diff --git a/magic_doc/contrib/magic_html/__init__.py b/magic_doc/contrib/magic_html/__init__.py deleted file mode 100644 index 545571c..0000000 --- a/magic_doc/contrib/magic_html/__init__.py +++ /dev/null @@ -1,58 +0,0 @@ -# -*- coding: utf-8 -*- -import json -from urllib.parse import urlparse -from magic_doc.contrib.magic_html.extractors.article_extractor import ArticleExtractor -from magic_doc.contrib.magic_html.extractors.weixin_extractor import WeixinExtractor -from magic_doc.contrib.magic_html.extractors.forum_extractor import ForumExtractor -from magic_doc.contrib.magic_html.extractors.custom_extractor import CustomExtractor - - -class GeneralExtractor: - def __init__(self, config_path=""): - if config_path: - """ - demo rule config file json: - { - "www.***.com": { - "clean": ["//script", "//style"], - "title": { - "mode": "xpath", - "value": "//div[@class='media-body']/h4/text()" - }, - "content": { - "mode": "xpath", - "value": "//div[@class='message break-all']" - } - } - } - """ - try: - with open(config_path, 'r', encoding='utf-8') as f: - self.rule = json.loads(f.read()) - except: - pass - else: - self.rule = {} - - def extract(self, html="", **kwargs) -> dict: - base_url = kwargs.get("base_url", "") - html_type = kwargs.pop("html_type", None) - if html_type: - if html_type == "forum": - return ForumExtractor().extract(html=html, **kwargs) - elif html_type == "weixin": - return WeixinExtractor().extract(html=html, **kwargs) - if base_url: - netloc = urlparse(base_url).netloc - if netloc in self.rule: - try: - new_kwargs = dict() - new_kwargs["rule"] = self.rule[netloc] - new_kwargs.update(kwargs) - return CustomExtractor().extract(html=html, **new_kwargs) - except: - # 当自定义规则不能覆盖站点所有板块时,使用 - return ArticleExtractor().extract(html=html, **kwargs) - if netloc == "mp.weixin.qq.com": - return WeixinExtractor().extract(html=html, **kwargs) - return ArticleExtractor().extract(html=html, **kwargs) diff --git a/magic_doc/contrib/magic_html/config.py b/magic_doc/contrib/magic_html/config.py deleted file mode 100644 index f70bbb2..0000000 --- a/magic_doc/contrib/magic_html/config.py +++ /dev/null @@ -1,290 +0,0 @@ -# -*- coding:utf-8 -*- - -Unique_ID = "all_ids_pjtest_20300101_921b9a" - -PAYWALL_DISCARD_XPATH = [ - """.//*[(self::div or self::p)][ - contains(@id, "paywall") or contains(@id, "premium") or - contains(@class, "paid-content") or contains(@class, "paidcontent") or - contains(@class, "obfuscated") or contains(@class, "blurred") or - contains(@class, "restricted") or contains(@class, "overlay") - ]""", -] - -OVERALL_DISCARD_XPATH = [ - # navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts - """.//*[(self::div or self::item or self::ul - or self::p or self::section or self::span)][ - contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer") - or contains(@id, "related") or contains(translate(@class, "R", "r"), "related") or - contains(@id, "viral") or contains(@class, "viral") or - starts-with(@id, "shar") or starts-with(@class, "shar") or - contains(@class, "share-") or - contains(translate(@id, "S", "s"), "share") or - contains(@id, "social") or contains(@class, "social") or contains(@class, "sociable") or - contains(@id, "syndication") or contains(@class, "syndication") or - starts-with(@id, "jp-") or starts-with(@id, "dpsp-content") or - contains(@class, "embedded") or contains(@class, "embed") - or contains(@id, "newsletter") or contains(@class, "newsletter") - or contains(@class, "subnav") or - contains(@id, "cookie") or contains(@class, "cookie") or contains(@id, "tags") - or contains(@class, "tags") or contains(@id, "sidebar") or - contains(@class, "sidebar") or contains(@id, "banner") or contains(@class, "banner") - or contains(@class, "meta") or - contains(@id, "menu") or contains(@class, "menu") or - contains(translate(@id, "N", "n"), "nav") or contains(translate(@role, "N", "n"), "nav") - or starts-with(@class, "nav") or contains(translate(@class, "N", "n"), "navigation") or - contains(@class, "navbar") or contains(@class, "navbox") or starts-with(@class, "post-nav") - or contains(@id, "breadcrumb") or contains(@class, "breadcrumb") or - contains(@id, "bread-crumb") or contains(@class, "bread-crumb") or - contains(@id, "author") or contains(@class, "author") or - contains(@id, "button") or contains(@class, "button") - or contains(translate(@class, "B", "b"), "byline") - or contains(@class, "rating") or starts-with(@class, "widget") or - contains(@class, "attachment") or contains(@class, "timestamp") or - contains(@class, "user-info") or contains(@class, "user-profile") or - contains(@class, "-ad-") or contains(@class, "-icon") - or contains(@class, "article-infos") or - contains(translate(@class, "I", "i"), "infoline") - or contains(@data-component, "MostPopularStories") - or contains(@class, "outbrain") or contains(@class, "taboola") - or contains(@class, "criteo") or contains(@class, "options") - or contains(@class, "consent") or contains(@class, "modal-content") - or contains(@class, "paid-content") or contains(@class, "paidcontent") - or contains(@id, "premium-") or contains(@id, "paywall") - or contains(@class, "obfuscated") or contains(@class, "blurred") - or contains(@class, " ad ") - or contains(@class, "next-post") - or contains(@class, "yin") or contains(@class, "zlylin") or - contains(@class, "xg1") or contains(@id, "bmdh") - or @data-lp-replacement-content]""", - # hidden parts - """.//*[starts-with(@class, "hide-") or contains(@class, "hide-print") or contains(@id, "hidden") - or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint") - or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true" - or contains(@class, "notloaded")]""", - # comment debris - # or contains(@class, "message-container") or contains(@id, "message_container") - """.//*[@class="comments-title" or contains(@class, "comments-title") or - contains(@class, "nocomments") or starts-with(@id, "reply-") or starts-with(@class, "reply-") or - contains(@class, "-reply-") or contains(@class, "message") or contains(@id, "message_container") - or contains(@id, "akismet") or contains(@class, "akismet")] """, -] - -TEASER_DISCARD_XPATH = [ - """.//*[(self::div or self::item or self::ul - or self::p or self::section or self::span)][ - contains(translate(@id, "T", "t"), "teaser") or contains(translate(@class, "T", "t"), "teaser") - ]""", -] - -PRECISION_DISCARD_XPATH = [ - ".//header", - """.//*[(self::div or self::item or self::ul - or self::p or self::section or self::span)][ - contains(@id, "bottom") or contains(@class, "bottom") or - contains(@id, "link") or contains(@class, "link") - or contains(@style, "border") - ]""", -] - -DISCARD_IMAGE_ELEMENTS = [ - """.//*[(self::div or self::item or self::ul - or self::p or self::section or self::span)][ - contains(@id, "caption") or contains(@class, "caption") - ] - """ -] - -REMOVE_COMMENTS_XPATH = [ - """.//*[(self::div or self::ul or self::section)][ - starts-with(translate(@id, "C","c"), 'comment') or - starts-with(translate(@class, "C","c"), 'comment') or starts-with(translate(@name, "C","c"), 'comment') or - contains(@class, 'article-comments') or contains(@class, 'post-comments') - or starts-with(@id, 'comol') or starts-with(@id, 'disqus_thread') - or starts-with(@id, 'dsq-comments') - ]""" -] - -CONTENT_EXTRACTOR_NOISE_XPATHS = [ - # '//div[contains(@class, "comment") or contains(@name, "comment") or contains(@id, "comment")]', - '//div[starts-with(@class, "advert") or starts-with(@name, "advert") or starts-with(@id, "advert")]', - '//div[contains(@style, "display: none")]', - '//div[contains(@style, "display:none")]', -] - -# 保留图片,音频,视频 -MANUALLY_CLEANED = [ - "aside", - "embed", - "footer", - "head", - "iframe", - "menu", - "object", - "script", - "applet", - "canvas", - "map", - "svg", - "area", - "blink", - "button", - "datalist", - "dialog", - "frame", - "frameset", - "fieldset", - "link", - "input", - "ins", - "label", - "legend", - "marquee", - "menuitem", - "nav", - "noscript", - "optgroup", - "option", - "output", - "param", - "progress", - "rp", - "rt", - "rtc", - "select", - "style", - "track", - "textarea", - "time", - "use", -] - -MANUALLY_STRIPPED = [ - "abbr", - "acronym", - "address", - "bdi", - "bdo", - "big", - "cite", - "data", - "dfn", - "font", - "hgroup", - "ins", - "mark", - "meta", - "ruby", - "small", - "tbody", - "template", - "tfoot", - "thead", -] - -CUT_EMPTY_ELEMS = { - "article", - "b", - "blockquote", - "dd", - "div", - "dt", - "em", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "i", - "li", - "main", - "p", - "pre", - "q", - "section", - "span", - "strong", -} - -USELESS_ATTR = [ - "share", - "contribution", - "copyright", - "copy-right", - "disclaimer", - "recommend", - "related", - "footer", - "social", - "submeta", - "report-infor", -] - -BODY_XPATH = [ - """.//*[(self::article or self::div or self::main or self::section)][ - @class="post" or @class="entry" or - contains(@class, "post-text") or contains(@class, "post_text") or - contains(@class, "post-body") or contains(@class, "post-entry") or contains(@class, "postentry") or - contains(@class, "post-content") or contains(@class, "post_content") or - contains(@class, "postcontent") or contains(@class, "postContent") or - contains(@class, "article-text") or contains(@class, "articletext") or contains(@class, "articleText") - or contains(@id, "entry-content") or - contains(@class, "entry-content") or contains(@id, "article-content") or - contains(@class, "article-content") or contains(@id, "article__content") or - contains(@class, "article__content") or contains(@id, "article-body") or - contains(@class, "article-body") or contains(@id, "article__body") or - contains(@class, "article__body") or @itemprop="articleBody" or - contains(translate(@id, "B", "b"), "articlebody") or contains(translate(@class, "B", "b"), "articlebody") - or @id="articleContent" or contains(@class, "ArticleContent") or - contains(@class, "page-content") or contains(@class, "text-content") or - contains(@id, "body-text") or contains(@class, "body-text") or contains(@class, "body-content") or contains(translate(@class, "B", "b"), "textbody") or - contains(@class, "article__container") or contains(@id, "art-content") or contains(@class, "art-content")][1]""", - "(.//article)[1]", - """(.//*[(self::article or self::div or self::main or self::section)][ - contains(@class, 'post-bodycopy') or - contains(@class, 'storycontent') or contains(@class, 'story-content') or - @class='postarea' or @class='art-postcontent' or - contains(@class, 'theme-content') or contains(@class, 'blog-content') or - contains(@class, 'section-content') or contains(@class, 'single-content') or - contains(@class, 'single-post') or - contains(@class, 'main-column') or contains(@class, 'wpb_text_column') or - starts-with(@id, 'primary') or starts-with(@class, 'article ') or @class="text" or - @id="article" or @class="cell" or @id="story" or @class="story" or - contains(@class, "story-body") or contains(@class, "field-body") or - contains(translate(@class, "FULTEX","fultex"), "fulltext") - or @role='article'])[1]""", - """(.//*[(self::article or self::div or self::main or self::section)][ - contains(@id, "content-main") or contains(@class, "content-main") or contains(@class, "content_main") or - contains(@id, "content-body") or contains(@class, "content-body") or contains(@id, "contentBody") - or contains(@class, "content__body") or contains(translate(@id, "CM","cm"), "main-content") or contains(translate(@class, "CM","cm"), "main-content") - or contains(translate(@class, "CP","cp"), "page-content") or - @id="content" or @class="content"])[1]""", - '(.//*[(self::article or self::div or self::section)][starts-with(@class, "main") or starts-with(@id, "main") or starts-with(@role, "main")])[1]|(.//main)[1]', -] - -Forum_XPATH = [ - """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][ - contains(@id, 'question') or contains(@class, 'question')]""", - """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][ - contains(@id, 'answer') or contains(@class, 'answer')]""", - """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][ - contains(@id, 'comment') or contains(@class, 'comment') or contains(@class, 'Comment')]""", - """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][contains(@class, "message-container") or contains(@id, "message_container") or contains(@class, "Messages_container")]""", - """.//*[(self::article or self::div or self::main or self::section or self::p or self::span or self::li or self::tr)][ - contains(@id, 'comment-content') or contains(@class, 'comment-content') or contains(@class, 'comment-body') or contains(@class, 'comment-body') or contains(@class, "post-reply") or contains(@class, "reply_content") or contains(@class, "reply-content") or contains(@class, "reply_post") or contains(@class, "post-reply") or contains(@id, "reply") or contains(@class, "post-text") or contains(@class, "post_text") or - contains(@class, "post-body") or contains(@class, "postbody") or contains(@class, "post-entry") or contains(@class, "postentry") or contains(@component, 'post') or - contains(@class, "post-content") or contains(@class, "post_content") or contains(@class, "p_content") or contains(@class, "Post_content") or contains(@class, "message-post") or contains(@class, "js-post")]""", - # id 包含post-加数字组成的形式 - """.//*[(self::article or self::div or self::main or self::section or self::p or self::span or self::li or self::tr)][contains(@id, 'post-') or contains(@id, 'post_')]""" -] - -METAS = [ - '//meta[starts-with(@property, "og:title")]/@content', - '//meta[starts-with(@name, "og:title")]/@content', - '//meta[starts-with(@property, "title")]/@content', - '//meta[starts-with(@name, "title")]/@content', - '//meta[starts-with(@property, "page:title")]/@content', - '//meta[starts-with(@name, "page:title")]/@content', -] diff --git a/magic_doc/contrib/magic_html/extractors/__init__.py b/magic_doc/contrib/magic_html/extractors/__init__.py deleted file mode 100644 index 380474e..0000000 --- a/magic_doc/contrib/magic_html/extractors/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# -*- coding:utf-8 -*- diff --git a/magic_doc/contrib/magic_html/extractors/article_extractor.py b/magic_doc/contrib/magic_html/extractors/article_extractor.py deleted file mode 100644 index 69795d6..0000000 --- a/magic_doc/contrib/magic_html/extractors/article_extractor.py +++ /dev/null @@ -1,43 +0,0 @@ -# -*- coding:utf-8 -*- - -from magic_doc.contrib.magic_html.utils import * -from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor -from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor - - -class ArticleExtractor(BaseExtractor): - def __init__(self) -> None: - super().__init__() - - def extract(self, html="", base_url="") -> dict: - html = html.replace(" ", " ").replace(" ", " ") - tree = load_html(html) - if tree is None: - raise ValueError - - title = TitleExtractor().process(tree) - - # base_url - base_href = tree.xpath("//base/@href") - - if base_href and "http" in base_href[0]: - base_url = base_href[0] - - # 标签转换, 增加数学标签处理 - format_tree = self.convert_tags(tree, base_url=base_url) - - # 删除script style等标签及其内容 - normal_tree = self.clean_tags(format_tree) - - subtree, xp_num, drop_list = self.xp_1_5(normal_tree) - if xp_num == "others": - subtree, drop_list = self.prune_unwanted_sections(normal_tree) - body_html = self.get_content_html(subtree, xp_num, base_url) - - return { - "xp_num": xp_num, - "drop_list": drop_list, - "html": body_html, - "title": title, - "base_url": base_url, - } diff --git a/magic_doc/contrib/magic_html/extractors/base_extractor.py b/magic_doc/contrib/magic_html/extractors/base_extractor.py deleted file mode 100644 index 252b218..0000000 --- a/magic_doc/contrib/magic_html/extractors/base_extractor.py +++ /dev/null @@ -1,844 +0,0 @@ -# -*- coding:utf-8 -*- - -import html -from collections import defaultdict -from copy import deepcopy -from urllib.parse import unquote, urljoin -from lxml.etree import Comment, strip_elements -from magic_doc.contrib.magic_html.config import * -from magic_doc.contrib.magic_html.readability_plus import Document as DocumentPlus -from magic_doc.contrib.magic_html.utils import * - - -class BaseExtractor: - def __init__(self): - self.drop_ids = [] - self.need_comment = False - - def xp_1_5(self, tree: HtmlElement): - drop_list = False - xp_num = "others" - result_body = Element("body") - - for idx, expr in enumerate(BODY_XPATH): - try: - subtree = tree.xpath(expr)[0] - xp_num = str(idx + 1) - except IndexError: - continue - - subtree, drop_list = self.prune_unwanted_sections(subtree) - - if len(subtree) == 0: - xp_num = "others" - continue - - ptest = subtree.xpath(".//text()[not(ancestor::a)]") - ptest_len = text_len("".join(ptest)) - all_text_len = text_len( - "".join(tree.xpath("//p//text()[not(ancestor::a)]")) - ) - if drop_list: - if ptest_len <= 50: - if all_text_len > 100: - xp_num = "others" - continue - else: - if ptest_len <= 20: - if all_text_len > 100: - xp_num = "others" - continue - result_body.append(subtree) - return result_body, xp_num, drop_list - - return result_body, xp_num, drop_list - - def get_content_html(self, cleaned_tree_backup, xp_num="others", base_url=""): - # readability_plus - doc = DocumentPlus( - cleaned_tree_backup, - url=base_url, - xp_num=xp_num, - need_comment=self.need_comment, - ) - body = doc.summary(html_partial=True) - - return body - - def prune_unwanted_nodes(self, tree, nodelist, with_backup=False): - if with_backup is True: - old_len = len(tree.text_content()) - backup = deepcopy(tree) - for expr in nodelist: - for subtree in tree.xpath(expr): - - # DISCARD_IMAGE_ELEMENTS 需要特殊判断 - if '"caption"' in expr and subtree.xpath(".//img"): - continue - # 有些出现hidden - if "hidden" in expr: - try: - if re.findall( - "overflow-x:\s*hidden", subtree.attrib["style"] - ) or re.findall( - "overflow-y:\s*hidden", subtree.attrib["style"] - ): - continue - if re.findall( - "overflow:\s*hidden", subtree.attrib["style"] - ) and re.findall("height:", subtree.attrib["style"]): - height_px = re.findall( - "height:\s*(\d+)", subtree.attrib["style"] - )[0] - if int(height_px) >= 800: - continue - except: - pass - self.remove_node(subtree) - if with_backup is False: - return tree - # else: - new_len = len(tree.text_content()) - if new_len > old_len / 7: - return tree - return backup - - def prune_html(self, tree): - """Delete selected empty elements""" - for element in tree.xpath(".//*[not(node())]"): - if element.tag in CUT_EMPTY_ELEMS: - self.remove_node(element) - return tree - - def remove_node(self, node: HtmlElement): - parent = node.getparent() - if text_strip(node.tail): - previous = node.getprevious() - if previous is None: - if parent is not None: - if text_strip(parent.text): - parent.text = "".join([parent.text, node.tail]) - else: - parent.text = node.tail - else: - if text_strip(previous.tail): - previous.tail = "".join([previous.tail, node.tail]) - else: - previous.tail = node.tail - - if parent is not None: - idx = node.attrib.get(Unique_ID, "") - parent.remove(node) - if idx: - self.drop_ids.append(int(idx)) - - def clean_tags(self, tree): - strip_elements(tree, Comment) - - xp_lists = [] - if not self.need_comment: - xp_lists.append(REMOVE_COMMENTS_XPATH) - xp_lists.append(CONTENT_EXTRACTOR_NOISE_XPATHS) - for xp_list in xp_lists: - tree = self.prune_unwanted_nodes(tree, xp_list) - - cleaning_list, stripping_list = ( - MANUALLY_CLEANED.copy(), - MANUALLY_STRIPPED.copy(), - ) - - for elem in tree.xpath(".//figure[descendant::table]"): - elem.tag = "div" - - for expression in cleaning_list + ["form"]: - for element in tree.getiterator(expression): - # 针对form 标签特殊处理 - if element.tag == "form": - ptest = element.xpath(".//text()[not(ancestor::a)]") - if text_len("".join(ptest)) <= 60: # 50 - self.remove_node(element) - else: - self.remove_node(element) - - HTML_CLEANER.kill_tags, HTML_CLEANER.remove_tags = cleaning_list, stripping_list - cleaned_tree = HTML_CLEANER.clean_html(self.prune_html(tree)) - - return cleaned_tree - - def generate_unique_id(self, element): - idx = 0 - for node in iter_node(element): - l_tag = node.tag.lower() - if l_tag not in ["html", "body"]: - node.attrib[Unique_ID] = str(idx) - idx += 1 - - def clean_unique_id(self, raw_element, content_html): - ids = re.findall(f' {Unique_ID}="(\d+)"', content_html) - self.drop_ids = list(set(self.drop_ids)) - self.drop_ids.sort() - skip_ids = [-1] - for x in ids: - if int(x) > int(skip_ids[-1]): - skip_ids.append(int(x)) - drop_node = raw_element.xpath( - f"//*[@{Unique_ID}='{x}']" - ) - if drop_node: - new_div = Element("div") - for j in self.drop_ids: - if int(j) > int(skip_ids[-1]): - append_element = drop_node[0].xpath( - f".//*[@{Unique_ID}='{j}']" - ) - if append_element: - skip_ids.append(j) - if len(append_element[0]) > 0: - skip_ids.extend( - [ - int(pjid) - for pjid in append_element[0].xpath( - f".//*/@{Unique_ID}" - ) - ] - ) - append_element[0].tail = None - new_div.append(append_element[0]) - - try: - drop_node[0].addnext(new_div) - parent = drop_node[0].getparent() - if parent is not None: - parent.remove(drop_node[0]) - except: - pass - - content_html = re.sub(f' {Unique_ID}="\d+"', "", content_html) - - drop_html = re.sub( - f' {Unique_ID}="\d+"', - "", - tostring(raw_element, encoding=str), - ) - return content_html, drop_html - - def math_latex_processing(self, node): - # 1. 文本中有\\begin{align} 或 \\begin{equation} - if node.tag not in ["script", "style"] and text_strip(node.text): - regex = r"\\begin{align}(.*?)\\end{align}" - text = node.text - matches = re.findall(regex, text, re.DOTALL) - if matches: - node.text = text.replace("\\begin{align}", "").replace( - "\\end{align}", "" - ) - - if node.tag not in ["script", "style"] and text_strip(node.text): - regex = r"\\begin{equation}(.*?)\\end{equation}" - text = node.text - matches = re.findall(regex, text, re.DOTALL) - for match in matches: - match = match.replace("\\begin{equation}", "") - match = match.replace("\\end{equation}", "") - wrapped_text = wrap_math(match, display=True) - text = text.replace(match, wrapped_text) - if matches: - # Remove the \begin{equation} and \end{equation} tags - text = text.replace("\\begin{equation}", "").replace( - "\\end{equation}", "" - ) - node.text = text - - if node.tag not in ["script", "style"] and text_strip(node.tail): - regex = r"\\begin{align}(.*?)\\end{align}" - text = node.tail - matches = re.findall(regex, text, re.DOTALL) - if matches: - node.tail = text.replace("\\begin{align}", "").replace( - "\\end{align}", "" - ) - - if node.tag not in ["script", "style"] and text_strip(node.tail): - regex = r"\\begin{equation}(.*?)\\end{equation}" - text = node.tail - matches = re.findall(regex, text, re.DOTALL) - for match in matches: - match = match.replace("\\begin{equation}", "") - match = match.replace("\\end{equation}", "") - wrapped_text = wrap_math(match, display=True) - text = text.replace(match, wrapped_text) - if matches: - # Remove the \begin{equation} and \end{equation} tags - text = text.replace("\\begin{equation}", "").replace( - "\\end{equation}", "" - ) - node.tail = text - - node_class = node.get("class") - - parent = node.getparent() - - # 2. class 为 texerror 的标签 - # Find the text between {} (maximum length) and replace the texerror with that text - - # 3. img中的latex - if node.tag == "img": - if node_class: - class_list = node_class.split(" ") - if any( - [img_class in class_list for img_class in latex_image_class_names] - ): - alt = node.get("alt") - if text_strip(alt): - new_span = Element("span") - wrapped_alt = wrap_math(alt) - new_span.text = wrapped_alt - node.addprevious(new_span) - src = node.get("src") - if src: - if "codecogs.com" in src: - try: - latex = src.split("?")[1:] - latex = "?".join( - latex - ) # In case there are multiple ? in the latex - latex = unquote(latex) - new_span = Element("span") - wrapped_latex = wrap_math(latex) - new_span.text = wrapped_latex - node.addprevious(new_span) - except: - pass - if "latex.php" in src: - try: - # they usually have "alt='-i u_t + \Delta u = |u|^2 u'" - alt = node.get("alt") - if text_strip(alt): - # Unescape the latex - alt = unquote(alt) - # Get the latex - wrapped_alt = wrap_math(alt) - new_span = Element("span") - new_span.text = wrapped_alt - node.addprevious(new_span) - except: - pass - if "/images/math/codecogs" in src: - try: - # they usually have "alt='-i u_t + \Delta u = |u|^2 u'" - alt = node.get("alt") - if text_strip(alt): - # Unescape the latex - alt = unquote(alt) - # Get the latex - wrapped_alt = wrap_math(alt) - new_span = Element("span") - new_span.text = wrapped_alt - node.addprevious(new_span) - except: - pass - if "mimetex.cgi" in src: - try: - latex = src.split("?")[1:] - latex = "?".join( - latex - ) # In case there are multiple ? in the latex - latex = unquote(latex) - new_span = Element("span") - wrapped_latex = wrap_math(latex) - new_span.text = wrapped_latex - node.addprevious(new_span) - except: - pass - if "mathtex.cgi" in src: - try: - latex = src.split("?")[1:] - latex = "?".join( - latex - ) # In case there are multiple ? in the latex - latex = unquote(latex) - new_span = Element("span") - wrapped_latex = wrap_math(latex) - new_span.text = wrapped_latex - node.addprevious(new_span) - except: - pass - if node_class: - if "x-ck12" in node_class: - try: - latex = node.get("alt") - if text_strip(latex): - latex = unquote(latex) - new_span = Element("span") - wrapped_latex = wrap_math(latex) - new_span.text = wrapped_latex - node.addprevious(new_span) - except: - pass - - # 4. class 为 math-container - if node_class == "math-container": - try: - text = node.text - if text_strip(text): - new_span = Element("span") - wrapped_math = wrap_math(text, display=True) - new_span.text = wrapped_math - if parent is not None: - if text_strip(node.tail): - new_span.tail = node.tail - parent.replace(node, new_span) - except: - pass - - # 5. class 为 wp-katex-eq - if node_class == "wp-katex-eq": - try: - text = node.text - if text_strip(text): - new_span = Element("span") - display_attr = node.get("data-display") - if display_attr is not None: - display = display_attr == "true" - else: - display = False - wrapped_math = wrap_math(text, display=display) - new_span.text = wrapped_math - if parent is not None: - if text_strip(node.tail): - new_span.tail = node.tail - parent.replace(node, new_span) - except: - pass - - # 6. script[type="math/tex"] - if node.tag == "script" and node.get("type") == "math/tex": - try: - text = node.text - if text_strip(text): - new_span = Element("span") - wrapped_text = wrap_math(text) - new_span.text = wrapped_text - if parent is not None: - if text_strip(node.tail): - new_span.tail = node.tail - parent.replace(node, new_span) - except: - pass - - # 7. script[type="math/asciimath"] - if node.tag == "script" and node.get("type") == "math/asciimath": - try: - text = node.text - if text_strip(text): - new_span = Element("span") - wrapped_asciimath = wrap_math(extract_asciimath(text)) - new_span.text = wrapped_asciimath - if parent is not None: - if text_strip(node.tail): - new_span.tail = node.tail - parent.replace(node, new_span) - except: - # Delete this script tag - self.remove_node(node) - - # 8. class tex - if node_class == "tex": - try: - # Check if they have data-expr attr - expr = node.get("data-expr") - if text_strip(expr): - # Replace with a span - new_span = Element("span") - wrapped_expr = wrap_math(expr) - new_span.text = wrapped_expr - if parent is not None: - if text_strip(node.tail): - new_span.tail = node.tail - parent.replace(node, new_span) - except: - pass - - # 9. span.katex - if node.tag == "span" and node_class == "katex": - # Find any spans with class "katex-html" and remove them - katex_html_spans = node.xpath('.//span[@class="katex-html"]') - for katex_html_span in katex_html_spans: - self.remove_node(katex_html_span) - - # 10. Remove any .MathJax_Preview spans - if node.tag == "span" and node_class == "MathJax_Preview": - self.remove_node(node) - - if node.tag == "span" and node_class and "x-ck12-mathEditor" in node_class: - try: - expr = node.get("data-tex") - if text_strip(expr): - expr = unquote(expr).replace("\"", "").replace(""", "") - # Replace with a span - new_span = Element("span") - wrapped_expr = wrap_math(expr) - new_span.text = wrapped_expr - if parent is not None: - if text_strip(node.tail): - new_span.tail = node.tail - parent.replace(node, new_span) - except: - pass - - # 11. all math tags - if node.tag == "math": - annotation_tags = node.xpath('.//annotation[@encoding="application/x-tex"]') - if len(annotation_tags) > 0: - annotation_tag = annotation_tags[0] - text = annotation_tag.text - if text_strip(text): - new_span = Element("span") - wrapped_text = wrap_math(text) - new_span.text = wrapped_text - if parent is not None: - if text_strip(node.tail): - new_span.tail = node.tail - parent.replace(node, new_span) - style_value = parent.get("style") - if style_value: - normalized_style_value = ( - style_value.lower() - .strip() - .replace(" ", "") - .replace(";", "") - ) - if "display:none" in normalized_style_value: - parent.style = "" - elif text_strip(node.get("alttext")): - # Get the alttext attribute - alttext = node.get("alttext") - if text_strip(alttext): - new_span = Element("span") - wrapped_alttext = wrap_math(alttext) - new_span.text = wrapped_alttext - if parent is not None: - if text_strip(node.tail): - new_span.tail = node.tail - parent.replace(node, new_span) - else: - try: - # Try translating to LaTeX - tmp_node = deepcopy(node) - tmp_node.tail = None - mathml = tostring(tmp_node, encoding=str) - # If this includes xmlns:mml, then we need to replace all - # instances of mml: with nothing - if "xmlns:mml" in mathml: - mathml = mathml.replace("mml:", "") - # replace xmlns:mml="..." with nothing - mathml = re.sub(r'xmlns:mml=".*?"', "", mathml) - # if 'xmlns=' in mathml: - # mathml = re.sub(r"xmlns='.*?'", '', mathml) - latex = mml_to_latex(mathml) - # Make a new span tag - new_span = Element("span") - # Set the html of the new span tag to the text - wrapped_latex = wrap_math(latex) - new_span.text = wrapped_latex - # Then, we need to replace the math tag with the new span tag - if parent is not None: - if text_strip(node.tail): - new_span.tail = node.tail - parent.replace(node, new_span) - except: - - self.remove_node(node) - - if node.tag == "mathjax": - try: - # Get the inner text of the mathjax tag - text = node.text - if text_strip(text): - text = html.unescape(text) - # Use regex to find text wrapped in hashes - matches = re.findall(r"#(.+?)#", text) - # For each match, replace the match with the LaTeX - for match in matches: - try: - latex = extract_asciimath(match) - # Replace the match with the LaTeX - text = text.replace(f"#{match}#", latex) - except: - - pass - # Create a new span tag - new_span = Element("span") - # Set the html of the new span tag to the text - new_span.text = text - # Then, we need to replace the mathjax tag with the new span tag - if parent is not None: - if text_strip(node.tail): - new_span.tail = node.tail - parent.replace(node, new_span) - except: - pass - - def convert_tags(self, element, base_url=""): - USELESS_ATTR_LIST = USELESS_ATTR - if not self.need_comment: - USELESS_ATTR_LIST = USELESS_ATTR_LIST + ["comment"] - for node in iter_node(element): - - # 增加数学标签转换 - self.math_latex_processing(node) - - if "data-src" in node.attrib and "src" not in node.attrib: - node.attrib["src"] = node.attrib["data-src"] - if "src" in node.attrib and node.attrib["src"] and base_url: - src_url = node.attrib["src"] - absolute_url = urljoin(base_url, src_url) - node.attrib["src"] = absolute_url - - if node.tag.lower() == "div" and not node.getchildren(): - node.tag = "p" - - class_name = node.get("class") - if class_name: - if class_name.lower() in USELESS_ATTR_LIST: - self.remove_node(node) - - return element - - def delete_by_link_density( - self, subtree, tagname, backtracking=False, favor_precision=False - ): - need_del_par = [] - skip_par = [] - drop_list = False - for descendant in subtree.iter(tagname): - pparent = descendant.getparent() - if pparent in need_del_par or pparent in skip_par: - continue - siblings = descendant.xpath(f"following-sibling::{tagname}") - nn = [descendant] - nn.extend(siblings) - txt_max_num = 0 - if len(siblings) + 1 >= 4: - pass - else: - txt_max_dict = { - "read": 0, - "more": 0, - "...": 0, - "阅读": 0, - "更多": 0, - "详细": 0, - "detail": 0, - "article": 0, - "blog": 0, - "news": 0, - } - if tagname == "div" or tagname == "article" or tagname == "section": - for j in nn: - txt = "".join(j.xpath(".//text()")).strip() - for x in [ - "read", - "more", - "...", - "阅读", - "更多", - "详细", - "detail", - "article", - "blog", - "news", - ]: - if txt.lower().endswith(x): - txt_max_dict[x] += 1 - txt_num = max(txt_max_dict.values()) - if txt_max_num < txt_num: - txt_max_num = txt_num - if txt_max_num >= 3: - break - if txt_max_num >= 3: - pass - else: - continue - skip_par.append(pparent) - a_num = 0 - for j in siblings: - if j.xpath(".//a"): - if tagname == "p": - if density_of_a_text(j, pre=0.8): - a_num += 1 - elif tagname in ["div", "section", "article"]: - if density_of_a_text(j, pre=0.2): - a_num += 1 - else: - if self.need_comment: - # 增加判断是否包含评论 再决定是否删除 - break_flg = False - for c_xpath in Forum_XPATH[:-1]: - if j.xpath(c_xpath.replace(".//*", "self::*")): - break_flg = True - break - if break_flg: - continue - if tagname == "li": - if text_len("".join(j.xpath(".//text()[not(ancestor::a)]"))) > 50: - continue - a_num += 1 - - if a_num < len(siblings): - if a_num >= 15 and ( - tagname == "div" or tagname == "article" or tagname == "section" - ): - pass - else: - continue - - similarity_with_siblings_nums = similarity_with_siblings( - descendant, siblings - ) - if tagname == "article" or tagname == "item": # or tagname == "section" - similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5 - # 列表有个很特殊的地方 另一种情况就是 descendant和siblings 都包含title/h1 | h2 标签 - if tagname == "div" or tagname == "article" or tagname == "section": - title_max_num = 0 - for ll in [".//head[@rend='h2']", ".//head[@rend='h1']", "./article"]: - title_num = 0 - for jj in nn: - if jj.xpath(ll): - title_num += 1 - if title_max_num < title_num: - title_max_num = title_num - if title_max_num >= 4: - similarity_with_siblings_nums = similarity_with_siblings_nums * 1.5 - - if txt_max_num >= 3: - pass - elif similarity_with_siblings_nums < 0.84: - if len(siblings) >= 15 and ( - tagname == "div" or tagname == "article" or tagname == "section" - ): - pass - else: - continue - # 父div中包含多同级div 且div class post-时,删除其余节点,保留第一篇文章 - class_attr = descendant.get("class") if descendant.get("class") else "" - if ( - re.findall("post-", class_attr, re.I) - or re.findall("-post", class_attr, re.I) - or re.findall("blog|aricle", class_attr, re.I) - ): - drop_list = True - sk_flg = True - for dl in siblings: - if ( - text_len("".join(descendant.xpath(".//text()"))) * 2 - < text_len("".join(dl.xpath(".//text()"))) - and sk_flg - ): - self.remove_node(descendant) - sk_flg = False - else: - self.remove_node(dl) - else: - need_del_par.append(descendant) - need_del_par.extend(siblings) - for node in need_del_par: - drop_list = True - try: - self.remove_node(node) - except Exception as e: - pass - - myelems, deletions = defaultdict(list), [] - - if tagname == "div": - for elem in subtree.iter(tagname): - if density_of_a_text(elem, pre=0.8) and img_div_check(elem): - deletions.append(elem) - - for elem in subtree.iter(tagname): - elemtext = trim(elem.text_content()) - result, templist = link_density_test(elem, elemtext, favor_precision) - if result is True and img_div_check(elem): - deletions.append(elem) - elif backtracking is True and len(templist) > 0: # if? - myelems[elemtext].append(elem) - if backtracking is True: - if favor_precision is False: - threshold = 100 - else: - threshold = 200 - for text, elem in myelems.items(): - if 0 < len(text) < threshold and len(elem) >= 3: - deletions.extend(elem) - - for elem in uniquify_list(deletions): - try: - if self.need_comment: - # 增加判断是否包含评论 再决定是否删除 - break_flg = False - for c_xpath in Forum_XPATH[:-1]: - if elem.xpath(c_xpath): - break_flg = True - break - if break_flg: - continue - self.remove_node(elem) - except AttributeError: - pass - return subtree, drop_list - - def prune_unwanted_sections(self, tree): - tmp_OVERALL_DISCARD_XPATH = OVERALL_DISCARD_XPATH - if self.need_comment: - tmp_OVERALL_DISCARD_XPATH = tmp_OVERALL_DISCARD_XPATH[:-1] - tree = self.prune_unwanted_nodes( - tree, tmp_OVERALL_DISCARD_XPATH, with_backup=True - ) - for xp_list in [ - PAYWALL_DISCARD_XPATH, - TEASER_DISCARD_XPATH, - DISCARD_IMAGE_ELEMENTS, - ]: - tree = self.prune_unwanted_nodes(tree, xp_list) - # remove elements by link density - tree, drop_list_1 = self.delete_by_link_density( - tree, "div", backtracking=True, favor_precision=False - ) - tree, drop_list_1_1 = self.delete_by_link_density( - tree, "article", backtracking=False, favor_precision=False - ) - tree, drop_list_1_2 = self.delete_by_link_density( - tree, "section", backtracking=False, favor_precision=False - ) - tree, drop_list_2_1 = self.delete_by_link_density( - tree, "ul", backtracking=False, favor_precision=False - ) - tree, drop_list_2_2 = self.delete_by_link_density( - tree, "li", backtracking=False, favor_precision=False - ) - tree, drop_list_3_1 = self.delete_by_link_density( - tree, "dl", backtracking=False, favor_precision=False - ) - tree, drop_list_3_3 = self.delete_by_link_density( - tree, "dt", backtracking=False, favor_precision=False - ) - tree, drop_list_3_2 = self.delete_by_link_density( - tree, "dd", backtracking=False, favor_precision=False - ) - tree, drop_list_3 = self.delete_by_link_density( - tree, "p", backtracking=False, favor_precision=False - ) - - return ( - tree, - drop_list_1 - or drop_list_2_1 - or drop_list_2_2 - or drop_list_3 - or drop_list_1_1 - or drop_list_1_2 - or drop_list_3_1 - or drop_list_3_2 - or drop_list_3_3, - ) diff --git a/magic_doc/contrib/magic_html/extractors/custom_extractor.py b/magic_doc/contrib/magic_html/extractors/custom_extractor.py deleted file mode 100644 index 56649b0..0000000 --- a/magic_doc/contrib/magic_html/extractors/custom_extractor.py +++ /dev/null @@ -1,57 +0,0 @@ -# -*- coding:utf-8 -*- -import re - -from magic_doc.contrib.magic_html.utils import * -from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor -from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor - - -class CustomExtractor(BaseExtractor): - def __init__(self) -> None: - super().__init__() - - def use_clean_rule(self, tree, clean_rules): - for clean_rule in clean_rules: - for x in tree.xpath(clean_rule): - self.remove_node(x) - return tree - - def use_extract_rule(self, tree, extract_rule): - if "/text()" in extract_rule["value"]: - return "".join(tree.xpath(extract_rule["value"])).strip() - return tree.xpath(extract_rule["value"])[0] - - def extract(self, html="", base_url="", rule={}) -> dict: - tree = load_html(html) - if tree is None: - raise ValueError - - # base_url - base_href = tree.xpath("//base/@href") - - if base_href and "http" in base_href[0]: - base_url = base_href[0] - - if "clean" in rule: - tree = self.use_clean_rule(tree, rule["clean"]) - - # 获取title - if "title" not in rule: - title = TitleExtractor().process(tree) - else: - title = self.use_extract_rule(tree, rule["title"]) - - # 文章区域 - try: - body_tree = self.use_extract_rule(tree, rule["content"]) - except: - raise ValueError - body_html = tostring(body_tree, encoding=str) - - return { - "xp_num": "custom", - "drop_list": False, - "html": body_html, - "title": title, - "base_url": base_url - } diff --git a/magic_doc/contrib/magic_html/extractors/forum_extractor.py b/magic_doc/contrib/magic_html/extractors/forum_extractor.py deleted file mode 100644 index a7358ca..0000000 --- a/magic_doc/contrib/magic_html/extractors/forum_extractor.py +++ /dev/null @@ -1,122 +0,0 @@ -# -*- coding:utf-8 -*- -import re - -from magic_doc.contrib.magic_html.config import Forum_XPATH, Unique_ID -from magic_doc.contrib.magic_html.utils import * -from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor -from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor - - -class ForumExtractor(BaseExtractor): - def __init__(self) -> None: - super().__init__() - - def extract(self, html="", base_url="") -> dict: - self.need_comment = True - html = html.replace(" ", " ").replace(" ", " ") - tree = load_html(html) - if tree is None: - raise ValueError - - # 获取title - title = TitleExtractor().process(tree) - - # base_url - base_href = tree.xpath("//base/@href") - - if base_href and "http" in base_href[0]: - base_url = base_href[0] - self.generate_unique_id(tree) - - format_tree = self.convert_tags(tree, base_url=base_url) - - normal_tree = self.clean_tags(format_tree) - - subtree, xp_num, drop_list = self.xp_1_5(normal_tree) - if xp_num == "others": - subtree, drop_list = self.prune_unwanted_sections(normal_tree) - body_html = self.get_content_html(subtree, xp_num, base_url) - - # 论坛等独有 - body_html_tree = fromstring(body_html) - try: - body_tree = body_html_tree.body - except: - body_tree = Element("body") - body_tree.extend(body_html_tree) - main_ids = body_tree.xpath(f".//@{Unique_ID}") - - for main_id in main_ids: - main_tree = normal_tree.xpath( - f".//*[@{Unique_ID}={main_id}]" - ) - if main_tree: - self.remove_node(main_tree[0]) - if not main_ids: - main_ids = [-1] - - if xp_num != "others": - normal_tree, _ = self.prune_unwanted_sections(normal_tree) - for c_xpath in Forum_XPATH: - while normal_tree.xpath(c_xpath): - x = normal_tree.xpath(c_xpath)[0] - self.remove_node(x) - if "'post-'" in c_xpath: - if not (re.findall('post-\d+', x.attrib.get("id", "").lower()) or re.findall('post_\d+', - x.attrib.get("id", - "").lower())): - continue - if ( - "header" in x.attrib.get("class", "").lower() - or "header" in x.attrib.get("id", "").lower() - ): - continue - try: - if int(x.attrib.get(Unique_ID, "0")) > int( - main_ids[-1] - ): - body_tree.append(x) - else: - prefix_div = Element("div") - suffix_div = Element("div") - need_prefix = False - need_suffix = False - while x.xpath( - f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" - ): - tmp_x = x.xpath( - f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" - )[0] - self.remove_node(tmp_x) - suffix_div.append(tmp_x) - need_suffix = True - while x.xpath( - f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" - ): - tmp_x = x.xpath( - f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" - )[0] - self.remove_node(tmp_x) - prefix_div.append(tmp_x) - need_prefix = True - if need_prefix: - body_tree.insert(0, prefix_div) - if need_suffix: - body_tree.append(suffix_div) - - except: - pass - - body_html = re.sub( - f' {Unique_ID}="\d+"', - "", - tostring(body_tree, encoding=str), - ) - - return { - "xp_num": xp_num, - "drop_list": drop_list, - "html": body_html, - "title": title, - "base_url": base_url - } diff --git a/magic_doc/contrib/magic_html/extractors/title_extractor.py b/magic_doc/contrib/magic_html/extractors/title_extractor.py deleted file mode 100644 index d49dad7..0000000 --- a/magic_doc/contrib/magic_html/extractors/title_extractor.py +++ /dev/null @@ -1,49 +0,0 @@ -# -*- coding:utf-8 -*- - -from magic_doc.contrib.magic_html.utils import * -from magic_doc.contrib.magic_html.config import * - - -class TitleExtractor: - def extract_by_meta(self, element: HtmlElement): - for xpath in METAS: - title = element.xpath(xpath) - if title: - return "".join(title) - - def extract_by_title(self, element: HtmlElement): - return "".join(element.xpath("//title//text()")).strip() - - def extract_by_hs(self, element: HtmlElement): - hs = element.xpath("//h1//text()|//h2//text()|//h3//text()") - return hs or [] - - def extract_by_h(self, element: HtmlElement): - for xpath in ["//h1", "//h2", "//h3"]: - children = element.xpath(xpath) - if not children: - continue - child = children[0] - texts = child.xpath("./text()") - if texts and len(texts): - return texts[0].strip() - - def process(self, element: HtmlElement): - title_extracted_by_meta = self.extract_by_meta(element) - if title_extracted_by_meta: - return title_extracted_by_meta - title_extracted_by_h = self.extract_by_h(element) - title_extracted_by_hs = self.extract_by_hs(element) - title_extracted_by_title = self.extract_by_title(element) - title_extracted_by_hs = sorted( - title_extracted_by_hs, - key=lambda x: similarity2(x, title_extracted_by_title), - reverse=True, - ) - if title_extracted_by_hs: - return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title) - - if title_extracted_by_title: - return title_extracted_by_title - - return title_extracted_by_h diff --git a/magic_doc/contrib/magic_html/extractors/weixin_extractor.py b/magic_doc/contrib/magic_html/extractors/weixin_extractor.py deleted file mode 100644 index 8549061..0000000 --- a/magic_doc/contrib/magic_html/extractors/weixin_extractor.py +++ /dev/null @@ -1,104 +0,0 @@ -# -*- coding:utf-8 -*- - -from magic_doc.contrib.magic_html.utils import * -from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor -from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor - - -class WeixinExtractor(BaseExtractor): - def __init__(self) -> None: - super().__init__() - - def extract(self, html="", base_url="") -> dict: - html = html.replace(" ", " ") - tree = load_html(html) - if tree is None: - raise ValueError - - # 获取title - title = TitleExtractor().process(tree) - - # base_url - base_href = tree.xpath("//base/@href") - - if base_href and "http" in base_href[0]: - base_url = base_href[0] - - # 文章区域 - try: - body_tree = tree.xpath('.//*[@id="img-content"]')[0] - except: - raise ValueError - - # 去除 js , style, comment - for script in body_tree.xpath(".//script"): - self.remove_node(script) - for style in body_tree.xpath(".//style"): - self.remove_node(style) - for comment in body_tree.xpath(".//comment()"): - self.remove_node(comment) - - # 删除所有的公众号介绍 - for mp in body_tree.xpath('.//div[@id="meta_content"]'): - self.remove_node(mp) - for mp in body_tree.xpath('.//div[@id="js_tags"]'): - self.remove_node(mp) - for mp in body_tree.xpath('.//div[@class="original_area_primary"]'): - self.remove_node(mp) - # 隐藏的封禁 介绍 - for mp in body_tree.xpath('.//section[@class="wx_profile_card_inner"]'): - self.remove_node(mp) - # 特殊的wx卡片介绍 - for mp in body_tree.xpath( - ".//section[contains(@class, 'wx_profile_msg_inner')]" - ): - self.remove_node(mp) - - # 针对杂乱内容进行去除 - all_raga = body_tree.xpath( - ".//*[contains(@style, 'color: rgba(255, 255, 255, 0)')] | .//*[contains(@style, 'color: rgba(255 255 255 0)')]" - ) - - for mp in all_raga: - flag_have_color_rgb, detail_style = self.ensure_have_color_rgb( - mp.attrib["style"] - ) - - if not flag_have_color_rgb: - continue - self.remove_node(mp) - - for img in body_tree.xpath(".//img"): - - if "data-src" not in img.attrib: - continue - - try: - img.set("src", img.attrib["data-src"]) - except Exception as e: - continue - - for h1 in body_tree.xpath(".//h1"): - if not h1.text: - continue - h1.text = h1.text.replace("\n", "").strip() - - body_html = tostring(body_tree, encoding=str) - - return { - "xp_num": "weixin", - "drop_list": False, - "html": body_html, - "title": title, - "base_url": base_url - } - - @staticmethod - def ensure_have_color_rgb(htmlstr): - pattern = r"(? - - * Import or include either the main stylesheet, or the - stylesheet module you wish to use, directly from the library - website; http://www.raleigh.ru/MathML/mmltex/. For example: - - - -Obtaining The Library ---------------------- - -The XSLT MathML Library is available for download as: - - * Zip file: http://www.raleigh.ru/MathML/mmltex/mmltex.zip - -Copyright ---------- - -Copyright (C) 2001, 2002 Vasil Yaroshevich - -Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation -files (the ``Software''), to deal in the Software without -restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -Except as contained in this notice, the names of individuals -credited with contribution to this software shall not be used in -advertising or otherwise to promote the sale, use or other -dealings in this Software without prior written authorization -from the individuals in question. - -Any stylesheet derived from this Software that is publically -distributed will be identified with a different name and the -version strings in any derived Software will be changed so that -no possibility of confusion between the derived package and this -Software will exist. - -Warranty --------- - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER -CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. - -Contacting the Author ---------------------- - -These stylesheets are maintained by Vasil Yaroshevich, . diff --git a/magic_doc/contrib/magic_html/mmltex/cmarkup.xsl b/magic_doc/contrib/magic_html/mmltex/cmarkup.xsl deleted file mode 100644 index 9e72dda..0000000 --- a/magic_doc/contrib/magic_html/mmltex/cmarkup.xsl +++ /dev/null @@ -1,1093 +0,0 @@ - - - - - - - - - - - - - + - - i - - - - - / - - - - - - _{} - - - - - e^{i - - } - - - - - E - - - - - - - - \mathrm{} - - - - - - - - - - - - - ( - - - , - - ) - - - - - () - - - - - - - \left( - - \left[ - - - , - - - - \right) - - \right] - - - - - \left\{\right\} - - - - - ^{(-1)} - - - - - - - - \mathrm{lambda}\: - - .\: - - - - - - - - - - \circ - - - - -\mathrm{id} - - - - \mathop{\mathrm{ - - }} - - - - - - - - \begin{cases} - - - \end{cases} - - - - - & \text{if $ - - $} - \\ - - - - - & \text{otherwise} - - - - - \left\lfloor\frac{ - - }{ - - }\right\rfloor - - - - - - - - ! - - - - - - - \left( - \frac{ - - - }{ - - - } - \right) - - - - - \ - - \{ - - - - , - - - - - - , - - - - \} - - - - - - - - - - - - - - - - - - - - - - - - - - ( - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - ) - - - - - - - - - ^{ - - - - } - - - - - - - \mod - - - - - - - - - - ( - - - - \times - - - - - - - - - - ) - - - - - \sqrt - - [ - - ] - - { - - } - - - -\gcd - - - - - - - - \land - - - - - - - - - - \lor - - - - - - - - - - \mathop{\mathrm{xor}} - - - - - - \neg - - - - - - - - - - \implies - - - - - - - - \ - - - - - , - - - \colon - - - - - - - \left| - - \right| - - - - - \overline{} - - - -\Re - - -\Im - - - - \lfloor - - \rfloor - - - - - \lceil - - \rceil - - - - - - - - - = - - - - - - - - - - \neq - - - - - - - - - - > - - - - - - - - - - < - - - - - - - - - - \ge - - - - - - - - - - \le - - - - - - - - - - \equiv - - - - - - - - - - \approx - - - - - - - - | - - - - - - - - \int - - _{ - - } - - - ^{ - - } - - - - \,d - - - - - - - ^\prime - - - - \frac{ - - - d^{ - - } - - }{d - - ^{ - - } - - - d - - }{d - - } - - - } - - - - - D_{ - - - , - - } - - - - - \frac{\partial^{ - - - - - - - - + - - - + - - - - - - - - } - - }{ - - \partial - - - ^{ - - } - - - } - - - - - - - - - , - - - -\mathop{\mathrm{div}} - - -\nabla^2 - - - - \{\} - - - - - \left[\right] - - - - - - - \colon - - - - - - , - - - - - - - - - - - - \cup - - - - - - - - - - \cap - - - - - - - - \in - - - - - - - - - - \notin - - - - - - - - - - - - \subseteq - - - - - - - - - - \subset - - - - - - - - - - \nsubseteq - - - - - - - - - - \not\subset - - - - - - - - - - \setminus - - - - - - | - - | - - - - - - - - - \times - - - - - - - - ^{ - - } - - - - - \sum - - - - - \prod - - - - - _{ - - - = - - - } - - - ^{ - - } - - - - - - - - \lim_{ - - } - - - - - - \to - - - - - - - - - - - - \searrow - \nearrow - \rightarrow - \to - - - - - - - - \ - - - - - - - - - \ - - - - - - \mathrm{ - - \,} - - - - - - - \mathrm{ - - } - - - - - e^{} - - - - - \lg - - - - - - - \log_{ - - } - - - - - - - - \langle - - - , - - \rangle - - - -\sigma - - - - \sigma( - - )^2 - - - - - \langle - - ^{ - - }\rangle - - _{ - - } - - - - - - - \left(\begin{array}{c} - - - \\ - - \end{array}\right) - - - - - \begin{pmatrix} - - \end{pmatrix} - - - - - - - & - - \\ - - - - - \det - - - - - - - \begin{vmatrix} - - \end{vmatrix} - - - - - - - - ^T - - - - - - - - _{ - - - , - - } - - - - - - - - - \dot - - - - - - - - - - - -\mathbb{Z} - - -\mathbb{R} - - -\mathbb{Q} - - -\mathbb{N} - - -\mathbb{C} - - -\mathbb{P} - - -e - - -i - - -NaN - - -\mbox{true} - - -\mbox{false} - - -\emptyset - - -\pi - - -\gamma - - -\infty - - - - - - - ( - - - - - - - - - ) - - - - - - - ( - - - - - - - - ) - - - \ No newline at end of file diff --git a/magic_doc/contrib/magic_html/mmltex/entities.xsl b/magic_doc/contrib/magic_html/mmltex/entities.xsl deleted file mode 100644 index d62e9e7..0000000 --- a/magic_doc/contrib/magic_html/mmltex/entities.xsl +++ /dev/null @@ -1,316 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/magic_doc/contrib/magic_html/mmltex/glayout.xsl b/magic_doc/contrib/magic_html/mmltex/glayout.xsl deleted file mode 100644 index 5aaa19d..0000000 --- a/magic_doc/contrib/magic_html/mmltex/glayout.xsl +++ /dev/null @@ -1,220 +0,0 @@ - - - - - - - - - - - - - - \genfrac{}{}{ - - - - ex - - - .05ex - - - - .2ex - - - - - - }{}{ - - - \frac{ - - - - \hfill - - - - \hfill - - }{ - - \hfill - - - - \hfill - - } - - - - - - \sqrt[ - - ]{ - - } - - - - exception 25: - \text{exception 25:} - - - - - - \sqrt{ - - } - - - - - - - \left - - - \ - - - - \left( - - - - - - - - - - - , - - - - - - - - - - - - - - - - - - - - - - - - \right - - - \ - - - - \right) - - - - - \phantom{ - - } - - - - - - \overline{ - - \hspace{.2em}|} - - - \sqrt{ - - } - - - \overline{) - - } - - - - - - - - - - - \colorbox[rgb]{ - - - - }{$ - - - \textcolor[rgb]{ - - - - }{ - - - - } - - - $} - - - - - - - - - \ No newline at end of file diff --git a/magic_doc/contrib/magic_html/mmltex/mmltex.xsl b/magic_doc/contrib/magic_html/mmltex/mmltex.xsl deleted file mode 100644 index 45c9e6f..0000000 --- a/magic_doc/contrib/magic_html/mmltex/mmltex.xsl +++ /dev/null @@ -1,45 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - $ - - $ - - - \ No newline at end of file diff --git a/magic_doc/contrib/magic_html/mmltex/scripts.xsl b/magic_doc/contrib/magic_html/mmltex/scripts.xsl deleted file mode 100644 index a81909e..0000000 --- a/magic_doc/contrib/magic_html/mmltex/scripts.xsl +++ /dev/null @@ -1,292 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - \overline{ - - - - - } - - - \overbrace{ - - - - - } - - - \underline{ - - - - - - } - - - \underbrace{ - - - - - - } - - - - - _{ - - }^{ - - } - - - \underset{ - - }{\overset{ - - }{ - - }} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \overline{ - - } - - - \overbrace{ - - } - - - - - ^{ - - } - - - \stackrel{ - - }{ - - } - - - - - - - - - - - \underline{ - - } - - - \underbrace{ - - } - - - - - _{ - - } - - - \underset{ - - }{ - - } - - - - - - { - - }_{ - - }^{ - - } - - - - { - - }^{ - - } - - - - { - - }_{ - - } - - - - - - {}_{ - - } - - - {}^{ - - } - - - - - - {} - - - _{ - - } - - - ^{ - - } - - - - - - - - - - - - - - {} - - - _{ - - } - - - ^{ - - } - - - - - - - \ No newline at end of file diff --git a/magic_doc/contrib/magic_html/mmltex/tables.xsl b/magic_doc/contrib/magic_html/mmltex/tables.xsl deleted file mode 100644 index e60592a..0000000 --- a/magic_doc/contrib/magic_html/mmltex/tables.xsl +++ /dev/null @@ -1,130 +0,0 @@ - - - - - - - - - \multicolumn{ - - }{c}{ - - } - - & - - - - - - - \hfill - - - - \hfill - - - - & - - - - - - - \\ - - - - - \begin{array}{ - - | - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | - - } - - \hline - - - - \\ \hline - - \end{array} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/magic_doc/contrib/magic_html/mmltex/tokens.xsl b/magic_doc/contrib/magic_html/mmltex/tokens.xsl deleted file mode 100644 index dcf1af1..0000000 --- a/magic_doc/contrib/magic_html/mmltex/tokens.xsl +++ /dev/null @@ -1,296 +0,0 @@ - - - - - - - - - - - - - - - \mathrm{ - - } - - - - - - - - - - - - - - - - - - - - - - \text{ - - } - - - - \phantom{\rule - - [- - - ] - - { - - 0ex - - - }{ - - 0ex - - - }} - - - - - - " - - - " - - - - - - \colorbox[rgb]{ - - - - }{$ - - - \textcolor[rgb]{ - - - - }{ - - - - - \mathrm{ - - - \mathbf{ - - - \mathit{ - - - \mathbit{ - - - \mathbb{ - - - { - - - \mathcal{ - - - \mathsc{ - - - \mathfrak{ - - - \mathsf{ - - - \mathbsf{ - - - \mathsfit{ - - - \mathbsfit{ - - - \mathtt{ - - - { - - - - - - } - - - } - - - $} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - , - - - - - - , - - - - - - - - - - - - - - - - - - - , - - - - - - - - - - - , - - - - - - - - - - - - - - 0,1,1 - 0,0,0 - 0,0,1 - 1,0,1 - .5,.5,.5 - 0,.5,0 - 0,1,0 - .5,0,0 - 0,0,.5 - .5,.5,0 - .5,0,.5 - 1,0,0 - .75,.75,.75 - 0,.5,.5 - 1,1,1 - 1,1,0 - - Exception at color template - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Exception at Hex2Decimal template - - - - - - - - - - - \ No newline at end of file diff --git a/magic_doc/contrib/magic_html/readability_plus.py b/magic_doc/contrib/magic_html/readability_plus.py deleted file mode 100644 index 9e26d42..0000000 --- a/magic_doc/contrib/magic_html/readability_plus.py +++ /dev/null @@ -1,521 +0,0 @@ -# -*- coding:utf-8 -*- - -from lxml.etree import tounicode -from lxml.html import document_fromstring, fragment_fromstring - -from magic_doc.contrib.magic_html.utils import * - - -def to_int(x): - if not x: - return None - x = x.strip() - if x.endswith("px"): - return int(x[:-2]) - if x.endswith("em"): - return int(x[:-2]) * 12 - return int(x) - - -def clean(text): - text = re.sub(r"\s{255,}", " " * 255, text) - text = re.sub(r"\s*\n\s*", "\n", text) - text = re.sub(r"\t|[ \t]{2,}", " ", text) - return text.strip() - - -def text_length(i): - return len(clean(i.text_content() or "")) - - -bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"] -single_quoted = "'[^']+'" -double_quoted = '"[^"]+"' -non_space = "[^ \"'>]+" -htmlstrip = re.compile( - "<" # open - "([^>]+) " # prefix - "(?:%s) *" % ("|".join(bad_attrs),) - + "= *(?:%s|%s|%s)" # undesirable attributes - % (non_space, single_quoted, double_quoted) - + "([^>]*)" # value # postfix - ">", # end - re.I, -) - - -def clean_attributes(html): - while htmlstrip.search(html): - html = htmlstrip.sub("<\\1\\2>", html) - return html - - -class Document: - """Class to build a etree document out of html.""" - - def __init__( - self, - input, - url=None, - min_text_length=25, - retry_length=250, - xpath=False, - handle_failures="discard", - xp_num="others", - need_comment=False, - ): - self.input = input - self.html = None - self.encoding = None - self.positive_keywords = None - self.negative_keywords = None - self.url = url - self.min_text_length = min_text_length - self.retry_length = retry_length - self.xpath = xpath - self.handle_failures = handle_failures - self.xp_num = xp_num - self.need_comment = need_comment - if not need_comment: - self.REGEXES = { - "unlikelyCandidatesRe": re.compile( - r"combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter", - re.I, - ), - "okMaybeItsACandidateRe": re.compile( - r"and|article|body|column|main|shadow", re.I - ), - "positiveRe": re.compile( - r"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story", - re.I, - ), - "negativeRe": re.compile( - r"combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget", - re.I, - ), - "divToPElementsRe": re.compile( - r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I - ), - "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I), - } - else: - self.REGEXES = { - "unlikelyCandidatesRe": re.compile( - r"combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter", - re.I, - ), - "okMaybeItsACandidateRe": re.compile( - r"and|article|body|column|main|shadow", re.I - ), - "positiveRe": re.compile( - r"article|body|content|entry|hentry|main|page|pagination|post|text|blog|story", - re.I, - ), - "negativeRe": re.compile( - r"combx|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget", - re.I, - ), - "divToPElementsRe": re.compile( - r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I - ), - "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I), - } - - def _html(self, force=False): - if force or self.html is None: - self.html = self._parse(self.input) - if self.xpath: - root = self.html.getroottree() - for i in self.html.getiterator(): - i.attrib["x"] = root.getpath(i) - return self.html - - def _parse(self, input: HtmlElement): - doc = input - base_href = self.url - if base_href: - try: - doc.make_links_absolute( - base_href, - resolve_base_href=True, - handle_failures=self.handle_failures, - ) - except TypeError: - doc.make_links_absolute( - base_href, - resolve_base_href=True, - handle_failures=self.handle_failures, - ) - else: - doc.resolve_base_href(handle_failures=self.handle_failures) - return doc - - def summary(self, html_partial=False): - try: - ruthless = True - while True: - self._html(True) - for i in self.tags(self.html, "body"): - i.set("id", "readabilityplusBody") - if ruthless and self.xp_num == "others": - self.remove_unlikely_candidates() - self.transform_misused_divs_into_paragraphs() - if self.xp_num == "others": - candidates = self.score_paragraphs() - best_candidate = self.select_best_candidate(candidates) - else: - best_candidate = None - ruthless = False - candidates = {} - if best_candidate: - article = self.get_article( - candidates, best_candidate, html_partial=html_partial - ) - else: - if ruthless: - ruthless = False - continue - else: - article = self.html.find("body") - if article is None: - article = self.html - cleaned_article = self.sanitize(article, candidates) - - article_length = len(cleaned_article or "") - retry_length = self.retry_length - of_acceptable_length = article_length >= retry_length - if ruthless and not of_acceptable_length: - ruthless = False - continue - else: - return cleaned_article - except Exception as e: - return None - - def get_article(self, candidates, best_candidate, html_partial=False): - sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2]) - if html_partial: - output = fragment_fromstring("
") - else: - output = document_fromstring("
") - best_elem = best_candidate["elem"] - parent = best_elem.getparent() - siblings = parent.getchildren() if parent is not None else [best_elem] - for sibling in siblings: - append = False - if sibling is best_elem: - append = True - sibling_key = sibling - if ( - sibling_key in candidates - and candidates[sibling_key]["content_score"] >= sibling_score_threshold - ): - append = True - - if sibling.tag == "p": - link_density = self.get_link_density(sibling) - node_content = sibling.text or "" - node_length = len(node_content) - - if node_length > 80 and link_density < 0.25: - append = True - elif ( - node_length <= 80 - and link_density == 0 - and re.search(r"\.( |$)", node_content) - ): - append = True - - if append: - if html_partial: - output.append(sibling) - else: - output.getchildren()[0].getchildren()[0].append(sibling) - return output - - def select_best_candidate(self, candidates): - if not candidates: - return None - - sorted_candidates = sorted( - candidates.values(), key=lambda x: x["content_score"], reverse=True - ) - for candidate in sorted_candidates[:5]: - elem = candidate["elem"] - - best_candidate = sorted_candidates[0] - return best_candidate - - def get_link_density(self, elem): - link_length = 0 - for i in elem.findall(".//a"): - link_length += text_length(i) - total_length = text_length(elem) - return float(link_length) / max(total_length, 1) - - def score_paragraphs(self): - MIN_LEN = self.min_text_length - candidates = {} - ordered = [] - for elem in self.tags(self._html(), "p", "pre", "td"): - parent_node = elem.getparent() - if parent_node is None: - continue - grand_parent_node = parent_node.getparent() - - inner_text = clean(elem.text_content() or "") - inner_text_len = len(inner_text) - - if inner_text_len < MIN_LEN: - continue - - if parent_node not in candidates: - candidates[parent_node] = self.score_node(parent_node) - ordered.append(parent_node) - - if grand_parent_node is not None and grand_parent_node not in candidates: - candidates[grand_parent_node] = self.score_node(grand_parent_node) - ordered.append(grand_parent_node) - - content_score = 1 - content_score += len(inner_text.split(",")) - content_score += len(inner_text.split(",")) - content_score += min((inner_text_len / 100), 3) - - candidates[parent_node]["content_score"] += content_score - if grand_parent_node is not None: - candidates[grand_parent_node]["content_score"] += content_score / 2.0 - - for elem in ordered: - candidate = candidates[elem] - ld = self.get_link_density(elem) - score = candidate["content_score"] - - candidate["content_score"] *= 1 - ld - - return candidates - - def class_weight(self, e): - weight = 0 - for feature in [e.get("class", None), e.get("id", None)]: - if feature: - if self.xp_num == "others": - if self.REGEXES["negativeRe"].search(feature): - weight -= 25 - - if self.REGEXES["positiveRe"].search(feature): - weight += 25 - else: - if self.REGEXES["positiveRe"].search(feature): - weight += 25 - - if self.positive_keywords and self.positive_keywords.search(feature): - weight += 25 - - if self.negative_keywords and self.negative_keywords.search(feature): - weight -= 25 - - if self.positive_keywords and self.positive_keywords.match("tag-" + e.tag): - weight += 25 - - if self.negative_keywords and self.negative_keywords.match("tag-" + e.tag): - weight -= 25 - - return weight - - def score_node(self, elem): - content_score = self.class_weight(elem) - name = elem.tag.lower() - if name in ["div", "article"]: - content_score += 5 - elif name in ["pre", "td", "blockquote"]: - content_score += 3 - elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]: - content_score -= 3 - elif name in [ - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "th", - "header", - "footer", - "nav", - ]: - content_score -= 5 - return {"content_score": content_score, "elem": elem} - - def remove_unlikely_candidates(self): - for elem in self.html.findall(".//*"): - s = "%s %s" % (elem.get("class", ""), elem.get("id", "")) - if len(s) < 2: - continue - if ( - self.REGEXES["unlikelyCandidatesRe"].search(s) - and (not self.REGEXES["okMaybeItsACandidateRe"].search(s)) - and elem.tag not in ["html", "body"] - ): - elem.drop_tree() - - def transform_misused_divs_into_paragraphs(self): - for elem in self.tags(self.html, "div"): - if not self.REGEXES["divToPElementsRe"].search( - str(b"".join(map(tostring, list(elem)))) - ): - elem.tag = "p" - - for elem in self.tags(self.html, "div"): - if elem.text and elem.text.strip(): - p = fragment_fromstring("

") - p.text = elem.text - elem.text = None - elem.insert(0, p) - - for pos, child in reversed(list(enumerate(elem))): - if child.tail and child.tail.strip(): - p = fragment_fromstring("

") - p.text = child.tail - child.tail = None - elem.insert(pos + 1, p) - if child.tag == "br": - child.drop_tree() - - def tags(self, node, *tag_names): - for tag_name in tag_names: - for e in node.findall(".//%s" % tag_name): - yield e - - def reverse_tags(self, node, *tag_names): - for tag_name in tag_names: - for e in reversed(node.findall(".//%s" % tag_name)): - yield e - - def sanitize(self, node, candidates): - MIN_LEN = self.min_text_length - for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): - if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: - header.drop_tree() - - for elem in self.tags(node, "iframe"): - if "src" in elem.attrib and self.REGEXES["videoRe"].search( - elem.attrib["src"] - ): - elem.text = "VIDEO" - else: - elem.drop_tree() - - allowed = {} - # Conditionally clean s,
    s, and
    s - for el in self.reverse_tags( - node, "table", "ul", "div", "aside", "header", "footer", "section" - ): - if el in allowed: - continue - weight = self.class_weight(el) - if el in candidates: - content_score = candidates[el]["content_score"] - else: - content_score = 0 - tag = el.tag - - if weight + content_score < 0: - el.drop_tree() - elif el.text_content().count(",") + el.text_content().count(",") < 10: - counts = {} - for kind in ["p", "img", "li", "a", "embed", "input"]: - counts[kind] = len(el.findall(".//%s" % kind)) - counts["li"] -= 100 - counts["input"] -= len(el.findall('.//input[@type="hidden"]')) - - content_length = text_length(el) - link_density = self.get_link_density(el) - - to_remove = False - reason = "" - - # 修改 - if el.tag == "div" and counts["img"] >= 1: - continue - if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3: - reason = "too many images (%s)" % counts["img"] - # to_remove = True - elif counts["li"] > counts["p"] and tag not in ("ol", "ul"): - reason = "more
  • s than

    s" - # to_remove = True - elif counts["input"] > (counts["p"] / 3): - reason = "less than 3x

    s than s" - to_remove = True - elif content_length < MIN_LEN and counts["img"] == 0: - reason = ( - "too short content length %s without a single image" - % content_length - ) - to_remove = True - elif content_length < MIN_LEN and counts["img"] > 2: - reason = ( - "too short content length %s and too many images" - % content_length - ) - to_remove = True - elif weight < 25 and link_density > 0.2: - if tag in ["div", "ul", "table"]: - ptest = el.xpath(".//text()[not(ancestor::a)]") - ptest_len = text_len("".join(ptest)) - if ptest_len >= MIN_LEN and link_density <= 0.3: - continue - reason = "too many links %.3f for its weight %s" % ( - link_density, - weight, - ) - to_remove = True - elif weight >= 25 and link_density > 0.5: - reason = "too many links %.3f for its weight %s" % ( - link_density, - weight, - ) - to_remove = True - elif (counts["embed"] == 1 and content_length < 75) or counts[ - "embed" - ] > 1: - reason = ( - "s with too short content length, or too many s" - ) - to_remove = True - elif not content_length: - reason = "no content" - to_remove = True - - i, j = 0, 0 - x = 1 - siblings = [] - for sib in el.itersiblings(): - sib_content_length = text_length(sib) - if sib_content_length: - i = +1 - siblings.append(sib_content_length) - if i == x: - break - for sib in el.itersiblings(preceding=True): - sib_content_length = text_length(sib) - if sib_content_length: - j = +1 - siblings.append(sib_content_length) - if j == x: - break - if siblings and sum(siblings) > 1000: - to_remove = False - for desnode in self.tags(el, "table", "ul", "div", "section"): - allowed[desnode] = True - - if to_remove: - el.drop_tree() - else: - pass - - self.html = node - return self.get_clean_html() - - def get_clean_html(self): - return clean_attributes(tounicode(self.html, method="html")) diff --git a/magic_doc/contrib/magic_html/utils.py b/magic_doc/contrib/magic_html/utils.py deleted file mode 100644 index 5bec08f..0000000 --- a/magic_doc/contrib/magic_html/utils.py +++ /dev/null @@ -1,428 +0,0 @@ -# -*- coding:utf-8 -*- - -import os -import re -from gzip import decompress - -import numpy as np -from lxml import etree -from lxml.html import Element, HtmlElement, HTMLParser, fromstring, tostring -from lxml.html.clean import Cleaner -from urllib3.response import HTTPResponse -from magic_doc.contrib.magic_html.config import Unique_ID - -try: - import brotli -except ImportError: - brotli = None - -try: - from cchardet import detect as cchardet_detect -except ImportError: - cchardet_detect = None - -from difflib import SequenceMatcher - -from charset_normalizer import from_bytes - -HTML_PARSER = HTMLParser( - collect_ids=False, - default_doctype=False, - encoding="utf-8", - remove_comments=True, - remove_pis=True, -) -DOCTYPE_TAG = re.compile("^< ?! ?DOCTYPE.+?/ ?>", re.I) -UNICODE_ALIASES = {"utf-8", "utf_8"} - -HTML_CLEANER = Cleaner( - annoying_tags=False, - comments=True, - embedded=False, - forms=False, - frames=False, - javascript=False, - links=False, - meta=False, - page_structure=False, - processing_instructions=True, - remove_unknown_tags=False, - safe_attrs_only=False, - scripts=False, - style=False, -) - -color_regex = re.compile(r"\\textcolor\[.*?\]\{.*?\}") - -latex_image_class_names = [ - "latexcenter", - "latex", - "tex", - "latexdisplay", - "latexblock", - "latexblockcenter", -] - - -def _translator(): - import py_asciimath.translator.translator as _translator - - return _translator - - -def ASCIIMath2Tex(*args, **kwargs): - return _translator().ASCIIMath2Tex(*args, **kwargs) - - -def MathML2Tex(*args, **kwargs): - return _translator().MathML2Tex(*args, **kwargs) - - -asciimath2tex = ASCIIMath2Tex(log=False) - - -def lcs_of_2(a, b): - match = SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b)) - return a[match[0]: match[0] + match[2]] - - -def lcs_of_list(*args): - if len(args) == 2: - return lcs_of_2(args[0], args[1]) - first = args[0] - remains = args[1:] - return lcs_of_2(first, lcs_of_list(*remains)) - - -def isutf8(data): - try: - data.decode("UTF-8") - except UnicodeDecodeError: - return False - return True - - -def handle_compressed_file(filecontent): - if isinstance(filecontent, bytes): - if filecontent[:2] == b"\x1f\x8b": - try: - filecontent = decompress(filecontent) - except (EOFError, OSError): - pass - elif brotli is not None: - try: - filecontent = brotli.decompress(filecontent) - except brotli.error: - pass - return filecontent - - -def detect_encoding(bytesobject): - if isutf8(bytesobject): - return ["utf-8"] - guesses = [] - if cchardet_detect is not None: - cchardet_guess = cchardet_detect(bytesobject)["encoding"] - if cchardet_guess is not None: - guesses.append(cchardet_guess.lower()) - detection_results = from_bytes(bytesobject[:15000]) or from_bytes(bytesobject) - if len(detection_results) > 0: - guesses.extend([r.encoding for r in detection_results]) - return [g for g in guesses if g not in UNICODE_ALIASES] - - -def decode_file(filecontent): - if isinstance(filecontent, str): - return filecontent - htmltext = None - filecontent = handle_compressed_file(filecontent) - for guessed_encoding in detect_encoding(filecontent): - try: - htmltext = filecontent.decode(guessed_encoding) - except (LookupError, UnicodeDecodeError): - htmltext = None - else: - break - return htmltext or str(filecontent, encoding="utf-8", errors="replace") - - -def strip_faulty_doctypes(htmlstring: str, beginning: str) -> str: - if "doctype" in beginning: - firstline, _, rest = htmlstring.partition("\n") - return DOCTYPE_TAG.sub("", firstline, count=1) + "\n" + rest - return htmlstring - - -def is_dubious_html(beginning: str) -> bool: - return "html" not in beginning - - -def fromstring_bytes(htmlobject): - tree = None - try: - tree = fromstring( - htmlobject.encode("utf8", "surrogatepass"), parser=HTML_PARSER - ) - except Exception as err: - pass - return tree - - -def load_html(htmlobject): - if isinstance(htmlobject, HtmlElement): - return htmlobject - if isinstance(htmlobject, HTTPResponse) or hasattr(htmlobject, "data"): - htmlobject = htmlobject.data - if not isinstance(htmlobject, (bytes, str)): - raise TypeError("incompatible input type", type(htmlobject)) - tree = None - htmlobject = decode_file(htmlobject) - beginning = htmlobject[:50].lower() - check_flag = is_dubious_html(beginning) - htmlobject = strip_faulty_doctypes(htmlobject, beginning) - fallback_parse = False - try: - tree = fromstring(htmlobject, parser=HTML_PARSER) - except ValueError: - tree = fromstring_bytes(htmlobject) - fallback_parse = True - except Exception as err: - pass - if (tree is None or len(tree) < 1) and not fallback_parse: - tree = fromstring_bytes(htmlobject) - if tree is not None and check_flag is True and len(tree) < 2: - tree = None - return tree - - -def is_empty_element(node: HtmlElement): - return not node.getchildren() and not node.text - - -def iter_node(element: HtmlElement): - yield element - for sub_element in element: - if isinstance(sub_element, HtmlElement): - yield from iter_node(sub_element) - - -def img_div_check(tree): - """ - 如果一个div中只有一张图,且子节点数小于4则保留 - """ - if len(tree.xpath(".//img")) == 1 and len(tree.xpath(".//*")) < 4: - return False - else: - return True - - -def text_len(s): - s = re.sub(" +", " ", s) # 将连续的多个空格替换为一个空格 - s = re.sub("[\n\t\r]+", "\n", s) - english_words = s.split() - chinese_characters = re.findall(r"[\u4e00-\u9fff]", s) - japanese_characters = re.findall(r"[\u3040-\u309F\u30A0-\u30FF]", s) - arabic_characters = re.findall(r"[\u0600-\u06FF]", s) - return ( - len(english_words) - + len(chinese_characters) - + len(japanese_characters) - + len(arabic_characters) - ) - - -def alias(element): - if element is None: - return "" - tag = element.tag - # skip nth-child - if tag in ["html", "body"]: - return tag - attribs = [tag] - for k, v in element.attrib.items(): - if k == Unique_ID: - continue - k, v = re.sub(r"\s*", "", k), re.sub(r"\s*", "", v) - v = re.sub(r"-\d+", "", v) - attribs.append(f'[{k}="{v}"]' if v else f"[{k}]") - result = "".join(attribs) - - # 直接将当前子节点属性展示上来 - nth = "" - for child in element.getchildren(): - if child.tag in ["dt", "dd", "li"]: - try: - # 子节点个数 - nth += str(len(list(child.getchildren()))) - except: - pass - continue - attribs = [child.tag] - for k, v in child.attrib.items(): - if k == Unique_ID: - continue - k, v = re.sub(r"\s*", "", k), re.sub(r"\s*", "", v) - v = re.sub(r"-\d+", "", v) - attribs.append(f"[{k}]" if v else f"[{k}]") - nth += "".join(attribs) - - result += f":{nth}" - return result - - -def similarity2(s1, s2): - if not s1 or not s2: - return 0 - s1_set = set(list(s1)) - s2_set = set(list(s2)) - intersection = s1_set.intersection(s2_set) - union = s1_set.union(s2_set) - return len(intersection) / len(union) - - -def similarity_with_element(element1, element2): - alias1 = alias(element1) - alias2 = alias(element2) - return similarity2(alias1, alias2) - - -def similarity_with_siblings(element, siblings): - scores = [] - for sibling in siblings: - # TODO: maybe compare all children not only alias - scores.append(similarity_with_element(element, sibling)) - if not scores: - return 0 - # 去掉一个最低值 - min_value = min(scores) - scores.remove(min_value) - return np.mean(scores) - - -def number_of_a_char(ele, xpath=".//a//text()"): - s = "".join(ele.xpath(xpath)).strip() - return text_len(s) - - -def number_of_char(ele, xpath=".//text()"): - s = "".join(ele.xpath(xpath)).strip() - return text_len(s) + 1 - - -def density_of_a_text(ele, pre=0.7): - a_char = number_of_a_char(ele) - t_char = number_of_char(ele) - if a_char / t_char >= pre: - return True - else: - return False - - -def uniquify_list(l): - return list(dict.fromkeys(l)) - - -def trim(string): - """Remove unnecessary spaces within a text string""" - try: - return " ".join(string.split()).strip() - except (AttributeError, TypeError): - return None - - -def collect_link_info(links_xpath, favor_precision=False): - shortelems, mylist = 0, [] - threshold = 10 if not favor_precision else 50 - for subelem in links_xpath: - subelemtext = trim(subelem.text_content()) - if subelemtext: - mylist.append(subelemtext) - if len(subelemtext) < threshold: - shortelems += 1 - lengths = sum(len(text) for text in mylist) - return lengths, len(mylist), shortelems, mylist - - -def link_density_test(element, text, favor_precision=False): - links_xpath, mylist = element.findall(".//a"), [] - if links_xpath: - if element.tag == "p": - if favor_precision is False: - if element.getnext() is None: - limitlen, threshold = 60, 0.8 - else: - limitlen, threshold = 30, 0.8 - else: - limitlen, threshold = 200, 0.8 - else: - if element.getnext() is None: - limitlen, threshold = 300, 0.8 - else: - limitlen, threshold = 100, 0.8 - elemlen = len(text) - if elemlen < limitlen: - linklen, elemnum, shortelems, mylist = collect_link_info( - links_xpath, favor_precision - ) - if elemnum == 0: - return True, mylist - if density_of_a_text(element, 0.5): - if linklen > threshold * elemlen or ( - elemnum > 1 and shortelems / elemnum > 0.8 - ): - return True, mylist - return False, mylist - - -def text_strip(text): - return text.strip() if text else text - - -def wrap_math(s, display=False): - s = re.sub(r"\s+", " ", s) - s = color_regex.sub("", s) - s = s.replace("$", "") - s = s.replace("\n", " ").replace("\\n", "") - s = s.strip() - if len(s) == 0: - return s - # Don't wrap if it's already in \align - if "align" in s: - return s - if display: - return "$$" + s + "$$" - return "$" + s + "$" - - -def extract_asciimath(s): - parsed = asciimath2tex.translate(s) - return parsed - - -cur_file = os.path.abspath(__file__) -xsl_path = os.path.join(os.path.dirname(cur_file), "mmltex/mmltex.xsl") - -xslt = etree.parse(xsl_path) -transform = etree.XSLT(xslt) - - -def mml_to_latex(mml_code): - # Remove any attibutes from the math tag - mml_code = re.sub(r"()", r"\1", mml_code) - mml_ns = mml_code.replace( - "", '' - ) # Required. - - mml_ns = mml_ns.replace(""", '"') - mml_ns = mml_ns.replace("'\\\"", '"').replace("\\\"'", '"') - - # 很多网页中标签内容就是错误 - # pattern = r"(<[^<>]*?\s)(mathbackground|mathsize|mathvariant|mathfamily|class|separators|style|id|rowalign|columnspacing|rowlines|columnlines|frame|framespacing|equalrows|equalcolumns|align|linethickness|lspace|rspace|mathcolor|rowspacing|displaystyle|style|columnalign|open|close|right|left)(?=\s|>)(?![\"'][^<>]*?>)" - - pattern = r'"([^"]+?)\'' - mml_ns = re.sub(pattern, r'"\1"', mml_ns) - - mml_dom = etree.fromstring(mml_ns) - mmldom = transform(mml_dom) - latex_code = str(mmldom) - return latex_code diff --git a/magic_doc/contrib/model/__init__.py b/magic_doc/contrib/model/__init__.py index 1bb61cf..1ace468 100644 --- a/magic_doc/contrib/model/__init__.py +++ b/magic_doc/contrib/model/__init__.py @@ -45,7 +45,7 @@ class Page(TypedDict): class Extractor(ABC): @abstractmethod - def setup(): + def setup(self): pass @abstractmethod diff --git a/magic_doc/contrib/office/doc.py b/magic_doc/contrib/office/doc.py index b07c2f3..4ec7e45 100644 --- a/magic_doc/contrib/office/doc.py +++ b/magic_doc/contrib/office/doc.py @@ -57,13 +57,6 @@ def extract( with open(pure_text_path, "r") as f: content = f.read() - # img_map: dict[Path, str] = {} - # imgs = media_dir.glob("*") - # for img in imgs: - # img_map[img] = self.generate_img_path(id, img.name) - # - # self.upload_background(id, img_map) - pages = [ Page(page_no=idx, content=x) for idx, x in enumerate(content.split("[pedia-page]")) @@ -80,14 +73,8 @@ def extract( for content in content_list: if not content["data"].startswith("[pedia-"): continue - if content["data"] == "[pedia-badpic]": - content["data"] = "" - content["type"] = "image" - elif content["data"].startswith("[pedia-pic"): - content["type"] = "image" - img_name = content["data"][len("[pedia-") : -1] - img_path = media_dir.joinpath(img_name) - content["data"] = img_map[img_path] + if content["data"] == "[pedia-badpic]" or content["data"].startswith("[pedia-pic"): + continue else: content["data"] = content["data"] + "\n" diff --git a/magic_doc/contrib/office/docx_extract.py b/magic_doc/contrib/office/docx_extract.py index ab49767..167d8b9 100644 --- a/magic_doc/contrib/office/docx_extract.py +++ b/magic_doc/contrib/office/docx_extract.py @@ -20,9 +20,7 @@ def setup(self): def __word2markdown( self, - id: str, docx_file_stream: IO[bytes], - save_fig_dir, ): tag_w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" tag_body = f"{tag_w}body" @@ -119,14 +117,13 @@ def extract( if type(r) is FileStorage: page = Page( page_no=0, - content_list=self.__word2markdown(id, r.stream, media_dir), + content_list=self.__word2markdown(r.stream), ) else: page = Page( page_no=0, - content_list=self.__word2markdown(id, open(r, "rb"), media_dir), + content_list=self.__word2markdown(open(r, "rb")), ) - # self.clean_up(id) return [page] diff --git a/magic_doc/contrib/office/ppt_extract.py b/magic_doc/contrib/office/ppt_extract.py deleted file mode 100644 index 8798b75..0000000 --- a/magic_doc/contrib/office/ppt_extract.py +++ /dev/null @@ -1,40 +0,0 @@ -import requests -import json - -from pathlib import Path - - -from werkzeug.datastructures import FileStorage - -from magic_doc.contrib.office import OfficeExtractor -from magic_doc.contrib.model import ExtractResponse - - -class PptExtractor(OfficeExtractor): - def __init__(self) -> None: - super().__init__() - - def setup(self): - pass - - def extract( - self, - r: FileStorage | Path, - id: str, - dir: Path, - media_dir: Path, - skip_image: bool, - ) -> ExtractResponse: - - if type(r) is FileStorage: - data = r.stream.read() - elif issubclass(type(r), Path): - with open(r, "rb") as data_file: - data = data_file.read() - - files = {"file": data} - response = requests.post(f"{self.config.tika}/api/v1/parse", files=files) - self.upload_background(id, {}) - return response.json()["pages"] - - diff --git a/magic_doc/contrib/office/pptx_extract.py b/magic_doc/contrib/office/pptx_extract.py index febccf4..82d73f4 100644 --- a/magic_doc/contrib/office/pptx_extract.py +++ b/magic_doc/contrib/office/pptx_extract.py @@ -7,10 +7,8 @@ from loguru import logger from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE -from pptx.parts.image import Image from pptx.presentation import Presentation as ppt from pptx.shapes.autoshape import Shape -from pptx.shapes.picture import Picture from pptx.shapes.graphfrm import GraphicFrame from pptx.table import Table, _Row, _Cell from pptx.slide import Slide @@ -32,10 +30,6 @@ def handle_shape( self, shape: Shape, content_list: List[Content], - media_dir: Path, - img_map: dict[Path, str], - id: str, - skip_image: bool, ): if shape.has_text_frame: for paragraph in shape.text_frame.paragraphs: @@ -45,16 +39,8 @@ def handle_shape( data=paragraph.text + "\n", ) ) - elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE and not skip_image: - shape: Picture - image: Image = shape.image - image_bytes = image.blob - img_path = media_dir.joinpath(f"pic-{len(img_map)}.{image.ext}") - img_s3_path = self.generate_img_path(id, img_path.name) - img_map[img_path] = img_s3_path - content_list.append(Content(type="image", data=img_s3_path)) - with open(img_path, "wb") as file: - file.write(image_bytes) + elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE: + pass elif shape.shape_type == MSO_SHAPE_TYPE.TABLE: shape: GraphicFrame table: Table = shape.table @@ -75,7 +61,7 @@ def handle_shape( elif shape.shape_type == MSO_SHAPE_TYPE.GROUP: shape: GroupShape for sub_shape in shape.shapes: - self.handle_shape(sub_shape, content_list, media_dir, img_map, id, skip_image) + self.handle_shape(sub_shape, content_list) else: # print(shape.shape_type, type(shape), file=sys.stderr) pass @@ -83,14 +69,9 @@ def handle_shape( def extract( self, r: FileStorage | Path, - id: str, - dir: Path, - media_dir: Path, - skip_image: bool, ) -> ExtractResponse: pages = [] - img_map = {} - + presentation: ppt = Presentation(r) for page_no, slide in enumerate(presentation.slides): slide: Slide @@ -99,15 +80,8 @@ def extract( self.handle_shape( shape, page["content_list"], - media_dir, - img_map, - id, - skip_image, ) pages.append(page) - - # self.upload_background(id, img_map) - return pages diff --git a/magic_doc/conv/conv_html.py b/magic_doc/conv/conv_html.py deleted file mode 100644 index 87921b8..0000000 --- a/magic_doc/conv/conv_html.py +++ /dev/null @@ -1,45 +0,0 @@ -import json -from magic_doc.conv.base import BaseConv -from magic_doc.progress.filepupdator import FileBaseProgressUpdator -from magic_doc.contrib.magic_html import GeneralExtractor -from magic_doc.progress.pupdator import ConvProgressUpdator -from loguru import logger - -extractor = GeneralExtractor() - - -class Html(BaseConv): - - def __init__(self): - super().__init__() - - @logger.catch - def to_md(self, html: str, pupdator: ConvProgressUpdator, **kwargs) -> str: - """ - 从HTML中提取主体区域内容 - :param html: html文本 - :param kwargs: 可选参数 - base_url 网页地址 - html_type 网页类型(支持3种) - 1. article 文章类 - 2. forum 论坛类 - 3. weixin 微信文章 - :return: { - "base_url": "https://example.com/", - "drop_list": false, - "html": "

    list[Page]: if __name__ == '__main__': pupdator = FileBaseProgressUpdator("/tmp/p.txt") doc = Doc() - logger.info(doc.to_md(Path("/home/myhloli/文本+表+图1.doc").read_bytes(), pupdator)) + if 0: + logger.info(doc.to_md(Path("/home/myhloli/文本+表+图1.doc").read_bytes(), pupdator)) + if 1: + print( + doc.to_md( + Path( + r"/opt/data/magic_doc/20240605/doc/【英文-习题】MIT_prs_w05d2.doc" + ).read_bytes(), + pupdator, + ) + ) diff --git a/magic_doc/conv/doc_libreoffice.py b/magic_doc/conv/doc_libreoffice.py index 09a266f..d2c607e 100644 --- a/magic_doc/conv/doc_libreoffice.py +++ b/magic_doc/conv/doc_libreoffice.py @@ -16,21 +16,21 @@ class Doc(BaseConv): def __init__(self): super().__init__() - + def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str: page_list = self.doc_to_pagelist(bits, pupdator) md_content_list = [] for page in page_list: - page_content_list = page['content_list'] + page_content_list = page["content_list"] total = len(page_content_list) for index, content in enumerate(page_content_list): progress = 50 + int(index / total * 50) # logger.info(f"progress: {progress}") pupdator.update(progress) - if content['type'] == 'image': + if content["type"] == "image": pass - elif content['type'] in ["text", "md"]: - data = content['data'] + elif content["type"] in ["text", "md"]: + data = content["data"] md_content_list.append(data) return "\n".join(md_content_list) @@ -40,14 +40,14 @@ def doc_to_docx(self, doc_path: str, dir_path: str) -> str: process = Popen(cmd, shell=True) process.wait() fname = str(Path(doc_path).stem) - docx_path = os.path.join(os.path.dirname(doc_path), f'{fname}.docx') + docx_path = os.path.join(os.path.dirname(doc_path), f"{fname}.docx") if not os.path.exists(docx_path): # logger.error(f"> !!! File conversion failed {doc_path} ==> {docx_path}") raise Exception(f"> !!! File conversion failed {doc_path} ==> {docx_path}") else: return docx_path - def doc_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]: + def doc_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]: with tempfile.TemporaryDirectory() as temp_path: temp_dir = Path(temp_path) media_dir = temp_dir / "media" @@ -57,12 +57,31 @@ def doc_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]: docx_file_path = self.doc_to_docx(str(file_path), str(temp_path)) pupdator.update(50) docx_extractor = DocxExtractor() - pages = docx_extractor.extract(Path(docx_file_path), "tmp", temp_dir, media_dir, True) + pages = docx_extractor.extract( + Path(docx_file_path), "tmp", temp_dir, media_dir, True + ) pupdator.update(80) return pages -if __name__ == '__main__': +if __name__ == "__main__": pupdator = FileBaseProgressUpdator("/tmp/p.txt") doc = Doc() - logger.info(doc.to_md(Path(r"D:\project\20240514magic_doc\doc_ppt\doc\demo\文本+表+图1.doc").read_bytes(), pupdator)) + if 0: + logger.info( + doc.to_md( + Path( + r"D:\project\20240514magic_doc\doc_ppt\doc\demo\文本+表+图1.doc" + ).read_bytes(), + pupdator, + ) + ) + if 1: + print( + doc.to_md( + Path( + r"/opt/data/magic_doc/20240605/doc/【英文-习题】MIT_prs_w05d2.doc" + ).read_bytes(), + pupdator, + ) + ) diff --git a/magic_doc/conv/docx_xml_parse.py b/magic_doc/conv/docx_xml_parse.py index 9f1a161..e74ea68 100644 --- a/magic_doc/conv/docx_xml_parse.py +++ b/magic_doc/conv/docx_xml_parse.py @@ -49,4 +49,7 @@ def docx_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]: if __name__ == '__main__': pupdator = FileBaseProgressUpdator("/tmp/p.txt") docx = Docx() - logger.info(docx.to_md(open(r"D:\project\20240514magic_doc\doc_ppt\doc\demo\文本+表+图.docx", "rb").read(), pupdator)) + if 0: + logger.info(docx.to_md(open(r"D:\project\20240514magic_doc\doc_ppt\doc\demo\文本+表+图.docx", "rb").read(), pupdator)) + if 1: + print(docx.to_md(Path("/opt/data/magic_doc/20240605/doc/星际迷航.docx").read_bytes(), pupdator)) \ No newline at end of file diff --git a/magic_doc/conv/ppt_libreoffice.py b/magic_doc/conv/ppt_libreoffice.py index 093ced1..3c4c84c 100644 --- a/magic_doc/conv/ppt_libreoffice.py +++ b/magic_doc/conv/ppt_libreoffice.py @@ -1,7 +1,7 @@ import os -from subprocess import Popen import tempfile from pathlib import Path +from subprocess import Popen from loguru import logger @@ -23,13 +23,13 @@ def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str: for index, page in enumerate(page_list): progress = 80 + int(index / total * 20) # logger.info(f"progress: {progress}") - page_content_list = page['content_list'] + page_content_list = page["content_list"] for content in page_content_list: pupdator.update(progress) - if content['type'] == 'image': + if content["type"] == "image": pass - elif content['type'] == "text": - data = content['data'] + elif content["type"] == "text": + data = content["data"] md_content_list.append(data) return "\n".join(md_content_list) @@ -39,7 +39,7 @@ def ppt_to_pptx(self, ppt_path: str, dir_path: str) -> str: process = Popen(cmd, shell=True) process.wait() fname = str(Path(ppt_path).stem) - pptx_path = os.path.join(os.path.dirname(ppt_path), f'{fname}.pptx') + pptx_path = os.path.join(os.path.dirname(ppt_path), f"{fname}.pptx") if not os.path.exists(pptx_path): # logger.error(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}") raise Exception(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}") @@ -56,14 +56,30 @@ def ppt_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]: pptx_file_path = self.ppt_to_pptx(str(file_path), str(temp_path)) pupdator.update(50) pptx_extractor = PptxExtractor() - pages = pptx_extractor.extract(Path(pptx_file_path), "tmp", temp_dir, media_dir, True) + pages = pptx_extractor.extract(Path(pptx_file_path)) pupdator.update(80) return pages -if __name__ == '__main__': +if __name__ == "__main__": pupdator = FileBaseProgressUpdator("/tmp/p.txt") ppt = Ppt() - logger.info( - ppt.to_md( - open(r"D:\project\20240514magic_doc\doc_ppt\doc\【英文-课件】MIT15_082JF10_lec10.3MB.ppt", "rb").read(), pupdator)) + if 0: + logger.info( + ppt.to_md( + open( + r"D:\project\20240514magic_doc\doc_ppt\doc\【英文-课件】MIT15_082JF10_lec10.3MB.ppt", + "rb", + ).read(), + pupdator, + ) + ) + if 1: + print( + ppt.to_md( + open( + "/opt/data/magic_doc/20240605/doc/MIT15_082JF10_lec10.3MB.ppt", "rb" + ).read(), + pupdator, + ) + ) diff --git a/magic_doc/conv/pptx_python_pptx.py b/magic_doc/conv/pptx_python_pptx.py index 01d0394..00d363e 100644 --- a/magic_doc/conv/pptx_python_pptx.py +++ b/magic_doc/conv/pptx_python_pptx.py @@ -21,13 +21,13 @@ def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str: for index, page in enumerate(page_list): progress = 50 + int(index / total * 50) # logger.info(f"progress: {progress}") - page_content_list = page['content_list'] + page_content_list = page["content_list"] for content in page_content_list: pupdator.update(progress) - if content['type'] == 'image': + if content["type"] == "image": pass - elif content['type'] == "text": - data = content['data'] + elif content["type"] == "text": + data = content["data"] md_content_list.append(data) return "\n".join(md_content_list) @@ -39,13 +39,41 @@ def pptx_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]: file_path = temp_dir / "tmp.pptx" file_path.write_bytes(bits) pptx_extractor = PptxExtractor() - pages = pptx_extractor.extract(file_path, "tmp", temp_dir, media_dir, True) + pages = pptx_extractor.extract(file_path) pupdator.update(50) return pages -if __name__ == '__main__': +if __name__ == "__main__": pupdator = FileBaseProgressUpdator("/tmp/p.txt") pptx = Pptx() - logger.info( - pptx.to_md(open(r"D:\project\20240514magic_doc\doc_ppt\doc\【英文-模板】Professional Pack Standard.pptx", "rb").read(), pupdator)) + if 0: + logger.info( + pptx.to_md( + open( + r"D:\project\20240514magic_doc\doc_ppt\doc\【英文-模板】Professional Pack Standard.pptx", + "rb", + ).read(), + pupdator, + ) + ) + if 0: + print( + pptx.to_md( + open( + r"/opt/data/magic_doc/20240605/doc/【英文-模板】Professional Pack Standard.pptx", + "rb", + ).read(), + pupdator, + ) + ) + if 1: + print( + pptx.to_md( + open( + r"/home/PJLAB/xurui1/Git/Magic-Doc/MIT15_082JF10_lec10.3MB.pptx", + "rb", + ).read(), + pupdator, + ) + ) diff --git a/magic_doc/restful_api/api/analysis/__init__.py b/magic_doc/restful_api/api/analysis/__init__.py index 1e73909..3319caa 100644 --- a/magic_doc/restful_api/api/analysis/__init__.py +++ b/magic_doc/restful_api/api/analysis/__init__.py @@ -1,10 +1,8 @@ from flask import Blueprint from .magic_pdf_view import * -from .magic_html_view import * from ..extentions import Api analysis_blue = Blueprint('analysis', __name__, url_prefix='/analysis') api = Api(analysis_blue) -api.add_resource(MagicPdfView, '/pdf') -api.add_resource(MagicHtmlView, '/html') \ No newline at end of file +api.add_resource(MagicPdfView, '/pdf') \ No newline at end of file diff --git a/magic_doc/restful_api/api/analysis/magic_html_view.py b/magic_doc/restful_api/api/analysis/magic_html_view.py deleted file mode 100644 index f3acce8..0000000 --- a/magic_doc/restful_api/api/analysis/magic_html_view.py +++ /dev/null @@ -1,28 +0,0 @@ -from flask import request -from flask_restful import Resource -from .serialization import MagicHtmlSchema -from marshmallow import ValidationError -from magic_doc.restful_api.common.custom_response import generate_response -from magic_doc.contrib.magic_html import GeneralExtractor -from loguru import logger - -extractor = GeneralExtractor() - - -class MagicHtmlView(Resource): - @logger.catch - def post(self): - """ - 网页提取 - :return: - """ - magic_html_schema = MagicHtmlSchema() - try: - params = magic_html_schema.load(request.get_json()) - except ValidationError as err: - return generate_response(code=400, msg=err.messages) - url = params.get("pageUrl", "") - html_type = params.get("html_type") - html = params.get("html") - data = extractor.extract(html, base_url=url, html_type=html_type) - return generate_response(data=data)