fuergaosi233 · YuzukiTsuru · Feb 27, 2021 · Feb 28, 2021
diff --git a/README.md b/README.md
@@ -20,8 +20,11 @@
   Can be in a few seconds the data capture.
 
 - Grab the text can be replicated
+
   ![](./screenshots/copy-feature.png)
+
 - Save the original directory structure
+
   ![](./screenshots/index.png)
 
 - Retain the original hyperlinks

diff --git a/gitbook.py b/gitbook.py
@@ -1,5 +1,8 @@
-import sys
+import argparse
 from gitbook2pdf import Gitbook2PDF
+
 if __name__ == '__main__':
-    url = sys.argv[1]
-    Gitbook2PDF(url).run()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-u", '--url', type=str, help='the gitbook url')
+    args = parser.parse_args()
+    Gitbook2PDF(args.url).run()
diff --git a/gitbook2pdf/ChapterParser.py b/gitbook2pdf/ChapterParser.py
@@ -0,0 +1,96 @@
+import html
+from urllib.parse import urljoin
+from lxml import etree
+
+
+class ChapterParser:
+    def __init__(self, original, index_title, baselevel=0):
+        self.head = ''
+        self.heads = {'h1': 1, 'h2': 2, 'h3': 3, 'h4': 4, 'h5': 5, 'h6': 6}
+        self.original = original
+        self.baselevel = baselevel
+        self.index_title = index_title
+
+    def parser(self):
+        tree = etree.HTML(self.original)
+        if tree.xpath('//section[@class="normal markdown-section"]'):
+            context = tree.xpath('//section[@class="normal markdown-section"]')[0]
+        else:
+            context = tree.xpath('//section[@class="normal"]')[0]
+        if context.find('footer'):
+            context.remove(context.find('footer'))
+        context = self.parsehead(context)
+        return html.unescape(etree.tostring(context, encoding='utf-8').decode())
+
+    def parsehead(self, context):
+        def level(num):
+            return 'level' + str(num)
+
+        for head in self.heads:
+            if context.xpath(head):
+                self.head = IndexParser.titleparse(context.xpath(head)[0])
+                if self.head in self.index_title:
+                    context.xpath(head)[0].text = self.index_title
+                context.xpath(head)[0].attrib['class'] = level(self.baselevel)
+                break
+        return context
+
+
+class IndexParser:
+    def __init__(self, lis, start_url):
+        self.lis = lis
+        self.start_url = start_url
+
+    @classmethod
+    def titleparse(cls, li):
+        children = li.getchildren()
+        if len(children) != 0:
+            firstchildren = children[0]
+            primeval_title = ''.join(firstchildren.itertext())
+            title = ' '.join(primeval_title.split())
+        else:
+            title = li.text
+        return title
+
+    def parse(self):
+        found_urls = []
+        content_urls = []
+        for li in self.lis:
+            element_class = li.attrib.get('class')
+            if not element_class:
+                continue
+            if 'header' in element_class:
+                title = self.titleparse(li)
+                data_level = li.attrib.get('data-level')
+                level = len(data_level.split('.')) if data_level else 1
+                content_urls.append({
+                    'url': "",
+                    'level': level,
+                    'title': title
+                })
+            elif "chapter" in element_class:
+                data_level = li.attrib.get('data-level')
+                level = len(data_level.split('.'))
+                if 'data-path' in li.attrib:
+                    data_path = li.attrib.get('data-path')
+                    url = urljoin(self.start_url, data_path)
+                    title = self.titleparse(li)
+                    if url not in found_urls:
+                        content_urls.append(
+                            {
+                                'url': url,
+                                'level': level,
+                                'title': title
+                            }
+                        )
+                        found_urls.append(url)
+
+                # Unclickable link
+                else:
+                    title = self.titleparse(li)
+                    content_urls.append({
+                        'url': "",
+                        'level': level,
+                        'title': title
+                    })
+        return content_urls
diff --git a/gitbook2pdf/HtmlGenerator.py b/gitbook2pdf/HtmlGenerator.py
@@ -0,0 +1,40 @@
+import re
+
+
+class HtmlGenerator:
+    def __init__(self, base_url):
+        self.html_start = """<!DOCTYPE html>\n<html lang="en">\n<head>\n<meta charset="UTF-8">\n"""
+        self.html_end = """\n</body>\n</html>"""
+        self.title_ele = ""
+        self.meta_list = []
+        self.body = ""
+        self.base_url = base_url
+
+    def add_meta_data(self, key, value):
+        meta_string = "<meta name={key} content={value}>".format_map({
+            'key': key,
+            'value': value
+        })
+        self.meta_list.append(meta_string)
+
+    def add_body(self, body):
+        self.body = body
+
+    def srcrepl(self, match):
+        """
+        Return the file contents with paths replaced
+        """
+        absolutePath = self.base_url
+        pathStr = match.group(3)
+        if pathStr.startswith(".."):
+            pathStr = pathStr[3:]
+        return "<" + match.group(1) + match.group(2) + "=" + "\"" + absolutePath + pathStr + "\"" + match.group(4) + ">"
+
+    def relative_to_absolute_path(self, origin_text):
+        p = re.compile(r"<(.*?)(src|href)=\"(?!http)(.*?)\"(.*?)>")
+        updated_text = p.sub(self.srcrepl, origin_text)
+        return updated_text
+
+    def output(self):
+        full_html = self.html_start + self.title_ele + "".join(self.meta_list) + "<body>" + self.body + self.html_end
+        return self.relative_to_absolute_path(full_html)
diff --git a/gitbook2pdf/__init__.py b/gitbook2pdf/__init__.py
@@ -1,2 +1,3 @@
 from .gitbook2pdf import Gitbook2PDF
-__all__ = ('Gitbook2PDF',)
+
+__all__ = ('Gitbook2PDF',)