diff --git a/README.md b/README.md index 746b838..07ee3fd 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,11 @@ Can be in a few seconds the data capture. - Grab the text can be replicated + ![](./screenshots/copy-feature.png) + - Save the original directory structure + ![](./screenshots/index.png) - Retain the original hyperlinks diff --git a/gitbook.py b/gitbook.py index 9a88ab2..5b99d60 100755 --- a/gitbook.py +++ b/gitbook.py @@ -1,5 +1,8 @@ -import sys +import argparse from gitbook2pdf import Gitbook2PDF + if __name__ == '__main__': - url = sys.argv[1] - Gitbook2PDF(url).run() + parser = argparse.ArgumentParser() + parser.add_argument("-u", '--url', type=str, help='the gitbook url') + args = parser.parse_args() + Gitbook2PDF(args.url).run() diff --git a/gitbook2pdf/ChapterParser.py b/gitbook2pdf/ChapterParser.py new file mode 100644 index 0000000..1fcd395 --- /dev/null +++ b/gitbook2pdf/ChapterParser.py @@ -0,0 +1,96 @@ +import html +from urllib.parse import urljoin +from lxml import etree + + +class ChapterParser: + def __init__(self, original, index_title, baselevel=0): + self.head = '' + self.heads = {'h1': 1, 'h2': 2, 'h3': 3, 'h4': 4, 'h5': 5, 'h6': 6} + self.original = original + self.baselevel = baselevel + self.index_title = index_title + + def parser(self): + tree = etree.HTML(self.original) + if tree.xpath('//section[@class="normal markdown-section"]'): + context = tree.xpath('//section[@class="normal markdown-section"]')[0] + else: + context = tree.xpath('//section[@class="normal"]')[0] + if context.find('footer'): + context.remove(context.find('footer')) + context = self.parsehead(context) + return html.unescape(etree.tostring(context, encoding='utf-8').decode()) + + def parsehead(self, context): + def level(num): + return 'level' + str(num) + + for head in self.heads: + if context.xpath(head): + self.head = IndexParser.titleparse(context.xpath(head)[0]) + if self.head in self.index_title: + context.xpath(head)[0].text = self.index_title + context.xpath(head)[0].attrib['class'] = level(self.baselevel) + break + return context + + +class IndexParser: + def __init__(self, lis, start_url): + self.lis = lis + self.start_url = start_url + + @classmethod + def titleparse(cls, li): + children = li.getchildren() + if len(children) != 0: + firstchildren = children[0] + primeval_title = ''.join(firstchildren.itertext()) + title = ' '.join(primeval_title.split()) + else: + title = li.text + return title + + def parse(self): + found_urls = [] + content_urls = [] + for li in self.lis: + element_class = li.attrib.get('class') + if not element_class: + continue + if 'header' in element_class: + title = self.titleparse(li) + data_level = li.attrib.get('data-level') + level = len(data_level.split('.')) if data_level else 1 + content_urls.append({ + 'url': "", + 'level': level, + 'title': title + }) + elif "chapter" in element_class: + data_level = li.attrib.get('data-level') + level = len(data_level.split('.')) + if 'data-path' in li.attrib: + data_path = li.attrib.get('data-path') + url = urljoin(self.start_url, data_path) + title = self.titleparse(li) + if url not in found_urls: + content_urls.append( + { + 'url': url, + 'level': level, + 'title': title + } + ) + found_urls.append(url) + + # Unclickable link + else: + title = self.titleparse(li) + content_urls.append({ + 'url': "", + 'level': level, + 'title': title + }) + return content_urls diff --git a/gitbook2pdf/HtmlGenerator.py b/gitbook2pdf/HtmlGenerator.py new file mode 100644 index 0000000..949c907 --- /dev/null +++ b/gitbook2pdf/HtmlGenerator.py @@ -0,0 +1,40 @@ +import re + + +class HtmlGenerator: + def __init__(self, base_url): + self.html_start = """\n\n
\n\n""" + self.html_end = """\n\n""" + self.title_ele = "" + self.meta_list = [] + self.body = "" + self.base_url = base_url + + def add_meta_data(self, key, value): + meta_string = "".format_map({ + 'key': key, + 'value': value + }) + self.meta_list.append(meta_string) + + def add_body(self, body): + self.body = body + + def srcrepl(self, match): + """ + Return the file contents with paths replaced + """ + absolutePath = self.base_url + pathStr = match.group(3) + if pathStr.startswith(".."): + pathStr = pathStr[3:] + return "<" + match.group(1) + match.group(2) + "=" + "\"" + absolutePath + pathStr + "\"" + match.group(4) + ">" + + def relative_to_absolute_path(self, origin_text): + p = re.compile(r"<(.*?)(src|href)=\"(?!http)(.*?)\"(.*?)>") + updated_text = p.sub(self.srcrepl, origin_text) + return updated_text + + def output(self): + full_html = self.html_start + self.title_ele + "".join(self.meta_list) + "" + self.body + self.html_end + return self.relative_to_absolute_path(full_html) diff --git a/gitbook2pdf/__init__.py b/gitbook2pdf/__init__.py index 16db1c4..a965859 100644 --- a/gitbook2pdf/__init__.py +++ b/gitbook2pdf/__init__.py @@ -1,2 +1,3 @@ from .gitbook2pdf import Gitbook2PDF -__all__ = ('Gitbook2PDF',) \ No newline at end of file + +__all__ = ('Gitbook2PDF',) diff --git a/gitbook2pdf/gitbook2pdf.py b/gitbook2pdf/gitbook2pdf.py index ae94ed2..257643a 100644 --- a/gitbook2pdf/gitbook2pdf.py +++ b/gitbook2pdf/gitbook2pdf.py @@ -1,183 +1,19 @@ -import html -import requests import asyncio -import aiohttp -import weasyprint import datetime -import re -from bs4 import BeautifulSoup -from urllib.parse import urljoin, urlparse -from lxml import etree as ET -import sys -import os -BASE_DIR = os.path.dirname(__file__) - -async def request(url, headers, timeout=None): - async with aiohttp.ClientSession() as session: - async with session.get(url, headers=headers, timeout=timeout) as resp: - return await resp.text() - - -def local_ua_stylesheets(self): - return [weasyprint.CSS(os.path.join(BASE_DIR, './libs/html5_ua.css'))] - - -# weasyprint's monkey patch for level - -def load_gitbook_css(): - with open( - os.path.join(BASE_DIR, './libs/gitbook.css'), 'r' - ) as f: - return f.read() - - -def get_level_class(num): - ''' - return 'level'+num - ''' - return 'level' + str(num) - - -class HtmlGenerator(): - def __init__(self, base_url): - self.html_start = """ - - - - -""" - - self.title_ele = "" - self.meta_list = [] - self.body = "" - self.html_end = """ - - -""" - self.base_url = base_url - - def add_meta_data(self, key, value): - meta_string = "".format_map({ - 'key': key, - 'value': value - }) - self.meta_list.append(meta_string) - - def add_body(self, body): - self.body = body - - def srcrepl(self, match): - "Return the file contents with paths replaced" - absolutePath = self.base_url - pathStr = match.group(3) - if pathStr.startswith(".."): - pathStr = pathStr[3:] - return "<" + match.group(1) + match.group(2) + "=" + "\"" + absolutePath + pathStr + "\"" + match.group( - 4) + ">" - - def relative_to_absolute_path(self, origin_text): - p = re.compile(r"<(.*?)(src|href)=\"(?!http)(.*?)\"(.*?)>") - updated_text = p.sub(self.srcrepl, origin_text) - return updated_text +import logging +from urllib.parse import urlparse - def output(self): - full_html = self.html_start + self.title_ele + "".join(self.meta_list) \ - + "" + self.body + self.html_end - return self.relative_to_absolute_path(full_html) - - -class ChapterParser(): - def __init__(self, original,index_title, baselevel=0): - self.heads = {'h1': 1, 'h2': 2, 'h3': 3, 'h4': 4, 'h5': 5, 'h6': 6} - self.original = original - self.baselevel = baselevel - self.index_title = index_title - - def parser(self): - tree = ET.HTML(self.original) - if tree.xpath('//section[@class="normal markdown-section"]'): - context = tree.xpath('//section[@class="normal markdown-section"]')[0] - else: - context = tree.xpath('//section[@class="normal"]')[0] - if context.find('footer'): - context.remove(context.find('footer')) - context = self.parsehead(context) - return html.unescape(ET.tostring(context).decode()) - - def parsehead(self, context): - def level(num): - return 'level' + str(num) - for head in self.heads: - if context.xpath(head): - self.head = IndexParser.titleparse(context.xpath(head)[0]) - if self.head in self.index_title: - context.xpath(head)[0].text = self.index_title - context.xpath(head)[0].attrib['class'] = level(self.baselevel) - break - return context - - -class IndexParser(): - def __init__(self, lis, start_url): - self.lis = lis - self.start_url = start_url - - @classmethod - def titleparse(cls, li): - children = li.getchildren() - if len(children) != 0: - firstchildren = children[0] - primeval_title = ''.join(firstchildren.itertext()) - title = ' '.join(primeval_title.split()) - else: - title = li.text - return title +import requests +from bs4 import BeautifulSoup - def parse(self): - found_urls = [] - content_urls = [] - for li in self.lis: - element_class = li.attrib.get('class') - if not element_class: - continue - if 'header' in element_class: - title = self.titleparse(li) - data_level = li.attrib.get('data-level') - level = len(data_level.split('.')) if data_level else 1 - content_urls.append({ - 'url': "", - 'level': level, - 'title': title - }) - elif "chapter" in element_class: - data_level = li.attrib.get('data-level') - level = len(data_level.split('.')) - if 'data-path' in li.attrib: - data_path = li.attrib.get('data-path') - url = urljoin(self.start_url, data_path) - title = self.titleparse(li) - if url not in found_urls: - content_urls.append( - { - 'url': url, - 'level': level, - 'title': title - } - ) - found_urls.append(url) +from .ChapterParser import * +from .HtmlGenerator import * +from .util import * - # Unclickable link - else: - title = self.titleparse(li) - content_urls.append({ - 'url': "", - 'level': level, - 'title': title - }) - return content_urls +logging.basicConfig(level=logging.INFO) -class Gitbook2PDF(): +class Gitbook2PDF: def __init__(self, base_url, fname=None): self.fname = fname self.base_url = base_url @@ -186,14 +22,13 @@ def __init__(self, base_url, fname=None): } self.content_list = [] self.meta_list = [] - self.meta_list.append( - ('generator', 'gitbook2pdf') - ) + self.meta_list.append(('generator', 'gitbook2pdf')) weasyprint.HTML._ua_stylesheets = local_ua_stylesheets def run(self): content_urls = self.collect_urls_and_metadata(self.base_url) self.content_list = ["" for _ in range(len(content_urls))] + loop = asyncio.get_event_loop() loop.run_until_complete(self.crawl_main_content(content_urls)) loop.close() @@ -208,17 +43,17 @@ def run(self): html_text = html_g.output() css_text = load_gitbook_css() - self.write_pdf(self.fname, html_text, css_text) + write_pdf(self.fname, html_text, css_text) async def crawl_main_content(self, content_urls): tasks = [] for index, urlobj in enumerate(content_urls): if urlobj['url']: - tasks.append(self.gettext(index, urlobj['url'], urlobj['level'],urlobj['title'])) + tasks.append(self.gettext(index, urlobj['url'], urlobj['level'], urlobj['title'])) else: tasks.append(self.getext_fake(index, urlobj['title'], urlobj['level'])) await asyncio.gather(*tasks) - print("crawl : all done!") + logging.info("crawl : all done!") async def getext_fake(self, index, title, level): await asyncio.sleep(0.01) @@ -227,33 +62,22 @@ async def getext_fake(self, index, title, level): self.content_list[index] = string async def gettext(self, index, url, level, title): - ''' + """ return path's html - ''' + """ + logging.info("crawling : " + url) - print("crawling : ", url) try: metatext = await request(url, self.headers, timeout=10) - except Exception as e: - print("retrying : ", url) + except TimeoutError: + logging.warning("retrying : " + url) metatext = await request(url, self.headers) try: text = ChapterParser(metatext, title, level, ).parser() - print("done : ", url) + logging.info("done : " + url) self.content_list[index] = text except IndexError: - print('faild at : ', url, ' maybe content is empty?') - - def write_pdf(self, fname, html_text, css_text): - tmphtml = weasyprint.HTML(string=html_text) - tmpcss = weasyprint.CSS(string=css_text) - fname = "./output/" + fname - htmlname = fname.replace('.pdf', '.html') - with open(htmlname, 'w', encoding='utf-8') as f: - f.write(html_text) - print('Generating pdf,please wait patiently') - tmphtml.write_pdf(fname, stylesheets=[tmpcss]) - print('Generated') + logging.error('faild at : ' + url + ' maybe content is empty?') def collect_urls_and_metadata(self, start_url): response = requests.get(start_url, headers=self.headers) @@ -273,17 +97,15 @@ def collect_urls_and_metadata(self, start_url): title = title.split('|')[1] title = title.replace(' ', '').replace('/', '-') self.fname = title + '.pdf' - self.meta_list.append( - ('title', self.fname.replace('.pdf', '')) - ) + self.meta_list.append(('title', self.fname.replace('.pdf', ''))) # get description meta data comments_section = soup.find_all(class_='comments-section') if comments_section: description = comments_section[0].text.replace('\n', '').replace('\t', '') - self.meta_list.append( - ('description', description) - ) + self.meta_list.append(('description', description)) + else: + pass # get author meta author_meta = soup.find('meta', {'name': 'author'}) @@ -300,6 +122,5 @@ def collect_urls_and_metadata(self, start_url): now = datetime.datetime.utcnow().replace(microsecond=0).isoformat() self.meta_list.append(('dcterms.created', now)) self.meta_list.append(('dcterms.modified', now)) - lis = ET.HTML(text).xpath("//ul[@class='summary']//li") + lis = etree.HTML(text).xpath("//ul[@class='summary']//li") return IndexParser(lis, start_url).parse() - diff --git a/gitbook2pdf/util.py b/gitbook2pdf/util.py index e69de29..e49615a 100644 --- a/gitbook2pdf/util.py +++ b/gitbook2pdf/util.py @@ -0,0 +1,41 @@ +import os + +import aiohttp +import weasyprint + +BASE_DIR = os.path.dirname(__file__) + + +async def request(url, headers, timeout=None): + async with aiohttp.ClientSession() as session: + async with session.get(url, headers=headers, timeout=timeout) as resp: + return await resp.text() + + +def local_ua_stylesheets(self): + return [weasyprint.CSS(os.path.join(BASE_DIR, './libs/html5_ua.css'))] + + +# weasyprint's monkey patch for level +def load_gitbook_css(): + with open(os.path.join(BASE_DIR, './libs/gitbook.css'), 'r') as f: + return f.read() + + +def get_level_class(num): + """ + return 'level'+num + """ + return 'level' + str(num) + + +def write_pdf(fname, html_text, css_text): + tmphtml = weasyprint.HTML(string=html_text) + tmpcss = weasyprint.CSS(string=css_text) + fname = "./output/" + fname + htmlname = fname.replace('.pdf', '.html') + with open(htmlname, 'w', encoding='utf-8') as f: + f.write(html_text) + print('Generating pdf,please wait patiently') + tmphtml.write_pdf(fname, stylesheets=[tmpcss]) + print('Generated')