diff --git a/pukiWikiDumper/dump/content/content.py b/pukiWikiDumper/dump/content/content.py index 4d763f7..7b02dc2 100644 --- a/pukiWikiDumper/dump/content/content.py +++ b/pukiWikiDumper/dump/content/content.py @@ -114,29 +114,3 @@ def dump_page(dumpDir: str, if current_only: print(msg_header, ' [[%s]] saved.' % (page['title'])) return - - # revs = get_revisions(puki_url, page, session=session, msg_header=msg_header) - - - # save_page_changes(dumpDir=dumpDir, child_path=child_path, title=page, - # revs=revs, msg_header=msg_header) - - - for rev in revs[1:]: - if 'id' in rev and rev['id']: - try: - txt = getSource(puki_url, page, rev['id'], session=session) - smkdirs(dumpDir, '/attic/' + child_path) - with uopen(dumpDir + '/attic/' + page.replace(':', '/') + '.' + rev['id'] + '.txt', 'w') as f: - f.write(txt) - print(msg_header, ' Revision %s of [[%s]] saved.' % ( - rev['id'], page)) - except DispositionHeaderMissingError: - print(msg_header, ' Revision %s of [[%s]] is empty. (probably deleted)' % ( - rev['id'], page)) - else: - print(msg_header, ' Revision %s of [[%s]] failed: %s' % (rev['id'], page, 'Rev id not found (please check ?do=revisions of this page)')) - - - # time.sleep(1.5) - diff --git a/pukiWikiDumper/dump/content/revisions.py b/pukiWikiDumper/dump/content/revisions.py index 15d61de..5f7b630 100644 --- a/pukiWikiDumper/dump/content/revisions.py +++ b/pukiWikiDumper/dump/content/revisions.py @@ -1,16 +1,9 @@ -from datetime import datetime -import html -import os -import re -from ipaddress import ip_address, IPv4Address, IPv6Address -import time import urllib.parse as urlparse import requests from bs4 import BeautifulSoup -from pukiWikiDumper.exceptions import ActionEditDisabled, ActionEditTextareaNotFound, ContentTypeHeaderNotTextPlain, HTTPStatusError -from pukiWikiDumper.utils.util import check_int, print_with_lock as print, smkdirs, uopen +from pukiWikiDumper.exceptions import ActionEditDisabled, ActionEditTextareaNotFound from pukiWikiDumper.utils.config import running_config @@ -82,264 +75,3 @@ def get_source_edit(url, page, rev='', session: requests.Session = None,): raise ActionEditTextareaNotFound(page['title']) return source - - -def get_revisions(puki_url, title, session: requests.Session = None, msg_header: str = ''): - """ Get the revisions of a page. This is nontrivial because different versions of DokuWiki return completely different revision HTML. - - Returns a dict with the following keys: (None if not found or failed) - - id: str|None - - user: str|None - - sum: str|None - - date: str|None - - minor: bool - """ - revs = [] - rev_tmplate = { - 'id': None, # str(int) - 'user': None, # str - 'sum': None, # str - 'date': None, # str - 'minor': False, # bool - 'sizechange': 0, - } - - i = 0 - continue_index = -1 - cont = True - - while cont: - r = session.get( - puki_url, - params={ - 'id': title, - 'do': 'revisions', - 'first': continue_index}) - - soup = BeautifulSoup(r.content, running_config.html_parser) - - try: - lis = soup.find('form', {'id': 'page__revisions'}).find( - 'ul').findAll('li') - except AttributeError: - # outdate dokuwiki version? try another way. - try: - lis = soup.find('div', {'class': 'page'}).find( - 'ul').findAll('li') - except: - # still fail - print(msg_header, 'Error: cannot find revisions list.') - raise - - for li in lis: - rev = {} - - checkbox = li.find('input', {'type': 'checkbox'}) - rev_hrefs = li.findAll( - 'a', href=lambda href: href and ( - '&rev=' in href or '?rev=' in href)) - - # id: optional(str(id)): rev_id, not title name. - if checkbox and rev.get('id', None) is None: - rev['id'] = checkbox.get('value', None) - rev['id'] = check_int(rev['id']) - - if rev_hrefs and rev.get('id', None) is None: - obj1 = rev_hrefs[0]['href'] - obj2 = urlparse.urlparse(obj1).query - obj3 = urlparse.parse_qs(obj2) - if 'rev' in obj3: - rev['id'] = obj3['rev'][0] - rev['id'] = check_int(rev['id']) - else: - rev['id'] = None - del (obj1, obj2, obj3) - - if use_hidden_rev and rev.get('id', None) is None: - obj1 = li.find('input', {'type': 'hidden'}) - if obj1 is not None and 'value' in obj1: - rev['id'] = obj1['value'] - rev['id'] = check_int(rev['id']) - del (obj1) - - # minor: bool - rev['minor'] = li.has_attr('class') and 'minor' in li['class'] - - # summary: optional(str) - sum_span = li.findAll('span', {'class': 'sum'}) - if sum_span and not select_revs: - sum_span = sum_span[0] - sum_text = sum_span.text.split(' ')[1:] - if sum_span.findAll('bdi'): - rev['sum'] = html.unescape( - sum_span.find('bdi').text).strip() - else: - rev['sum'] = html.unescape(' '.join(sum_text)).strip() - elif not select_revs: - print(msg_header, ' ', repr( - li.text).replace('\\n', ' ').strip()) - wikilink1 = li.find('a', {'class': 'wikilink1'}) - text_node = wikilink1 and wikilink1.next and wikilink1.next.next or '' - if text_node.strip: - rev['sum'] = html.unescape(text_node).strip(u'\u2013 \n') - - # date: optional(str) - date_span = li.find('span', {'class': 'date'}) - if date_span: - rev['date'] = date_span.text.strip() - else: - rev['date'] = ' '.join(li.text.strip().split(' ')[:2]) - matches = re.findall( - r'([0-9./]+ [0-9]{1,2}:[0-9]{1,2})', - rev['date']) - if matches: - rev['date'] = matches[0] - - # sizechange: optional(int) - sizechange_span = li.find('span', {'class': 'sizechange'}) - - if sizechange_span: - sizechange_text = sizechange_span.text.replace('\xC2\xA0', ' ').strip() - units = ['B', 'KB', 'MB', 'GB'] - positive = '−' not in sizechange_text - size_change = re.sub(r'[^0-9.]', '', sizechange_text) - try: - size_change = float(size_change) - except ValueError: - size_change = 0.0 - - for unit in units[1:]: - if unit in sizechange_text: - size_change *= 1024 - rev['sizechange'] = positive and int(size_change) or int(-size_change) - - # user: optional(str) - # legacy - # if not (select_revs and len(revs) > i and revs[i]['user']): - user_span = li.find('span', {'class': 'user'}) - if user_span and user_span.text is not None: - rev['user'] = html.unescape(user_span.text).strip() - - # if select_revs and len(revs) > i: - # revs[i].update(rev) - # else: - # revs.append(rev) - - _rev = {**rev_tmplate, **rev} # merge dicts - revs.append(_rev) - - i += 1 - - # next page - first = soup.findAll('input', {'name': 'first', 'value': True}) - continue_index = first and max(map(lambda x: int(x['value']), first)) - cont = soup.find('input', {'class': 'button', 'accesskey': 'n'}) - # time.sleep(1.5) - - # if revs and use_hidden_rev and not select_revs: - # soup2 = BeautifulSoup(session.get(url, params={'id': title}).text) - # revs[0]['id'] = soup2.find( - # 'input', { - # 'type': 'hidden', 'name': 'rev', 'value': True})['value'] - - return revs - - -DATE_FORMATS = ["%Y-%m-%d %H:%M", # - "%Y-%m-%d", # - "%Y/%m/%d", # - "%Y/%m/%d %H:%M", - "%Y-%m-%d %H:%M:%S", - "%Y/%m/%d %H:%M:%S", - - "%d.%m.%Y %H:%M", - "%d/%m/%Y %H:%M", # - "%d.%m.%Y %H:%M:%S", - "%d/%m/%Y %H:%M:%S", # - - "%d/%m/%Y alle %H:%M", # # 01/03/2007 alle 14:20 (16 anni fa) - - "Le %d/%m/%Y, %H:%M", # # Le 23/09/2020, 17:01 - - "%H:%M %d/%m/%Y", # - "%d. %m. %Y (%H:%M)", # - ] -""" Why there are so many date formats in the world? :( """ - -def save_page_changes(dumpDir, title: str, revs, child_path, msg_header: str): - changes_file = dumpDir + '/meta/' + title.replace(':', '/') + '.changes' - if os.path.exists(changes_file): - print(msg_header, ' meta change file exists:', changes_file) - return - - revidOfPage: set[str] = set() - rows2write = [] - # Loop through revisions in reverse. - for rev in revs[::-1]: - print(msg_header, ' meta change saving:', rev) - summary = 'sum' in rev and rev['sum'].strip() or '' - rev_id = str(0) - - ip = '127.0.0.1' - user = '' - minor = 'minor' in rev and rev['minor'] - - if 'id' in rev and rev['id']: - rev_id = rev['id'] - else: - # Different date formats in different versions of DokuWiki. - # If no ID was found, make one up based on the date (since rev IDs are Unix times) - # Maybe this is evil. Not sure. - - print(msg_header, ' One revision of [[%s]] missing rev_id. Using date to rebuild...' - % title, end=' ') - - for date_format in DATE_FORMATS: - try: - date = datetime.strptime( - # remove " (x days ago)" in f"{date_format} (x days ago)" if date_format not contain '(' - rev['date'].split('(')[0].strip(), - date_format - ) if '(' not in date_format else datetime.strptime( - # date_format contain '(' - rev['date'].strip(), - date_format - ) - rev_id = str(int(time.mktime(date.utctimetuple()))) - break - except Exception: - rev_id = None - - assert rev_id is not None, 'Cannot parse date: %s' % rev['date'] - assert isinstance(rev_id, str), 'rev_id must be str, not %s' % type(rev_id) - - # if rev_id is not unique, plus 1 to it until it is. - while rev_id in revidOfPage: - rev_id = str(int(rev_id) + 1) - print(msg_header, 'rev_id is now %s' % rev_id) - - revidOfPage.add(rev_id) - - rev['user'] = rev['user'] if 'user' in rev else 'unknown' - try: - ip_parsed = ip_address(rev['user']) - assert isinstance(ip_parsed, (IPv4Address, IPv6Address)) - ip = rev['user'] - except ValueError: - user = rev['user'] - - sizechange = rev['sizechange'] if 'sizechange' in rev else '' - - extra = '' # TODO: use this - # max 255 chars(utf-8) for summary. (dokuwiki limitation) - summary = summary[:255] - row = '\t'.join([rev_id, ip, 'e' if minor else 'E', - title, user, summary, extra, str(sizechange)]) - row = row.replace('\n', ' ') - row = row.replace('\r', ' ') - rows2write.append(row) - - - smkdirs(dumpDir, '/meta/' + child_path) - with uopen(changes_file, 'w') as f: - f.write('\n'.join(rows2write)+'\n') \ No newline at end of file diff --git a/pukiWikiDumper/dump/dokuDumper.py b/pukiWikiDumper/dump/dokuDumper.py index 6c7ef27..895d7d5 100644 --- a/pukiWikiDumper/dump/dokuDumper.py +++ b/pukiWikiDumper/dump/dokuDumper.py @@ -11,13 +11,12 @@ from pukiWikiDumper.__version__ import DUMPER_VERSION, pukiWikiDumper_outdated_check from pukiWikiDumper.dump.content.content import dump_content -from pukiWikiDumper.dump.html import dump_HTML from pukiWikiDumper.dump.info import update_info from pukiWikiDumper.dump.media import dump_attachs from pukiWikiDumper.utils.config import update_config, running_config from pukiWikiDumper.utils.patch import SessionMonkeyPatch from pukiWikiDumper.utils.session import createSession, load_cookies -from pukiWikiDumper.utils.util import avoidSites, buildBaseUrl, getPukiUrl, smkdirs, standardizeUrl, uopen, url2prefix +from pukiWikiDumper.utils.util import avoidSites, buildBaseUrl, getPukiUrl, smkdirs, standardizeUrl, url2prefix DEFAULT_THREADS = -1 # magic number, -1 means use 1 thread. diff --git a/pukiWikiDumper/dump/html/__init__.py b/pukiWikiDumper/dump/html/__init__.py deleted file mode 100644 index a2e924c..0000000 --- a/pukiWikiDumper/dump/html/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .html import dump_HTML \ No newline at end of file diff --git a/pukiWikiDumper/dump/html/html.py b/pukiWikiDumper/dump/html/html.py deleted file mode 100644 index c88eb2f..0000000 --- a/pukiWikiDumper/dump/html/html.py +++ /dev/null @@ -1,111 +0,0 @@ -import os -import threading -import time -import requests -from pukiWikiDumper.dump.content.revisions import get_revisions, save_page_changes -from pukiWikiDumper.dump.content.titles import get_pages - -from pukiWikiDumper.utils.util import load_pages, smkdirs, uopen -from pukiWikiDumper.utils.util import print_with_lock as print -from pukiWikiDumper.utils.config import running_config - -HTML_DIR = 'html/' -HTML_PAGR_DIR = HTML_DIR + 'pages/' -HTML_OLDPAGE_DIR = HTML_DIR + 'attic/' - -sub_thread_error = None - -def dump_HTML(puki_url, dumpDir, - session: requests.Session, skipTo: int = 0, threads: int = 1, - ignore_errors: bool = False, current_only: bool = False): - smkdirs(dumpDir, HTML_PAGR_DIR) - - titles = load_pages(pagesFilePath=dumpDir + '/dumpMeta/titles.txt') - if titles is None: - titles = get_pages(url=puki_url, session=session) - with uopen(dumpDir + '/dumpMeta/titles.txt', 'w') as f: - f.write('\n'.join(titles)) - f.write('\n--END--\n') - - if not len(titles): - print('Empty wiki') - return False - - index_of_title = -1 # 0-based - if skipTo > 0: - index_of_title = skipTo - 2 - titles = titles[skipTo-1:] - - def try_dump_html_page(*args, **kwargs): - try: - dump_html_page(*args, **kwargs) - except Exception as e: - if not ignore_errors: - global sub_thread_error - sub_thread_error = e - raise e - print('[',args[1]+1,']Error in sub thread: (', e, ') ignored') - for title in titles: - while threading.active_count() > threads: - time.sleep(0.1) - if sub_thread_error: - raise sub_thread_error - - index_of_title += 1 - t = threading.Thread(target=try_dump_html_page, args=(dumpDir, - index_of_title, - title, - puki_url, - session, - current_only)) - print('HTML: (%d/%d): [[%s]] ...' % (index_of_title+1, len(titles), title)) - t.daemon = True - t.start() - - while threading.active_count() > 1: - time.sleep(2) - print('Waiting for %d threads to finish' % - (threading.active_count() - 1), end='\r') - -def dump_html_page(dumpDir, index_of_title, title, puki_url, session: requests.Session, current_only: bool = False): - r = session.get(puki_url, params={'do': running_config.export_xhtml_action, 'id': title}) - # export_html is a alias of export_xhtml, but not exist in older versions of dokuwiki - r.raise_for_status() - if r.text is None or r.text == '': - raise Exception(f'Empty response (r.text)') - - msg_header = '['+str(index_of_title + 1)+']: ' - - title2path = title.replace(':', '/') - child_path = os.path.dirname(title2path) - html_path = dumpDir + '/' + HTML_PAGR_DIR + title2path + '.html' - smkdirs(dumpDir, HTML_PAGR_DIR, child_path) - with uopen(html_path, 'w') as f: - f.write(r.text) - print(msg_header, '[[%s]]' % title, 'saved') - - if current_only: - return True - - revs = get_revisions(puki_url=puki_url, session=session, title=title, msg_header=msg_header) - - for rev in revs[1:]: - if 'id' in rev and rev['id']: - try: - r = session.get(puki_url, params={'do': running_config.export_xhtml_action, 'id': title, 'rev': rev['id']}) - r.raise_for_status() - if r.text is None or r.text == '': - raise Exception(f'Empty response (r.text)') - smkdirs(dumpDir, HTML_OLDPAGE_DIR, child_path) - old_html_path = dumpDir + '/' + HTML_OLDPAGE_DIR + title2path + '.' + rev['id'] + '.html' - - with uopen(old_html_path, 'w') as f: - f.write(r.text) - print(msg_header, ' Revision %s of [[%s]] saved.' % (rev['id'], title)) - except requests.HTTPError as e: - print(msg_header, ' Revision %s of [[%s]] failed: %s' % (rev['id'], title, e)) - else: - print(msg_header, ' Revision %s of [[%s]] failed: %s' % (rev['id'], title, 'Rev id not found (please check ?do=revisions of this page)')) - - save_page_changes(dumpDir=dumpDir, child_path=child_path, title=title, - revs=revs, msg_header=msg_header) \ No newline at end of file