Skip to content

Commit

Permalink
refactor: clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqzss committed Jun 14, 2024
1 parent 5fc14dd commit c6f0eec
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 432 deletions.
19 changes: 13 additions & 6 deletions ChinaXivXiv/defines.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

END_FILEID = 78000
DEFAULT_HEADERS = {
"User-Agent": "ChinaXiv Archive Mirror Project/0.1.0 (STW; SaveTheWeb; +github.com/saveweb; [email protected]) (qos-rate-limit: 3q/s)",
"User-Agent": "ChinaXiv Archive Mirror Project/0.2.0 (STW; SaveTheWeb; +github.com/saveweb; [email protected]) (qos-rate-limit: 3q/s)",
}
DEBUG = 1

Expand Down Expand Up @@ -51,14 +51,21 @@ def __post_init__(self):
assert self.status in Status.__dict__.values()


'''
authors = None
pubyear = None
title = None
journal = None
prefer_identifier = None
'''
@dataclass
class ChinaXivHtmlMetadata:
chinaxiv_id: int
""" 又名 fileid """
title: str
authors: List[str]
journal: str
pubyear: int
title: str | None
authors: List[str] | None
journal: str | None
pubyear: int | None
version: int
csoaid: str
""" 又名 article-id """
Expand All @@ -67,7 +74,7 @@ class ChinaXivHtmlMetadata:
subjects: List[str]
keywords: List[str]

prefer_identifier: Optional[str]
prefer_identifier: str | None
""" DOI or csoaid """

@dataclass
Expand Down
13 changes: 1 addition & 12 deletions ChinaXivXiv/main.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,13 @@
import asyncio
from datetime import datetime
import os
import time
import motor.motor_asyncio
import httpx

from ChinaXivXiv.defines import DEBUG, DEFAULT_HEADERS, Status
from ChinaXivXiv.mongo_ops import create_fileids_queue_index, find_max_id, init_queue
from ChinaXivXiv.util import arg_parser
from ChinaXivXiv.defines import DEFAULT_HEADERS
from ChinaXivXiv.workers.IA_uploader import IA_upload_worker
from ChinaXivXiv.workers.file_downloader import file_downloader_worker
from ChinaXivXiv.workers.fileid_finder import fileid_finder_worker
from ChinaXivXiv.workers.metadata_scraper import metadata_scraper_worker
from ChinaXivXiv.workers.status_mover import status_mover_worker
from ChinaXivXiv.workers.task_provider import task_provider_worker


async def main():
args = arg_parser()
transport = httpx.AsyncHTTPTransport(retries=3)
h_client = httpx.AsyncClient(timeout=60, transport=transport)
h_client.headers.update(DEFAULT_HEADERS)
Expand All @@ -30,7 +20,6 @@ async def main():
IA_upload_worker(
client=h_client,
collection=global_chinaxiv_collection,
args=args
) for _ in range(5)]
await asyncio.gather(*cors)

Expand Down
33 changes: 0 additions & 33 deletions ChinaXivXiv/mongo_ops.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,9 @@
from datetime import datetime
import time
from typing import Optional
import httpx
import motor.motor_asyncio

from ChinaXivXiv.defines import Status, Task


async def init_queue(queue_col: motor.motor_asyncio.AsyncIOMotorCollection, start_id: int, end_id: int, status: str = Status.TODO):
"""
start_id: 1, end_id: 5
will create id: 1, 2, 3, 4
doc: {"id": int,"status": str}
"""
assert queue_col.name == "fileids_queue"
assert status in Status.__dict__.values()
assert start_id > 0
assert start_id <= end_id
if start_id == end_id:
print(f"start_id == end_id: {start_id}")
return
docs = []
for i in range(start_id, end_id):
docs.append({
"id": i,
"status": status,
})
if len(docs) == 100000:
s_time = time.time()
await queue_col.insert_many(docs, ordered=False)
e_time = time.time()
docs = []
print(f"inserted c_queue={i} | {e_time - s_time}", end="\r")
if docs:
await queue_col.insert_many(docs)
print(f"inserted c_queue={end_id}", end="\r")


async def claim_task(queue: motor.motor_asyncio.AsyncIOMotorCollection,
status_from: str = Status.TODO,
Expand Down
33 changes: 0 additions & 33 deletions ChinaXivXiv/util.py

This file was deleted.

31 changes: 14 additions & 17 deletions ChinaXivXiv/workers/IA_uploader.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,17 @@
import asyncio
from datetime import datetime
import os
from pprint import pprint
import random
import time
import io
from dataclasses import dataclass
import traceback
from typing import Dict, List, Optional
from typing import Dict, Optional
import httpx

import internetarchive

import motor.motor_asyncio
from ChinaXivXiv.defines import ChinaXivGlobalMetadata, ChinaXivHtmlMetadata, Status, Task
from ChinaXivXiv.mongo_ops import claim_task, create_fileids_queue_index, find_max_id, init_queue, update_task
from ChinaXivXiv.util import Args
from ChinaXivXiv.workers.metadata_scraper import get_chinaxivhtmlmetadata_from_html, get_core_html, parse_keywords, parse_subjects
from ChinaXivXiv.defines import ChinaXivGlobalMetadata, ChinaXivHtmlMetadata, Status
from ChinaXivXiv.mongo_ops import claim_task, update_task
from ChinaXivXiv.workers.metadata_scraper import get_chinaxivhtmlmetadata_from_html, get_core_html

NOTES = """\
- 元数据由脚本提取,仅供参考,以 ChinaXiv.org 官网为准。(如元数据识别有误/需要更新,请留言)
Expand All @@ -25,7 +20,7 @@
- “版本历史”的“下载全文”按钮链接到的是 ChinaXiv.org 的原始链接,未来可能会失效。
"""

async def IA_upload_worker(client: httpx.AsyncClient, collection: motor.motor_asyncio.AsyncIOMotorCollection, args: Args):
async def IA_upload_worker(client: httpx.AsyncClient, collection: motor.motor_asyncio.AsyncIOMotorCollection):
while not os.path.exists("stop"):
# 1. claim a task
TASK = await claim_task(collection, status_from=Status.TODO, status_to=Status.UPLOADTOIA_PROCESSING)
Expand Down Expand Up @@ -78,7 +73,7 @@ async def IA_upload_worker(client: httpx.AsyncClient, collection: motor.motor_as
core_html = get_core_html(html=r_html.content, url=chinaxiv_permanent_with_version_url)


ia_identifier = await async_upload(client, html_metadata, core_html)
ia_identifier = await async_upload(client, metadata_from_browse_db, html_metadata, core_html)
print(f"uploaded to IA: {ia_identifier}")

await update_task(collection, TASK, status=Status.UPLOADTOIA_DONE)
Expand Down Expand Up @@ -116,7 +111,7 @@ def load_ia_keys():
"""


async def async_upload(client: httpx.AsyncClient, html_metadata: ChinaXivHtmlMetadata, core_html: str):
async def async_upload(client: httpx.AsyncClient, metadata_from_browse_db: Dict, html_metadata: ChinaXivHtmlMetadata, core_html: str):
assert html_metadata, "metadata is None"
assert f'{html_metadata.csoaid}v{html_metadata.version}.pdf'

Expand All @@ -133,16 +128,16 @@ async def async_upload(client: httpx.AsyncClient, html_metadata: ChinaXivHtmlMet
f'urn:cstr:32003.36.ChinaXiv.{html_metadata.csoaid}.V{html_metadata.version}',
]

YYYY = html_metadata.pubyear
YYYY = html_metadata.pubyear if html_metadata.pubyear else int(metadata_from_browse_db["year"])
assert 1900 <= YYYY <= 2100
MM = html_metadata.csoaid[4:6]
assert len(MM) == 2 and (1 <= int(MM) <= 12)

metadata = {
"title": html_metadata.title,
"creator": html_metadata.authors, # List[str]
"title": html_metadata.title if html_metadata.title else metadata_from_browse_db["title"],
"creator": html_metadata.authors if html_metadata.authors else metadata_from_browse_db["authors"],
"date": f'{YYYY}-{MM}',
"subject": ["ChinaXiv", html_metadata.journal] + html_metadata.subjects + html_metadata.keywords, # TODO: may overflow 255 chars
"subject": ["ChinaXiv"] + html_metadata.subjects + html_metadata.keywords, # TODO: may overflow 255 chars
"description": core_html,
"source": chinaxiv_permanent_with_version_url,
"external-identifier": external_identifier,
Expand All @@ -156,12 +151,14 @@ async def async_upload(client: httpx.AsyncClient, html_metadata: ChinaXivHtmlMet
# ^^^^ IA native metadata ^^^^

# vvvv custom metadata field vvvv
"journal": html_metadata.journal,

"chinaxiv": html_metadata.csoaid, # == chinaxiv_csoaid, so we can just search "chinaxiv:yyyymm.nnnnnn" to get the item on IA
"chinaxiv_id": html_metadata.chinaxiv_id, # each version has a unique id, even if they have the same csoaid
"chinaxiv_copyQuotation": html_metadata.copyQuotation, # 推荐引用格式 | suggested citation format
}
if html_metadata.journal:
metadata['subject'].append(html_metadata.journal)
metadata['journal'] = html_metadata.journal
identifier = f"ChinaXiv-{html_metadata.csoaid}V{html_metadata.version}"
# identifier = f"TEST-ChinaXiv-{metadata.csoaid}V{metadata.version}"
core_html_filename = f"{html_metadata.csoaid}v{html_metadata.version}-abs.html"
Expand Down
66 changes: 0 additions & 66 deletions ChinaXivXiv/workers/file_downloader.py

This file was deleted.

57 changes: 0 additions & 57 deletions ChinaXivXiv/workers/fileid_finder.py

This file was deleted.

Loading

0 comments on commit c6f0eec

Please sign in to comment.