Skip to content

Commit

Permalink
modified to handle JSONError and other errors
Browse files Browse the repository at this point in the history
  • Loading branch information
Pawween committed Oct 21, 2024
1 parent 9f0a87f commit b9db7dd
Showing 1 changed file with 48 additions and 31 deletions.
79 changes: 48 additions & 31 deletions stages/python/02_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

scraperapi_key = "Enter api key"
scraperapi_key = "Api Key"

metadata_columns = [
'id', 'doi', 'url', 'type', 'type_crossref', 'publication_date', 'journal', 'publisher',
Expand Down Expand Up @@ -48,35 +48,50 @@ def download_pdf(url, output_dir, downloaded_hashes):
if content_hash in downloaded_hashes:
return None, None

if len(content) < 1024:
return None, None

filename = f"{content_hash}.pdf"
outfile_path = output_dir / filename

with outfile_path.open('wb') as file:
file.write(content)

if content.startswith(b'%PDF-'):
with outfile_path.open('rb') as file:
reader = pypdf.PdfReader(file)
if len(reader.pages) > 0:
return outfile_path, content_hash

outfile_path.unlink()
if content.startswith(b'%PDF-'):
try:
with outfile_path.open('rb') as file:
reader = pypdf.PdfReader(file)
if len(reader.pages) > 1:
return outfile_path, content_hash
except pypdf.errors.PdfReadError:
outfile_path.unlink()
return None, None
except Exception as e:
if outfile_path.exists():
outfile_path.unlink()
return None, None
if outfile_path.exists():
outfile_path.unlink()

return None, None

# retrieve publisher and journal from crossref
# handling JSONDecodeError
def extract_metadata(doi):
base_url = "https://api.crossref.org/works/"
response = requests.get(f"http://api.scraperapi.com",params={"api_key":scraperapi_key,"url":f"{base_url}{doi}"})
if response.status_code == 200:
data = response.json()['message']
journal = data.get('container-title', [])
journal = journal[0] if journal else None
publisher = data.get('publisher', [])
publisher = publisher if publisher else None
return {'journal': journal, 'publisher': publisher}
if 'application/json' in response.headers.get('Content-Type', ''):
data = response.json()['message']
journal = data.get('container-title', [])
journal = journal[0] if journal else None
publisher = data.get('publisher', [])
publisher = publisher if publisher else None
return {'journal': journal, 'publisher': publisher}
else:
return None
else:
return {'journal': None, 'publisher': None}
return None

##### Execution #####

Expand All @@ -90,7 +105,7 @@ def extract_metadata(doi):

# process each file
for file in input_dir.glob('*.parquet'):
df = pd.read_parquet(file)[:100]
df = pd.read_parquet(file)[:7500]

# Get the latest row where content_hash is assigned
existing_metadata = load_existing_metadata(output_dir, file.stem)
Expand All @@ -116,7 +131,8 @@ def extract_metadata(doi):
doi_url[1],
doi_url[0],
*download_pdf(doi_url[1], output_dir, downloaded_hashes),
*extract_metadata(doi_url[0]).values()
#*(extract_metadata(doi_url[0]).values() if extract_metadata(doi_url[0]) is not None else [None, None])
*(extract_metadata(doi_url[0]) or [None, None])
),
zip(dois, urls)
), total=len(dois), desc='Downloading PDFs'))
Expand All @@ -125,8 +141,8 @@ def extract_metadata(doi):
if result and result[2] not in downloaded_hashes:
if result[2] is not None and result[3] is not None:
results_list.append({
'doi': result[1], #DOI
'url': result[0], #URL
'doi': result[1], # DOI
'url': result[0], # URL
'content_hash': str(result[3]), # Hash
'file_path': str(result[2]), # Path
'journal': str(result[4]), # Journal
Expand All @@ -138,26 +154,27 @@ def extract_metadata(doi):
results_df = pd.DataFrame(results_list)

# merge hash, pdf path, journal and publisher to original data
# handle issues when no new data is generated
if not results_df.empty:
# original columns
original_columns = [
original_columns = [
'id', 'doi','type', 'type_crossref', 'publication_date', 'title',
'is_oa', 'authors', 'areas', 'themes', 'keywd', 'volume', 'issue', 'language'
]
]

original_df = df[original_columns]

metadata_df = pd.merge(results_df, original_df, on='doi', how='left')
original_df = df[original_columns]

# remove rows with no downloaded pdfs
metadata_df = metadata_df.dropna(subset=['content_hash', 'file_path'])
metadata_df = pd.merge(results_df, original_df, on='doi', how='left')

# remove rows with no downloaded pdfs
metadata_df = metadata_df.dropna(subset=['content_hash', 'file_path'])

metadata_df = metadata_df[metadata_columns]
metadata_df = metadata_df[metadata_columns]

# save new metadata
if not metadata_df.empty:
output_file = output_dir / f"{file_stem}_pdfs.parquet"
save_metadata(metadata_df, output_dir, file_stem)
# save new metadata
if not metadata_df.empty:
output_file = output_dir / f"{file_stem}_pdfs.parquet"
save_metadata(metadata_df, output_dir, file_stem)



0 comments on commit b9db7dd

Please sign in to comment.