modified to handle JSONError and other errors

biobricks-ai · Oct 21, 2024 · b9db7dd · b9db7dd
1 parent 9f0a87f
commit b9db7dd
Showing 1 changed file with 48 additions and 31 deletions.
diff --git a/stages/python/02_download.py b/stages/python/02_download.py
@@ -8,7 +8,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from tqdm import tqdm
 
-scraperapi_key = "Enter api key"
+scraperapi_key = "Api Key"
 
 metadata_columns = [
     'id', 'doi', 'url', 'type', 'type_crossref', 'publication_date', 'journal', 'publisher', 
@@ -48,35 +48,50 @@ def download_pdf(url, output_dir, downloaded_hashes):
         if content_hash in downloaded_hashes:
             return None, None
 
+        if len(content) < 1024:
+            return None, None
+
         filename = f"{content_hash}.pdf"
         outfile_path = output_dir / filename
 
         with outfile_path.open('wb') as file:
             file.write(content)
 
-        if content.startswith(b'%PDF-'):            
-            with outfile_path.open('rb') as file:
-                reader = pypdf.PdfReader(file)
-                if len(reader.pages) > 0:
-                    return outfile_path, content_hash
-
-        outfile_path.unlink()
+        if content.startswith(b'%PDF-'):
+            try:
+                with outfile_path.open('rb') as file:
+                    reader = pypdf.PdfReader(file)
+                    if len(reader.pages) > 1:
+                        return outfile_path, content_hash
+            except pypdf.errors.PdfReadError:
+                outfile_path.unlink()
+                return None, None
+            except Exception as e:
+                if outfile_path.exists():
+                    outfile_path.unlink()
+                    return None, None
+        if outfile_path.exists():
+            outfile_path.unlink()
 
     return None, None
 
 # retrieve publisher and journal from crossref
+# handling JSONDecodeError
 def extract_metadata(doi):
     base_url = "https://api.crossref.org/works/"
     response = requests.get(f"http://api.scraperapi.com",params={"api_key":scraperapi_key,"url":f"{base_url}{doi}"})
     if response.status_code == 200:
-        data = response.json()['message']
-        journal = data.get('container-title', [])
-        journal = journal[0] if journal else None
-        publisher = data.get('publisher', [])
-        publisher = publisher if publisher else None            
-        return {'journal': journal, 'publisher': publisher}
+        if 'application/json' in response.headers.get('Content-Type', ''): 
+            data = response.json()['message']
+            journal = data.get('container-title', [])
+            journal = journal[0] if journal else None
+            publisher = data.get('publisher', [])
+            publisher = publisher if publisher else None            
+            return {'journal': journal, 'publisher': publisher}
+        else:
+            return None
     else:
-        return {'journal': None, 'publisher': None}
+        return None
 
 ##### Execution #####
 
@@ -90,7 +105,7 @@ def extract_metadata(doi):
 
 # process each file
 for file in input_dir.glob('*.parquet'):
-    df = pd.read_parquet(file)[:100]
+    df = pd.read_parquet(file)[:7500]
 
     # Get the latest row where content_hash is assigned
     existing_metadata = load_existing_metadata(output_dir, file.stem)
@@ -116,7 +131,8 @@ def extract_metadata(doi):
                 doi_url[1], 
                 doi_url[0], 
                 *download_pdf(doi_url[1], output_dir, downloaded_hashes), 
-                *extract_metadata(doi_url[0]).values()
+                #*(extract_metadata(doi_url[0]).values() if extract_metadata(doi_url[0]) is not None else [None, None])
+                *(extract_metadata(doi_url[0]) or [None, None])
             ),
             zip(dois, urls)
         ), total=len(dois), desc='Downloading PDFs'))
@@ -125,8 +141,8 @@ def extract_metadata(doi):
         if result and result[2] not in downloaded_hashes:
             if result[2] is not None and result[3] is not None:
                 results_list.append({
-                    'doi': result[1],                #DOI  
-                    'url': result[0],                #URL
+                    'doi': result[1],                # DOI  
+                    'url': result[0],                # URL
                     'content_hash': str(result[3]),  # Hash
                     'file_path': str(result[2]),     # Path
                     'journal': str(result[4]),       # Journal
@@ -138,26 +154,27 @@ def extract_metadata(doi):
     results_df = pd.DataFrame(results_list)
 
     # merge hash, pdf path, journal and publisher to original data
+    # handle issues when no new data is generated
+    if not results_df.empty:
     # original columns
-    original_columns = [
+        original_columns = [
             'id', 'doi','type', 'type_crossref', 'publication_date', 'title', 
             'is_oa', 'authors', 'areas', 'themes', 'keywd', 'volume', 'issue', 'language'
-        ]
+            ]
 
-    original_df = df[original_columns]
-
-    metadata_df = pd.merge(results_df, original_df, on='doi', how='left')
+        original_df = df[original_columns]
 
-    # remove rows with no downloaded pdfs
-    metadata_df = metadata_df.dropna(subset=['content_hash', 'file_path'])
+        metadata_df = pd.merge(results_df, original_df, on='doi', how='left')
 
+        # remove rows with no downloaded pdfs
+        metadata_df = metadata_df.dropna(subset=['content_hash', 'file_path'])
 
-    metadata_df = metadata_df[metadata_columns]
+        metadata_df = metadata_df[metadata_columns]
 
-    # save new metadata
-    if not metadata_df.empty:
-        output_file = output_dir / f"{file_stem}_pdfs.parquet"
-        save_metadata(metadata_df, output_dir, file_stem)
+        # save new metadata
+        if not metadata_df.empty:
+            output_file = output_dir / f"{file_stem}_pdfs.parquet"
+            save_metadata(metadata_df, output_dir, file_stem)