working on some validation logic to catch bad PDFs

biobricks-ai · Aug 2, 2024 · f7e887f · f7e887f
1 parent 33ca9db
commit f7e887f
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 2 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-requests
+requests
+pypdf
diff --git a/stages/02_download.sh b/stages/02_download.sh
@@ -14,5 +14,5 @@ for file in "$rawpath"/*.csv; do
         xargs -P 14 -I {} curl -G {} -d "binary_target=binary" \
             -d "ultra_premium=false" \
             -d "api_key=$SCRAPERAPI_KEY" -O -J -L --output-dir "$pdfpath" -w "%{json}\n" |
-            jq '{filename_effective, url}' >> "log.json"
+            jq '{filename_effective, url, response_code}' >> "log.json"
 done
diff --git a/stages/03_check_pdfs.py b/stages/03_check_pdfs.py
@@ -0,0 +1,31 @@
+import json
+import requests
+from pypdf import PdfReader
+from pathlib import Path
+
+def is_readable_pdf(file_path):
+    try:
+        with open(file_path, 'rb') as file:
+            reader = PdfReader(file)
+            if len(reader.pages) > 0:
+                # Try to extract text from the first page
+                text = reader.pages[0].extract_text()
+                return len(text.strip()) > 0
+    except Exception:
+        return False
+    return True
+
+good_pdfs = []
+bad_pdfs = []
+
+if __name__ == '__main__':
+    pdfs = Path("pdfs")
+    for file in pdfs.iterdir():
+        if is_readable_pdf(file):
+            good_pdfs.append(file.name)
+        else:
+            bad_pdfs.append(file.name)
+    print("PDFs captured:", len(good_pdfs))
+    print(f"Issues encountered with: {len(bad_pdfs)} files. Retrying...")
+    # find a way to replace this with the actual URL...
+    retry_urls = [{f: "url"} for f in bad_pdfs]