Skip to content

Commit

Permalink
working on some validation logic to catch bad PDFs
Browse files Browse the repository at this point in the history
  • Loading branch information
bhlieberman committed Aug 2, 2024
1 parent 33ca9db commit f7e887f
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 2 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
requests
requests
pypdf
2 changes: 1 addition & 1 deletion stages/02_download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@ for file in "$rawpath"/*.csv; do
xargs -P 14 -I {} curl -G {} -d "binary_target=binary" \
-d "ultra_premium=false" \
-d "api_key=$SCRAPERAPI_KEY" -O -J -L --output-dir "$pdfpath" -w "%{json}\n" |
jq '{filename_effective, url}' >> "log.json"
jq '{filename_effective, url, response_code}' >> "log.json"
done
31 changes: 31 additions & 0 deletions stages/03_check_pdfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import json
import requests
from pypdf import PdfReader
from pathlib import Path

def is_readable_pdf(file_path):
try:
with open(file_path, 'rb') as file:
reader = PdfReader(file)
if len(reader.pages) > 0:
# Try to extract text from the first page
text = reader.pages[0].extract_text()
return len(text.strip()) > 0
except Exception:
return False
return True

good_pdfs = []
bad_pdfs = []

if __name__ == '__main__':
pdfs = Path("pdfs")
for file in pdfs.iterdir():
if is_readable_pdf(file):
good_pdfs.append(file.name)
else:
bad_pdfs.append(file.name)
print("PDFs captured:", len(good_pdfs))
print(f"Issues encountered with: {len(bad_pdfs)} files. Retrying...")
# find a way to replace this with the actual URL...
retry_urls = [{f: "url"} for f in bad_pdfs]

0 comments on commit f7e887f

Please sign in to comment.