diff --git a/requirements.txt b/requirements.txt index 663bd1f..1d8662f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -requests \ No newline at end of file +requests +pypdf \ No newline at end of file diff --git a/stages/02_download.sh b/stages/02_download.sh index 21d86da..e4f939b 100755 --- a/stages/02_download.sh +++ b/stages/02_download.sh @@ -14,5 +14,5 @@ for file in "$rawpath"/*.csv; do xargs -P 14 -I {} curl -G {} -d "binary_target=binary" \ -d "ultra_premium=false" \ -d "api_key=$SCRAPERAPI_KEY" -O -J -L --output-dir "$pdfpath" -w "%{json}\n" | - jq '{filename_effective, url}' >> "log.json" + jq '{filename_effective, url, response_code}' >> "log.json" done \ No newline at end of file diff --git a/stages/03_check_pdfs.py b/stages/03_check_pdfs.py new file mode 100644 index 0000000..5c31fd7 --- /dev/null +++ b/stages/03_check_pdfs.py @@ -0,0 +1,31 @@ +import json +import requests +from pypdf import PdfReader +from pathlib import Path + +def is_readable_pdf(file_path): + try: + with open(file_path, 'rb') as file: + reader = PdfReader(file) + if len(reader.pages) > 0: + # Try to extract text from the first page + text = reader.pages[0].extract_text() + return len(text.strip()) > 0 + except Exception: + return False + return True + +good_pdfs = [] +bad_pdfs = [] + +if __name__ == '__main__': + pdfs = Path("pdfs") + for file in pdfs.iterdir(): + if is_readable_pdf(file): + good_pdfs.append(file.name) + else: + bad_pdfs.append(file.name) + print("PDFs captured:", len(good_pdfs)) + print(f"Issues encountered with: {len(bad_pdfs)} files. Retrying...") + # find a way to replace this with the actual URL... + retry_urls = [{f: "url"} for f in bad_pdfs] \ No newline at end of file