diff --git a/requirements.txt b/requirements.txt
index 663bd1f..1d8662f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
-requests
\ No newline at end of file
+requests
+pypdf
\ No newline at end of file
diff --git a/stages/02_download.sh b/stages/02_download.sh
index 21d86da..e4f939b 100755
--- a/stages/02_download.sh
+++ b/stages/02_download.sh
@@ -14,5 +14,5 @@ for file in "$rawpath"/*.csv; do
         xargs -P 14 -I {} curl -G {} -d "binary_target=binary" \
             -d "ultra_premium=false" \
             -d "api_key=$SCRAPERAPI_KEY" -O -J -L --output-dir "$pdfpath" -w "%{json}\n" |
-            jq '{filename_effective, url}' >> "log.json"
+            jq '{filename_effective, url, response_code}' >> "log.json"
 done
\ No newline at end of file
diff --git a/stages/03_check_pdfs.py b/stages/03_check_pdfs.py
new file mode 100644
index 0000000..5c31fd7
--- /dev/null
+++ b/stages/03_check_pdfs.py
@@ -0,0 +1,31 @@
+import json
+import requests
+from pypdf import PdfReader
+from pathlib import Path
+
+def is_readable_pdf(file_path):
+    try:
+        with open(file_path, 'rb') as file:
+            reader = PdfReader(file)
+            if len(reader.pages) > 0:
+                # Try to extract text from the first page
+                text = reader.pages[0].extract_text()
+                return len(text.strip()) > 0
+    except Exception:
+        return False
+    return True
+
+good_pdfs = []
+bad_pdfs = []
+
+if __name__ == '__main__':
+    pdfs = Path("pdfs")
+    for file in pdfs.iterdir():
+        if is_readable_pdf(file):
+            good_pdfs.append(file.name)
+        else:
+            bad_pdfs.append(file.name)
+    print("PDFs captured:", len(good_pdfs))
+    print(f"Issues encountered with: {len(bad_pdfs)} files. Retrying...")
+    # find a way to replace this with the actual URL...
+    retry_urls = [{f: "url"} for f in bad_pdfs]
\ No newline at end of file