dataforgoodfr · Phiphigengen · Apr 21, 2024 · Apr 21, 2024
diff --git a/.gitignore b/.gitignore
@@ -168,4 +168,7 @@ cython_debug/
 *.pkl
 
 # Ref data file
-data_step2_before-currency-units.csv
+data_step2_before-currency-units.csv
+
+# PDF downloads
+pdf_downloads/
diff --git a/collecte/pdf_downloader.py b/collecte/pdf_downloader.py
@@ -1,11 +1,22 @@
+import os
 from pathlib import Path
 
 import pandas as pd
 import requests
+from dotenv import load_dotenv
+
+load_dotenv()
 
 keywords = "tax country by country reporting GRI 207-4"
-api_key = ""
-cx = ""
+api_key = os.getenv("GOOGLE_API_KEY")
+cx = os.getenv("GOOGLE_CX")
+# Define a header designed to mimic a request from a web browser.
+HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+    ),
+}
 
 
 def cbcr_finder(
@@ -37,6 +48,42 @@ def cbcr_finder(
     return pdf_urls
 
 
+def _download_pdf(
+    url: str,
+    local_filename: Path,
+    headers: dict[str, str] | None = None,
+) -> None:
+    """
+    Downloads a PDF file from a specified URL and saves it to a local file.
+
+    This function attempts to download a file from the provided URL and save it
+    to the specified local path. If the initial download attempt without headers times out,
+    the function retries the download using a predefined HTTP headers designed to
+    mimic a web browser request.
+
+    Parameters:
+        url (str): The URL from which to download the PDF file.
+        local_filename (Path): The local path (including filename) where the PDF will be saved.
+        headers (dict, optional): A dictionary of HTTP headers to send with the request.
+            If None, the function initially tries without headers, and on timeout, retries with
+            default browser-like headers.
+    """
+    try:
+        with requests.get(url, stream=True, timeout=10, headers=headers) as r:
+            r.raise_for_status()
+            with Path.open(local_filename, "wb") as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+        print(f"Downloaded: {local_filename}")
+    except requests.exceptions.Timeout:
+        if headers is None:  # Check if headers were not used in the initial request
+            print("Initial request timed out. Retrying with headers...")
+            return _download_pdf(url, local_filename, HEADERS)
+        print("The request timed out with headers.")
+    except requests.exceptions.RequestException as e:
+        print(f"Failed to download {url}: {e!s}")
+
+
 def download_pdf(url: str, download_folder: str, company_name: str) -> str:
     # Create a sanitized version of the company name for the directory
     company_folder = Path(download_folder) / "".join(
@@ -48,15 +95,7 @@ def download_pdf(url: str, download_folder: str, company_name: str) -> str:
     local_filename = Path(company_folder) / url.split("/")[-1]
 
     if not Path.exists(local_filename):
-        try:
-            with requests.get(url, stream=True) as r:
-                r.raise_for_status()
-                with Path.open(local_filename, "wb") as f:
-                    for chunk in r.iter_content(chunk_size=8192):
-                        f.write(chunk)
-            print(f"Downloaded: {local_filename}")
-        except requests.RequestException as e:
-            print(f"Failed to download {url}: {e!s}")
+        _download_pdf(url, local_filename)
     else:
         print(f"Already exists: {local_filename}")
     return str(local_filename)

diff --git a/collecte/test/test_pdf_downloader.py b/collecte/test/test_pdf_downloader.py
@@ -65,7 +65,12 @@ def test_download_pdf(
         assert result == expected_filename
         mock_file.assert_called_once_with(PosixPath(expected_filename), "wb")
         mock_exists.assert_any_call(PosixPath(expected_filename))
-        mock_get.assert_called_once_with(self.pdf_url, stream=True)
+        mock_get.assert_called_once_with(
+            self.pdf_url,
+            stream=True,
+            timeout=10,
+            headers=None,
+        )
 
 
 if __name__ == "__main__":