Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix pdf downloads blocked on some urls #71

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,7 @@ cython_debug/
*.pkl

# Ref data file
data_step2_before-currency-units.csv
data_step2_before-currency-units.csv

# PDF downloads
pdf_downloads/
61 changes: 50 additions & 11 deletions collecte/pdf_downloader.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
import os
from pathlib import Path

import pandas as pd
import requests
from dotenv import load_dotenv

load_dotenv()

keywords = "tax country by country reporting GRI 207-4"
api_key = ""
cx = ""
api_key = os.getenv("GOOGLE_API_KEY")
cx = os.getenv("GOOGLE_CX")
# Define a header designed to mimic a request from a web browser.
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
),
}


def cbcr_finder(
Expand Down Expand Up @@ -37,6 +48,42 @@ def cbcr_finder(
return pdf_urls


def _download_pdf(
url: str,
local_filename: Path,
headers: dict[str, str] | None = None,
) -> None:
"""
Downloads a PDF file from a specified URL and saves it to a local file.

This function attempts to download a file from the provided URL and save it
to the specified local path. If the initial download attempt without headers times out,
the function retries the download using a predefined HTTP headers designed to
mimic a web browser request.

Parameters:
url (str): The URL from which to download the PDF file.
local_filename (Path): The local path (including filename) where the PDF will be saved.
headers (dict, optional): A dictionary of HTTP headers to send with the request.
If None, the function initially tries without headers, and on timeout, retries with
default browser-like headers.
"""
try:
with requests.get(url, stream=True, timeout=10, headers=headers) as r:
r.raise_for_status()
with Path.open(local_filename, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Downloaded: {local_filename}")
except requests.exceptions.Timeout:
if headers is None: # Check if headers were not used in the initial request
print("Initial request timed out. Retrying with headers...")
return _download_pdf(url, local_filename, HEADERS)
print("The request timed out with headers.")
except requests.exceptions.RequestException as e:
print(f"Failed to download {url}: {e!s}")


def download_pdf(url: str, download_folder: str, company_name: str) -> str:
# Create a sanitized version of the company name for the directory
company_folder = Path(download_folder) / "".join(
Expand All @@ -48,15 +95,7 @@ def download_pdf(url: str, download_folder: str, company_name: str) -> str:
local_filename = Path(company_folder) / url.split("/")[-1]

if not Path.exists(local_filename):
try:
with requests.get(url, stream=True) as r:
r.raise_for_status()
with Path.open(local_filename, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
print(f"Downloaded: {local_filename}")
except requests.RequestException as e:
print(f"Failed to download {url}: {e!s}")
_download_pdf(url, local_filename)
else:
print(f"Already exists: {local_filename}")
return str(local_filename)
Expand Down
7 changes: 6 additions & 1 deletion collecte/test/test_pdf_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,12 @@ def test_download_pdf(
assert result == expected_filename
mock_file.assert_called_once_with(PosixPath(expected_filename), "wb")
mock_exists.assert_any_call(PosixPath(expected_filename))
mock_get.assert_called_once_with(self.pdf_url, stream=True)
mock_get.assert_called_once_with(
self.pdf_url,
stream=True,
timeout=10,
headers=None,
)


if __name__ == "__main__":
Expand Down
Loading