Skip to content

Commit

Permalink
MAINT: Use image_similarity function (#2073)
Browse files Browse the repository at this point in the history
Rename get_pdf_from_url to get_data_from_url
  • Loading branch information
MartinThoma authored Aug 9, 2023
1 parent 82e8681 commit f9d25e6
Show file tree
Hide file tree
Showing 12 changed files with 204 additions and 257 deletions.
6 changes: 3 additions & 3 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
from pypdf.generic import DictionaryObject, IndirectObject


def get_pdf_from_url(url: str, name: str) -> bytes:
def get_data_from_url(url: str, name: str) -> bytes:
"""
Download a PDF from a URL and return its contents.
Download a File from a URL and return its contents.
This function makes sure the PDF is not downloaded too often.
This function is a last resort for PDF files where we are uncertain if
Expand All @@ -20,7 +20,7 @@ def get_pdf_from_url(url: str, name: str) -> bytes:
name: unique name across all files
Returns:
Read PDF as bytes
Read File as bytes
"""
if url.startswith("file://"):
with open(url[7:].replace("\\", "/"), "rb") as fp:
Expand Down
20 changes: 10 additions & 10 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pypdf._cmap import build_char_map
from pypdf.errors import PdfReadWarning

from . import get_pdf_from_url
from . import get_data_from_url


@pytest.mark.enable_socket()
Expand Down Expand Up @@ -41,7 +41,7 @@
],
)
def test_text_extraction_slow(caplog, url: str, name: str, strict: bool):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=strict)
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict)
for page in reader.pages:
page.extract_text()
assert caplog.text == ""
Expand Down Expand Up @@ -81,7 +81,7 @@ def test_text_extraction_slow(caplog, url: str, name: str, strict: bool):
)
def test_text_extraction_fast(caplog, url: str, name: str, strict: bool):
"""Text extraction runs without exceptions or warnings"""
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=strict)
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict)
for page in reader.pages:
page.extract_text()
assert caplog.text == ""
Expand All @@ -92,7 +92,7 @@ def test_parse_encoding_advanced_encoding_not_implemented():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957144.pdf"
name = "tika-957144.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
with pytest.warns(PdfReadWarning, match="Advanced encoding .* not implemented yet"):
for page in reader.pages:
page.extract_text()
Expand All @@ -103,7 +103,7 @@ def test_ascii_charset():
# iss #1312
url = "https://github.com/py-pdf/pypdf/files/9472500/main.pdf"
name = "ascii charset.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "/a" not in reader.pages[0].extract_text()


Expand All @@ -129,15 +129,15 @@ def test_ascii_charset():
def test_text_extraction_of_specific_pages(
url: str, name: str, page_nb: int, within_text
):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert within_text in reader.pages[page_nb].extract_text()


@pytest.mark.enable_socket()
def test_iss1533():
url = "https://github.com/py-pdf/pypdf/files/10376149/iss1533.pdf"
name = "iss1533.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
reader.pages[0].extract_text() # no error
assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü"

Expand All @@ -163,7 +163,7 @@ def test_iss1533():
],
)
def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
extracted = reader.pages[page_index].extract_text() # no error
for contained in within_text:
assert contained in extracted
Expand All @@ -174,7 +174,7 @@ def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text)
def test_latex():
url = "https://github.com/py-pdf/pypdf/files/12163370/math-in-text-created-via-latex.pdf"
name = "math_latex.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
txt = reader.pages[0].extract_text() # no error
for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"):
assert pat in txt
Expand All @@ -185,7 +185,7 @@ def test_latex():
def test_unixxx_glyphs():
url = "https://arxiv.org/pdf/2201.00021.pdf"
name = "unixxx_glyphs.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
txt = reader.pages[0].extract_text() # no error
for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"):
assert pat in txt
Loading

0 comments on commit f9d25e6

Please sign in to comment.