From f9d25e63e46fb16a3dc7d80d89674805b32179c0 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Wed, 9 Aug 2023 13:56:02 +0200 Subject: [PATCH] MAINT: Use image_similarity function (#2073) Rename get_pdf_from_url to get_data_from_url --- tests/__init__.py | 6 +- tests/test_cmap.py | 20 ++--- tests/test_filters.py | 155 +++++++++++++------------------------- tests/test_generic.py | 20 ++--- tests/test_images.py | 4 +- tests/test_merger.py | 48 ++++++------ tests/test_page.py | 28 +++---- tests/test_page_labels.py | 4 +- tests/test_reader.py | 68 ++++++++--------- tests/test_workflows.py | 52 ++++++------- tests/test_writer.py | 42 +++++------ tests/test_xmp.py | 14 ++-- 12 files changed, 204 insertions(+), 257 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 3d79f778a..6b4305bec 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -7,9 +7,9 @@ from pypdf.generic import DictionaryObject, IndirectObject -def get_pdf_from_url(url: str, name: str) -> bytes: +def get_data_from_url(url: str, name: str) -> bytes: """ - Download a PDF from a URL and return its contents. + Download a File from a URL and return its contents. This function makes sure the PDF is not downloaded too often. This function is a last resort for PDF files where we are uncertain if @@ -20,7 +20,7 @@ def get_pdf_from_url(url: str, name: str) -> bytes: name: unique name across all files Returns: - Read PDF as bytes + Read File as bytes """ if url.startswith("file://"): with open(url[7:].replace("\\", "/"), "rb") as fp: diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 6e7448651..fe769e1c5 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -7,7 +7,7 @@ from pypdf._cmap import build_char_map from pypdf.errors import PdfReadWarning -from . import get_pdf_from_url +from . import get_data_from_url @pytest.mark.enable_socket() @@ -41,7 +41,7 @@ ], ) def test_text_extraction_slow(caplog, url: str, name: str, strict: bool): - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=strict) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict) for page in reader.pages: page.extract_text() assert caplog.text == "" @@ -81,7 +81,7 @@ def test_text_extraction_slow(caplog, url: str, name: str, strict: bool): ) def test_text_extraction_fast(caplog, url: str, name: str, strict: bool): """Text extraction runs without exceptions or warnings""" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)), strict=strict) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict) for page in reader.pages: page.extract_text() assert caplog.text == "" @@ -92,7 +92,7 @@ def test_parse_encoding_advanced_encoding_not_implemented(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957144.pdf" name = "tika-957144.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) with pytest.warns(PdfReadWarning, match="Advanced encoding .* not implemented yet"): for page in reader.pages: page.extract_text() @@ -103,7 +103,7 @@ def test_ascii_charset(): # iss #1312 url = "https://github.com/py-pdf/pypdf/files/9472500/main.pdf" name = "ascii charset.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert "/a" not in reader.pages[0].extract_text() @@ -129,7 +129,7 @@ def test_ascii_charset(): def test_text_extraction_of_specific_pages( url: str, name: str, page_nb: int, within_text ): - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert within_text in reader.pages[page_nb].extract_text() @@ -137,7 +137,7 @@ def test_text_extraction_of_specific_pages( def test_iss1533(): url = "https://github.com/py-pdf/pypdf/files/10376149/iss1533.pdf" name = "iss1533.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text() # no error assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü" @@ -163,7 +163,7 @@ def test_iss1533(): ], ) def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text): - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) extracted = reader.pages[page_index].extract_text() # no error for contained in within_text: assert contained in extracted @@ -174,7 +174,7 @@ def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text) def test_latex(): url = "https://github.com/py-pdf/pypdf/files/12163370/math-in-text-created-via-latex.pdf" name = "math_latex.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) txt = reader.pages[0].extract_text() # no error for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"): assert pat in txt @@ -185,7 +185,7 @@ def test_latex(): def test_unixxx_glyphs(): url = "https://arxiv.org/pdf/2201.00021.pdf" name = "unixxx_glyphs.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) txt = reader.pages[0].extract_text() # no error for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"): assert pat in txt diff --git a/tests/test_filters.py b/tests/test_filters.py index 0daf67a26..6e145f549 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -3,12 +3,11 @@ import sys from io import BytesIO from itertools import product as cartesian_product -from math import sqrt from pathlib import Path from unittest.mock import patch import pytest -from PIL import Image, ImageChops +from PIL import Image from pypdf import PdfReader from pypdf.errors import PdfReadError, PdfStreamError @@ -21,7 +20,7 @@ ) from pypdf.generic import ArrayObject, DictionaryObject, NameObject, NumberObject -from . import get_pdf_from_url +from . import get_data_from_url from .test_images import image_similarity filter_inputs = ( @@ -144,7 +143,7 @@ def test_decode_ahx(): """ url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf" name = "NewJersey.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for p in reader.pages: _ = list(p.images.keys()) @@ -233,7 +232,7 @@ def test_ccitt_fax_decode(): def test_decompress_zlib_error(mock_logger_warning): url = "https://corpora.tika.apache.org/base/docs/govdocs1/952/952445.pdf" name = "tika-952445.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for page in reader.pages: page.extract_text() mock_logger_warning.assert_called_with( @@ -245,7 +244,7 @@ def test_decompress_zlib_error(mock_logger_warning): def test_lzw_decode_neg1(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/921/921632.pdf" name = "tika-921632.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[47] with pytest.raises(PdfReadError) as exc: page.extract_text() @@ -256,7 +255,7 @@ def test_lzw_decode_neg1(): def test_issue_399(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/976/976970.pdf" name = "tika-976970.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[1].extract_text() @@ -266,7 +265,7 @@ def test_image_without_imagemagic(): sys.modules["PIL"] = None url = "https://corpora.tika.apache.org/base/docs/govdocs1/914/914102.pdf" name = "tika-914102.pdf" - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=True) for page in reader.pages: @@ -282,7 +281,7 @@ def test_image_without_imagemagic(): def test_issue_1737(): url = "https://github.com/py-pdf/pypdf/files/11068604/tt1.pdf" name = "iss1737.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0]["/Resources"]["/XObject"]["/Im0"].get_data() reader.pages[0]["/Resources"]["/XObject"]["/Im1"].get_data() reader.pages[0]["/Resources"]["/XObject"]["/Im2"].get_data() @@ -297,7 +296,7 @@ def test_pa_image_extraction(): """ url = "https://github.com/py-pdf/pypdf/files/11250359/test_img.pdf" name = "issue-1801.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page0 = reader.pages[0] images = page0.images @@ -305,7 +304,7 @@ def test_pa_image_extraction(): assert images[0].name == "Im1.png" # Ensure visual appearence - data = get_pdf_from_url( + data = get_data_from_url( "https://user-images.githubusercontent.com/" "1658117/232842886-9d1b0726-3a5b-430d-8464-595d919c266c.png", "issue-1801.png", @@ -318,7 +317,7 @@ def test_1bit_image_extraction(): """Cf issue #1814""" url = "https://github.com/py-pdf/pypdf/files/11336817/grimm10.pdf" name = "grimm10" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for p in reader.pages: p.images @@ -330,9 +329,7 @@ def test_png_transparency_reverse(): reader = PdfReader(pdf_path) url_png = "https://user-images.githubusercontent.com/4083478/236685544-a1940b06-fb42-4bb1-b589-1e4ad429d68e.png" name_png = "labeled-edges-center-image.png" - _refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + _refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) data = reader.pages[0].images[0] _img = Image.open(BytesIO(data.data)) assert ".jp2" in data.name @@ -344,12 +341,10 @@ def test_iss1787(): """Cf issue #1787""" url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf" name = "pdf_font_garbled.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/236793172-09340aef-3440-4c8a-af85-a91cdad27d46.png" name_png = "watermark1.png" - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) data = reader.pages[0].images[0] img = Image.open(BytesIO(data.data)) assert ".png" in data.name @@ -367,12 +362,10 @@ def test_tiff_predictor(): """Decode Tiff Predictor 2 Images""" url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977609.pdf" name = "tika-977609.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/236793166-288b4b59-dee3-49fd-a04e-410aab06199a.png" name_png = "tifimage.png" - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) data = reader.pages[0].images[0] img = Image.open(BytesIO(data.data)) assert ".png" in data.name @@ -384,13 +377,13 @@ def test_rgba(): """Decode rgb with transparency""" url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf" name = "tika-972174.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/238288207-b77dd38c-34b4-4f4f-810a-bf9db7ca0414.png" name_png = "tika-972174_p0-im0.png" data = reader.pages[0].images[0] assert ".jp2" in data.name similarity = image_similarity( - data.image, BytesIO(get_pdf_from_url(url_png, name=name_png)) + data.image, BytesIO(get_data_from_url(url_png, name=name_png)) ) assert similarity > 0.99 @@ -405,37 +398,25 @@ def test_cmyk(): return # the file is encrypted url = "https://github.com/py-pdf/pypdf/files/11962229/DB-5368770_Vitocal_200-G.pdf" name = "Vitocal.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/251283945-38c5b92c-cf94-473c-bb57-a51b74fc39be.jpg" name_png = "VitocalImage.png" - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[1].images[0] assert data.image.mode == "CMYK" assert ".jpg" in data.name - diff = ImageChops.difference(data.image, refimg) - d = sqrt( - sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()]) - ) / (diff.size[0] * diff.size[1]) - assert d < 0.01 + assert image_similarity(data.image, refimg) > 0.99 # deflate url = "https://github.com/py-pdf/pypdf/files/12078533/cmyk2.pdf" name = "cmyk_deflate.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://github.com/py-pdf/pypdf/files/12078556/cmyk.tif.txt" name_png = "cmyk_deflate.tif" - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[0].images[0] assert data.image.mode == "CMYK" assert ".tif" in data.name - diff = ImageChops.difference(data.image, refimg) - d = sqrt( - sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()]) - ) / (diff.size[0] * diff.size[1]) - assert d < 0.001 # lossless compression expected + assert image_similarity(data.image, refimg) > 0.999 # lossless compression expected @pytest.mark.enable_socket() @@ -443,7 +424,7 @@ def test_iss1863(): """Test doc from iss1863""" url = "https://github.com/py-pdf/pypdf/files/11578953/USC.EMBA.-.Pre-Season.and.Theme.I.pdf" name = "o1whh9b3.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for p in reader.pages: for i in p.images: i.name @@ -453,7 +434,7 @@ def test_iss1863(): def test_read_images(): url = "https://www.selbst.de/paidcontent/dl/64733/72916" name = "selbst.72916.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] for _ in page.images: pass @@ -463,7 +444,7 @@ def test_read_images(): def test_cascaded_filters_images(): url = "https://github.com/py-pdf/pypdf/files/11845099/GeoTopo-komprimiert.pdf" name = "iss1912.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # for focus, analyse the page 23 for p in reader.pages: for i in p.images: @@ -474,7 +455,7 @@ def test_cascaded_filters_images(): def test_calrgb(): url = "https://github.com/py-pdf/pypdf/files/12061061/tt.pdf" name = "calRGB.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].images[0] @@ -483,48 +464,32 @@ def test_index_lookup(): """The lookup is provided as an str and bytes""" url = "https://github.com/py-pdf/pypdf/files/12090523/2023.USDC_Circle.Examination.Report.May.2023.pdf" name = "2023USDC.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # TextStringObject Lookup url_png = "https://github.com/py-pdf/pypdf/files/12144094/im1.png.txt" name_png = "iss1982_im1.png" - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[0].images[-1] assert data.image.mode == "RGB" - diff = ImageChops.difference(data.image, refimg) - d = sqrt(sum([(a * a + b * b + c * c) for a, b, c in diff.getdata()])) / ( - diff.size[0] * diff.size[1] - ) - assert d < 0.001 + assert image_similarity(data.image, refimg) > 0.999 # ByteStringObject Lookup url_png = "https://github.com/py-pdf/pypdf/files/12144093/im2.png.txt" name_png = "iss1982_im2.png" - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[-1].images[-1] assert data.image.mode == "RGB" - diff = ImageChops.difference(data.image, refimg) - d = sqrt(sum([(a * a + b * b + c * c) for a, b, c in diff.getdata()])) / ( - diff.size[0] * diff.size[1] - ) - assert d < 0.001 + assert image_similarity(data.image, refimg) > 0.999 # indexed CMYK images # currently with a TODO as we convert to RBG the palette url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf" name = "tika-972174.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42" name_png = "usa.png" - refimg = Image.open(BytesIO(get_pdf_from_url(url_png, name=name_png))) + refimg = Image.open(BytesIO(get_data_from_url(url_png, name=name_png))) data = reader.pages[0].images["/Im3"] # assert data.image.mode == "PA" but currently "RGBA" - diff = ImageChops.difference(data.image, refimg) - d = sqrt( - sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()]) - ) / (diff.size[0] * diff.size[1]) - assert d < 0.001 + assert image_similarity(data.image, refimg) > 0.999 @pytest.mark.enable_socket() @@ -532,18 +497,12 @@ def test_2bits_image(): """From #1954, test with 2bits image. TODO: 4bits also""" url = "https://github.com/py-pdf/pypdf/files/12050253/tt.pdf" name = "paid.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/253568117-ca95cc85-9dea-4145-a5e0-032f1c1aa322.png" name_png = "Paid.png" - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[0].images[0] - diff = ImageChops.difference(data.image, refimg) - d = sqrt( - sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()]) - ) / (diff.size[0] * diff.size[1]) - assert d < 0.01 + assert image_similarity(data.image, refimg) > 0.99 @pytest.mark.enable_socket() @@ -554,17 +513,13 @@ def test_gray_devicen_cmyk(): """ url = "https://github.com/py-pdf/pypdf/files/12080338/example_121.pdf" name = "gray_cmyk.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/254545494-42df4949-1557-4f2d-acca-6be6e8de1122.png" name_png = "velo.png" - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[0].images[0] assert data.image.mode == "L" - diff = ImageChops.difference(data.image, refimg) - d = sqrt(sum([(a * a) for a in diff.getdata()])) / (diff.size[0] * diff.size[1]) - assert d < 0.001 + assert image_similarity(data.image, refimg) > 0.999 @pytest.mark.enable_socket() @@ -572,19 +527,15 @@ def test_runlengthdecode(): """From #1954, test with 2bits image. TODO: 4bits also""" url = "https://github.com/py-pdf/pypdf/files/12159941/out.pdf" name = "RunLengthDecode.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/255940800-6d63972e-a3d6-4cf9-aa6f-0793af24cded.png" name_png = "RunLengthDecode.png" - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[0].images[0] - diff = ImageChops.difference(data.image, refimg) - d = sqrt(sum([(a * a) for a in diff.getdata()])) / (diff.size[0] * diff.size[1]) - assert d < 0.001 + assert image_similarity(data.image, refimg) > 0.999 url = "https://github.com/py-pdf/pypdf/files/12162905/out.pdf" name = "FailedRLE1.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) with pytest.raises(PdfStreamError) as exc: reader.pages[0].images[0] assert exc.value.args[0] == "Unexpected EOD in RunLengthDecode" @@ -603,17 +554,13 @@ def test_gray_separation_cmyk(): """ url = "https://github.com/py-pdf/pypdf/files/12143372/tt.pdf" name = "TestWithSeparationBlack.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url_png = "https://user-images.githubusercontent.com/4083478/254545494-42df4949-1557-4f2d-acca-6be6e8de1122.png" name_png = "velo.png" # reused - refimg = Image.open( - BytesIO(get_pdf_from_url(url_png, name=name_png)) - ) # not a pdf but it works + refimg = BytesIO(get_data_from_url(url_png, name=name_png)) data = reader.pages[0].images[0] assert data.image.mode == "L" - diff = ImageChops.difference(data.image, refimg) - d = sqrt(sum([(a * a) for a in diff.getdata()])) / (diff.size[0] * diff.size[1]) - assert d < 0.001 + assert image_similarity(data.image, refimg) > 0.999 @pytest.mark.enable_socket() @@ -621,7 +568,7 @@ def test_singleton_device(): """From #2023""" url = "https://github.com/py-pdf/pypdf/files/12177287/tt.pdf" name = "pypdf_with_arr_deviceRGB.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].images[0] @@ -630,7 +577,7 @@ def test_jpx_no_spacecode(): """From #2061""" url = "https://github.com/py-pdf/pypdf/files/12253581/tt2.pdf" name = "jpx_no_spacecode.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) im = reader.pages[0].images[0] # create an object without filter and without colorspace # just for coverage diff --git a/tests/test_generic.py b/tests/test_generic.py index 642e0ef0e..fe91b7184 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -34,7 +34,7 @@ read_string_from_stream, ) -from . import ReaderDummy, get_pdf_from_url +from . import ReaderDummy, get_data_from_url TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent @@ -630,7 +630,7 @@ def test_remove_child_in_tree(): ], ) def test_extract_text(caplog, url: str, name: str, caplog_content: str): - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for page in reader.pages: page.extract_text() if caplog_content == "": @@ -645,7 +645,7 @@ def test_text_string_write_to_stream(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924562.pdf" name = "tika-924562.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.clone_document_from_reader(reader) for page in writer.pages: @@ -657,7 +657,7 @@ def test_bool_repr(tmp_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/932/932449.pdf" name = "tika-932449.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) write_path = tmp_path / "tmp-fields-report.txt" with open(write_path, "w") as fp: fields = reader.get_fields(fileobj=fp) @@ -683,7 +683,7 @@ def test_issue_997(mock_logger_warning, pdf_file_path): name = "gh-issue-997.pdf" merger = PdfMerger() - merger.append(BytesIO(get_pdf_from_url(url, name=name))) # here the error raises + merger.append(BytesIO(get_data_from_url(url, name=name))) # here the error raises with open(pdf_file_path, "wb") as f: merger.write(f) merger.close() @@ -694,7 +694,7 @@ def test_issue_997(mock_logger_warning, pdf_file_path): merger = PdfMerger(strict=True) with pytest.raises(PdfReadError) as exc: merger.append( - BytesIO(get_pdf_from_url(url, name=name)) + BytesIO(get_data_from_url(url, name=name)) ) # here the error raises assert exc.value.args[0] == "Could not find object." with open(pdf_file_path, "wb") as f: @@ -1155,7 +1155,7 @@ def test_append_with_indirectobject_not_pointing(caplog): """ url = "https://github.com/py-pdf/pypdf/files/10729142/document.pdf" name = "tst_iss1631.pdf" - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=False) writer = PdfWriter() writer.append(reader) @@ -1171,7 +1171,7 @@ def test_iss1615_1673(): # #1615 url = "https://github.com/py-pdf/pypdf/files/10671366/graph_letter.pdf" name = "graph_letter.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader) assert ( @@ -1183,7 +1183,7 @@ def test_iss1615_1673(): # #1673 url = "https://github.com/py-pdf/pypdf/files/10848750/budgeting-loan-form-sf500.pdf" name = "budgeting-loan-form-sf500.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.clone_document_from_reader(reader) @@ -1196,7 +1196,7 @@ def test_destination_withoutzoom(): "2021%20----%20book%20-%20Security%20of%20biquitous%20Computing%20Systems.pdf" ) name = "2021_book_security.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.outline out = BytesIO() diff --git a/tests/test_images.py b/tests/test_images.py index 4ec75f4f8..b159af0d3 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -15,7 +15,7 @@ from pypdf import PdfReader from pypdf._page import PageObject -from . import get_pdf_from_url +from . import get_data_from_url TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent @@ -107,7 +107,7 @@ def test_image_similarity_mid(): def test_image_new_property(): url = "https://github.com/py-pdf/pypdf/files/11219022/pdf_font_garbled.pdf" name = "pdf_font_garbled.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.pages[0].images.keys() == [ "/I0", "/I1", diff --git a/tests/test_merger.py b/tests/test_merger.py index 094df1014..3820ef1a5 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -10,7 +10,7 @@ from pypdf.errors import DeprecationError from pypdf.generic import Destination, Fit -from . import get_pdf_from_url +from . import get_data_from_url TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent @@ -323,7 +323,7 @@ def test_merge_write_closed_fh_with_writer(pdf_file_path): def test_trim_outline_list(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/995/995175.pdf" name = "tika-995175.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) merger.write(pdf_file_path) @@ -334,7 +334,7 @@ def test_trim_outline_list(pdf_file_path): def test_trim_outline_list_with_writer(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/995/995175.pdf" name = "tika-995175.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @@ -346,7 +346,7 @@ def test_trim_outline_list_with_writer(pdf_file_path): def test_zoom(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/994/994759.pdf" name = "tika-994759.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) merger.write(pdf_file_path) @@ -357,7 +357,7 @@ def test_zoom(pdf_file_path): def test_zoom_with_writer(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/994/994759.pdf" name = "tika-994759.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @@ -369,7 +369,7 @@ def test_zoom_with_writer(pdf_file_path): def test_zoom_xyz_no_left(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/933/933322.pdf" name = "tika-933322.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) merger.write(pdf_file_path) @@ -380,7 +380,7 @@ def test_zoom_xyz_no_left(pdf_file_path): def test_zoom_xyz_no_left_with_writer(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/933/933322.pdf" name = "tika-933322.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @@ -392,7 +392,7 @@ def test_zoom_xyz_no_left_with_writer(pdf_file_path): def test_outline_item(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/997/997511.pdf" name = "tika-997511.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) merger.write(pdf_file_path) @@ -404,7 +404,7 @@ def test_outline_item(pdf_file_path): def test_outline_item_with_writer(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/997/997511.pdf" name = "tika-997511.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @@ -417,7 +417,7 @@ def test_outline_item_with_writer(pdf_file_path): def test_trim_outline(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/982/982336.pdf" name = "tika-982336.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) merger.write(pdf_file_path) @@ -429,7 +429,7 @@ def test_trim_outline(pdf_file_path): def test_trim_outline_with_writer(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/982/982336.pdf" name = "tika-982336.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @@ -442,7 +442,7 @@ def test_trim_outline_with_writer(pdf_file_path): def test1(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923621.pdf" name = "tika-923621.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) merger.write(pdf_file_path) @@ -454,7 +454,7 @@ def test1(pdf_file_path): def test1_with_writer(pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/923/923621.pdf" name = "tika-923621.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @@ -468,7 +468,7 @@ def test_sweep_recursion1(pdf_file_path): # TODO: This test looks like an infinite loop. url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) merger.write(pdf_file_path) @@ -484,7 +484,7 @@ def test_sweep_recursion1_with_writer(pdf_file_path): # TODO: This test looks like an infinite loop. url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @@ -512,7 +512,7 @@ def test_sweep_recursion1_with_writer(pdf_file_path): ) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_sweep_recursion2(url, name, pdf_file_path): - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) merger.write(pdf_file_path) @@ -539,7 +539,7 @@ def test_sweep_recursion2(url, name, pdf_file_path): ], ) def test_sweep_recursion2_with_writer(url, name, pdf_file_path): - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @@ -554,7 +554,7 @@ def test_sweep_recursion2_with_writer(url, name, pdf_file_path): def test_sweep_indirect_list_newobj_is_none(caplog, pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/906/906769.pdf" name = "tika-906769.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) merger.write(pdf_file_path) @@ -569,7 +569,7 @@ def test_sweep_indirect_list_newobj_is_none(caplog, pdf_file_path): def test_sweep_indirect_list_newobj_is_none_with_writer(caplog, pdf_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/906/906769.pdf" name = "tika-906769.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfWriter() merger.append(reader) merger.write(pdf_file_path) @@ -587,7 +587,7 @@ def test_iss1145(): url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf" name = "iss1145.pdf" merger = PdfMerger() - merger.append(PdfReader(BytesIO(get_pdf_from_url(url, name=name)))) + merger.append(PdfReader(BytesIO(get_data_from_url(url, name=name)))) merger.close() @@ -597,7 +597,7 @@ def test_iss1145_with_writer(): url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf" name = "iss1145.pdf" merger = PdfWriter() - merger.append(PdfReader(BytesIO(get_pdf_from_url(url, name=name)))) + merger.append(PdfReader(BytesIO(get_data_from_url(url, name=name)))) merger.close() @@ -649,7 +649,7 @@ def test_iss1344(caplog): url = "https://github.com/py-pdf/pypdf/files/9549001/input.pdf" name = "iss1344.pdf" m = PdfMerger() - m.append(PdfReader(BytesIO(get_pdf_from_url(url, name=name)))) + m.append(PdfReader(BytesIO(get_data_from_url(url, name=name)))) b = BytesIO() m.write(b) r = PdfReader(b) @@ -664,7 +664,7 @@ def test_iss1344_with_writer(caplog): url = "https://github.com/py-pdf/pypdf/files/9549001/input.pdf" name = "iss1344.pdf" m = PdfWriter() - m.append(PdfReader(BytesIO(get_pdf_from_url(url, name=name)))) + m.append(PdfReader(BytesIO(get_data_from_url(url, name=name)))) b = BytesIO() m.write(b) p = PdfReader(b).pages[0] @@ -677,7 +677,7 @@ def test_articles_with_writer(caplog): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "924666.pdf" m = PdfWriter() - m.append(PdfReader(BytesIO(get_pdf_from_url(url, name=name))), (2, 10)) + m.append(PdfReader(BytesIO(get_data_from_url(url, name=name))), (2, 10)) b = BytesIO() m.write(b) r = PdfReader(b) diff --git a/tests/test_page.py b/tests/test_page.py index 70a830a09..d1f6f3fcb 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -24,7 +24,7 @@ TextStringObject, ) -from . import get_pdf_from_url, normalize_warnings +from . import get_data_from_url, normalize_warnings TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent @@ -87,7 +87,7 @@ def test_page_operations(pdf_path, password): is as expected. """ if pdf_path.startswith("http"): - pdf_path = BytesIO(get_pdf_from_url(pdf_path, pdf_path.split("/")[-1])) + pdf_path = BytesIO(get_data_from_url(pdf_path, pdf_path.split("/")[-1])) else: pdf_path = RESOURCE_ROOT / pdf_path reader = PdfReader(pdf_path) @@ -349,7 +349,7 @@ def test_iss_1142(): # check fix for problem of context save/restore (q/Q) url = "https://github.com/py-pdf/pypdf/files/9150656/ST.2019.PDF" name = "st2019.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) txt = reader.pages[3].extract_text() # The following text is contained in two different cells: assert txt.find("有限公司") > 0 @@ -398,7 +398,7 @@ def test_iss_1142(): ], ) def test_extract_text(url, name): - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for page in reader.pages: page.extract_text() @@ -408,7 +408,7 @@ def test_extract_text(url, name): def test_extract_text_page_pdf_impossible_decode_xform(caplog): url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972962.pdf" name = "tika-972962.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for page in reader.pages: page.extract_text() warn_msgs = normalize_warnings(caplog.text) @@ -420,7 +420,7 @@ def test_extract_text_page_pdf_impossible_decode_xform(caplog): def test_extract_text_operator_t_star(): # L1266, L1267 url = "https://corpora.tika.apache.org/base/docs/govdocs1/967/967943.pdf" name = "tika-967943.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for page in reader.pages: page.extract_text() @@ -862,7 +862,7 @@ def test_annotation_setter(pdf_file_path): def test_text_extraction_issue_1091(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/966/966635.pdf" name = "tika-966635.pdf" - stream = BytesIO(get_pdf_from_url(url, name=name)) + stream = BytesIO(get_data_from_url(url, name=name)) with pytest.warns(PdfReadWarning): reader = PdfReader(stream) for page in reader.pages: @@ -873,7 +873,7 @@ def test_text_extraction_issue_1091(): def test_empyt_password_1088(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/941/941536.pdf" name = "tika-941536.pdf" - stream = BytesIO(get_pdf_from_url(url, name=name)) + stream = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(stream) len(reader.pages) @@ -922,7 +922,7 @@ def test_read_link_annotation(): def test_no_resources(): url = "https://github.com/py-pdf/pypdf/files/9572045/108.pdf" name = "108.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page_one = reader.pages[0] page_two = reader.pages[0] page_one.merge_page(page_two) @@ -1095,10 +1095,10 @@ def test_merge_page_resources_smoke_test(): def test_merge_transformed_page_into_blank(): url = "https://github.com/py-pdf/pypdf/files/10768334/badges_3vjrh_7LXDZ_1-1.pdf" name = "badges_3vjrh_7LXDZ_1.pdf" - r1 = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + r1 = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/files/10768335/badges_3vjrh_7LXDZ_2-1.pdf" name = "badges_3vjrh_7LXDZ_2.pdf" - r2 = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + r2 = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.add_blank_page(100, 100) writer.pages[0].merge_translated_page(r1.pages[0], 0, 0, True, True) @@ -1139,7 +1139,7 @@ def test_pages_printing(): def test_del_pages(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/941/941536.pdf" name = "tika-941536.pdf" - writer = PdfWriter(clone_from=BytesIO(get_pdf_from_url(url, name=name))) + writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) ll = len(writer.pages) pp = writer.pages[1].indirect_reference del writer.pages[1] @@ -1160,7 +1160,7 @@ def test_del_pages(): for p in pp: assert p not in pages["/Kids"] # del whole arborescence - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # error case pp = reader.pages[2] i = pp["/Parent"].get_object()["/Kids"].index(pp.indirect_reference) @@ -1168,7 +1168,7 @@ def test_del_pages(): with pytest.raises(PdfReadError): del reader.pages[2] # reader is corrupted we have to reload it - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) del reader.pages[:] assert len(reader.pages) == 0 assert len(reader.trailer["/Root"]["/Pages"]["/Kids"]) == 0 diff --git a/tests/test_page_labels.py b/tests/test_page_labels.py index a0cc87355..8b2e11b0d 100644 --- a/tests/test_page_labels.py +++ b/tests/test_page_labels.py @@ -21,7 +21,7 @@ NumberObject, ) -from . import get_pdf_from_url +from . import get_data_from_url @pytest.mark.parametrize( @@ -72,7 +72,7 @@ def test_number2uppercase_letter(): def test_index2label(caplog): url = "https://github.com/py-pdf/pypdf/files/10773829/waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf" name = "waarom-meisjes-het-beter-doen-op-HAVO-en-VWO-ROA.pdf" - r = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + r = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert index2label(r, 1) == "ii" assert index2label(r, 9) == "6" # very silly data to get test cover diff --git a/tests/test_reader.py b/tests/test_reader.py index dc8b44a2f..141d59aed 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -27,7 +27,7 @@ TextStringObject, ) -from . import get_pdf_from_url, normalize_warnings +from . import get_data_from_url, normalize_warnings try: from Crypto.Cipher import AES # noqa: F401 @@ -780,7 +780,7 @@ def test_converttoint_deprecated(): @pytest.mark.enable_socket() def test_iss925(): url = "https://github.com/py-pdf/pypdf/files/8796328/1.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name="iss925.pdf"))) + reader = PdfReader(BytesIO(get_data_from_url(url, name="iss925.pdf"))) for page_sliced in reader.pages: page_object = page_sliced.get_object() @@ -839,7 +839,7 @@ def test_read_form_416(): url = ( "https://www.fda.gov/downloads/AboutFDA/ReportsManualsForms/Forms/UCM074728.pdf" ) - reader = PdfReader(BytesIO(get_pdf_from_url(url, name="issue_416.pdf"))) + reader = PdfReader(BytesIO(get_data_from_url(url, name="issue_416.pdf"))) fields = reader.get_form_text_fields() assert len(fields) > 0 @@ -877,7 +877,7 @@ def test_extract_text_xref_issue_2(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/981/981961.pdf" msg = "incorrect startxref pointer(2)" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-981961.pdf"))) + reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-981961.pdf"))) for page in reader.pages: page.extract_text() assert normalize_warnings(caplog.text) == [msg] @@ -889,7 +889,7 @@ def test_extract_text_xref_issue_3(caplog): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/977/977774.pdf" msg = "incorrect startxref pointer(3)" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-977774.pdf"))) + reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-977774.pdf"))) for page in reader.pages: page.extract_text() assert normalize_warnings(caplog.text) == [msg] @@ -899,7 +899,7 @@ def test_extract_text_xref_issue_3(caplog): def test_extract_text_pdf15(): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/976/976030.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-976030.pdf"))) + reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-976030.pdf"))) for page in reader.pages: page.extract_text() @@ -908,7 +908,7 @@ def test_extract_text_pdf15(): def test_extract_text_xref_table_21_bytes_clrf(): # pdf/0264cf510015b2a4b395a15cb23c001e.pdf url = "https://corpora.tika.apache.org/base/docs/govdocs1/956/956939.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-956939.pdf"))) + reader = PdfReader(BytesIO(get_data_from_url(url, name="tika-956939.pdf"))) for page in reader.pages: page.extract_text() @@ -917,7 +917,7 @@ def test_extract_text_xref_table_21_bytes_clrf(): def test_get_fields(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972486.pdf" name = "tika-972486.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) fields = reader.get_fields() assert fields is not None assert "c1-1" in fields @@ -930,7 +930,7 @@ def test_get_fields(): def test_get_full_qualified_fields(): url = "https://github.com/py-pdf/pypdf/files/10142389/fields_with_dots.pdf" name = "fields_with_dots.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) fields = reader.get_form_text_fields(True) assert fields is not None assert "customer.name" in fields @@ -952,14 +952,14 @@ def test_get_fields_read_else_block(): # covers also issue 1089 url = "https://corpora.tika.apache.org/base/docs/govdocs1/934/934771.pdf" name = "tika-934771.pdf" - PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + PdfReader(BytesIO(get_data_from_url(url, name=name))) @pytest.mark.enable_socket() def test_get_fields_read_else_block2(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/914/914902.pdf" name = "tika-914902.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) fields = reader.get_fields() assert fields is None @@ -969,14 +969,14 @@ def test_get_fields_read_else_block2(): def test_get_fields_read_else_block3(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/957/957721.pdf" name = "tika-957721.pdf" - PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + PdfReader(BytesIO(get_data_from_url(url, name=name))) @pytest.mark.enable_socket() def test_metadata_is_none(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/963/963692.pdf" name = "tika-963692.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.metadata is None @@ -984,7 +984,7 @@ def test_metadata_is_none(): def test_get_fields_read_write_report(txt_file_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/909/909655.pdf" name = "tika-909655.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) with open(txt_file_path, "w") as fp: fields = reader.get_fields(fileobj=fp) assert fields @@ -1006,7 +1006,7 @@ def test_xfa(src): def test_xfa_non_empty(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/942/942050.pdf" name = "tika-942050.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert list(reader.xfa.keys()) == [ "preamble", "config", @@ -1034,7 +1034,7 @@ def test_header(src, pdf_header): def test_outline_color(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.outline[0].color == [0, 0, 1] @@ -1042,7 +1042,7 @@ def test_outline_color(): def test_outline_font_format(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.outline[0].font_format == 2 @@ -1184,7 +1184,7 @@ def test_outline_missing_title(caplog): ids=["stored_directly", "dest_below_names_with_kids"], ) def test_named_destination(url, name): - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert len(reader.named_destinations) > 0 @@ -1192,7 +1192,7 @@ def test_named_destination(url, name): def test_outline_with_missing_named_destination(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/913/913678.pdf" name = "tika-913678.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # outline items in document reference a named destination that is not defined assert reader.outline[1][0].title.startswith("Report for 2002AZ3B: Microbial") @@ -1201,7 +1201,7 @@ def test_outline_with_missing_named_destination(): def test_outline_with_empty_action(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # outline items (entitled Tables and Figures) utilize an empty action (/A) # that has no type or destination assert reader.outline[-4].title == "Tables" @@ -1219,7 +1219,7 @@ def test_pdfreader_multiple_definitions(caplog): """iss325""" url = "https://github.com/py-pdf/pypdf/files/9176644/multipledefs.pdf" name = "multipledefs.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text() assert normalize_warnings(caplog.text) == [ "Multiple definitions in dictionary at byte 0xb5 for key /Group" @@ -1245,11 +1245,11 @@ def test_corrupted_xref_table(): # issue #1292 url = "https://github.com/py-pdf/pypdf/files/9444747/BreezeManual.orig.pdf" name = "BreezeMan1.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text() url = "https://github.com/py-pdf/pypdf/files/9444748/BreezeManual.failed.pdf" name = "BreezeMan2.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].extract_text() @@ -1258,7 +1258,7 @@ def test_reader(caplog): # iss #1273 url = "https://github.com/py-pdf/pypdf/files/9464742/shiv_resume.pdf" name = "shiv_resume.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert "Previous trailer can not be read" in caplog.text caplog.clear() # first call requires some reparations... @@ -1279,7 +1279,7 @@ def test_zeroing_xref(): "UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf" ) name = "UTA_OSHA.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) len(reader.pages) @@ -1290,11 +1290,11 @@ def test_thread(): "UTA_OSHA_3115_Fall_Protection_Training_09162021_.pdf" ) name = "UTA_OSHA.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert reader.threads is None url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert isinstance(reader.threads, ArrayObject) assert len(reader.threads) >= 1 @@ -1303,7 +1303,7 @@ def test_thread(): def test_build_outline_item(caplog): url = "https://github.com/py-pdf/pypdf/files/9464742/shiv_resume.pdf" name = "shiv_resume.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) outline = reader._build_outline_item( DictionaryObject( { @@ -1355,7 +1355,7 @@ def test_page_labels(src, page_labels): def test_iss1559(): url = "https://github.com/py-pdf/pypdf/files/10441992/default.pdf" name = "iss1559.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for p in reader.pages: p.extract_text() @@ -1365,7 +1365,7 @@ def test_iss1652(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/10818844/tt.pdf" name = "invalidNamesDest.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.named_destinations @@ -1373,7 +1373,7 @@ def test_iss1652(): def test_iss1689(): url = "https://github.com/py-pdf/pypdf/files/10948283/error_file_without_data.pdf" name = "iss1689.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0] @@ -1381,7 +1381,7 @@ def test_iss1689(): def test_iss1710(): url = "https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf" name = "irbookonlinereading.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.outline @@ -1423,7 +1423,7 @@ def test_broken_file_header(): def test_iss1756(): url = "https://github.com/py-pdf/pypdf/files/11105591/641-Attachment-B-Pediatric-Cardiac-Arrest-8-1-2019.pdf" name = "iss1756.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.trailer["/ID"] # removed to cope with missing cryptodome during commit check : len(reader.pages) @@ -1433,6 +1433,6 @@ def test_iss1756(): def test_iss1825(): url = "https://github.com/py-pdf/pypdf/files/11367871/MiFO_LFO_FEIS_NOA_Published.3.pdf" name = "iss1825.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) page = reader.pages[0] page.extract_text() diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 1c06c02df..fa16c27f1 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -19,7 +19,7 @@ from pypdf.errors import PdfReadError, PdfReadWarning from pypdf.generic import ContentStream, NameObject, read_object -from . import get_pdf_from_url, normalize_warnings +from . import get_data_from_url, normalize_warnings TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent @@ -253,7 +253,7 @@ def test_extract_textbench(enable, url, pages, print_result=False): if not enable: return try: - reader = PdfReader(BytesIO(get_pdf_from_url(url, url.split("/")[-1]))) + reader = PdfReader(BytesIO(get_data_from_url(url, url.split("/")[-1]))) for page_number in pages: if print_result: print(f"**************** {url} / page {page_number} ****************") @@ -322,7 +322,7 @@ def test_orientations(): ) def test_overlay(pdf_file_path, base_path, overlay_path): if base_path.startswith("http"): - base_path = BytesIO(get_pdf_from_url(base_path, name="tika-935981.pdf")) + base_path = BytesIO(get_data_from_url(base_path, name="tika-935981.pdf")) else: base_path = PROJECT_ROOT / base_path reader = PdfReader(base_path) @@ -351,7 +351,7 @@ def test_overlay(pdf_file_path, base_path, overlay_path): ) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge_with_warning(tmp_path, url, name): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) merger = PdfMerger() merger.append(reader) @@ -371,7 +371,7 @@ def test_merge_with_warning(tmp_path, url, name): ) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_merge(tmp_path, url, name): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) merger = PdfMerger() merger.append(reader) @@ -398,7 +398,7 @@ def test_merge(tmp_path, url, name): ], ) def test_get_metadata(url, name, expected_metadata): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) data = reader.metadata assert expected_metadata == data @@ -477,7 +477,7 @@ def test_get_metadata(url, name, expected_metadata): ], ) def test_extract_text(url, name, strict, exception): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=strict) if not exception: for page in reader.pages: @@ -517,7 +517,7 @@ def test_extract_text(url, name, strict, exception): ], ) def test_compress_raised(url, name): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) writer = PdfWriter() writer.clone_document_from_reader(reader) @@ -537,7 +537,7 @@ def test_compress_raised(url, name): ], ) def test_get_fields_warns(tmp_path, caplog, url, name): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) write_path = tmp_path / "tmp.txt" with open(write_path, "w") as fp: @@ -558,7 +558,7 @@ def test_get_fields_warns(tmp_path, caplog, url, name): ], ) def test_get_fields_no_warning(tmp_path, url, name): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) write_path = tmp_path / "tmp.txt" with open(write_path, "w") as fp: @@ -571,7 +571,7 @@ def test_get_fields_no_warning(tmp_path, url, name): def test_scale_rectangle_indirect_object(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/999/999944.pdf" name = "tika-999944.pdf" - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) for page in reader.pages: @@ -657,7 +657,7 @@ def test_merge_output(caplog): ], ) def test_image_extraction(url, name): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) images_extracted = [] @@ -685,7 +685,7 @@ def test_image_extraction_strict(): # Emits log messages url = "https://corpora.tika.apache.org/base/docs/govdocs1/914/914102.pdf" name = "tika-914102.pdf" - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=True) images_extracted = [] @@ -719,7 +719,7 @@ def test_image_extraction_strict(): ], ) def test_image_extraction2(url, name): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) images_extracted = [] @@ -757,7 +757,7 @@ def test_image_extraction2(url, name): ], ) def test_get_outline(url, name): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) reader.outline @@ -777,7 +777,7 @@ def test_get_outline(url, name): ], ) def test_get_xfa(url, name): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data) reader.xfa @@ -809,7 +809,7 @@ def test_get_xfa(url, name): ], ) def test_get_fonts(url, name, strict): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=strict) for page in reader.pages: page._get_fonts() @@ -842,7 +842,7 @@ def test_get_fonts(url, name, strict): ], ) def test_get_xmp(url, name, strict): - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=strict) xmp_info = reader.xmp_metadata if xmp_info: @@ -877,7 +877,7 @@ def test_get_xmp(url, name, strict): def test_tounicode_is_identity(): url = "https://github.com/py-pdf/pypdf/files/9998335/FP_Thesis.pdf" name = "FP_Thesis.pdf" - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=False) reader.pages[0].extract_text() @@ -889,13 +889,13 @@ def test_append_forms(): url = "https://github.com/py-pdf/pypdf/files/10367412/pdfa.pdf" name = "form_a.pdf" - reader1 = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader1 = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader1.add_form_topname("form_a") writer.append(reader1) url = "https://github.com/py-pdf/pypdf/files/10367413/pdfb.pdf" name = "form_b.pdf" - reader2 = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader2 = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader2.add_form_topname("form_b") writer.append(reader2) @@ -911,7 +911,7 @@ def test_append_forms(): def test_extra_test_iss1541(): url = "https://github.com/py-pdf/pypdf/files/10418158/tst_iss1541.pdf" name = "tst_iss1541.pdf" - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=False) reader.pages[0].extract_text() @@ -945,7 +945,7 @@ def test_fields_returning_stream(): """This problem was reported in #424""" url = "https://github.com/mstamy2/PyPDF2/files/1948267/Simple.form.pdf" name = "tst_iss424.pdf" - data = BytesIO(get_pdf_from_url(url, name=name)) + data = BytesIO(get_data_from_url(url, name=name)) reader = PdfReader(data, strict=False) assert "BtchIssQATit_time" in reader.get_form_text_fields()["TimeStampData"] @@ -990,10 +990,10 @@ def test_inline_images(): """This problem was reported in #424""" url = "https://arxiv.org/pdf/2201.00151.pdf" name = "2201.00151.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/assets/4083478/28e8b87c-be2c-40d9-9c86-15c7819021bf" name = "inline4.png" - img_ref = Image.open(BytesIO(get_pdf_from_url(url, name=name))) + img_ref = Image.open(BytesIO(get_data_from_url(url, name=name))) assert list(reader.pages[1].images[4].image.getdata()) == list(img_ref.getdata()) with pytest.raises(KeyError): reader.pages[0].images["~999~"] @@ -1022,7 +1022,7 @@ def test_inline_images(): def test_iss(): url = "https://github.com/py-pdf/pypdf/files/11801077/lv2018tconv.pdf" name = "lv2018tconv.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) for i, page in enumerate(reader.pages): print(i) page.extract_text() diff --git a/tests/test_writer.py b/tests/test_writer.py index 325a4a141..58fff1417 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -28,7 +28,7 @@ TextStringObject, ) -from . import get_pdf_from_url +from . import get_data_from_url TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent @@ -806,7 +806,7 @@ def test_sweep_indirect_references_nullobject_exception(pdf_file_path): # TODO: Check this more closely... this looks weird url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) merger = PdfMerger() merger.append(reader) merger.write(pdf_file_path) @@ -830,7 +830,7 @@ def test_sweep_indirect_references_nullobject_exception(pdf_file_path): ) @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_some_appends(pdf_file_path, url, name): - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) # PdfMerger merger = PdfMerger() merger.append(reader) @@ -1013,7 +1013,7 @@ def test_startup_dest(): def test_iss471(): url = "https://github.com/py-pdf/pypdf/files/9139245/book.pdf" name = "book_471.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader, excluded_fields=[]) @@ -1026,7 +1026,7 @@ def test_iss471(): def test_reset_translation(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader, (0, 10)) nb = len(writer._objects) @@ -1064,7 +1064,7 @@ def test_threads_empty(): def test_append_without_annots_and_articles(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader, None, (0, 10), True, ["/B"]) writer.reset_translation() @@ -1083,7 +1083,7 @@ def test_append_without_annots_and_articles(): def test_append_multiple(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924666.pdf" name = "tika-924666.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append( reader, [0, 0, 0] @@ -1234,7 +1234,7 @@ def test_set_page_label(pdf_file_path): def test_iss1601(): url = "https://github.com/py-pdf/pypdf/files/10579503/badges-38.pdf" name = "badge-38.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() page_1 = writer.add_blank_page( reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] @@ -1305,13 +1305,13 @@ def test_iss1614(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/10669995/broke.pdf" name = "iss1614.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader) # test for 2nd error case reported in #1614 url = "https://github.com/py-pdf/pypdf/files/10696390/broken.pdf" name = "iss1614.2.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer.append(reader) @@ -1320,7 +1320,7 @@ def test_new_removes(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/10807951/tt.pdf" name = "iss1650.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.clone_document_from_reader(reader) @@ -1344,7 +1344,7 @@ def test_new_removes(): url = "https://github.com/py-pdf/pypdf/files/10832029/tt2.pdf" name = "GeoBaseWithComments.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer.append(reader) writer.remove_objects_from_page(writer.pages[0], [ObjectDeletionFlag.LINKS]) assert "/Links" not in [ @@ -1373,7 +1373,7 @@ def test_new_removes(): def test_late_iss1654(): url = "https://github.com/py-pdf/pypdf/files/10935632/bid1.pdf" name = "bid1.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.clone_document_from_reader(reader) for p in writer.pages: @@ -1387,7 +1387,7 @@ def test_iss1723(): # test of an annotation(link) directly stored in the /Annots in the page url = "https://github.com/py-pdf/pypdf/files/11015242/inputFile.pdf" name = "iss1723.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter() writer.append(reader, (3, 5)) @@ -1399,7 +1399,7 @@ def test_iss1767(): # cloning url = "https://github.com/py-pdf/pypdf/files/11138472/test.pdf" name = "iss1723.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) PdfWriter(clone_from=reader) @@ -1413,10 +1413,10 @@ def test_named_dest_page_number(): name = "central.pdf" writer = PdfWriter() writer.add_blank_page(100, 100) - writer.append(BytesIO(get_pdf_from_url(url, name=name)), pages=[0, 1, 2]) + writer.append(BytesIO(get_data_from_url(url, name=name)), pages=[0, 1, 2]) assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 2 assert writer._root_object["/Names"]["/Dests"]["/Names"][-1][0] == (1 + 1) - writer.append(BytesIO(get_pdf_from_url(url, name=name))) + writer.append(BytesIO(get_data_from_url(url, name=name))) assert len(writer._root_object["/Names"]["/Dests"]["/Names"]) == 6 writer2 = PdfWriter() writer2.add_blank_page(100, 100) @@ -1497,7 +1497,7 @@ def test_iss1862(): url = "https://github.com/py-pdf/pypdf/files/11708801/intro.pdf" name = "iss1862.pdf" writer = PdfWriter() - writer.append(BytesIO(get_pdf_from_url(url, name=name))) + writer.append(BytesIO(get_data_from_url(url, name=name))) # check that "/B" is in the font writer.pages[0]["/Resources"]["/Font"]["/F1"]["/CharProcs"]["/B"].get_data() @@ -1520,10 +1520,10 @@ def test_empty_objects_before_cloning(): def test_watermark(): url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" name = "bgwatermark.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) url = "https://github.com/py-pdf/pypdf/files/11985888/source.pdf" name = "srcwatermark.pdf" - writer = PdfWriter(clone_from=BytesIO(get_pdf_from_url(url, name=name))) + writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name))) for p in writer.pages: p.merge_page(reader.pages[0], over=False) @@ -1539,7 +1539,7 @@ def test_watermark(): def test_da_missing_in_annot(): url = "https://github.com/py-pdf/pypdf/files/12136285/Building.Division.Permit.Application.pdf" name = "BuildingDivisionPermitApplication.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) writer = PdfWriter(clone_from=reader) writer.update_page_form_field_values( writer.pages[0], {"PCN-1": "0"}, auto_regenerate=False diff --git a/tests/test_xmp.py b/tests/test_xmp.py index 777a44141..e01e5c6fd 100644 --- a/tests/test_xmp.py +++ b/tests/test_xmp.py @@ -10,7 +10,7 @@ from pypdf import PdfReader from pypdf.errors import PdfReadError -from . import get_pdf_from_url +from . import get_data_from_url TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent @@ -126,7 +126,7 @@ def test_identity_function(x): ) def test_xmpmm_instance_id(url, name, xmpmm_instance_id): """XMPMM instance id is correctly extracted.""" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.xmpmm_instance_id == xmpmm_instance_id # cache hit: @@ -138,7 +138,7 @@ def test_xmp_dc_description_extraction(): """XMP dc_description is correctly extracted.""" url = "https://corpora.tika.apache.org/base/docs/govdocs1/953/953770.pdf" name = "tika-953770.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.dc_description == { "x-default": "U.S. Title 50 Certification Form" @@ -154,7 +154,7 @@ def test_dc_creator_extraction(): """XMP dc_creator is correctly extracted.""" url = "https://corpora.tika.apache.org/base/docs/govdocs1/953/953770.pdf" name = "tika-953770.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.dc_creator == ["U.S. Fish and Wildlife Service"] # cache hit: @@ -166,7 +166,7 @@ def test_custom_properties_extraction(): """XMP custom_properties is correctly extracted.""" url = "https://corpora.tika.apache.org/base/docs/govdocs1/986/986065.pdf" name = "tika-986065.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.custom_properties == {"Style": "Searchable Image (Exact)"} # cache hit: @@ -178,7 +178,7 @@ def test_dc_subject_extraction(): """XMP dc_subject is correctly extracted.""" url = "https://corpora.tika.apache.org/base/docs/govdocs1/959/959519.pdf" name = "tika-959519.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) xmp_metadata = reader.xmp_metadata assert xmp_metadata.dc_subject == [ "P&P", @@ -214,7 +214,7 @@ def test_invalid_xmp_information_handling(): """ url = "https://github.com/py-pdf/pypdf/files/5536984/test.pdf" name = "pypdf-5536984.pdf" - reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) with pytest.raises(PdfReadError) as exc: reader.xmp_metadata assert exc.value.args[0].startswith("XML in XmpInformation was invalid")