From ced67e1b68abd82b007f30f71230b608fe3d8ca1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 10 Apr 2024 09:02:57 +0200 Subject: [PATCH] ROB: Cope with some image extraction issues (#2591) Closes #2343: 1st case : image with images in 1 byte encoding with Separation color space 2nd case: similar + \n to be ignored at the end of the image data --- pypdf/_xobj_image_helpers.py | 17 ++++++++++++++++- pypdf/filters.py | 22 +++++++++++++++++----- tests/test_images.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index cebab1041..89341a460 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -125,6 +125,21 @@ def _get_imagemode( return mode, mode == "CMYK" +def _extended_image_frombytes( + mode: str, size: Tuple[int, int], data: bytes +) -> Image.Image: + try: + img = Image.frombytes(mode, size, data) + except ValueError as exc: + nb_pix = size[0] * size[1] + if len(data) % nb_pix != 0: + raise exc + k = nb_pix * len(mode) / len(data) + data = b"".join([bytes((x,) * int(k)) for x in data]) + img = Image.frombytes(mode, size, data) + return img + + def _handle_flate( size: Tuple[int, int], data: bytes, @@ -168,7 +183,7 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: elif mode == "4bits": mode = "P" data = bits2byte(data, size, 4) - img = Image.frombytes(mode, size, data) + img = _extended_image_frombytes(mode, size, data) if color_space == "/Indexed": from .generic import TextStringObject diff --git a/pypdf/filters.py b/pypdf/filters.py index fe3f3c71a..9e2158b21 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -285,7 +285,9 @@ def decode( index = 0 while True: if index >= len(data): - logger_warning("missing EOD in ASCIIHexDecode, check if output is OK", __name__) + logger_warning( + "missing EOD in ASCIIHexDecode, check if output is OK", __name__ + ) break # reach End Of String even if no EOD char = data[index : index + 1] if char == b">": @@ -341,7 +343,9 @@ def decode( index = 0 while True: if index >= len(data): - logger_warning("missing EOD in RunLengthDecode, check if output is OK", __name__) + logger_warning( + "missing EOD in RunLengthDecode, check if output is OK", __name__ + ) break # reach End Of String even if no EOD length = data[index] index += 1 @@ -733,6 +737,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, """ from ._xobj_image_helpers import ( Image, + _extended_image_frombytes, _get_imagemode, _handle_flate, _handle_jpx, @@ -747,10 +752,12 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, else: obj_as_text = x_object_obj.__repr__() - size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT]) + size = (cast(int, x_object_obj[IA.WIDTH]), cast(int, x_object_obj[IA.HEIGHT])) data = x_object_obj.get_data() # type: ignore if isinstance(data, str): # pragma: no cover data = data.encode() + if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n' + data = data[:-1] colors = x_object_obj.get("/Colors", 1) color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object() if isinstance(color_space, list) and len(color_space) == 1: @@ -819,7 +826,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, ) elif mode == "CMYK": img, image_format, extension, invert_color = ( - Image.frombytes(mode, size, data), + _extended_image_frombytes(mode, size, data), "TIFF", ".tif", False, @@ -828,7 +835,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, raise PdfReadError(f"ColorSpace field not found in {x_object_obj}") else: img, image_format, extension, invert_color = ( - Image.frombytes(mode, size, data), + _extended_image_frombytes(mode, size, data), "PNG", ".png", False, @@ -849,6 +856,11 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes, and color_space[0].get_object() == "/Indexed" ): decode = None # decode is meanless of Indexed + if ( + isinstance(color_space, ArrayObject) + and color_space[0].get_object() == "/Separation" + ): + decode = [1.0, 0.0] * len(img.getbands()) if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))): lut: List[int] = [] for i in range(0, len(decode), 2): diff --git a/tests/test_images.py b/tests/test_images.py index b1b907014..7a690f7d7 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -255,3 +255,31 @@ def test_cmyk_no_filter(): name = "iss2522.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].images[0].image + + +@pytest.mark.enable_socket() +def test_separation_1byte_to_rgb_inverted(): + """Cf #2343""" + url = "https://github.com/py-pdf/pypdf/files/13679585/test2_P038-038.pdf" + name = "iss2343.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/assets/4083478/b7f41897-96ef-4ea6-b165-5ef307a92b87" + name = "iss2343.png" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(reader.pages[0].images[0].image, img) >= 0.99 + obj = reader.pages[0].images[0].indirect_reference.get_object() + obj.set_data(obj.get_data() + b"\x00") + with pytest.raises(ValueError): + reader.pages[0].images[0] + + +@pytest.mark.enable_socket() +def test_data_with_lf(): + """Cf #2343""" + url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf" + name = "iss2343b.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + url = "https://github.com/py-pdf/pypdf/assets/4083478/1120b0cf-a67a-403f-aa1a-9a191cbc087f" + name = "iss2343b0.png" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(reader.pages[8].images[9].image, img) == 1.0