From bff4b8e02565f6eff0896a005337eb6d100d9773 Mon Sep 17 00:00:00 2001 From: Harsh Date: Fri, 15 Nov 2024 19:38:46 +0530 Subject: [PATCH] Implement uncompress functionality for PDF files (#75) Co-authored-by: Lucas Cimon <925560+Lucas-C@users.noreply.github.com> --- README.md | 1 + pdfly/cli.py | 25 +++++++++++++++++ pdfly/uncompress.py | 52 ++++++++++++++++++++++++++++++++++++ tests/conftest.py | 9 ++++--- tests/test_extract_images.py | 2 -- tests/test_uncompress.py | 40 +++++++++++++++++++++++++++ tests/test_update_offsets.py | 4 +-- 7 files changed, 125 insertions(+), 8 deletions(-) create mode 100644 pdfly/uncompress.py create mode 100644 tests/test_uncompress.py diff --git a/README.md b/README.md index 0667e65..a749d60 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ $ pdfly --help │ 2-up Create a booklet-style PDF from a single input. │ │ cat Concatenate pages from PDF files into a single PDF file. │ │ compress Compress a PDF. │ +| uncompress Uncompresses a PDF. │ │ extract-images Extract images from PDF without resampling or altering. │ │ extract-text Extract text from a PDF file. │ │ meta Show metadata of a PDF file │ diff --git a/pdfly/cli.py b/pdfly/cli.py index d736d33..9c5fa31 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -16,6 +16,7 @@ import pdfly.metadata import pdfly.pagemeta import pdfly.rm +import pdfly.uncompress import pdfly.up2 import pdfly.update_offsets import pdfly.x2pdf @@ -205,6 +206,30 @@ def compress( pdfly.compress.main(pdf, output) +@entry_point.command(name="uncompress", help=pdfly.uncompress.__doc__) # type: ignore[misc] +def uncompress( + pdf: Annotated[ + Path, + typer.Argument( + exists=True, + file_okay=True, + dir_okay=False, + writable=False, + readable=True, + resolve_path=True, + ), + ], + output: Annotated[ + Path, + typer.Argument( + exists=False, + writable=True, + ), + ], +) -> None: + pdfly.uncompress.main(pdf, output) + + @entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__) # type: ignore[misc] def update_offsets( file_in: Annotated[ diff --git a/pdfly/uncompress.py b/pdfly/uncompress.py new file mode 100644 index 0000000..a543473 --- /dev/null +++ b/pdfly/uncompress.py @@ -0,0 +1,52 @@ +"""Module for uncompressing PDF content streams.""" + +import zlib +from pathlib import Path +from typing import Optional + +from pypdf import PdfReader, PdfWriter +from pypdf.generic import IndirectObject, PdfObject + + +def main(pdf: Path, output: Path) -> None: + reader = PdfReader(pdf) + writer = PdfWriter() + + for page in reader.pages: + if "/Contents" in page: + contents: Optional[PdfObject] = page["/Contents"] + if isinstance(contents, IndirectObject): + contents = contents.get_object() + if contents is not None: + if isinstance(contents, list): + for content in contents: + if isinstance(content, IndirectObject): + decompress_content_stream(content) + elif isinstance(contents, IndirectObject): + decompress_content_stream(contents) + writer.add_page(page) + + with open(output, "wb") as fp: + writer.write(fp) + + orig_size = pdf.stat().st_size + uncomp_size = output.stat().st_size + + print(f"Original Size : {orig_size:,}") + print( + f"Uncompressed Size: {uncomp_size:,} ({(uncomp_size / orig_size) * 100:.1f}% of original)" + ) + + +def decompress_content_stream(content: IndirectObject) -> None: + """Decompress a content stream if it uses FlateDecode.""" + if content.get("/Filter") == "/FlateDecode": + try: + compressed_data = content.get_data() + uncompressed_data = zlib.decompress(compressed_data) + content.set_data(uncompressed_data) + del content["/Filter"] + except zlib.error as error: + print( + f"Some content stream with /FlateDecode failed to be decompressed: {error}" + ) diff --git a/tests/conftest.py b/tests/conftest.py index 9ab40d4..181d60c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,8 +3,9 @@ import os from pathlib import Path -from fpdf import FPDF import pytest +from fpdf import FPDF + from pdfly.cli import entry_point try: @@ -35,7 +36,7 @@ def run_cli(args): return error.code -@pytest.fixture +@pytest.fixture() def two_pages_pdf_filepath(tmp_path): "A PDF with 2 pages, and a different image on each page" # Note: prior to v2.7.9, fpdf2 produced incorrect /Resources dicts for each page (cf. fpdf2 PR #1133), @@ -50,7 +51,7 @@ def two_pages_pdf_filepath(tmp_path): return pdf_filepath -@pytest.fixture +@pytest.fixture() def pdf_file_100(tmp_path): """A PDF with 100 pages; each has only the page index on it.""" pdf = FPDF() @@ -65,7 +66,7 @@ def pdf_file_100(tmp_path): return pdf_filepath -@pytest.fixture +@pytest.fixture() def pdf_file_abc(tmp_path): """A PDF with 100 pages; each has only the page index on it.""" pdf = FPDF() diff --git a/tests/test_extract_images.py b/tests/test_extract_images.py index 3a3025e..4a0df2b 100644 --- a/tests/test_extract_images.py +++ b/tests/test_extract_images.py @@ -1,5 +1,3 @@ -import pytest - from .conftest import RESOURCES_ROOT, chdir, run_cli diff --git a/tests/test_uncompress.py b/tests/test_uncompress.py new file mode 100644 index 0000000..dc19723 --- /dev/null +++ b/tests/test_uncompress.py @@ -0,0 +1,40 @@ +"""Tests for the `uncompress` command.""" + +from pathlib import Path + +import pytest +from pypdf import PdfReader +from typer.testing import CliRunner + +from pdfly.cli import entry_point + +runner = CliRunner() + + +@pytest.mark.parametrize( + "input_pdf_filepath", Path("sample-files").glob("*.pdf") +) +def test_uncompress_all_sample_files( + input_pdf_filepath: Path, tmp_path: Path +) -> None: + output_pdf_filepath = tmp_path / "uncompressed_output.pdf" + + result = runner.invoke( + entry_point, + ["uncompress", str(input_pdf_filepath), str(output_pdf_filepath)], + ) + + assert ( + result.exit_code == 0 + ), f"Error in uncompressing {input_pdf_filepath}: {result.output}" + assert ( + output_pdf_filepath.exists() + ), f"Output PDF {output_pdf_filepath} does not exist." + + reader = PdfReader(str(output_pdf_filepath)) + for page in reader.pages: + contents = page.get("/Contents") + if contents: + assert ( + "/Filter" not in contents + ), "Content stream is still compressed" diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py index bd5d506..c239577 100644 --- a/tests/test_update_offsets.py +++ b/tests/test_update_offsets.py @@ -4,12 +4,12 @@ Here should only be end-to-end tests. """ +import re from pathlib import Path import pytest -import re -from .conftest import RESOURCES_ROOT, chdir, run_cli +from .conftest import RESOURCES_ROOT, run_cli def test_update_offsets(capsys, tmp_path: Path) -> None: