From bff4b8e02565f6eff0896a005337eb6d100d9773 Mon Sep 17 00:00:00 2001
From: Harsh <harshdayal13@gmail.com>
Date: Fri, 15 Nov 2024 19:38:46 +0530
Subject: [PATCH] Implement uncompress functionality for PDF files (#75)

Co-authored-by: Lucas Cimon <925560+Lucas-C@users.noreply.github.com>
---
 README.md                    |  1 +
 pdfly/cli.py                 | 25 +++++++++++++++++
 pdfly/uncompress.py          | 52 ++++++++++++++++++++++++++++++++++++
 tests/conftest.py            |  9 ++++---
 tests/test_extract_images.py |  2 --
 tests/test_uncompress.py     | 40 +++++++++++++++++++++++++++
 tests/test_update_offsets.py |  4 +--
 7 files changed, 125 insertions(+), 8 deletions(-)
 create mode 100644 pdfly/uncompress.py
 create mode 100644 tests/test_uncompress.py

diff --git a/README.md b/README.md
index 0667e65..a749d60 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ $ pdfly --help
 │ 2-up             Create a booklet-style PDF from a single input.            │
 │ cat              Concatenate pages from PDF files into a single PDF file.   │
 │ compress         Compress a PDF.                                            │
+| uncompress       Uncompresses a PDF.                                        │
 │ extract-images   Extract images from PDF without resampling or altering.    │
 │ extract-text     Extract text from a PDF file.                              │
 │ meta             Show metadata of a PDF file                                │
diff --git a/pdfly/cli.py b/pdfly/cli.py
index d736d33..9c5fa31 100644
--- a/pdfly/cli.py
+++ b/pdfly/cli.py
@@ -16,6 +16,7 @@
 import pdfly.metadata
 import pdfly.pagemeta
 import pdfly.rm
+import pdfly.uncompress
 import pdfly.up2
 import pdfly.update_offsets
 import pdfly.x2pdf
@@ -205,6 +206,30 @@ def compress(
     pdfly.compress.main(pdf, output)
 
 
+@entry_point.command(name="uncompress", help=pdfly.uncompress.__doc__)  # type: ignore[misc]
+def uncompress(
+    pdf: Annotated[
+        Path,
+        typer.Argument(
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            writable=False,
+            readable=True,
+            resolve_path=True,
+        ),
+    ],
+    output: Annotated[
+        Path,
+        typer.Argument(
+            exists=False,
+            writable=True,
+        ),
+    ],
+) -> None:
+    pdfly.uncompress.main(pdf, output)
+
+
 @entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__)  # type: ignore[misc]
 def update_offsets(
     file_in: Annotated[
diff --git a/pdfly/uncompress.py b/pdfly/uncompress.py
new file mode 100644
index 0000000..a543473
--- /dev/null
+++ b/pdfly/uncompress.py
@@ -0,0 +1,52 @@
+"""Module for uncompressing PDF content streams."""
+
+import zlib
+from pathlib import Path
+from typing import Optional
+
+from pypdf import PdfReader, PdfWriter
+from pypdf.generic import IndirectObject, PdfObject
+
+
+def main(pdf: Path, output: Path) -> None:
+    reader = PdfReader(pdf)
+    writer = PdfWriter()
+
+    for page in reader.pages:
+        if "/Contents" in page:
+            contents: Optional[PdfObject] = page["/Contents"]
+            if isinstance(contents, IndirectObject):
+                contents = contents.get_object()
+            if contents is not None:
+                if isinstance(contents, list):
+                    for content in contents:
+                        if isinstance(content, IndirectObject):
+                            decompress_content_stream(content)
+                elif isinstance(contents, IndirectObject):
+                    decompress_content_stream(contents)
+        writer.add_page(page)
+
+    with open(output, "wb") as fp:
+        writer.write(fp)
+
+    orig_size = pdf.stat().st_size
+    uncomp_size = output.stat().st_size
+
+    print(f"Original Size  : {orig_size:,}")
+    print(
+        f"Uncompressed Size: {uncomp_size:,} ({(uncomp_size / orig_size) * 100:.1f}% of original)"
+    )
+
+
+def decompress_content_stream(content: IndirectObject) -> None:
+    """Decompress a content stream if it uses FlateDecode."""
+    if content.get("/Filter") == "/FlateDecode":
+        try:
+            compressed_data = content.get_data()
+            uncompressed_data = zlib.decompress(compressed_data)
+            content.set_data(uncompressed_data)
+            del content["/Filter"]
+        except zlib.error as error:
+            print(
+                f"Some content stream with /FlateDecode failed to be decompressed: {error}"
+            )
diff --git a/tests/conftest.py b/tests/conftest.py
index 9ab40d4..181d60c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,8 +3,9 @@
 import os
 from pathlib import Path
 
-from fpdf import FPDF
 import pytest
+from fpdf import FPDF
+
 from pdfly.cli import entry_point
 
 try:
@@ -35,7 +36,7 @@ def run_cli(args):
         return error.code
 
 
-@pytest.fixture
+@pytest.fixture()
 def two_pages_pdf_filepath(tmp_path):
     "A PDF with 2 pages, and a different image on each page"
     # Note: prior to v2.7.9, fpdf2 produced incorrect /Resources dicts for each page (cf. fpdf2 PR #1133),
@@ -50,7 +51,7 @@ def two_pages_pdf_filepath(tmp_path):
     return pdf_filepath
 
 
-@pytest.fixture
+@pytest.fixture()
 def pdf_file_100(tmp_path):
     """A PDF with 100 pages; each has only the page index on it."""
     pdf = FPDF()
@@ -65,7 +66,7 @@ def pdf_file_100(tmp_path):
     return pdf_filepath
 
 
-@pytest.fixture
+@pytest.fixture()
 def pdf_file_abc(tmp_path):
     """A PDF with 100 pages; each has only the page index on it."""
     pdf = FPDF()
diff --git a/tests/test_extract_images.py b/tests/test_extract_images.py
index 3a3025e..4a0df2b 100644
--- a/tests/test_extract_images.py
+++ b/tests/test_extract_images.py
@@ -1,5 +1,3 @@
-import pytest
-
 from .conftest import RESOURCES_ROOT, chdir, run_cli
 
 
diff --git a/tests/test_uncompress.py b/tests/test_uncompress.py
new file mode 100644
index 0000000..dc19723
--- /dev/null
+++ b/tests/test_uncompress.py
@@ -0,0 +1,40 @@
+"""Tests for the `uncompress` command."""
+
+from pathlib import Path
+
+import pytest
+from pypdf import PdfReader
+from typer.testing import CliRunner
+
+from pdfly.cli import entry_point
+
+runner = CliRunner()
+
+
+@pytest.mark.parametrize(
+    "input_pdf_filepath", Path("sample-files").glob("*.pdf")
+)
+def test_uncompress_all_sample_files(
+    input_pdf_filepath: Path, tmp_path: Path
+) -> None:
+    output_pdf_filepath = tmp_path / "uncompressed_output.pdf"
+
+    result = runner.invoke(
+        entry_point,
+        ["uncompress", str(input_pdf_filepath), str(output_pdf_filepath)],
+    )
+
+    assert (
+        result.exit_code == 0
+    ), f"Error in uncompressing {input_pdf_filepath}: {result.output}"
+    assert (
+        output_pdf_filepath.exists()
+    ), f"Output PDF {output_pdf_filepath} does not exist."
+
+    reader = PdfReader(str(output_pdf_filepath))
+    for page in reader.pages:
+        contents = page.get("/Contents")
+        if contents:
+            assert (
+                "/Filter" not in contents
+            ), "Content stream is still compressed"
diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py
index bd5d506..c239577 100644
--- a/tests/test_update_offsets.py
+++ b/tests/test_update_offsets.py
@@ -4,12 +4,12 @@
 Here should only be end-to-end tests.
 """
 
+import re
 from pathlib import Path
 
 import pytest
-import re
 
-from .conftest import RESOURCES_ROOT, chdir, run_cli
+from .conftest import RESOURCES_ROOT, run_cli
 
 
 def test_update_offsets(capsys, tmp_path: Path) -> None: