Skip to content

Commit

Permalink
ENH: Added command update-offsets to adjust offsets and lengths. (#15)
Browse files Browse the repository at this point in the history
Co-authored-by: Lucas Cimon <[email protected]>
Co-authored-by: Cimon Lucas (LCM) <[email protected]>
  • Loading branch information
3 people authored Nov 7, 2024
1 parent 010d5a4 commit da75816
Show file tree
Hide file tree
Showing 8 changed files with 417 additions and 5 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ upload:
clean:
python setup.py clean --all
pyclean .
rm -rf Tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt
rm -rf tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt

test:
pytest Tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30
pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30

mutation-test:
mutmut run
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ $ pdfly --help
│ meta Show metadata of a PDF file │
│ pagemeta Give details about a single page. │
│ rm Remove pages from PDF files. │
│ update-offsets Updates offsets and lengths in a simple PDF file. │
│ x2pdf Convert one or more files to PDF. Each file is a page. │
╰─────────────────────────────────────────────────────────────────────────────╯
```
Expand Down
16 changes: 16 additions & 0 deletions pdfly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pdfly.pagemeta
import pdfly.rm
import pdfly.up2
import pdfly.update_offsets
import pdfly.x2pdf


Expand Down Expand Up @@ -228,6 +229,21 @@ def compress(
pdfly.compress.main(pdf, output)


@entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__) # type: ignore[misc]
def update_offsets(
file_in: Path,
file_out: Path,
encoding: str = typer.Option(
"ISO-8859-1",
help="Encoding used to read and write the files, e.g. UTF-8.",
),
verbose: bool = typer.Option(
False, help="Show progress while processing."
),
) -> None:
pdfly.update_offsets.main(file_in, file_out, encoding, verbose)


@entry_point.command(name="x2pdf", help=pdfly.x2pdf.__doc__) # type: ignore[misc]
def x2pdf(
x: List[Path],
Expand Down
291 changes: 291 additions & 0 deletions pdfly/update_offsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
"""
Updates offsets and lengths in a simple PDF file.
The PDF specification requires that the xref section at the end
of a PDF file has the correct offsets of the PDF's objects.
It further requires that the dictionary of a stream object
contains a /Length-entry giving the length of the encoded stream.
When editing a PDF file using a text-editor (e.g. vim) it is
elaborate to compute or adjust these offsets and lengths.
This command tries to compute /Length-entries of the stream dictionaries
and the offsets in the xref-section automatically.
It expects that the PDF file has ASCII encoding only. It may
use ISO-8859-1 or UTF-8 in its comments.
The current implementation incorrectly replaces CR (0x0d) by LF (0x0a) in binary data.
It expects that there is one xref-section only.
It expects that the /Length-entries have default values containing
enough digits, e.g. /Length 000 when the stream consists of 576 bytes.
Example:
update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf
"""

import re
import sys
from pathlib import Path

if sys.version_info >= (3, 9):
List = list
else: # Support for Python 3.8
from typing import List

from rich.console import Console

# Here, only simple regular expressions are used.
# Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better.
RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
RE_CONTENT = re.compile(r"^([^\r\n]*)", re.DOTALL)
RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL)
RE_LENGTH = re.compile(
r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL
)


def update_lines(
lines_in: List[str], encoding: str, console: Console, verbose: bool
) -> List[str]:
"""
Iterates over the lines of a pdf-files and updates offsets.
The input is expected to be a pdf without binary-sections.
:param lines_in: A list over the lines including line-breaks.
:param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8".
:param console: Console used to print messages.
:param verbose: True to activate logging of info-messages.
:return The output is a list of lines to be written
in the given encoding.
"""
lines_out = [] # lines to be written
map_line_offset = {} # map from line-number to offset
map_obj_offset = {} # map from object-number to offset
map_obj_line = {} # map from object-number to line-number
line_no = 0 # current line-number (starting at 0)
offset_out = 0 # current offset in output-file
line_xref = None # line-number of xref-line (in xref-section only)
line_startxref = None # line-number of startxref-line
curr_obj = None # number of current object
len_stream = None # length of stream (in stream only)
offset_xref = None # offset of xref-section
map_stream_len = {} # map from object-number to /Length of stream
map_obj_length_line = {} # map from object-number to /Length-line
map_obj_length_ref = (
{}
) # map from object-number to /Length-reference (e.g. "3")
map_obj_length_line_no = {} # map from object-number to line_no of length
# of /Length-line
for idx, line in enumerate(lines_in):
line_no = idx + 1
m_content = RE_CONTENT.match(line)
if m_content is None:
raise RuntimeError(
f"Invalid PDF file: line {line_no} without line-break."
)
content = m_content.group(1)
map_line_offset[line_no] = offset_out
m_obj = RE_OBJ.match(line)
if m_obj is not None:
curr_obj = m_obj.group(1)
curr_gen = m_obj.group(2)
if verbose:
console.print(f"line {line_no}: object {curr_obj}")
if curr_gen != "0":
raise RuntimeError(
f"Invalid PDF file: generation {curr_gen} of object {curr_obj} in line {line_no} is not supported."
)
map_obj_offset[curr_obj] = int(offset_out)
map_obj_line[curr_obj] = line_no
len_stream = None

if content == "xref":
offset_xref = offset_out
line_xref = line_no
elif content == "startxref":
line_startxref = line_no
line_xref = None
elif content == "stream":
if verbose:
console.print(f"line {line_no}: start stream")
len_stream = 0
elif content == "endstream":
if verbose:
console.print(f"line {line_no}: end stream")
if curr_obj is None:
raise RuntimeError(
f"Invalid PDF file: line {line_no}: endstream without object-start."
)
if len_stream is None:
raise RuntimeError(
f"Invalid PDF file: line {line_no}: endstream without stream."
)
if len_stream > 0:
# Ignore the last EOL
len_stream = (
len_stream - 2
if lines_in[idx - 1][-2:] == "\r\n"
else len_stream - 1
)
if verbose:
console.print(
f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}"
)
map_stream_len[curr_obj] = len_stream
elif content == "endobj":
curr_obj = None
elif curr_obj is not None and len_stream is None:
m_length_ref = RE_LENGTH_REF.match(line)
if m_length_ref is not None:
len_obj = m_length_ref.group(2)
len_obj_gen = m_length_ref.group(3)
if verbose:
console.print(
f"line {line_no}, /Length-reference {len_obj} {len_obj_gen} R: {content}"
)
map_obj_length_ref[curr_obj] = len_obj
else:
m_length = RE_LENGTH.match(line)
if m_length is not None:
if verbose:
console.print(f"line {line_no}, /Length: {content}")
map_obj_length_line[curr_obj] = line
map_obj_length_line_no[curr_obj] = line_no
elif curr_obj is not None and len_stream is not None:
len_stream += len(line.encode(encoding))
elif line_xref is not None and line_no > line_xref + 2:
objNo = line_no - line_xref - 2
if objNo <= len(map_obj_offset) and str(objNo) in map_obj_offset:
eol = line[-2:]
xrefUpd = ("%010d" % map_obj_offset[str(objNo)]) + " 00000 n"
if verbose:
console.print(f"{content} -> {xrefUpd}")
line = xrefUpd + eol
elif line_startxref is not None and line_no == line_startxref + 1:
if offset_xref is None:
raise NotImplementedError(
"Unsupported file: startxref without preceding xref-section (probable cross-reference stream)"
)
line = "%d\n" % offset_xref
lines_out.append(line)

offset_out += len(line.encode(encoding))

# Some checks
if len(map_obj_offset) == 0:
raise RuntimeError(
"Invalid PDF file: the command didn't find any PDF objects."
)
if offset_xref is None:
raise RuntimeError(
"Invalid PDF file: the command didn't find a xref-section"
)
if line_startxref is None:
raise RuntimeError(
"Invalid PDF file: the command didn't find a startxref-section"
)

for curr_obj, stream_len in map_stream_len.items():
if curr_obj in map_obj_length_line:
line = map_obj_length_line[curr_obj]
m_length = RE_LENGTH.match(line)
if m_length is None:
raise RuntimeError(
f"Invalid PDF file: line '{line}' does not contain a valid /Length."
)
prev_length = m_length.group(2)
len_digits = len(prev_length)
len_format = "%%0%dd" % len_digits
updated_length = len_format % stream_len
if len(updated_length) > len_digits:
raise RuntimeError(
f"Not enough digits in /Length-entry {prev_length}"
f" of object {curr_obj}:"
f" too short to take /Length {updated_length}"
)
line = m_length.group(1) + updated_length + m_length.group(3)
lines_out[map_obj_length_line_no[curr_obj] - 1] = line
elif curr_obj in map_obj_length_ref:
len_obj = map_obj_length_ref[curr_obj]
if len_obj not in map_obj_line:
raise RuntimeError(
f"obj {curr_obj} has unknown length-obj {len_obj}"
)
len_obj_line = map_obj_line[len_obj]
prev_length = lines_out[len_obj_line][:-1]
len_digits = len(prev_length)
len_format = "%%0%dd" % len_digits
updated_length = len_format % stream_len
if len(updated_length) > len_digits:
raise RuntimeError(
f"Not enough digits in /Length-ref-entry {prev_length}"
f" of object {curr_obj} and len-object {len_obj}:"
f" too short to take /Length {updated_length}"
)
if prev_length != updated_length:
if verbose:
console.print(
f"line {line_no}, ref-len {len_obj} of {curr_obj}: {prev_length} -> {updated_length}"
)
lines_out[len_obj_line] = updated_length + "\n"
else:
raise RuntimeError(
f"obj {curr_obj} with stream-len {stream_len}"
f" has no object-length-line: {map_obj_length_line}"
)

return lines_out


def read_binary_file(file_path: Path, encoding: str) -> List[str]:
"""
Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
Encoding utf-8 can't be used to read random binary data.
:param file_path: file to be read line by line
:param encoding: encoding to be used (e.g. "iso-8859-1")
:return lines including line-breaks
"""
chunks: List[str] = []
with file_path.open("rb") as file:
buffer = bytearray()
while True:
chunk = file.read(4096) # Read in chunks of 4096 bytes
if not chunk:
break # End of file

buffer += chunk

# Split buffer into chunks based on LF, CR, or CRLF
while True:
match = re.search(b"(\x0D\x0A|\x0A|\x0D)", buffer)
if not match:
break # No more line breaks found, process the remaining buffer

end = match.end()
chunk_str = buffer[:end].decode(encoding, errors="strict")
buffer = buffer[end:]

chunks.append(chunk_str)

# Handle the last chunk
if buffer:
chunks.append(buffer.decode(encoding, errors="strict"))

return chunks


def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
console = Console()
console.print(f"Read {file_in}")

lines_in = read_binary_file(file_in, encoding)
lines_out = update_lines(lines_in, encoding, console, verbose)

with open(file_out, "wb") as f:
for line in lines_out:
f.write(line.encode(encoding))

console.print(f"Wrote {file_out}", soft_wrap=True)
Binary file added resources/file-with-fixed-offsets.pdf
Binary file not shown.
Binary file added resources/file-with-invalid-offsets.pdf
Binary file not shown.
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Utilities and fixtures that are available automatically for all tests."""

import io, os
import os
from pathlib import Path

from fpdf import FPDF
Expand Down Expand Up @@ -58,7 +58,7 @@ def pdf_file_100(tmp_path):
for i in range(100):
pdf.add_page()
pdf.set_font("helvetica", size=12)
pdf.cell(200, 10, txt=f"{i}", ln=True, align="C")
pdf.cell(200, 10, text=f"{i}", ln=True, align="C")

pdf_filepath = tmp_path / "pdf_file_100.pdf"
pdf.output(pdf_filepath)
Expand All @@ -73,7 +73,7 @@ def pdf_file_abc(tmp_path):
for char in [chr(i) for i in range(ord("a"), ord("z") + 1)]:
pdf.add_page()
pdf.set_font("helvetica", size=12)
pdf.cell(200, 10, txt=f"{char}", ln=True, align="C")
pdf.cell(200, 10, text=f"{char}", ln=True, align="C")

pdf_filepath = tmp_path / "abc.pdf"
pdf.output(pdf_filepath)
Expand Down
Loading

0 comments on commit da75816

Please sign in to comment.