Skip to content

Commit

Permalink
Pleasing mypy & typing imports under Python 3.8
Browse files Browse the repository at this point in the history
  • Loading branch information
Cimon Lucas (LCM) committed Nov 7, 2024
1 parent c3a6c88 commit fc42eb4
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 30 deletions.
4 changes: 2 additions & 2 deletions pdfly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,10 @@ def update_offsets(
encoding: str = typer.Option(
"ISO-8859-1",
help="Encoding used to read and write the files, e.g. UTF-8.",
), # noqa
),
verbose: bool = typer.Option(
False, help="Show progress while processing."
), # noqa
),
) -> None:
pdfly.update_offsets.main(file_in, file_out, encoding, verbose)

Expand Down
57 changes: 34 additions & 23 deletions pdfly/update_offsets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python
"""
Updates offsets and lengths in a simple PDF file.
Expand All @@ -20,15 +19,21 @@
It expects that the /Length-entries have default values containing
enough digits, e.g. /Length 000 when the stream consists of 576 bytes.
EXAMPLE
Example:
update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf
"""

from collections.abc import Iterable
from pathlib import Path
from rich.console import Console
import re
import sys
from pathlib import Path

if sys.version_info >= (3, 9):
List = list
else: # Support for Python 3.8
from typing import List

from rich.console import Console

# Here, only simple regular expressions are used.
# Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better.
Expand All @@ -41,20 +46,20 @@


def update_lines(
lines_in: Iterable[str], encoding: str, console: Console, verbose: bool
) -> Iterable[str]:
"""Iterates over the lines of a pdf-files and updates offsets.
lines_in: List[str], encoding: str, console: Console, verbose: bool
) -> List[str]:
"""
Iterates over the lines of a pdf-files and updates offsets.
The input is expected to be a pdf without binary-sections.
:param lines_in: An Iterable over the lines including line-breaks.
:param lines_in: A list over the lines including line-breaks.
:param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8".
:param console: Console used to print messages.
:param verbose: True to activate logging of info-messages.
:return The output is a list of lines to be written
in the given encoding.
"""

lines_out = [] # lines to be written
map_line_offset = {} # map from line-number to offset
map_obj_offset = {} # map from object-number to offset
Expand Down Expand Up @@ -184,22 +189,27 @@ def update_lines(

for curr_obj, stream_len in map_stream_len.items():
if curr_obj in map_obj_length_line:
m_length = RE_LENGTH.match(map_obj_length_line[curr_obj])
line = map_obj_length_line[curr_obj]
m_length = RE_LENGTH.match(line)
if m_length is None:
raise RuntimeError(
f"Invalid PDF file: line '{line}' does not contain a valid /Length."
)
prev_length = m_length.group(2)
len_digits = len(prev_length)
len_format = "%%0%dd" % len_digits
updated_length = len_format % stream_len
if len(updated_length) > len_digits:
raise RuntimeError(
f"Not enough digits in /Length-entry {prev_length}"
+ f" of object {curr_obj}:"
+ f" too short to take /Length {updated_length}"
f" of object {curr_obj}:"
f" too short to take /Length {updated_length}"
)
line = m_length.group(1) + updated_length + m_length.group(3)
lines_out[map_obj_length_line_no[curr_obj] - 1] = line
elif curr_obj in map_obj_length_ref:
len_obj = map_obj_length_ref[curr_obj]
if not len_obj in map_obj_line:
if len_obj not in map_obj_line:
raise RuntimeError(
f"obj {curr_obj} has unknown length-obj {len_obj}"
)
Expand All @@ -211,8 +221,8 @@ def update_lines(
if len(updated_length) > len_digits:
raise RuntimeError(
f"Not enough digits in /Length-ref-entry {prev_length}"
+ f" of object {curr_obj} and len-object {len_obj}:"
+ f" too short to take /Length {updated_length}"
f" of object {curr_obj} and len-object {len_obj}:"
f" too short to take /Length {updated_length}"
)
if prev_length != updated_length:
if verbose:
Expand All @@ -223,22 +233,23 @@ def update_lines(
else:
raise RuntimeError(
f"obj {curr_obj} with stream-len {stream_len}"
+ f" has no object-length-line: {map_obj_length_line}"
f" has no object-length-line: {map_obj_length_line}"
)

return lines_out


def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
"""Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
def read_binary_file(file_path: Path, encoding: str) -> List[str]:
"""
Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
Encoding utf-8 can't be used to read random binary data.
:param file_path: file to be read line by line
:param encoding: encoding to be used (e.g. "iso-8859-1")
:return lines including line-breaks
"""
chunks = []
with open(file_path, "rb") as file:
chunks: List[str] = []
with file_path.open("rb") as file:
buffer = bytearray()
while True:
chunk = file.read(4096) # Read in chunks of 4096 bytes
Expand All @@ -253,7 +264,7 @@ def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
if not match:
break # No more line breaks found, process the remaining buffer

start, end = match.start(), match.end()
end = match.end()
chunk_str = buffer[:end].decode(encoding, errors="strict")
buffer = buffer[end:]

Expand All @@ -277,4 +288,4 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
for line in lines_out:
f.write(line.encode(encoding))

console.print(f"Wrote {file_out}")
console.print(f"Wrote {file_out}", soft_wrap=True)
6 changes: 3 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Utilities and fixtures that are available automatically for all tests."""

import io, os
import os
from pathlib import Path

from fpdf import FPDF
Expand Down Expand Up @@ -58,7 +58,7 @@ def pdf_file_100(tmp_path):
for i in range(100):
pdf.add_page()
pdf.set_font("helvetica", size=12)
pdf.cell(200, 10, txt=f"{i}", ln=True, align="C")
pdf.cell(200, 10, text=f"{i}", ln=True, align="C")

pdf_filepath = tmp_path / "pdf_file_100.pdf"
pdf.output(pdf_filepath)
Expand All @@ -73,7 +73,7 @@ def pdf_file_abc(tmp_path):
for char in [chr(i) for i in range(ord("a"), ord("z") + 1)]:
pdf.add_page()
pdf.set_font("helvetica", size=12)
pdf.cell(200, 10, txt=f"{char}", ln=True, align="C")
pdf.cell(200, 10, text=f"{char}", ln=True, align="C")

pdf_filepath = tmp_path / "abc.pdf"
pdf.output(pdf_filepath)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_update_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ def test_update_offsets(capsys, tmp_path: Path) -> None:
assert not captured.err
assert re.search(r"Wrote\s+" + re.escape(str(output)), captured.out)
assert output.exists()
with open(file_expected, "r", encoding="iso-8859-1") as file_exp:
with open(file_expected, encoding="iso-8859-1") as file_exp:
lines_exp = file_exp.readlines()
with open(output, "r", encoding="iso-8859-1") as file_act:
with open(output, encoding="iso-8859-1") as file_act:
lines_act = file_act.readlines()
assert len(lines_exp) == len(
lines_act
Expand Down

0 comments on commit fc42eb4

Please sign in to comment.