Skip to content

Commit

Permalink
Merge branch 'main' into pre-commit-updates
Browse files Browse the repository at this point in the history
  • Loading branch information
stefan6419846 authored Oct 27, 2024
2 parents 968681e + 9f647e6 commit 794c7ee
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 4 deletions.
36 changes: 36 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,41 @@
# CHANGELOG

## Version 5.1.0, 2024-10-27

### New Features (ENH)
- Add `layout_mode_font_height_weight` argument to `PageObject.extract_text()` (#2920)

### Bug Fixes (BUG)
- Fix font specificier for FreeText annotation (#2893)
- Line breaks are not generated due to incorrect calculation of text leading (#2890)
- Improve handling of spaces in text extraction (#2882)

### Robustness (ROB)
- Soft failure for flate encode image mode 1 with wrong LUT size (#2900)

### Documentation (DOC)
- Use latest package versions (#2907)
- Correct example of reading FileAttachment annotation (#2906)

### Developer Experience (DEV)
- Update pinned requirements (#2918)
- Make make_release.py compatible with Windows environment (#2894)

### Maintenance (MAINT)
- Remove references to outdated Python versions (#2919)
- Generalize the method of obtaining space_code (#2891)
- Unnecessary character mapping process (#2888)
- New LZW decoding implementation (#2887)

### Testing (TST)
- Add LzwCodec for encoding (#2883)

### Code Style (STY)
- Capitalize error messages (#2903)
- Modify error messages in PdfWriter (#2902)

[Full Changelog](https://github.com/py-pdf/pypdf/compare/5.0.1...5.1.0)

## Version 5.0.1, 2024-09-29

### New Features (ENH)
Expand Down
8 changes: 7 additions & 1 deletion pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2210,6 +2210,7 @@ def _layout_mode_text(
scale_weight: float = 1.25,
strip_rotated: bool = True,
debug_path: Optional[Path] = None,
font_height_weight: float = 1,
) -> str:
"""
Get text preserving fidelity to source PDF text layout.
Expand All @@ -2229,6 +2230,8 @@ def _layout_mode_text(
- bts.json: text render ops left justified and grouped by BT/ET operators
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
Defaults to None.
font_height_weight: multiplier for font height when calculating
blank lines. Defaults to 1.
Returns:
str: multiline string containing page text in a fixed width format that
Expand Down Expand Up @@ -2260,7 +2263,7 @@ def _layout_mode_text(

char_width = _layout_mode.fixed_char_width(bt_groups, scale_weight)

return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically)
return _layout_mode.fixed_width_page(ty_groups, char_width, space_vertically, font_height_weight)

def extract_text(
self,
Expand Down Expand Up @@ -2335,6 +2338,8 @@ def extract_text(
- tjs.json: individual text render ops with corresponding transform matrices
- bts.json: text render ops left justified and grouped by BT/ET operators
- bt_groups.json: BT/ET operations grouped by rendered y-coord (aka lines)
layout_mode_font_height_weight (float): multiplier for font height when calculating
blank lines. Defaults to 1.
Returns:
The extracted text
Expand All @@ -2358,6 +2363,7 @@ def extract_text(
scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
strip_rotated=kwargs.get("layout_mode_strip_rotated", True),
debug_path=kwargs.get("layout_mode_debug_path"),
font_height_weight=kwargs.get("layout_mode_font_height_weight", 1)
)
if len(args) >= 1:
if isinstance(args[0], str):
Expand Down
5 changes: 3 additions & 2 deletions pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> fl


def fixed_width_page(
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool
ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool, font_height_weight: float
) -> str:
"""
Generate page text from text operations grouped by rendered y coordinate.
Expand All @@ -352,6 +352,7 @@ def fixed_width_page(
ty_groups: dict of text show ops as returned by y_coordinate_groups()
char_width: fixed character width
space_vertically: include blank lines inferred from y distance + font height.
font_height_weight: multiplier for font height when calculating blank lines.
Returns:
str: page text in a fixed width format that closely adheres to the rendered
Expand All @@ -363,7 +364,7 @@ def fixed_width_page(
for y_coord, line_data in ty_groups.items():
if space_vertically and lines:
blank_lines = (
int(abs(y_coord - last_y_coord) / line_data[0]["font_height"]) - 1
int(abs(y_coord - last_y_coord) / (line_data[0]["font_height"] * font_height_weight)) - 1
)
lines.extend([""] * blank_lines)
line = ""
Expand Down
2 changes: 1 addition & 1 deletion pypdf/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "5.0.1"
__version__ = "5.1.0"
19 changes: 19 additions & 0 deletions resources/crazyones_layout_vertical_space.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
The Crazy Ones
October 14, 1998

Heres to the crazy ones. The misfits. The rebels. The troublemakers.
The round pegs in the square holes.
The ones who see things differently. Theyre not fond of rules. And
they have no respect for the status quo. You can quote them,
disagree with them, glorify or vilify them.
About the only thing you cant do is ignore them. Because they change
things. They invent. They imagine. They heal. They explore. They
create. They inspire. They push the human race forward.
Maybe they have to be crazy.
How else can you stare at an empty canvas and see a work of art? Or
sit in silence and hear a song thats never been written? Or gaze at
a red planet and see a laboratory on wheels?
We make tools for these kinds of people.
While some see them as the crazy ones, we see genius. Because the
people who are crazy enough to think they can change the world,
are the ones who do.
25 changes: 25 additions & 0 deletions resources/crazyones_layout_vertical_space_font_height_weight.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
The Crazy Ones
October 14, 1998

Heres to the crazy ones. The misfits. The rebels. The troublemakers.
The round pegs in the square holes.

The ones who see things differently. Theyre not fond of rules. And
they have no respect for the status quo. You can quote them,
disagree with them, glorify or vilify them.

About the only thing you cant do is ignore them. Because they change
things. They invent. They imagine. They heal. They explore. They
create. They inspire. They push the human race forward.

Maybe they have to be crazy.

How else can you stare at an empty canvas and see a work of art? Or
sit in silence and hear a song thats never been written? Or gaze at
a red planet and see a laboratory on wheels?

We make tools for these kinds of people.

While some see them as the crazy ones, we see genius. Because the
people who are crazy enough to think they can change the world,
are the ones who do.
41 changes: 41 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,44 @@ def test_text_leading_height_unit():
page = reader.pages[0]
extracted = page.extract_text()
assert "Something[cited]\n" in extracted


def test_layout_mode_space_vertically_font_height_weight():
"""Tests layout mode with vertical space and font height weight (issue #2915)"""
with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as inputfile:
# Load PDF file from file
reader = PdfReader(inputfile)
page = reader.pages[0]

# Normal behaviour
with open(RESOURCE_ROOT / "crazyones_layout_vertical_space.txt", "rb") as pdftext_file:
pdftext = pdftext_file.read()

text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True).encode("utf-8")

# Compare the text of the PDF to a known source
for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()):
assert expected_line == actual_line

pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows
assert text == pdftext, (
"PDF extracted text differs from expected value.\n\n"
"Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text)
)

# Blank lines are added to truly separate paragraphs
with open(RESOURCE_ROOT / "crazyones_layout_vertical_space_font_height_weight.txt", "rb") as pdftext_file:
pdftext = pdftext_file.read()

text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=True,
layout_mode_font_height_weight=0.85).encode("utf-8")

# Compare the text of the PDF to a known source
for expected_line, actual_line in zip(text.splitlines(), pdftext.splitlines()):
assert expected_line == actual_line

pdftext = pdftext.replace(b"\r\n", b"\n") # fix for windows
assert text == pdftext, (
"PDF extracted text differs from expected value.\n\n"
"Expected:\n\n%r\n\nExtracted:\n\n%r\n\n" % (pdftext, text)
)

0 comments on commit 794c7ee

Please sign in to comment.