Skip to content

Commit

Permalink
BUG: Improve handling of spaces in text extraction (#2882)
Browse files Browse the repository at this point in the history
Closes #1153.
  • Loading branch information
ssjkamei authored Oct 3, 2024
1 parent 8e1799e commit d5233a0
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 141 deletions.
155 changes: 100 additions & 55 deletions pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,28 +75,28 @@ def build_char_map_from_dict(
for x in int_entry:
if x <= 255:
encoding[x] = chr(x)
try:
# override space_width with new params
space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
except Exception:
pass
# I consider the space_code is available on one byte
if isinstance(space_code, str):
try: # one byte
sp = space_code.encode("charmap")[0]
except Exception:
sp = space_code.encode("utf-16-be")
sp = sp[0] + 256 * sp[1]
try:
sp = ord(map_dict[chr(sp)])
except KeyError:
pass
else:
sp = space_code
sp_width = compute_space_width(ft, sp, space_width)
font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0)
half_space_width = compute_space_width(font_width_map, chr(sp)) / 2.0

return (
font_type,
float(sp_width / 2),
half_space_width,
encoding,
# https://github.com/python/mypy/issues/4374
map_dict,
map_dict
)


Expand Down Expand Up @@ -402,78 +402,123 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->
lst = lst[2:]


def compute_space_width(
ft: DictionaryObject, space_code: int, space_width: float
) -> float:
sp_width: float = space_width * 2.0 # default value
w = []
w1 = {}
def build_font_width_map(
ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any], default_font_width: float
) -> Dict[Any, float]:
font_width_map: Dict[Any, float] = {}
st: int = 0
en: int = 0
if ft is None:
font_width_map["default"] = default_font_width
return font_width_map
try:
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
except Exception:
pass
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
# §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
# Widths for a CIDFont are defined using the DW and W entries.
# DW2 and W2 are for vertical use. Vertical type is not implemented.
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
try:
w1[-1] = cast(float, ft1["/DW"])
font_width_map["default"] = cast(float, ft1["/DW"])
except Exception:
w1[-1] = 1000.0
font_width_map["default"] = default_font_width
if "/W" in ft1:
w = list(ft1["/W"])
w = ft1["/W"].get_object()
else:
w = []
while len(w) > 0:
st = w[0] if isinstance(w[0], int) else w[0].get_object()
second = w[1].get_object()
if isinstance(second, int):
for x in range(st, second):
w1[x] = w[2]
# C_first C_last same_W
en = second
for c_code in range(st, en + 1):
try:
conversion_char = map_dict[chr(c_code)]
font_width_map[conversion_char] = w[2]
except KeyError:
pass
w = w[3:]
elif isinstance(second, list):
for y in second:
w1[st] = y
st += 1
# Starting_C [W1 W2 ... Wn]
c_code = st
for width in second:
try:
conversion_char = map_dict[chr(c_code)]
font_width_map[conversion_char] = width
except KeyError:
pass
c_code += 1
w = w[2:]
else:
logger_warning(
"unknown widths : \n" + (ft1["/W"]).__repr__(),
__name__,
)
break
try:
sp_width = w1[space_code]
except Exception:
sp_width = (
w1[-1] / 2.0
) # if using default we consider space will be only half size
elif "/Widths" in ft:
w = list(ft["/Widths"]) # type: ignore
try:
st = cast(int, ft["/FirstChar"])
en: int = cast(int, ft["/LastChar"])
if st > space_code or en < space_code:
raise Exception("Not in range")
if w[space_code - st].get_object() == 0:
raise Exception("null width")
sp_width = w[space_code - st].get_object()
except Exception:
if "/FontDescriptor" in ft and "/MissingWidth" in cast(
DictionaryObject, ft["/FontDescriptor"]
):
sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
else:
# will consider width of char as avg(width)/2
m = 0
cpt = 0
for xx in w:
xx = xx.get_object()
if xx > 0:
m += xx
cpt += 1
sp_width = m / max(1, cpt) / 2

if is_null_or_none(sp_width):
sp_width = 0.0
w = ft["/Widths"].get_object()
if "/FontDescriptor" in ft and "/MissingWidth" in cast(
DictionaryObject, ft["/FontDescriptor"]
):
font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
else:
# will consider width of char as avg(width)
m = 0
cpt = 0
for xx in w:
xx = xx.get_object()
if xx > 0:
m += xx
cpt += 1
font_width_map["default"] = m / max(1, cpt)
st = cast(int, ft["/FirstChar"])
en = cast(int, ft["/LastChar"])
for c_code in range(st, en + 1):
try:
width = w[c_code - st].get_object()
font_width_map[chr(c_code)] = width
except (IndexError, KeyError):
# The PDF structure is invalid. The array is too small
# for the specified font width.
pass
if is_null_or_none(font_width_map.get("default")):
font_width_map["default"] = default_font_width if default_font_width else 0.0
return font_width_map


def compute_space_width(
font_width_map: Dict[Any, float], space_char: str
) -> float:
try:
sp_width = font_width_map[space_char]
if sp_width == 0:
raise ValueError("Zero width")
except (KeyError, ValueError):
sp_width = (
font_width_map["default"] / 2.0
) # if using default we consider space will be only half size

return sp_width


def compute_font_width(
font_width_map: Dict[Any, float],
char: str
) -> float:
char_width: float = 0.0
try:
char_width = font_width_map[char]
except KeyError:
char_width = (
font_width_map["default"]
)

return char_width


def type1_alternative(
ft: DictionaryObject,
map_dict: Dict[Any, Any],
Expand Down
45 changes: 39 additions & 6 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
overload,
)

from ._cmap import build_char_map, unknown_char_map
from ._cmap import build_char_map, build_font_width_map, compute_font_width, unknown_char_map
from ._protocols import PdfCommonDocProtocol
from ._text_extraction import (
OrientationNotFoundError,
Expand Down Expand Up @@ -496,6 +496,7 @@ def __init__(
if not is_null_or_none(indirect_reference):
assert indirect_reference is not None, "mypy"
self.update(cast(DictionaryObject, indirect_reference.get_object()))
self._font_width_maps: Dict[str, Dict[str, float]] = {}

def hash_bin(self) -> int:
"""
Expand Down Expand Up @@ -1716,6 +1717,25 @@ def _debug_for_extract(self) -> str: # pragma: no cover
out += "No Font\n"
return out

def _get_acutual_font_widths(
self,
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
add_text: str,
font_size: float,
default_space_width: float
) -> Tuple[float, float, float]:
font_widths: float = 0
font_name: str = cmap[2]
if font_name not in self._font_width_maps:
self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1], default_space_width * 2)
font_width_map: Dict[Any, float] = self._font_width_maps[font_name]
if add_text:
for char in add_text:
font_widths += compute_font_width(font_width_map, char)
return (font_widths * font_size, default_space_width * font_size, font_size)

def _extract_text(
self,
obj: Any,
Expand Down Expand Up @@ -1793,19 +1813,25 @@ def _extract_text(
char_scale = 1.0
space_scale = 1.0
_space_width: float = 500.0 # will be set correctly at first Tf
_actual_str_size: Dict[str, float] = {
"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0} # will be set to string length calculation result
TL = 0.0
font_size = 12.0 # init just in case of

def current_spacewidth() -> float:
return _space_width / 1000.0

def current_strwidths() -> float:
return _actual_str_size["str_widths"] / 1000.0

def process_operation(operator: bytes, operands: List[Any]) -> None:
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
nonlocal orientations, rtl_dir, visitor_text, output, text
nonlocal orientations, rtl_dir, visitor_text, output, text, _actual_str_size
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS

check_crlf_space: bool = False
str_widths: float = 0.0
# Table 5.4 page 405
if operator == b"BT":
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
Expand Down Expand Up @@ -1919,6 +1945,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
ty = float(operands[1])
tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
str_widths = current_strwidths()
_actual_str_size["str_widths"] = 0.0
elif operator == b"Tm":
check_crlf_space = True
tm_matrix = [
Expand All @@ -1929,13 +1957,14 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
float(operands[4]),
float(operands[5]),
]
str_widths = current_strwidths()
_actual_str_size["str_widths"] = 0.0
elif operator == b"T*":
check_crlf_space = True
tm_matrix[5] -= TL

elif operator == b"Tj":
check_crlf_space = True
text, rtl_dir = handle_tj(
text, rtl_dir, add_text = handle_tj(
text,
operands,
cm_matrix,
Expand All @@ -1947,6 +1976,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
rtl_dir,
visitor_text,
)
current_font_widths, _actual_str_size["space_width"], _actual_str_size["str_height"] = (
self._get_acutual_font_widths(cmap, add_text, font_size, current_spacewidth()))
_actual_str_size["str_widths"] += current_font_widths
else:
return None
if check_crlf_space:
Expand All @@ -1961,7 +1993,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
output,
font_size,
visitor_text,
current_spacewidth(),
str_widths,
_actual_str_size["space_width"],
_actual_str_size["str_height"]
)
if text == "":
memo_cm = cm_matrix.copy()
Expand Down Expand Up @@ -2042,7 +2076,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()

else:
process_operation(operator, operands)
if visitor_operand_after is not None:
Expand Down
Loading

0 comments on commit d5233a0

Please sign in to comment.