BUG: Improve handling of spaces in text extraction (#2882)

Closes #1153.
py-pdf · Oct 3, 2024 · d5233a0 · d5233a0
1 parent 8e1799e
commit d5233a0
Show file tree

Hide file tree

Showing 4 changed files with 187 additions and 141 deletions.
diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -75,28 +75,28 @@ def build_char_map_from_dict(
         for x in int_entry:
             if x <= 255:
                 encoding[x] = chr(x)
-    try:
-        # override space_width with new params
-        space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
-    except Exception:
-        pass
     # I consider the space_code is available on one byte
     if isinstance(space_code, str):
         try:  # one byte
             sp = space_code.encode("charmap")[0]
         except Exception:
             sp = space_code.encode("utf-16-be")
             sp = sp[0] + 256 * sp[1]
+        try:
+            sp = ord(map_dict[chr(sp)])
+        except KeyError:
+            pass
     else:
         sp = space_code
-    sp_width = compute_space_width(ft, sp, space_width)
+    font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0)
+    half_space_width = compute_space_width(font_width_map, chr(sp)) / 2.0
 
     return (
         font_type,
-        float(sp_width / 2),
+        half_space_width,
         encoding,
         # https://github.com/python/mypy/issues/4374
-        map_dict,
+        map_dict
     )
 
 
@@ -402,78 +402,123 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) ->
         lst = lst[2:]
 
 
-def compute_space_width(
-    ft: DictionaryObject, space_code: int, space_width: float
-) -> float:
-    sp_width: float = space_width * 2.0  # default value
-    w = []
-    w1 = {}
+def build_font_width_map(
+    ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any], default_font_width: float
+) -> Dict[Any, float]:
+    font_width_map: Dict[Any, float] = {}
     st: int = 0
+    en: int = 0
+    if ft is None:
+        font_width_map["default"] = default_font_width
+        return font_width_map
+    try:
+        default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0
+    except Exception:
+        pass
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
+        # §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
+        # Widths for a CIDFont are defined using the DW and W entries.
+        # DW2 and W2 are for vertical use. Vertical type is not implemented.
         ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
         try:
-            w1[-1] = cast(float, ft1["/DW"])
+            font_width_map["default"] = cast(float, ft1["/DW"])
         except Exception:
-            w1[-1] = 1000.0
+            font_width_map["default"] = default_font_width
         if "/W" in ft1:
-            w = list(ft1["/W"])
+            w = ft1["/W"].get_object()
         else:
             w = []
         while len(w) > 0:
             st = w[0] if isinstance(w[0], int) else w[0].get_object()
             second = w[1].get_object()
             if isinstance(second, int):
-                for x in range(st, second):
-                    w1[x] = w[2]
+                # C_first C_last same_W
+                en = second
+                for c_code in range(st, en + 1):
+                    try:
+                        conversion_char = map_dict[chr(c_code)]
+                        font_width_map[conversion_char] = w[2]
+                    except KeyError:
+                        pass
                 w = w[3:]
             elif isinstance(second, list):
-                for y in second:
-                    w1[st] = y
-                    st += 1
+                # Starting_C [W1 W2 ... Wn]
+                c_code = st
+                for width in second:
+                    try:
+                        conversion_char = map_dict[chr(c_code)]
+                        font_width_map[conversion_char] = width
+                    except KeyError:
+                        pass
+                    c_code += 1
                 w = w[2:]
             else:
                 logger_warning(
                     "unknown widths : \n" + (ft1["/W"]).__repr__(),
                     __name__,
                 )
                 break
-        try:
-            sp_width = w1[space_code]
-        except Exception:
-            sp_width = (
-                w1[-1] / 2.0
-            )  # if using default we consider space will be only half size
     elif "/Widths" in ft:
-        w = list(ft["/Widths"])  # type: ignore
-        try:
-            st = cast(int, ft["/FirstChar"])
-            en: int = cast(int, ft["/LastChar"])
-            if st > space_code or en < space_code:
-                raise Exception("Not in range")
-            if w[space_code - st].get_object() == 0:
-                raise Exception("null width")
-            sp_width = w[space_code - st].get_object()
-        except Exception:
-            if "/FontDescriptor" in ft and "/MissingWidth" in cast(
-                DictionaryObject, ft["/FontDescriptor"]
-            ):
-                sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
-            else:
-                # will consider width of char as avg(width)/2
-                m = 0
-                cpt = 0
-                for xx in w:
-                    xx = xx.get_object()
-                    if xx > 0:
-                        m += xx
-                        cpt += 1
-                sp_width = m / max(1, cpt) / 2
-
-    if is_null_or_none(sp_width):
-        sp_width = 0.0
+        w = ft["/Widths"].get_object()
+        if "/FontDescriptor" in ft and "/MissingWidth" in cast(
+            DictionaryObject, ft["/FontDescriptor"]
+        ):
+            font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object()  # type: ignore
+        else:
+            # will consider width of char as avg(width)
+            m = 0
+            cpt = 0
+            for xx in w:
+                xx = xx.get_object()
+                if xx > 0:
+                    m += xx
+                    cpt += 1
+            font_width_map["default"] = m / max(1, cpt)
+        st = cast(int, ft["/FirstChar"])
+        en = cast(int, ft["/LastChar"])
+        for c_code in range(st, en + 1):
+            try:
+                width = w[c_code - st].get_object()
+                font_width_map[chr(c_code)] = width
+            except (IndexError, KeyError):
+                # The PDF structure is invalid. The array is too small
+                # for the specified font width.
+                pass
+    if is_null_or_none(font_width_map.get("default")):
+        font_width_map["default"] = default_font_width if default_font_width else 0.0
+    return font_width_map
+
+
+def compute_space_width(
+    font_width_map: Dict[Any, float], space_char: str
+) -> float:
+    try:
+        sp_width = font_width_map[space_char]
+        if sp_width == 0:
+            raise ValueError("Zero width")
+    except (KeyError, ValueError):
+        sp_width = (
+            font_width_map["default"] / 2.0
+        )  # if using default we consider space will be only half size
+
     return sp_width
 
 
+def compute_font_width(
+    font_width_map: Dict[Any, float],
+    char: str
+) -> float:
+    char_width: float = 0.0
+    try:
+        char_width = font_width_map[char]
+    except KeyError:
+        char_width = (
+            font_width_map["default"]
+        )
+
+    return char_width
+
+
 def type1_alternative(
     ft: DictionaryObject,
     map_dict: Dict[Any, Any],

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -49,7 +49,7 @@
     overload,
 )
 
-from ._cmap import build_char_map, unknown_char_map
+from ._cmap import build_char_map, build_font_width_map, compute_font_width, unknown_char_map
 from ._protocols import PdfCommonDocProtocol
 from ._text_extraction import (
     OrientationNotFoundError,
@@ -496,6 +496,7 @@ def __init__(
         if not is_null_or_none(indirect_reference):
             assert indirect_reference is not None, "mypy"
             self.update(cast(DictionaryObject, indirect_reference.get_object()))
+        self._font_width_maps: Dict[str, Dict[str, float]] = {}
 
     def hash_bin(self) -> int:
         """
@@ -1716,6 +1717,25 @@ def _debug_for_extract(self) -> str:  # pragma: no cover
             out += "No Font\n"
         return out
 
+    def _get_acutual_font_widths(
+        self,
+        cmap: Tuple[
+            Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+        ],
+        add_text: str,
+        font_size: float,
+        default_space_width: float
+    ) -> Tuple[float, float, float]:
+        font_widths: float = 0
+        font_name: str = cmap[2]
+        if font_name not in self._font_width_maps:
+            self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1], default_space_width * 2)
+        font_width_map: Dict[Any, float] = self._font_width_maps[font_name]
+        if add_text:
+            for char in add_text:
+                font_widths += compute_font_width(font_width_map, char)
+        return (font_widths * font_size, default_space_width * font_size, font_size)
+
     def _extract_text(
         self,
         obj: Any,
@@ -1793,19 +1813,25 @@ def _extract_text(
         char_scale = 1.0
         space_scale = 1.0
         _space_width: float = 500.0  # will be set correctly at first Tf
+        _actual_str_size: Dict[str, float] = {
+            "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0}  # will be set to string length calculation result
         TL = 0.0
         font_size = 12.0  # init just in case of
 
         def current_spacewidth() -> float:
             return _space_width / 1000.0
 
+        def current_strwidths() -> float:
+            return _actual_str_size["str_widths"] / 1000.0
+
         def process_operation(operator: bytes, operands: List[Any]) -> None:
             nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
             nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
-            nonlocal orientations, rtl_dir, visitor_text, output, text
+            nonlocal orientations, rtl_dir, visitor_text, output, text, _actual_str_size
             global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
 
             check_crlf_space: bool = False
+            str_widths: float = 0.0
             # Table 5.4 page 405
             if operator == b"BT":
                 tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
@@ -1919,6 +1945,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 ty = float(operands[1])
                 tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
                 tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
+                str_widths = current_strwidths()
+                _actual_str_size["str_widths"] = 0.0
             elif operator == b"Tm":
                 check_crlf_space = True
                 tm_matrix = [
@@ -1929,13 +1957,14 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     float(operands[4]),
                     float(operands[5]),
                 ]
+                str_widths = current_strwidths()
+                _actual_str_size["str_widths"] = 0.0
             elif operator == b"T*":
                 check_crlf_space = True
                 tm_matrix[5] -= TL
-
             elif operator == b"Tj":
                 check_crlf_space = True
-                text, rtl_dir = handle_tj(
+                text, rtl_dir, add_text = handle_tj(
                     text,
                     operands,
                     cm_matrix,
@@ -1947,6 +1976,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     rtl_dir,
                     visitor_text,
                 )
+                current_font_widths, _actual_str_size["space_width"], _actual_str_size["str_height"] = (
+                    self._get_acutual_font_widths(cmap, add_text, font_size, current_spacewidth()))
+                _actual_str_size["str_widths"] += current_font_widths
             else:
                 return None
             if check_crlf_space:
@@ -1961,7 +1993,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                         output,
                         font_size,
                         visitor_text,
-                        current_spacewidth(),
+                        str_widths,
+                        _actual_str_size["space_width"],
+                        _actual_str_size["str_height"]
                     )
                     if text == "":
                         memo_cm = cm_matrix.copy()
@@ -2042,7 +2076,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                     text = ""
                     memo_cm = cm_matrix.copy()
                     memo_tm = tm_matrix.copy()
-
             else:
                 process_operation(operator, operands)
             if visitor_operand_after is not None: