diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 4cc112552..fde795b01 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -75,11 +75,6 @@ def build_char_map_from_dict( for x in int_entry: if x <= 255: encoding[x] = chr(x) - try: - # override space_width with new params - space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] - except Exception: - pass # I consider the space_code is available on one byte if isinstance(space_code, str): try: # one byte @@ -87,16 +82,21 @@ def build_char_map_from_dict( except Exception: sp = space_code.encode("utf-16-be") sp = sp[0] + 256 * sp[1] + try: + sp = ord(map_dict[chr(sp)]) + except KeyError: + pass else: sp = space_code - sp_width = compute_space_width(ft, sp, space_width) + font_width_map = build_font_width_map(ft, map_dict, space_width * 2.0) + half_space_width = compute_space_width(font_width_map, chr(sp)) / 2.0 return ( font_type, - float(sp_width / 2), + half_space_width, encoding, # https://github.com/python/mypy/issues/4374 - map_dict, + map_dict ) @@ -402,34 +402,55 @@ def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> lst = lst[2:] -def compute_space_width( - ft: DictionaryObject, space_code: int, space_width: float -) -> float: - sp_width: float = space_width * 2.0 # default value - w = [] - w1 = {} +def build_font_width_map( + ft: Union[DictionaryObject, None], map_dict: Dict[Any, Any], default_font_width: float +) -> Dict[Any, float]: + font_width_map: Dict[Any, float] = {} st: int = 0 + en: int = 0 + if ft is None: + font_width_map["default"] = default_font_width + return font_width_map + try: + default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] * 2.0 + except Exception: + pass if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): + # ยง9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts") + # Widths for a CIDFont are defined using the DW and W entries. + # DW2 and W2 are for vertical use. Vertical type is not implemented. ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore try: - w1[-1] = cast(float, ft1["/DW"]) + font_width_map["default"] = cast(float, ft1["/DW"]) except Exception: - w1[-1] = 1000.0 + font_width_map["default"] = default_font_width if "/W" in ft1: - w = list(ft1["/W"]) + w = ft1["/W"].get_object() else: w = [] while len(w) > 0: st = w[0] if isinstance(w[0], int) else w[0].get_object() second = w[1].get_object() if isinstance(second, int): - for x in range(st, second): - w1[x] = w[2] + # C_first C_last same_W + en = second + for c_code in range(st, en + 1): + try: + conversion_char = map_dict[chr(c_code)] + font_width_map[conversion_char] = w[2] + except KeyError: + pass w = w[3:] elif isinstance(second, list): - for y in second: - w1[st] = y - st += 1 + # Starting_C [W1 W2 ... Wn] + c_code = st + for width in second: + try: + conversion_char = map_dict[chr(c_code)] + font_width_map[conversion_char] = width + except KeyError: + pass + c_code += 1 w = w[2:] else: logger_warning( @@ -437,43 +458,67 @@ def compute_space_width( __name__, ) break - try: - sp_width = w1[space_code] - except Exception: - sp_width = ( - w1[-1] / 2.0 - ) # if using default we consider space will be only half size elif "/Widths" in ft: - w = list(ft["/Widths"]) # type: ignore - try: - st = cast(int, ft["/FirstChar"]) - en: int = cast(int, ft["/LastChar"]) - if st > space_code or en < space_code: - raise Exception("Not in range") - if w[space_code - st].get_object() == 0: - raise Exception("null width") - sp_width = w[space_code - st].get_object() - except Exception: - if "/FontDescriptor" in ft and "/MissingWidth" in cast( - DictionaryObject, ft["/FontDescriptor"] - ): - sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore - else: - # will consider width of char as avg(width)/2 - m = 0 - cpt = 0 - for xx in w: - xx = xx.get_object() - if xx > 0: - m += xx - cpt += 1 - sp_width = m / max(1, cpt) / 2 - - if is_null_or_none(sp_width): - sp_width = 0.0 + w = ft["/Widths"].get_object() + if "/FontDescriptor" in ft and "/MissingWidth" in cast( + DictionaryObject, ft["/FontDescriptor"] + ): + font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore + else: + # will consider width of char as avg(width) + m = 0 + cpt = 0 + for xx in w: + xx = xx.get_object() + if xx > 0: + m += xx + cpt += 1 + font_width_map["default"] = m / max(1, cpt) + st = cast(int, ft["/FirstChar"]) + en = cast(int, ft["/LastChar"]) + for c_code in range(st, en + 1): + try: + width = w[c_code - st].get_object() + font_width_map[chr(c_code)] = width + except (IndexError, KeyError): + # The PDF structure is invalid. The array is too small + # for the specified font width. + pass + if is_null_or_none(font_width_map.get("default")): + font_width_map["default"] = default_font_width if default_font_width else 0.0 + return font_width_map + + +def compute_space_width( + font_width_map: Dict[Any, float], space_char: str +) -> float: + try: + sp_width = font_width_map[space_char] + if sp_width == 0: + raise ValueError("Zero width") + except (KeyError, ValueError): + sp_width = ( + font_width_map["default"] / 2.0 + ) # if using default we consider space will be only half size + return sp_width +def compute_font_width( + font_width_map: Dict[Any, float], + char: str +) -> float: + char_width: float = 0.0 + try: + char_width = font_width_map[char] + except KeyError: + char_width = ( + font_width_map["default"] + ) + + return char_width + + def type1_alternative( ft: DictionaryObject, map_dict: Dict[Any, Any], diff --git a/pypdf/_page.py b/pypdf/_page.py index 87b914ce2..c49a68c33 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -49,7 +49,7 @@ overload, ) -from ._cmap import build_char_map, unknown_char_map +from ._cmap import build_char_map, build_font_width_map, compute_font_width, unknown_char_map from ._protocols import PdfCommonDocProtocol from ._text_extraction import ( OrientationNotFoundError, @@ -496,6 +496,7 @@ def __init__( if not is_null_or_none(indirect_reference): assert indirect_reference is not None, "mypy" self.update(cast(DictionaryObject, indirect_reference.get_object())) + self._font_width_maps: Dict[str, Dict[str, float]] = {} def hash_bin(self) -> int: """ @@ -1716,6 +1717,25 @@ def _debug_for_extract(self) -> str: # pragma: no cover out += "No Font\n" return out + def _get_acutual_font_widths( + self, + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ], + add_text: str, + font_size: float, + default_space_width: float + ) -> Tuple[float, float, float]: + font_widths: float = 0 + font_name: str = cmap[2] + if font_name not in self._font_width_maps: + self._font_width_maps[font_name] = build_font_width_map(cmap[3], cmap[1], default_space_width * 2) + font_width_map: Dict[Any, float] = self._font_width_maps[font_name] + if add_text: + for char in add_text: + font_widths += compute_font_width(font_width_map, char) + return (font_widths * font_size, default_space_width * font_size, font_size) + def _extract_text( self, obj: Any, @@ -1793,19 +1813,25 @@ def _extract_text( char_scale = 1.0 space_scale = 1.0 _space_width: float = 500.0 # will be set correctly at first Tf + _actual_str_size: Dict[str, float] = { + "str_widths": 0.0, "space_width": 0.0, "str_height": 0.0} # will be set to string length calculation result TL = 0.0 font_size = 12.0 # init just in case of def current_spacewidth() -> float: return _space_width / 1000.0 + def current_strwidths() -> float: + return _actual_str_size["str_widths"] / 1000.0 + def process_operation(operator: bytes, operands: List[Any]) -> None: nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap - nonlocal orientations, rtl_dir, visitor_text, output, text + nonlocal orientations, rtl_dir, visitor_text, output, text, _actual_str_size global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS check_crlf_space: bool = False + str_widths: float = 0.0 # Table 5.4 page 405 if operator == b"BT": tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] @@ -1919,6 +1945,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: ty = float(operands[1]) tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] + str_widths = current_strwidths() + _actual_str_size["str_widths"] = 0.0 elif operator == b"Tm": check_crlf_space = True tm_matrix = [ @@ -1929,13 +1957,14 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: float(operands[4]), float(operands[5]), ] + str_widths = current_strwidths() + _actual_str_size["str_widths"] = 0.0 elif operator == b"T*": check_crlf_space = True tm_matrix[5] -= TL - elif operator == b"Tj": check_crlf_space = True - text, rtl_dir = handle_tj( + text, rtl_dir, add_text = handle_tj( text, operands, cm_matrix, @@ -1947,6 +1976,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: rtl_dir, visitor_text, ) + current_font_widths, _actual_str_size["space_width"], _actual_str_size["str_height"] = ( + self._get_acutual_font_widths(cmap, add_text, font_size, current_spacewidth())) + _actual_str_size["str_widths"] += current_font_widths else: return None if check_crlf_space: @@ -1961,7 +1993,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: output, font_size, visitor_text, - current_spacewidth(), + str_widths, + _actual_str_size["space_width"], + _actual_str_size["str_height"] ) if text == "": memo_cm = cm_matrix.copy() @@ -2042,7 +2076,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() - else: process_operation(operator, operands) if visitor_operand_after is not None: diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 3b1d687ea..a1c0d1d91 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -98,7 +98,9 @@ def crlf_space_check( output: str, font_size: float, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], + str_widths: float, spacewidth: float, + str_height: float, ) -> Tuple[str, str, List[float], List[float]]: cm_prev = cmtm_prev[0] tm_prev = cmtm_prev[1] @@ -112,88 +114,38 @@ def crlf_space_check( orientation = orient(m) delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] - k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) - f = font_size * k + # Table 108 of the 1.7 reference ("Text positioning operators") + scale_prev_x = math.sqrt(tm_prev[0]**2 + tm_prev[1]**2) + scale_prev_y = math.sqrt(tm_prev[2]**2 + tm_prev[3]**2) + scale_y = math.sqrt(tm_matrix[2]**2 + tm_matrix[3]**2) cm_prev = m + if orientation not in orientations: raise OrientationNotFoundError + if orientation in (0, 180): + moved_height: float = delta_y + moved_width: float = delta_x + elif orientation in (90, 270): + moved_height = delta_x + moved_width = delta_y try: - if orientation == 0: - if delta_y < -0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif ( - abs(delta_y) < f * 0.3 - and abs(delta_x) > spacewidth * f * 15 - and (output + text)[-1] != " " - ): - text += " " - elif orientation == 180: - if delta_y > 0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif ( - abs(delta_y) < f * 0.3 - and abs(delta_x) > spacewidth * f * 15 - and (output + text)[-1] != " " - ): - text += " " - elif orientation == 90: - if delta_x > 0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif ( - abs(delta_x) < f * 0.3 - and abs(delta_y) > spacewidth * f * 15 - and (output + text)[-1] != " " - ): - text += " " - elif orientation == 270: - if delta_x < -0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif ( - abs(delta_x) < f * 0.3 - and abs(delta_y) > spacewidth * f * 15 - and (output + text)[-1] != " " - ): - text += " " + if abs(moved_height) > 0.8 * min(str_height * scale_prev_y, font_size * scale_y): + if (output + text)[-1] != "\n": + output += text + "\n" + if visitor_text is not None: + visitor_text( + text + "\n", + memo_cm, + memo_tm, + cmap[3], + font_size, + ) + text = "" + elif ( + (moved_width >= (spacewidth + str_widths) * scale_prev_x) + and (output + text)[-1] != " " + ): + text += " " except Exception: pass tm_prev = tm_matrix.copy() @@ -214,12 +166,14 @@ def handle_tj( font_size: float, rtl_dir: bool, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], -) -> Tuple[str, bool]: +) -> Tuple[str, bool, str]: + add_text = "" m = mult(tm_matrix, cm_matrix) orientation = orient(m) if orientation in orientations and len(operands) > 0: if isinstance(operands[0], str): text += operands[0] + add_text = operands[0] else: t: str = "" tt: bytes = ( @@ -259,6 +213,7 @@ def handle_tj( or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... ): text = x + text if rtl_dir else text + x + add_text = x if rtl_dir else add_text + x elif ( # right-to-left characters set 0x0590 <= xx <= 0x08FF or 0xFB1D <= xx <= 0xFDFF @@ -272,6 +227,7 @@ def handle_tj( visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = x + text + add_text = x + add_text else: # left-to-right # print(">",xx,x,end="") if rtl_dir: @@ -281,5 +237,6 @@ def handle_tj( visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = text + x + add_text += x # fmt: on - return text, rtl_dir + return text, rtl_dir, add_text diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 8bfa1809e..ff318f9fe 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -200,3 +200,14 @@ def test_space_with_one_unit_smaller_than_font_width(): page = reader.pages[0] extracted = page.extract_text() assert "Reporting crude oil leak.\n" in extracted + + +@pytest.mark.enable_socket() +def test_space_position_calculation(): + """Tests for #1153""" + url = "https://github.com/py-pdf/pypdf/files/9164743/file-0.pdf" + name = "iss1153.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[3] + extracted = page.extract_text() + assert "Shortly after the Geneva BOF session, the" in extracted