From 2b7619f05119d84ec4687d9d3763d446aa3764c1 Mon Sep 17 00:00:00 2001 From: Birgit Edel <67849440+biredel@users.noreply.github.com> Date: Fri, 29 Mar 2024 20:18:36 +0000 Subject: [PATCH 1/3] BUG: Ambiguous translated references (1/2) id(obj) can and will hit the same value for different objects, if they do not exist at the same time. In CPython, its just a memory address. When that collission occurs, PdfWriter when asked to insert a page from one PdfReader object may silently confuse it with one from a since-deleted PdfReader object, thus corrupting output in not-easily-noticed ways. Replacing dict[id(obj)] with dict[weakref(obj)] should suffice, as we do not replace entries - we only ever insert or delete. --- pypdf/_protocols.py | 3 ++- pypdf/_writer.py | 16 +++++++++------- pypdf/generic/_base.py | 20 ++++++++++---------- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index 9f413660b..8a316cf03 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -3,6 +3,7 @@ from abc import abstractmethod from pathlib import Path from typing import IO, Any, Dict, List, Optional, Tuple, Union +from weakref import WeakKeyDictionary try: # Python 3.8+: https://peps.python.org/pep-0586 @@ -78,7 +79,7 @@ def trailer(self) -> Dict[str, Any]: class PdfWriterProtocol(PdfCommonDocProtocol, Protocol): _objects: List[Any] - _id_translated: Dict[int, Dict[int, int]] + _id_translated: WeakKeyDictionary["PdfReaderProtocol", Dict[int, int]] @abstractmethod def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: diff --git a/pypdf/_writer.py b/pypdf/_writer.py index c7569e31e..cac51fa6a 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -52,7 +52,9 @@ Union, cast, ) +from weakref import WeakKeyDictionary +from ._protocols import PdfReaderProtocol from ._cmap import build_char_map_from_dict from ._doc_common import PdfDocCommon from ._encryption import EncryptAlgorithm, Encryption @@ -178,7 +180,7 @@ def __init__( self._idnum_hash: Dict[bytes, IndirectObject] = {} """Maps hash values of indirect objects to their IndirectObject instances.""" - self._id_translated: Dict[int, Dict[int, int]] = {} + self._id_translated: WeakKeyDictionary[PdfReaderProtocol, dict[int, int]] = WeakKeyDictionary() # The root of our page tree node. pages = DictionaryObject() @@ -389,7 +391,7 @@ def _add_page( # page, we need to create a new dictionary for the page, however the # objects below (including content) are not duplicated: try: # delete an already existing page - del self._id_translated[id(page_org.indirect_reference.pdf)][ # type: ignore + del self._id_translated[page_org.indirect_reference.pdf][ # type: ignore page_org.indirect_reference.idnum # type: ignore ] except Exception: @@ -2451,7 +2453,7 @@ def merge( ArrayObject, cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"], ) - trslat = self._id_translated[id(reader)] + trslat = self._id_translated[reader] try: for f in reader.root_object["/AcroForm"]["/Fields"]: # type: ignore try: @@ -2564,7 +2566,7 @@ def add_filtered_articles( else: thr = thr.get_object() if thr.indirect_reference.idnum not in self._id_translated[ - id(reader) + reader ] and fltr.search((thr["/I"] if "/I" in thr else {}).get("/Title", "")): self._add_articles_thread(thr, pages, reader) @@ -2780,15 +2782,15 @@ def reset_translation( if set to None or omitted, all tables will be reset. """ if reader is None: - self._id_translated = {} + self._id_translated = WeakKeyDictionary() elif isinstance(reader, PdfReader): try: - del self._id_translated[id(reader)] + del self._id_translated[reader] except Exception: pass elif isinstance(reader, IndirectObject): try: - del self._id_translated[id(reader.pdf)] + del self._id_translated[reader.pdf] except Exception: pass else: diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 3fde174b0..80f8a041c 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -124,19 +124,19 @@ def _reference_clone( return clone i = len(pdf_dest._objects) + 1 if ind is not None: - if id(ind.pdf) not in pdf_dest._id_translated: - pdf_dest._id_translated[id(ind.pdf)] = {} - pdf_dest._id_translated[id(ind.pdf)]["PreventGC"] = ind.pdf # type: ignore + if ind.pdf not in pdf_dest._id_translated: + pdf_dest._id_translated[ind.pdf] = {} + pdf_dest._id_translated[ind.pdf]["PreventGC"] = ind.pdf # type: ignore if ( not force_duplicate - and ind.idnum in pdf_dest._id_translated[id(ind.pdf)] + and ind.idnum in pdf_dest._id_translated[ind.pdf] ): obj = pdf_dest.get_object( - pdf_dest._id_translated[id(ind.pdf)][ind.idnum] + pdf_dest._id_translated[ind.pdf][ind.idnum] ) assert obj is not None return obj - pdf_dest._id_translated[id(ind.pdf)][ind.idnum] = i + pdf_dest._id_translated[ind.pdf][ind.idnum] = i pdf_dest._objects.append(clone) clone.indirect_reference = IndirectObject(i, 0, pdf_dest) return clone @@ -250,11 +250,11 @@ def clone( if self.pdf == pdf_dest and not force_duplicate: # Already duplicated and no extra duplication required return self - if id(self.pdf) not in pdf_dest._id_translated: - pdf_dest._id_translated[id(self.pdf)] = {} + if self.pdf not in pdf_dest._id_translated: + pdf_dest._id_translated[self.pdf] = {} - if self.idnum in pdf_dest._id_translated[id(self.pdf)]: - dup = pdf_dest.get_object(pdf_dest._id_translated[id(self.pdf)][self.idnum]) + if self.idnum in pdf_dest._id_translated[self.pdf]: + dup = pdf_dest.get_object(pdf_dest._id_translated[self.pdf][self.idnum]) if force_duplicate: assert dup is not None assert dup.indirect_reference is not None From 14846104cca76bdf714a9862d2899b81109b69d9 Mon Sep 17 00:00:00 2001 From: Birgit Edel <67849440+biredel@users.noreply.github.com> Date: Sat, 30 Mar 2024 11:29:29 +0000 Subject: [PATCH 2/3] Python 3.7 (EoL) syntax --- pypdf/_protocols.py | 2 +- pypdf/_writer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py index 8a316cf03..c1a5b927d 100644 --- a/pypdf/_protocols.py +++ b/pypdf/_protocols.py @@ -79,7 +79,7 @@ def trailer(self) -> Dict[str, Any]: class PdfWriterProtocol(PdfCommonDocProtocol, Protocol): _objects: List[Any] - _id_translated: WeakKeyDictionary["PdfReaderProtocol", Dict[int, int]] + _id_translated: "WeakKeyDictionary[PdfReaderProtocol, Dict[int, int]]" @abstractmethod def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]: diff --git a/pypdf/_writer.py b/pypdf/_writer.py index cac51fa6a..6a2659865 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -180,7 +180,7 @@ def __init__( self._idnum_hash: Dict[bytes, IndirectObject] = {} """Maps hash values of indirect objects to their IndirectObject instances.""" - self._id_translated: WeakKeyDictionary[PdfReaderProtocol, dict[int, int]] = WeakKeyDictionary() + self._id_translated: "WeakKeyDictionary[PdfReaderProtocol, dict[int, int]]" = WeakKeyDictionary() # The root of our page tree node. pages = DictionaryObject() From 03e8b81e24b8b0a61f2d13503a4ae313c1678ab7 Mon Sep 17 00:00:00 2001 From: Birgit Edel <67849440+biredel@users.noreply.github.com> Date: Sat, 30 Mar 2024 11:35:08 +0000 Subject: [PATCH 3/3] Revert "ROB: Cope with garbage collector during cloning (#1841)" This reverts commit 818855335da85f98d87326c054ac744efc00a0ec. --- pypdf/generic/_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 80f8a041c..52de94643 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -126,7 +126,6 @@ def _reference_clone( if ind is not None: if ind.pdf not in pdf_dest._id_translated: pdf_dest._id_translated[ind.pdf] = {} - pdf_dest._id_translated[ind.pdf]["PreventGC"] = ind.pdf # type: ignore if ( not force_duplicate and ind.idnum in pdf_dest._id_translated[ind.pdf]