Skip to content

Commit

Permalink
ENH: Add fll parameter to PdfWriter constructor (#2865)
Browse files Browse the repository at this point in the history
Allow to load huge files. Closes #2839.
  • Loading branch information
pubpub-zz authored Sep 25, 2024
1 parent 635a7c1 commit dcd15aa
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 7 deletions.
17 changes: 14 additions & 3 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,17 +156,22 @@ class PdfWriter(PdfDocCommon):
incremental: If true, loads the document and set the PdfWriter in incremental mode.
When writing incrementally, the original document is written first and new/modified
content is appended. To be used for signed document/forms to keep signature valid.
full: If true, loads all the objects (always full if incremental = True).
This parameters may allows to load very big PDFs.
"""

def __init__(
self,
fileobj: Union[None, PdfReader, StrByteType, Path] = "",
clone_from: Union[None, PdfReader, StrByteType, Path] = None,
incremental: bool = False,
full: bool = False,
) -> None:
self.incremental = incremental
self.incremental = incremental or full
"""
Returns if the PdfWriter object has been started in incremental mode.
"""
Expand Down Expand Up @@ -203,7 +208,7 @@ def __init__(
fileobj = BytesIO(f.read(-1))
if isinstance(fileobj, BytesIO):
fileobj = PdfReader(fileobj)
else:
if not isinstance(fileobj, PdfReader):
raise PyPdfError("Invalid type for incremental mode")
self._reader = fileobj # prev content is in _reader.stream
self._header = fileobj.pdf_header.encode()
Expand Down Expand Up @@ -273,6 +278,8 @@ def _get_clone_from(
}
)
self._add_object(self._root_object)
if full and not incremental:
self.incremental = False
if isinstance(self._ID, list):
if isinstance(self._ID[0], TextStringObject):
self._ID[0] = ByteStringObject(self._ID[0].get_original_bytes())
Expand Down Expand Up @@ -1177,11 +1184,15 @@ def clone_reader_document_root(self, reader: PdfReader) -> None:
Args:
reader: PdfReader from which the document root should be copied.
"""
self._info_obj = None
if self.incremental:
self._objects = [None] * cast(int, reader.trailer["/Size"])
for i in range(len(self._objects) - 1):
o = reader.get_object(i + 1)
if o is not None:
self._objects[i] = o.replicate(self)
else:
self._objects.clear()
self._info_obj = None
self._root_object = reader.root_object.clone(self)
self._pages = self._root_object.raw_get("/Pages")

Expand Down
22 changes: 22 additions & 0 deletions pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,22 @@ def hash_value(self) -> bytes:
)
).encode()

def replicate(
self,
pdf_dest: PdfWriterProtocol,
) -> "PdfObject":
"""
Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter)
without ensuring links. This is used in clone_document_from_root with incremental = True.
Args:
pdf_dest: Target to clone to.
Returns:
The cloned PdfObject
"""
return self.clone(pdf_dest)

def clone(
self,
pdf_dest: PdfWriterProtocol,
Expand Down Expand Up @@ -298,6 +314,12 @@ def hash_bin(self) -> int:
"""
return hash((self.__class__, self.idnum, self.generation, id(self.pdf)))

def replicate(
self,
pdf_dest: PdfWriterProtocol,
) -> "PdfObject":
return IndirectObject(self.idnum, self.generation, pdf_dest)

def clone(
self,
pdf_dest: PdfWriterProtocol,
Expand Down
86 changes: 85 additions & 1 deletion pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,21 @@


class ArrayObject(List[Any], PdfObject):
def replicate(
self,
pdf_dest: PdfWriterProtocol,
) -> "ArrayObject":
arr = cast(
"ArrayObject",
self._reference_clone(ArrayObject(), pdf_dest, False),
)
for data in self:
if hasattr(data, "replicate"):
arr.append(data.replicate(pdf_dest))
else:
arr.append(data)
return arr

def clone(
self,
pdf_dest: PdfWriterProtocol,
Expand Down Expand Up @@ -248,6 +263,20 @@ def read_from_stream(


class DictionaryObject(Dict[Any, Any], PdfObject):
def replicate(
self,
pdf_dest: PdfWriterProtocol,
) -> "DictionaryObject":
d__ = cast(
"DictionaryObject",
self._reference_clone(self.__class__(), pdf_dest, False),
)
for k, v in self.items():
d__[k.replicate(pdf_dest)] = (
v.replicate(pdf_dest) if hasattr(v, "replicate") else v
)
return d__

def clone(
self,
pdf_dest: PdfWriterProtocol,
Expand Down Expand Up @@ -864,6 +893,31 @@ def __init__(self) -> None:
self._data: bytes = b""
self.decoded_self: Optional[DecodedStreamObject] = None

def replicate(
self,
pdf_dest: PdfWriterProtocol,
) -> "StreamObject":
d__ = cast(
"StreamObject",
self._reference_clone(self.__class__(), pdf_dest, False),
)
d__._data = self._data
try:
decoded_self = self.decoded_self
if decoded_self is None:
self.decoded_self = None
else:
self.decoded_self = cast(
"DecodedStreamObject", decoded_self.replicate(pdf_dest)
)
except Exception:
pass
for k, v in self.items():
d__[k.replicate(pdf_dest)] = (
v.replicate(pdf_dest) if hasattr(v, "replicate") else v
)
return d__

def _clone(
self,
src: DictionaryObject,
Expand Down Expand Up @@ -1105,7 +1159,37 @@ def __init__(
stream_data = stream.get_data()
assert stream_data is not None
super().set_data(stream_data)
self.forced_encoding = forced_encoding
self.forced_encoding = forced_encoding

def replicate(
self,
pdf_dest: PdfWriterProtocol,
) -> "ContentStream":
d__ = cast(
"ContentStream",
self._reference_clone(self.__class__(None, None), pdf_dest, False),
)
d__._data = self._data
try:
decoded_self = self.decoded_self
if decoded_self is None:
self.decoded_self = None
else:
self.decoded_self = cast(
"DecodedStreamObject", decoded_self.replicate(pdf_dest)
)
except Exception:
pass
for k, v in self.items():
d__[k.replicate(pdf_dest)] = (
v.replicate(pdf_dest) if hasattr(v, "replicate") else v
)
return d__
d__.set_data(self._data)
d__.pdf = pdf_dest
d__._operations = list(self._operations)
d__.forced_encoding = self.forced_encoding
return d__

def clone(
self,
Expand Down
40 changes: 40 additions & 0 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1155,3 +1155,43 @@ def test_is_null_or_none():
writer = PdfWriter(reader)
writer.pages[0]["/Contents"].append(writer._add_object(NullObject()))
assert is_null_or_none(writer.pages[0]["/Contents"][-1])


def test_coverage_arrayobject():
writer = PdfWriter()
a = ArrayObject([1])
assert isinstance(a.replicate(writer)[0], int)
assert isinstance(a.clone(writer)[0], int)
a.indirect_reference = IndirectObject(1, 0, writer)
assert isinstance(a.clone(writer)[0], int)
r = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
a = ArrayObject([r.pages[0]["/Contents"][0].get_object()])
aa = a.clone(writer)
assert isinstance(aa[0], IndirectObject)
for k, v in aa.items():
assert isinstance(k, int)
assert isinstance(v, PdfObject)


def test_coverage_streamobject():
writer = PdfWriter()
s = StreamObject()
del s.decoded_self
s.replicate(writer)
s.clone(writer)

co = ContentStream(None, None)
co.replicate(writer)
co.clone(writer, False, None)
co.indirect_reference = IndirectObject(1, 0, writer)
assert co == co.clone(writer)

r = PdfReader(RESOURCE_ROOT / "crazyones.pdf")
co = r.pages[0].get_contents()
co[NameObject("/testkey")] = NameObject("/test")
co.decoded_self = None
assert "/testkey" in co.replicate(writer)
co = r.pages[0].get_contents()
co[NameObject("/testkey")] = NameObject("/test")
co.decoded_self = DecodedStreamObject()
assert "/testkey" in co.replicate(writer)
5 changes: 2 additions & 3 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1140,8 +1140,7 @@ def test_set_page_label(pdf_file_path):

# Tests full length with labels assigned at first and last elements
# Tests different labels assigned to consecutive ranges
writer = PdfWriter()
writer.clone_document_from_reader(reader)
writer = PdfWriter(reader, full=True)
writer.set_page_label(0, 1, "/r")
writer.set_page_label(4, 5, "/A")
writer.set_page_label(10, 10, "/A")
Expand Down Expand Up @@ -2428,7 +2427,7 @@ def test_increment_writer(caplog):
)
assert "/ForTestOnly" in reader.get_object(5)
with pytest.raises(PyPdfError):
writer = PdfWriter(reader, incremental=True)
writer = PdfWriter(1, incremental=True)
b.seek(0)
writer = PdfWriter(b, incremental=True)
assert writer.list_objects_in_increment() == [] # no flowdown of properties
Expand Down

0 comments on commit dcd15aa

Please sign in to comment.