Skip to content

Commit

Permalink
refactor: make factory mandatory
Browse files Browse the repository at this point in the history
When keeping the backward compatible call of read and readall, user try to old way without understanding the changes.
Now argument `factory` become mandatory.
All the user of `read` and `readall` should change their application to get benefit of the improvement.
It also updates a manual example.
  • Loading branch information
miurahr committed Oct 14, 2024
1 parent 73c03d6 commit 9d686af
Show file tree
Hide file tree
Showing 9 changed files with 163 additions and 40 deletions.
43 changes: 37 additions & 6 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -235,23 +235,54 @@ SevenZipFile Object
zip.extract(targets=targets, recursive=True)
.. py:method:: SevenZipFile.readall()
.. py:method:: SevenZipFile.readall(factory: WriterFactory = factory)
Extract all members from the archive to memory and returns dictionary object.
Returned dictionary has a form of Dict[filename: str, BinaryIO: io.BytesIO object].
Extract all members from the archive to object which implement Py7zIO interface,
and returns dictionary object which key is archived filename and value is given object
that factory class provide.

Returned dictionary has a form of Dict[str, Py7zIO].
Once readall() called, the SevenZipFIle object become exhausted and EOF state.
If you want to call read(), readall(), extract(), extractall() again,
you should call reset() before it.
You can get extracted data from dictionary value as such

.. code-block:: python
class MyWriter(Py7zIO):
def __init__(self):
self.buf = None
self.empty = True
def write(self, s: [bytes, bytearray]):
"""keep only first 10 bytes"""
if self.empty:
self.buf = s[:10]
self.empty = False
def read(self, length: int = 0) -> bytes:
if self.empty:
return None
return self.buf[:length]
def seek(self, offset: int, whence: int = 0) -> int:
return 0
def size(self) -> int:
return len(self.buf)
class MyFactory(WriterFactory):
def create(self, fname) -> Py7zIO:
return MyWriter()
with SevenZipFile('archive.7z', 'r') as zip:
for fname, bio in zip.readall().items():
print(f'{fname}: {bio.read(10)}...')
for fname, py7zio in zip.readall(MyFactory()).items():
print(f'{fname}: {py7zio.read(10)}...')
.. py:method:: SevenZipFile.read(targets=None)
.. py:method:: SevenZipFile.read(factory: WriterFactory, targets=None)
Extract specified list of target archived files to dictionary object.

Expand Down
3 changes: 3 additions & 0 deletions py7zr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
PRESET_EXTREME,
)
from py7zr.py7zr import ArchiveInfo, FileInfo, SevenZipFile, is_7zfile, pack_7zarchive, unpack_7zarchive
from py7zr.io import Py7zIO, WriterFactory
from py7zr.version import __version__

__copyright__ = "Copyright (C) 2019-2021 Hiroshi Miura"
Expand All @@ -60,6 +61,8 @@
"UnsupportedCompressionMethodError",
"Bad7zFile",
"DecompressionError",
"Py7zIO",
"WriterFactory",
"FILTER_LZMA",
"FILTER_LZMA2",
"FILTER_DELTA",
Expand Down
60 changes: 56 additions & 4 deletions py7zr/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
import hashlib
import io
from abc import ABC, abstractmethod
from typing import Optional, Union
Expand Down Expand Up @@ -44,12 +45,41 @@ def size(self) -> int:
pass


class HashIO(Py7zIO):
def __init__(self, filename):
self.filename = filename
self.hash = hashlib.sha256()
self.size = 0

def write(self, s: Union[bytes, bytearray]) -> int:
self.size += len(s)
self.hash.update(s)
return len(s)

def read(self, size: Optional[int] = None) -> bytes:
return self.hash.digest()

def seek(self, offset: int, whence: int = 0) -> int:
return 0

def flush(self) -> None:
pass

def size(self) -> int:
return self.size


class Py7zBytesIO(Py7zIO):
def __init__(self):
def __init__(self, filename: str, limit: int):
self.filename = filename
self.limit = limit
self._buffer = io.BytesIO()

def write(self, s: Union[bytes, bytearray]) -> int:
return self._buffer.write(s)
if self.size() < self.limit:
return self._buffer.write(s)
else:
return 0

def read(self, size: Optional[int] = None) -> bytes:
return self._buffer.read(size)
Expand All @@ -72,13 +102,29 @@ def create(self, filename: str) -> Py7zIO:
pass


class HashIOFactory(WriterFactory):
def __init__(self):
pass

def create(self, filename: str) -> Py7zIO:
return HashIO(filename)


class BytesIOFactory(WriterFactory):

def __init__(self, limit: int):
self.limit = limit

def create(self, filename: str) -> Py7zIO:
return Py7zBytesIO(filename, self.limit)


class NullIOFactory(WriterFactory):
def __init__(self):
pass

def create(self, filename: str) -> Py7zIO:
return Py7zBytesIO()
return NullIO()


class MemIO:
Expand Down Expand Up @@ -160,7 +206,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
pass


class NullIO:
class NullIO(Py7zIO):
"""pathlib.Path-like IO class of /dev/null"""

def __init__(self):
Expand Down Expand Up @@ -191,6 +237,12 @@ def parent(self):
def mkdir(self):
return None

def seek(self, offset: int, whence: int = 0) -> int:
pass

def size(self):
return 0

def __enter__(self):
return self

Expand Down
7 changes: 4 additions & 3 deletions py7zr/py7zr.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from threading import Thread
from typing import IO, Any, BinaryIO, Optional, Union

import deprecated
import multivolumefile

from py7zr.archiveinfo import Folder, Header, SignatureHeader
Expand Down Expand Up @@ -1016,7 +1017,7 @@ def list(self) -> list[FileInfo]:
)
return alist

def readall(self, factory: Optional[WriterFactory] = None) -> Optional[dict[str, MemIO]]:
def readall(self, factory: WriterFactory) -> Optional[dict[str, MemIO]]:
self._dict = {}
return self._extract(path=None, return_dict=True, writer_factory=factory)

Expand All @@ -1029,8 +1030,7 @@ def extractall(self, path: Optional[Any] = None, callback: Optional[ExtractCallb
self._extract(path=path, return_dict=False, callback=callback)

def read(
self, targets: Optional[Collection[str]] = None, factory: Optional[WriterFactory] = None
) -> Optional[dict[str, MemIO]]:
self, factory: WriterFactory, targets: Optional[Collection[str]] = None) -> Optional[dict[str, MemIO]]:
if not self._is_none_or_collection(targets):
raise TypeError("Wrong argument type given.")
# For interoperability with ZipFile, we strip any trailing slashes
Expand Down Expand Up @@ -1106,6 +1106,7 @@ def write(self, file: Union[pathlib.Path, str], arcname: Optional[str] = None):
self.files.append(file_info)
self.worker.archive(self.fp, self.files, folder, deref=self.dereference)

@deprecated
def writed(self, targets: dict[str, IO[Any]]) -> None:
for target, input in targets.items():
self.writef(input, target)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ dependencies = [
"pybcj>=1.0.0,<1.1.0",
"multivolumefile>=0.2.3",
"inflate64>=1.0.0,<1.1.0",
"deprecated",
]
keywords = ['compression', '7zip', 'lzma', 'zstandard', 'ppmd', 'lzma2', 'bcj', 'archive']
dynamic = ["readme", "version"]
Expand Down
13 changes: 7 additions & 6 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def test_py7zr_read_and_reset(tmp_path):
archive = py7zr.SevenZipFile(open(os.path.join(testdata_path, "read_reset.7z"), "rb"))
iterations = archive.getnames()
for target in iterations:
_dict = archive.read(targets=[target])
_dict = archive.read(factory=py7zr.io.NullIOFactory(), targets=[target])
assert len(_dict) == 1
archive.reset()
archive.close()
Expand All @@ -283,7 +283,7 @@ def test_py7zr_read_with_trailing_slash_and_reset(tmp_path):
archive = py7zr.SevenZipFile(open(os.path.join(testdata_path, "read_reset.7z"), "rb"))
iterations = archive.getnames()
for target in iterations:
_dict = archive.read(targets=[f"{target}/"])
_dict = archive.read(factory=py7zr.io.NullIOFactory(), targets=[f"{target}/"])
assert len(_dict) == 1
archive.reset()
archive.close()
Expand Down Expand Up @@ -348,21 +348,22 @@ def test_read_collection_argument():
"HmmmTaSI/atXtuwiN5mGrqyFZTC/V2VEohWua1Yk1K+jXy+32hBwnK2clyr3rN5L"
"Abv5g2wXBiABCYCFAAcLAQABIwMBAQVdABAAAAyAlgoBouB4BAAA"
)
factory = py7zr.io.BytesIOFactory(64)
with py7zr.SevenZipFile(BytesIO(data), password="boom") as arc:
result = arc.read(["bar.txt"]) # list -> ok
result = arc.read(factory, ["bar.txt"]) # list -> ok
assert "bar.txt" in result
bina = result.get("bar.txt")
assert isinstance(bina, MemIO)
assert bina.read() == b"refinery"
with py7zr.SevenZipFile(BytesIO(data), password="boom") as arc:
result = arc.read({"bar.txt"}) # set -> ok
result = arc.read(factory, {"bar.txt"}) # set -> ok
assert result.get("bar.txt").read() == b"refinery"
with pytest.raises(TypeError):
with py7zr.SevenZipFile(BytesIO(data), password="boom") as arc:
arc.read(("bar.txt",)) # tuple -> bad
arc.read(factory, ("bar.txt",)) # tuple -> bad
with pytest.raises(TypeError):
with py7zr.SevenZipFile(BytesIO(data), password="boom") as arc:
arc.read("bar.txt") # str -> bad
arc.read(factory, "bar.txt") # str -> bad
with pytest.raises(TypeError):
with py7zr.SevenZipFile(BytesIO(data), password="boom") as arc:
arc.extract(targets="bar.txt") # str -> bad
2 changes: 1 addition & 1 deletion tests/test_encryption.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_extract_encrypted_1(tmp_path):
@pytest.mark.files
def test_extract_encrypted_1_mem():
archive = py7zr.SevenZipFile(testdata_path.joinpath("encrypted_1.7z").open(mode="rb"), password="secret")
_dict = archive.readall()
archive.readall(py7zr.io.NullIOFactory())
archive.close()


Expand Down
34 changes: 16 additions & 18 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def check_archive(archive, tmp_path, return_dict: bool):
assert tmp_path.joinpath("test/test2.txt").open("rb").read() == bytes("This file is located in a folder.", "ascii")
assert tmp_path.joinpath("test1.txt").open("rb").read() == bytes("This file is located in the root.", "ascii")
else:
_dict = archive.readall()
_dict = archive.readall(py7zr.io.BytesIOFactory(64))
actual = _dict["test/test2.txt"].read()
assert actual == bytes("This file is located in a folder.", "ascii")
actual = _dict["test1.txt"].read()
Expand Down Expand Up @@ -91,7 +91,7 @@ def test_github_14(tmp_path):
@pytest.mark.files
def test_github_14_mem(tmp_path):
archive = py7zr.SevenZipFile(testdata_path.joinpath("github_14.7z").open(mode="rb"))
_dict = archive.readall()
_dict = archive.readall(py7zr.io.BytesIOFactory(limit=32))
actual = _dict["github_14"].read()
assert actual == bytes("Hello GitHub issue #14.\n", "ascii")

Expand All @@ -105,7 +105,7 @@ def _test_umlaut_archive(filename: str, target: pathlib.Path, return_dict: bool)
actual = target.joinpath("t\xe4st.txt").open().read()
assert actual == "This file contains a german umlaut in the filename."
else:
_dict = archive.readall()
_dict = archive.readall(py7zr.io.BytesIOFactory(64))
actual = _dict["t\xe4st.txt"].read()
assert actual == b"This file contains a german umlaut in the filename."
archive.close()
Expand Down Expand Up @@ -188,7 +188,7 @@ def test_extract_symlink(tmp_path):
@pytest.mark.files
def test_extract_symlink_mem():
with py7zr.SevenZipFile(testdata_path.joinpath("symlink.7z").open(mode="rb")) as archive:
_dict = archive.readall()
archive.readall(py7zr.io.NullIOFactory())


@pytest.mark.files
Expand Down Expand Up @@ -248,10 +248,9 @@ def test_lzma2bcj_mem():
"mingw64/share/doc/szip/RELEASE.txt",
"mingw64/bin/libszip-0.dll",
]
_dict = archive.readall()
m = hashlib.sha256()
m.update(_dict["mingw64/bin/libszip-0.dll"].read())
assert m.digest() == binascii.unhexlify("13926e3f080c9ca557165864ce5722acc4f832bb52a92d8d86c7f6e583708c4d")
_dict = archive.readall(py7zr.io.HashIOFactory())
digest = _dict["mingw64/bin/libszip-0.dll"].read()
assert digest == binascii.unhexlify("13926e3f080c9ca557165864ce5722acc4f832bb52a92d8d86c7f6e583708c4d")
archive.close()


Expand Down Expand Up @@ -286,7 +285,7 @@ def test_extract_lzma_1(tmp_path):
def test_extract_lzma2_1(tmp_path):
with testdata_path.joinpath("lzma2_1.7z").open(mode="rb") as target:
with py7zr.SevenZipFile(target) as ar:
_dict = ar.readall()
_dict = ar.readall(py7zr.io.NullIOFactory())


@pytest.mark.files
Expand All @@ -301,7 +300,7 @@ def test_zerosize(tmp_path):
def test_zerosize_mem():
with testdata_path.joinpath("zerosize.7z").open(mode="rb") as target:
archive = py7zr.SevenZipFile(target)
_dict = archive.readall()
archive.readall(py7zr.io.NullIOFactory())
archive.close()


Expand Down Expand Up @@ -359,7 +358,7 @@ def test_github_14_multi_mem():
"""multiple unnamed objects."""
archive = py7zr.SevenZipFile(str(testdata_path.joinpath("github_14_multi.7z")), "r")
assert archive.getnames() == ["github_14_multi", "github_14_multi"]
_dict = archive.readall()
_dict = archive.readall(py7zr.io.BytesIOFactory(32))
actual_1 = _dict["github_14_multi"].read()
assert actual_1 == bytes("Hello GitHub issue #14 1/2.\n", "ascii")
actual_2 = _dict["github_14_multi_0"].read()
Expand All @@ -380,10 +379,9 @@ def test_multiblock(tmp_path):
@pytest.mark.files
def test_multiblock_mem():
archive = py7zr.SevenZipFile(testdata_path.joinpath("mblock_1.7z").open(mode="rb"))
_dict = archive.readall()
m = hashlib.sha256()
m.update(_dict["bin/7zdec.exe"].read())
assert m.digest() == binascii.unhexlify("e14d8201c5c0d1049e717a63898a3b1c7ce4054a24871daebaa717da64dcaff5")
_dict = archive.readall(py7zr.io.HashIOFactory())
digest = _dict["bin/7zdec.exe"].read()
assert digest == binascii.unhexlify("e14d8201c5c0d1049e717a63898a3b1c7ce4054a24871daebaa717da64dcaff5")
archive.close()


Expand Down Expand Up @@ -451,7 +449,7 @@ def test_no_main_streams(tmp_path):
@pytest.mark.files
def test_no_main_streams_mem():
archive = py7zr.SevenZipFile(testdata_path.joinpath("test_folder.7z").open(mode="rb"))
_dict = archive.readall()
archive.readall(py7zr.io.NullIOFactory())
archive.close()


Expand Down Expand Up @@ -537,7 +535,7 @@ def test_decompress_small_files(tmp_path):
@pytest.mark.files
def test_extract_lzma_bcj_x86(tmp_path):
with py7zr.SevenZipFile(testdata_path.joinpath("lzma_bcj_x86.7z").open(mode="rb")) as ar:
_dict = ar.readall()
ar.readall(py7zr.io.NullIOFactory())


@pytest.mark.files
Expand Down Expand Up @@ -604,7 +602,7 @@ def test_extract_root_path_arcname(tmp_path):
iterations = archive.getnames()
assert len(iterations) == 1

_dict = archive.read(targets=iterations)
_dict = archive.read(factory=py7zr.io.BytesIOFactory(32), targets=iterations)
if _dict is None:
# fix typing errors
raise RuntimeError("Failed to read archive")
Expand Down
Loading

0 comments on commit 9d686af

Please sign in to comment.