From 8620d9fda40ea242ba024990c4d6f4cf4e36415d Mon Sep 17 00:00:00 2001 From: Kamil Mankowski Date: Tue, 9 Jul 2024 11:49:29 +0200 Subject: [PATCH 1/2] FIX: Support for extracting data from archives with dirs When zip or tar archive contains directories, they appear in the default listings in addition to files they contain. It causes exceptions or extracting empty data, what eventually causes issues on creating a report message. --- CHANGELOG.md | 2 ++ intelmq/lib/utils.py | 4 ++-- intelmq/tests/assets/subdir.tar.gz | Bin 0 -> 183 bytes intelmq/tests/assets/subdir.tar.gz.license | 3 +++ intelmq/tests/assets/subdir.zip | Bin 0 -> 430 bytes intelmq/tests/assets/subdir.zip.license | 3 +++ .../bots/collectors/http/test_collector.py | 19 ++++++++++++++++++ intelmq/tests/lib/test_utils.py | 16 +++++++++++++++ 8 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 intelmq/tests/assets/subdir.tar.gz create mode 100644 intelmq/tests/assets/subdir.tar.gz.license create mode 100644 intelmq/tests/assets/subdir.zip create mode 100644 intelmq/tests/assets/subdir.zip.license diff --git a/CHANGELOG.md b/CHANGELOG.md index f6fb896a73..8de5546380 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ ### Core - `intelmq.lib.utils.drop_privileges`: When IntelMQ is called as `root` and dropping the privileges to user `intelmq`, also set the non-primary groups associated with the `intelmq` user. Makes the behaviour of running intelmqctl as `root` closer to the behaviour of `sudo -u intelmq ...` (PR#2507 by Mikk Margus Möll). +- `intelmq.lib.utils.unzip`: Filter out directory entries when extracting data fixing the issue that + archives with directories causes extracting empty data for a directory entry (PR# by Kamil Mankowski). ### Development diff --git a/intelmq/lib/utils.py b/intelmq/lib/utils.py index 42d551ad98..de59a223a6 100644 --- a/intelmq/lib/utils.py +++ b/intelmq/lib/utils.py @@ -538,7 +538,7 @@ def extract_tar(file): def extract(filename): return tar.extractfile(filename).read() - return tuple(file.name for file in tar.getmembers()), tar, extract + return tuple(file.name for file in tar.getmembers() if file.isfile()), tar, extract def extract_gzip(file): @@ -547,7 +547,7 @@ def extract_gzip(file): def extract_zip(file): zfp = zipfile.ZipFile(io.BytesIO(file), "r") - return zfp.namelist(), zfp, zfp.read + return [member.filename for member in zfp.infolist() if not member.is_dir()], zfp, zfp.read def unzip(file: bytes, extract_files: Union[bool, list], logger=None, diff --git a/intelmq/tests/assets/subdir.tar.gz b/intelmq/tests/assets/subdir.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..03daf10d51d353d2180e5c9cddb1d997e625abcd GIT binary patch literal 183 zcmV;o07(BIiwFS6_l#x$1MQSS3c@fDMYHx4a|36RNphYR6^crsNx|bg(x9#?E@F|) z+x$fa^26|w=Ke5D`_nj@Y9L}@21$w@-?KmjoE_VnVqG13I000N?S_%LF literal 0 HcmV?d00001 diff --git a/intelmq/tests/assets/subdir.tar.gz.license b/intelmq/tests/assets/subdir.tar.gz.license new file mode 100644 index 0000000000..056d32ec61 --- /dev/null +++ b/intelmq/tests/assets/subdir.tar.gz.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2024 CERT.at GmbH + +SPDX-License-Identifier: AGPL-3.0-or-later diff --git a/intelmq/tests/assets/subdir.zip b/intelmq/tests/assets/subdir.zip new file mode 100644 index 0000000000000000000000000000000000000000..5fba87a8563e7317c513e6f8284e59156355245b GIT binary patch literal 430 zcmWIWW@Zs#W&nZUml0qZO0Waz;?ks)%p&~&pdv1?qNlY^-pjvuuyO(=L6{4nC@HZB zh!jdvD@wQ!8Xf>OO!rdv#-brDKOcx-8W@@EnQ=Kp1!xNh2sFHP1kq4uaWN=>cnpjT z5 Date: Wed, 10 Jul 2024 09:09:45 +0200 Subject: [PATCH 2/2] Improve descriptions Co-authored-by: Sebastian --- CHANGELOG.md | 3 +-- intelmq/tests/lib/test_utils.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8de5546380..ff3d2ee5a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,8 +13,7 @@ ### Core - `intelmq.lib.utils.drop_privileges`: When IntelMQ is called as `root` and dropping the privileges to user `intelmq`, also set the non-primary groups associated with the `intelmq` user. Makes the behaviour of running intelmqctl as `root` closer to the behaviour of `sudo -u intelmq ...` (PR#2507 by Mikk Margus Möll). -- `intelmq.lib.utils.unzip`: Filter out directory entries when extracting data fixing the issue that - archives with directories causes extracting empty data for a directory entry (PR# by Kamil Mankowski). +- `intelmq.lib.utils.unzip`: Ignore directories themselves when extracting data to prevent the extraction of empty data for a directory entries (PR#2512 by Kamil Mankowski). ### Development diff --git a/intelmq/tests/lib/test_utils.py b/intelmq/tests/lib/test_utils.py index daba629960..ddb34408a3 100644 --- a/intelmq/tests/lib/test_utils.py +++ b/intelmq/tests/lib/test_utils.py @@ -261,7 +261,7 @@ def test_unzip_tar_gz_return_names(self): ('foo', b'foo text\n'))) def test_unzip_tar_gz_with_subdir(self): - """ Test the unzip function with a tar gz file and return_names. """ + """ Test the unzip function with a tar gz file containing a subdirectory and return_names. Test that the directories themselves are ignored. """ filename = os.path.join(os.path.dirname(__file__), '../assets/subdir.tar.gz') with open(filename, 'rb') as fh: result = utils.unzip(fh.read(), extract_files=True, return_names=True) @@ -298,7 +298,7 @@ def test_unzip_zip_return_names(self): ('foo', b'foo text\n'))) def test_unzip_zip_with_subdir(self): - """ Test the unzip function with a zip containing a subdirectory and returning names.""" + """ Test the unzip function with a zip containing a subdirectory and returning names. Test that the directories themselves are ignored.""" filename = os.path.join(os.path.dirname(__file__), '../assets/subdir.zip') with open(filename, 'rb') as fh: result = utils.unzip(fh.read(), extract_files=True, return_names=True)