diff --git a/CHANGELOG.md b/CHANGELOG.md index 195d9dffd..def51e473 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,16 +1,11 @@ # HDMF Changelog -## Upcoming - -### Bug fixes -- Fixed an issue with the `data_utils.GenericDataChunkIterator` where if the underlying dataset was such that the `numpy.product` of the `maxshape` exceeded the range of the default `int32`, buffer overflow would occur and cause the true buffer shape to exceed available memory. This has been resolved by upcasting all shape attributes and operations to use the `uint64` data type. @codycbakerphd ([#780](https://github.com/hdmf-dev/hdmf/pull/780)) - - -## HDMF 3.4.7 (October 26, 2022) +## HDMF 3.4.7 (November 9, 2022) ### Bug fixes - Fix an issue where not providing an optional argument to `__init__` of an auto-generated `MultiContainerInterface` class raised an error. @rly ([#779](https://github.com/hdmf-dev/hdmf/pull/779)) +- Fixed an issue with the `data_utils.GenericDataChunkIterator` where if the underlying dataset was such that the `numpy.product` of the `maxshape` exceeded the range of the default `int32`, buffer overflow would occur and cause the true buffer shape to exceed available memory. This has been resolved by dropping all `numpy` operations (which forced casting within the passed data type) in favor of the unlimited precision of Python builtin integer types @codycbakerphd ([#780](https://github.com/hdmf-dev/hdmf/pull/780)) ([#781](https://github.com/hdmf-dev/hdmf/pull/781)) ## HDMF 3.4.6 (October 4, 2022) diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py index 953392512..04c66c251 100644 --- a/src/hdmf/data_utils.py +++ b/src/hdmf/data_utils.py @@ -1,4 +1,7 @@ import copy +import math +import functools # TODO: remove when Python 3.7 support is dropped +import operator # TODO: remove when Python 3.7 support is dropped from abc import ABCMeta, abstractmethod from collections.abc import Iterable from warnings import warn @@ -206,37 +209,49 @@ def __init__(self, **kwargs): ), "Only one of 'chunk_mb' or 'chunk_shape' can be specified!" self._dtype = self._get_dtype() - self._maxshape = tuple(np.array(self._get_maxshape(), dtype="uint64")) # Upcast for safer numpy operations - self.chunk_shape = tuple( - np.asarray(chunk_shape or self._get_default_chunk_shape(chunk_mb=chunk_mb), dtype="uint64") - ) # Upcast for safer numpy operations - self.buffer_shape = tuple( - np.asarray(buffer_shape or self._get_default_buffer_shape(buffer_gb=buffer_gb), dtype="uint64") - ) # Upcast for safer numpy operations + self._maxshape = tuple(int(x) for x in self._get_maxshape()) + chunk_shape = tuple(int(x) for x in chunk_shape) if chunk_shape else chunk_shape + self.chunk_shape = chunk_shape or self._get_default_chunk_shape(chunk_mb=chunk_mb) + buffer_shape = tuple(int(x) for x in buffer_shape) if buffer_shape else buffer_shape + self.buffer_shape = buffer_shape or self._get_default_buffer_shape(buffer_gb=buffer_gb) # Shape assertions - array_chunk_shape = np.array(self.chunk_shape) - array_buffer_shape = np.array(self.buffer_shape) - array_maxshape = np.array(self.maxshape) assert all( - array_chunk_shape <= array_maxshape + buffer_axis > 0 for buffer_axis in self.buffer_shape + ), f"Some dimensions of buffer_shape ({self.buffer_shape}) are less than zero!" + assert all( + chunk_axis <= maxshape_axis for chunk_axis, maxshape_axis in zip(self.chunk_shape, self.maxshape) ), f"Some dimensions of chunk_shape ({self.chunk_shape}) exceed the data dimensions ({self.maxshape})!" assert all( - array_buffer_shape <= array_maxshape + buffer_axis <= maxshape_axis for buffer_axis, maxshape_axis in zip(self.buffer_shape, self.maxshape) ), f"Some dimensions of buffer_shape ({self.buffer_shape}) exceed the data dimensions ({self.maxshape})!" assert all( - array_chunk_shape <= array_buffer_shape + (chunk_axis <= buffer_axis for chunk_axis, buffer_axis in zip(self.chunk_shape, self.buffer_shape)) ), f"Some dimensions of chunk_shape ({self.chunk_shape}) exceed the buffer shape ({self.buffer_shape})!" - assert all((array_buffer_shape % array_chunk_shape == 0)[array_buffer_shape != array_maxshape]), ( + assert all( + buffer_axis % chunk_axis == 0 + for chunk_axis, buffer_axis, maxshape_axis in zip(self.chunk_shape, self.buffer_shape, self.maxshape) + if buffer_axis != maxshape_axis + ), ( f"Some dimensions of chunk_shape ({self.chunk_shape}) do not " f"evenly divide the buffer shape ({self.buffer_shape})!" ) - self.num_buffers = np.prod( - np.ceil(array_maxshape / array_buffer_shape).astype("uint64") # np.ceil casts as float + self.num_buffers = functools.reduce( # TODO: replace with math.prod when Python 3.7 support is dropped + operator.mul, + [ + math.ceil(maxshape_axis / buffer_axis) + for buffer_axis, maxshape_axis in zip(self.buffer_shape, self.maxshape) + ], + 1, ) self.buffer_selection_generator = ( - tuple([slice(lower_bound, upper_bound) for lower_bound, upper_bound in zip(lower_bounds, upper_bounds)]) + tuple( + [ + slice(lower_bound, upper_bound) + for lower_bound, upper_bound in zip(lower_bounds, upper_bounds) + ] + ) for lower_bounds, upper_bounds in zip( product( *[ @@ -279,7 +294,7 @@ def __init__(self, **kwargs): default=None, ) ) - def _get_default_chunk_shape(self, **kwargs): + def _get_default_chunk_shape(self, **kwargs) -> Tuple[int, ...]: """ Select chunk shape with size in MB less than the threshold of chunk_mb. @@ -291,15 +306,17 @@ def _get_default_chunk_shape(self, **kwargs): n_dims = len(self.maxshape) itemsize = self.dtype.itemsize chunk_bytes = chunk_mb * 1e6 - v = np.floor(np.array(self.maxshape) / np.min(self.maxshape)).astype("uint64") # np.floor casts to float - prod_v = np.prod(v) + + min_maxshape = min(self.maxshape) + v = tuple(math.floor(maxshape_axis / min_maxshape) for maxshape_axis in self.maxshape) + prod_v = functools.reduce(operator.mul, v, 1) # TODO: replace with math.prod when Python 3.7 support is dropped while prod_v * itemsize > chunk_bytes and prod_v != 1: - v_ind = v != 1 - next_v = v[v_ind] - v[v_ind] = np.floor(next_v / np.min(next_v)).astype("uint64") # np.floor casts to float - prod_v = np.prod(v) - k = np.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims)).astype("uint64") # np.floor casts to float - return tuple([min(x, self.maxshape[dim]) for dim, x in enumerate(k * v)]) + non_unit_min_v = min(x for x in v if x != 1) + v = tuple(math.floor(x / non_unit_min_v) if x != 1 else x for x in v) + # TODO: replace with math.prod when Python 3.7 support is dropped + prod_v = functools.reduce(operator.mul, v, 1) + k = math.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims)) + return tuple([min(k * x, self.maxshape[dim]) for dim, x in enumerate(v)]) @docval( dict( @@ -309,7 +326,7 @@ def _get_default_chunk_shape(self, **kwargs): default=None, ) ) - def _get_default_buffer_shape(self, **kwargs): + def _get_default_buffer_shape(self, **kwargs) -> Tuple[int, ...]: """ Select buffer shape with size in GB less than the threshold of buffer_gb. @@ -318,21 +335,27 @@ def _get_default_buffer_shape(self, **kwargs): """ buffer_gb = getargs("buffer_gb", kwargs) assert buffer_gb > 0, f"buffer_gb ({buffer_gb}) must be greater than zero!" + assert all(chunk_axis > 0 for chunk_axis in self.chunk_shape), ( + f"Some dimensions of chunk_shape ({self.chunk_shape}) are less than zero!" + ) - k = np.floor( - (buffer_gb * 1e9 / (np.prod(self.chunk_shape) * self.dtype.itemsize)) ** (1 / len(self.chunk_shape)) - ).astype("uint64") # np.floor casts to float + # TODO: replace with math.prod when Python 3.7 support is dropped + k = math.floor( + ( + buffer_gb * 1e9 / (functools.reduce(operator.mul, self.chunk_shape, 1) * self.dtype.itemsize) + ) ** (1 / len(self.chunk_shape)) + ) return tuple( [ - min(max(x, self.chunk_shape[j]), self.maxshape[j]) - for j, x in enumerate(k * np.array(self.chunk_shape)) + min(max(k * x, self.chunk_shape[j]), self.maxshape[j]) + for j, x in enumerate(self.chunk_shape) ] ) - def recommended_chunk_shape(self) -> tuple: + def recommended_chunk_shape(self) -> Tuple[int, ...]: return self.chunk_shape - def recommended_data_shape(self) -> tuple: + def recommended_data_shape(self) -> Tuple[int, ...]: return self.maxshape def __iter__(self): @@ -376,16 +399,16 @@ def _get_data(self, selection: Tuple[slice]) -> np.ndarray: raise NotImplementedError("The data fetching method has not been built for this DataChunkIterator!") @property - def maxshape(self): + def maxshape(self) -> Tuple[int, ...]: return self._maxshape @abstractmethod - def _get_maxshape(self) -> tuple: + def _get_maxshape(self) -> Tuple[int, ...]: """Retrieve the maximum bounds of the data shape using minimal I/O.""" raise NotImplementedError("The setter for the maxshape property has not been built for this DataChunkIterator!") @property - def dtype(self): + def dtype(self) -> np.dtype: return self._dtype @abstractmethod diff --git a/tests/unit/utils_test/test_core_GenericDataChunkIterator.py b/tests/unit/utils_test/test_core_GenericDataChunkIterator.py index 392d0566a..076260b55 100644 --- a/tests/unit/utils_test/test_core_GenericDataChunkIterator.py +++ b/tests/unit/utils_test/test_core_GenericDataChunkIterator.py @@ -1,8 +1,9 @@ +import unittest import numpy as np from pathlib import Path from tempfile import mkdtemp from shutil import rmtree -import unittest +from typing import Tuple, Iterable import h5py @@ -22,22 +23,37 @@ def __init__(self, array: np.ndarray, **kwargs): self.array = array super().__init__(**kwargs) - def _get_data(self, selection): + def _get_data(self, selection) -> np.ndarray: return self.array[selection] - def _get_maxshape(self): + def _get_maxshape(self) -> Tuple[int, ...]: return self.array.shape - def _get_dtype(self): + def _get_dtype(self) -> np.dtype: + return self.array.dtype + + class TestNumpyArrayDataChunkIteratorWithNumpyDtypeShape(GenericDataChunkIterator): + def __init__(self, array: np.ndarray, **kwargs): + self.array = array + super().__init__(**kwargs) + + def _get_data(self, selection) -> np.ndarray: + return self.array[selection] + + def _get_maxshape(self) -> Tuple[np.uint64, ...]: # Undesirable return type, but can be handled + return tuple(np.uint64(x) for x in self.array.shape) + + def _get_dtype(self) -> np.dtype: return self.array.dtype - def setUp(self): - np.random.seed(seed=0) - self.test_dir = Path(mkdtemp()) - self.test_array = np.empty(shape=(2000, 384), dtype="int16") + @classmethod + def setUpClass(cls): + cls.test_dir = Path(mkdtemp()) + cls.test_array = np.empty(shape=(2000, 384), dtype="int16") - def tearDown(self): - rmtree(self.test_dir) + @classmethod + def tearDownClass(cls): + rmtree(cls.test_dir) def check_first_data_chunk_call(self, expected_selection, iterator_options): test = self.TestNumpyArrayDataChunkIterator(array=self.test_array, **iterator_options) @@ -58,6 +74,13 @@ def check_direct_hdf5_write(self, iterator_options): np.testing.assert_array_equal(np.array(dset), self.test_array) self.assertEqual(dset.chunks, iterator.chunk_shape) + def check_all_of_iterable_is_python_int(self, iterable: Iterable): + assert all( + tuple( # Easier to visualize failures in pytest with tuple vs. generator + isinstance(x, int) for x in iterable + ) + ) + def test_abstract_assertions(self): class TestGenericDataChunkIterator(GenericDataChunkIterator): pass @@ -122,7 +145,7 @@ def test_joint_option_assertions(self): array=self.test_array, buffer_shape=buffer_shape, chunk_shape=chunk_shape ) - def test_buffer_option_assertions(self): + def test_buffer_option_assertion_negative_buffer_gb(self): buffer_gb = -1 with self.assertRaisesWith( exc_type=AssertionError, @@ -130,6 +153,7 @@ def test_buffer_option_assertions(self): ): self.TestNumpyArrayDataChunkIterator(array=self.test_array, buffer_gb=buffer_gb) + def test_buffer_option_assertion_exceed_maxshape(self): buffer_shape = (2001, 384) with self.assertRaisesWith( exc_type=AssertionError, @@ -140,7 +164,15 @@ def test_buffer_option_assertions(self): ): self.TestNumpyArrayDataChunkIterator(array=self.test_array, buffer_shape=buffer_shape) - def test_chunk_option_assertions(self): + def test_buffer_option_assertion_negative_shape(self): + buffer_shape = (-1, 384) + with self.assertRaisesWith( + exc_type=AssertionError, + exc_msg=f"Some dimensions of buffer_shape ({buffer_shape}) are less than zero!" + ): + self.TestNumpyArrayDataChunkIterator(array=self.test_array, buffer_shape=buffer_shape) + + def test_chunk_option_assertion_negative_chunk_mb(self): chunk_mb = -1 with self.assertRaisesWith( exc_type=AssertionError, @@ -148,6 +180,14 @@ def test_chunk_option_assertions(self): ): self.TestNumpyArrayDataChunkIterator(array=self.test_array, chunk_mb=chunk_mb) + def test_chunk_option_assertion_negative_shape(self): + chunk_shape = (-1, 384) + with self.assertRaisesWith( + exc_type=AssertionError, + exc_msg=f"Some dimensions of chunk_shape ({chunk_shape}) are less than zero!" + ): + self.TestNumpyArrayDataChunkIterator(array=self.test_array, chunk_shape=chunk_shape) + @unittest.skipIf(not TQDM_INSTALLED, "optional tqdm module is not installed") def test_progress_bar_assertion(self): with self.assertWarnsWith( @@ -160,50 +200,69 @@ def test_progress_bar_assertion(self): progress_bar_options=dict(total=5), ) - def test_maxshape_attribute_uint64_type(self): - assert all( - [ - x.dtype is np.dtype("uint64") - for x in self.TestNumpyArrayDataChunkIterator(array=self.test_array).maxshape - ] + def test_maxshape_attribute_contains_int_type(self): + """Motivated by issues described in https://github.com/hdmf-dev/hdmf/pull/780 & 781 regarding return types.""" + self.check_all_of_iterable_is_python_int( + iterable=self.TestNumpyArrayDataChunkIterator(array=self.test_array).maxshape ) - def test_chunk_shape_attribute_automated_uint64_type(self): - assert all( - [ - x.dtype is np.dtype("uint64") - for x in self.TestNumpyArrayDataChunkIterator(array=self.test_array).chunk_shape - ] + def test_automated_buffer_shape_attribute_int_type(self): + """Motivated by issues described in https://github.com/hdmf-dev/hdmf/pull/780 & 781 regarding return types.""" + self.check_all_of_iterable_is_python_int( + iterable=self.TestNumpyArrayDataChunkIterator(array=self.test_array).buffer_shape ) - def test_buffer_shape_attribute_automated_uint64_type(self): - assert all( - [ - x.dtype is np.dtype("uint64") - for x in self.TestNumpyArrayDataChunkIterator(array=self.test_array).buffer_shape - ] + def test_automated_chunk_shape_attribute_int_type(self): + """Motivated by issues described in https://github.com/hdmf-dev/hdmf/pull/780 & 781 regarding return types.""" + self.check_all_of_iterable_is_python_int( + iterable=self.TestNumpyArrayDataChunkIterator(array=self.test_array).chunk_shape + ) + + def test_np_dtype_maxshape_attribute_int_type(self): + """Motivated by issues described in https://github.com/hdmf-dev/hdmf/pull/780 & 781 regarding return types.""" + self.check_all_of_iterable_is_python_int( + iterable=self.TestNumpyArrayDataChunkIteratorWithNumpyDtypeShape(array=self.test_array).maxshape + ) + + def test_manual_buffer_shape_attribute_int_type(self): + """Motivated by issues described in https://github.com/hdmf-dev/hdmf/pull/780 & 781 regarding return types.""" + self.check_all_of_iterable_is_python_int( + iterable=self.TestNumpyArrayDataChunkIterator( + array=self.test_array, + chunk_shape=(np.uint64(100), np.uint64(2)), + buffer_shape=(np.uint64(200), np.uint64(4)), + ).buffer_shape ) - def test_chunk_shape_attribute_manual_uint64_type(self): - assert all([x.dtype is np.dtype("uint64") for x in self.TestNumpyArrayDataChunkIterator( - array=self.test_array, - chunk_shape=(100, 2) - ).chunk_shape]) + def test_manual_chunk_shape_attribute_int_type(self): + """Motivated by issues described in https://github.com/hdmf-dev/hdmf/pull/780 & 781 regarding return types.""" + self.check_all_of_iterable_is_python_int( + iterable=self.TestNumpyArrayDataChunkIterator( + array=self.test_array, + chunk_shape=(np.uint64(100), np.uint64(2)) + ).chunk_shape + ) - def test_buffer_shape_attribute_manual_uint64_type(self): - assert all([x.dtype is np.dtype("uint64") for x in self.TestNumpyArrayDataChunkIterator( - array=self.test_array, - chunk_shape=(100, 2), - buffer_shape=(200, 4), - ).buffer_shape]) + def test_selection_slices_int_type(self): + """Motivated by issues described in https://github.com/hdmf-dev/hdmf/pull/780 & 781 regarding return types.""" + iterator = self.TestNumpyArrayDataChunkIterator(array=self.test_array) + first_chunk = next(iterator) + stop_0 = first_chunk.selection[0].stop + start_0 = first_chunk.selection[0].start + stop_1 = first_chunk.selection[1].stop + start_1 = first_chunk.selection[1].start + + self.check_all_of_iterable_is_python_int(iterable=(stop_0, start_0, stop_1, start_1)) def test_num_buffers(self): buffer_shape = (950, 190) chunk_shape = (50, 38) + expected_num_buffers = 9 + test = self.TestNumpyArrayDataChunkIterator( array=self.test_array, buffer_shape=buffer_shape, chunk_shape=chunk_shape ) - self.assertEqual(first=test.num_buffers, second=9) + self.assertEqual(first=test.num_buffers, second=expected_num_buffers) def test_numpy_array_chunk_iterator(self): iterator_options = dict() @@ -213,10 +272,10 @@ def test_numpy_array_chunk_iterator(self): self.check_direct_hdf5_write(iterator_options=iterator_options) def test_buffer_shape_option(self): - test_buffer_shape = (1580, 316) - iterator_options = dict(buffer_shape=test_buffer_shape) + expected_buffer_shape = (1580, 316) + iterator_options = dict(buffer_shape=expected_buffer_shape) self.check_first_data_chunk_call( - expected_selection=tuple([slice(0, buffer_shape_axis) for buffer_shape_axis in test_buffer_shape]), + expected_selection=tuple([slice(0, buffer_shape_axis) for buffer_shape_axis in expected_buffer_shape]), iterator_options=iterator_options, ) self.check_direct_hdf5_write(iterator_options=iterator_options) @@ -257,20 +316,29 @@ def test_chunk_shape_option(self): self.assertEqual(iterator.chunk_shape, test_chunk_shape) def test_chunk_mb_option(self): - test_chunk_shape = (1115, 223) + expected_chunk_shape = (1115, 223) iterator = self.TestNumpyArrayDataChunkIterator(array=self.test_array, chunk_mb=0.5) - self.assertEqual(iterator.chunk_shape, test_chunk_shape) + self.assertEqual(iterator.chunk_shape, expected_chunk_shape) - # chunk is larger than total data size; should collapse to maxshape - test_chunk_shape = (2000, 384) + def test_chunk_mb_option_larger_than_total_size(self): + """Chunk is larger than total data size; should collapse to maxshape.""" + expected_chunk_shape = (2000, 384) iterator = self.TestNumpyArrayDataChunkIterator(array=self.test_array, chunk_mb=2) - self.assertEqual(iterator.chunk_shape, test_chunk_shape) + self.assertEqual(iterator.chunk_shape, expected_chunk_shape) - # test to evoke while condition of default shaping method - test_chunk_shape = (1, 79, 79) + def test_chunk_mb_option_while_condition(self): + """Test to evoke while condition of default shaping method.""" + expected_chunk_shape = (2, 79, 79) + special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(2, 2000, 2000), dtype="int16") + iterator = self.TestNumpyArrayDataChunkIterator(array=special_array) + self.assertEqual(iterator.chunk_shape, expected_chunk_shape) + + def test_chunk_mb_option_while_condition_unit_maxshape_axis(self): + """Test to evoke while condition of default shaping method.""" + expected_chunk_shape = (1, 79, 79) special_array = np.random.randint(low=-(2 ** 15), high=2 ** 15 - 1, size=(1, 2000, 2000), dtype="int16") iterator = self.TestNumpyArrayDataChunkIterator(array=special_array) - self.assertEqual(iterator.chunk_shape, test_chunk_shape) + self.assertEqual(iterator.chunk_shape, expected_chunk_shape) @unittest.skipIf(not TQDM_INSTALLED, "optional tqdm module is not installed") def test_progress_bar(self):