Skip to content

Commit

Permalink
Ensure slices of GenericDataChunkIterator have consistent data type (#…
Browse files Browse the repository at this point in the history
…781)

Co-authored-by: Ryan Ly <[email protected]>
  • Loading branch information
CodyCBakerPhD and rly authored Nov 9, 2022
1 parent e7d1c8e commit 35dab0f
Show file tree
Hide file tree
Showing 3 changed files with 183 additions and 97 deletions.
9 changes: 2 additions & 7 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
# HDMF Changelog

## Upcoming

### Bug fixes
- Fixed an issue with the `data_utils.GenericDataChunkIterator` where if the underlying dataset was such that the `numpy.product` of the `maxshape` exceeded the range of the default `int32`, buffer overflow would occur and cause the true buffer shape to exceed available memory. This has been resolved by upcasting all shape attributes and operations to use the `uint64` data type. @codycbakerphd ([#780](https://github.com/hdmf-dev/hdmf/pull/780))


## HDMF 3.4.7 (October 26, 2022)
## HDMF 3.4.7 (November 9, 2022)

### Bug fixes
- Fix an issue where not providing an optional argument to `__init__` of an auto-generated `MultiContainerInterface`
class raised an error. @rly ([#779](https://github.com/hdmf-dev/hdmf/pull/779))
- Fixed an issue with the `data_utils.GenericDataChunkIterator` where if the underlying dataset was such that the `numpy.product` of the `maxshape` exceeded the range of the default `int32`, buffer overflow would occur and cause the true buffer shape to exceed available memory. This has been resolved by dropping all `numpy` operations (which forced casting within the passed data type) in favor of the unlimited precision of Python builtin integer types @codycbakerphd ([#780](https://github.com/hdmf-dev/hdmf/pull/780)) ([#781](https://github.com/hdmf-dev/hdmf/pull/781))

## HDMF 3.4.6 (October 4, 2022)

Expand Down
97 changes: 60 additions & 37 deletions src/hdmf/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import copy
import math
import functools # TODO: remove when Python 3.7 support is dropped
import operator # TODO: remove when Python 3.7 support is dropped
from abc import ABCMeta, abstractmethod
from collections.abc import Iterable
from warnings import warn
Expand Down Expand Up @@ -206,37 +209,49 @@ def __init__(self, **kwargs):
), "Only one of 'chunk_mb' or 'chunk_shape' can be specified!"

self._dtype = self._get_dtype()
self._maxshape = tuple(np.array(self._get_maxshape(), dtype="uint64")) # Upcast for safer numpy operations
self.chunk_shape = tuple(
np.asarray(chunk_shape or self._get_default_chunk_shape(chunk_mb=chunk_mb), dtype="uint64")
) # Upcast for safer numpy operations
self.buffer_shape = tuple(
np.asarray(buffer_shape or self._get_default_buffer_shape(buffer_gb=buffer_gb), dtype="uint64")
) # Upcast for safer numpy operations
self._maxshape = tuple(int(x) for x in self._get_maxshape())
chunk_shape = tuple(int(x) for x in chunk_shape) if chunk_shape else chunk_shape
self.chunk_shape = chunk_shape or self._get_default_chunk_shape(chunk_mb=chunk_mb)
buffer_shape = tuple(int(x) for x in buffer_shape) if buffer_shape else buffer_shape
self.buffer_shape = buffer_shape or self._get_default_buffer_shape(buffer_gb=buffer_gb)

# Shape assertions
array_chunk_shape = np.array(self.chunk_shape)
array_buffer_shape = np.array(self.buffer_shape)
array_maxshape = np.array(self.maxshape)
assert all(
array_chunk_shape <= array_maxshape
buffer_axis > 0 for buffer_axis in self.buffer_shape
), f"Some dimensions of buffer_shape ({self.buffer_shape}) are less than zero!"
assert all(
chunk_axis <= maxshape_axis for chunk_axis, maxshape_axis in zip(self.chunk_shape, self.maxshape)
), f"Some dimensions of chunk_shape ({self.chunk_shape}) exceed the data dimensions ({self.maxshape})!"
assert all(
array_buffer_shape <= array_maxshape
buffer_axis <= maxshape_axis for buffer_axis, maxshape_axis in zip(self.buffer_shape, self.maxshape)
), f"Some dimensions of buffer_shape ({self.buffer_shape}) exceed the data dimensions ({self.maxshape})!"
assert all(
array_chunk_shape <= array_buffer_shape
(chunk_axis <= buffer_axis for chunk_axis, buffer_axis in zip(self.chunk_shape, self.buffer_shape))
), f"Some dimensions of chunk_shape ({self.chunk_shape}) exceed the buffer shape ({self.buffer_shape})!"
assert all((array_buffer_shape % array_chunk_shape == 0)[array_buffer_shape != array_maxshape]), (
assert all(
buffer_axis % chunk_axis == 0
for chunk_axis, buffer_axis, maxshape_axis in zip(self.chunk_shape, self.buffer_shape, self.maxshape)
if buffer_axis != maxshape_axis
), (
f"Some dimensions of chunk_shape ({self.chunk_shape}) do not "
f"evenly divide the buffer shape ({self.buffer_shape})!"
)

self.num_buffers = np.prod(
np.ceil(array_maxshape / array_buffer_shape).astype("uint64") # np.ceil casts as float
self.num_buffers = functools.reduce( # TODO: replace with math.prod when Python 3.7 support is dropped
operator.mul,
[
math.ceil(maxshape_axis / buffer_axis)
for buffer_axis, maxshape_axis in zip(self.buffer_shape, self.maxshape)
],
1,
)
self.buffer_selection_generator = (
tuple([slice(lower_bound, upper_bound) for lower_bound, upper_bound in zip(lower_bounds, upper_bounds)])
tuple(
[
slice(lower_bound, upper_bound)
for lower_bound, upper_bound in zip(lower_bounds, upper_bounds)
]
)
for lower_bounds, upper_bounds in zip(
product(
*[
Expand Down Expand Up @@ -279,7 +294,7 @@ def __init__(self, **kwargs):
default=None,
)
)
def _get_default_chunk_shape(self, **kwargs):
def _get_default_chunk_shape(self, **kwargs) -> Tuple[int, ...]:
"""
Select chunk shape with size in MB less than the threshold of chunk_mb.
Expand All @@ -291,15 +306,17 @@ def _get_default_chunk_shape(self, **kwargs):
n_dims = len(self.maxshape)
itemsize = self.dtype.itemsize
chunk_bytes = chunk_mb * 1e6
v = np.floor(np.array(self.maxshape) / np.min(self.maxshape)).astype("uint64") # np.floor casts to float
prod_v = np.prod(v)

min_maxshape = min(self.maxshape)
v = tuple(math.floor(maxshape_axis / min_maxshape) for maxshape_axis in self.maxshape)
prod_v = functools.reduce(operator.mul, v, 1) # TODO: replace with math.prod when Python 3.7 support is dropped
while prod_v * itemsize > chunk_bytes and prod_v != 1:
v_ind = v != 1
next_v = v[v_ind]
v[v_ind] = np.floor(next_v / np.min(next_v)).astype("uint64") # np.floor casts to float
prod_v = np.prod(v)
k = np.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims)).astype("uint64") # np.floor casts to float
return tuple([min(x, self.maxshape[dim]) for dim, x in enumerate(k * v)])
non_unit_min_v = min(x for x in v if x != 1)
v = tuple(math.floor(x / non_unit_min_v) if x != 1 else x for x in v)
# TODO: replace with math.prod when Python 3.7 support is dropped
prod_v = functools.reduce(operator.mul, v, 1)
k = math.floor((chunk_bytes / (prod_v * itemsize)) ** (1 / n_dims))
return tuple([min(k * x, self.maxshape[dim]) for dim, x in enumerate(v)])

@docval(
dict(
Expand All @@ -309,7 +326,7 @@ def _get_default_chunk_shape(self, **kwargs):
default=None,
)
)
def _get_default_buffer_shape(self, **kwargs):
def _get_default_buffer_shape(self, **kwargs) -> Tuple[int, ...]:
"""
Select buffer shape with size in GB less than the threshold of buffer_gb.
Expand All @@ -318,21 +335,27 @@ def _get_default_buffer_shape(self, **kwargs):
"""
buffer_gb = getargs("buffer_gb", kwargs)
assert buffer_gb > 0, f"buffer_gb ({buffer_gb}) must be greater than zero!"
assert all(chunk_axis > 0 for chunk_axis in self.chunk_shape), (
f"Some dimensions of chunk_shape ({self.chunk_shape}) are less than zero!"
)

k = np.floor(
(buffer_gb * 1e9 / (np.prod(self.chunk_shape) * self.dtype.itemsize)) ** (1 / len(self.chunk_shape))
).astype("uint64") # np.floor casts to float
# TODO: replace with math.prod when Python 3.7 support is dropped
k = math.floor(
(
buffer_gb * 1e9 / (functools.reduce(operator.mul, self.chunk_shape, 1) * self.dtype.itemsize)
) ** (1 / len(self.chunk_shape))
)
return tuple(
[
min(max(x, self.chunk_shape[j]), self.maxshape[j])
for j, x in enumerate(k * np.array(self.chunk_shape))
min(max(k * x, self.chunk_shape[j]), self.maxshape[j])
for j, x in enumerate(self.chunk_shape)
]
)

def recommended_chunk_shape(self) -> tuple:
def recommended_chunk_shape(self) -> Tuple[int, ...]:
return self.chunk_shape

def recommended_data_shape(self) -> tuple:
def recommended_data_shape(self) -> Tuple[int, ...]:
return self.maxshape

def __iter__(self):
Expand Down Expand Up @@ -376,16 +399,16 @@ def _get_data(self, selection: Tuple[slice]) -> np.ndarray:
raise NotImplementedError("The data fetching method has not been built for this DataChunkIterator!")

@property
def maxshape(self):
def maxshape(self) -> Tuple[int, ...]:
return self._maxshape

@abstractmethod
def _get_maxshape(self) -> tuple:
def _get_maxshape(self) -> Tuple[int, ...]:
"""Retrieve the maximum bounds of the data shape using minimal I/O."""
raise NotImplementedError("The setter for the maxshape property has not been built for this DataChunkIterator!")

@property
def dtype(self):
def dtype(self) -> np.dtype:
return self._dtype

@abstractmethod
Expand Down
Loading

0 comments on commit 35dab0f

Please sign in to comment.