pangeo-forge · jbusecke · Jul 15, 2023 · Jul 15, 2023 · Jul 16, 2023 · Jul 16, 2023
diff --git a/pangeo_forge_recipes/dynamic_target_chunks.py b/pangeo_forge_recipes/dynamic_target_chunks.py
@@ -0,0 +1,107 @@
+import itertools
+from typing import Dict, List
+
+import numpy as np
+import xarray as xr
+
+from pangeo_forge_recipes.aggregation import XarraySchema, schema_to_template_ds
+
+
+def get_memory_size(ds: xr.Dataset, chunks: Dict[str, int]) -> int:
+    """Returns an estimate of memory size based on input chunks.
+    Currently this applies the chunks input to the dataset, then
+    iterates through the variables and returns the maximum.
+    """
+    ds_single_chunk = ds.isel({dim: slice(0, chunk) for dim, chunk in chunks.items()})
+    mem_size = max([ds_single_chunk[var].nbytes for var in ds_single_chunk.data_vars])
+    return mem_size
+
+
+def difference(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    return np.sqrt(np.sum((a - b) ** 2))
+
+
+def normalize(a: np.ndarray) -> np.ndarray:
+    """Convert to a unit vector"""
+    return a / np.sqrt(np.sum(a**2))
+
+
+def even_divisor_chunks(n: int) -> List[int]:
+    """Returns values that evenly divide n"""
+    divisors = []
+    for i in range(1, n + 1):
+        if n % i == 0:
+            divisors.append(n // i)
+    return divisors
+
+
+def dynamic_target_chunks_from_schema(
+    schema: XarraySchema,
+    target_chunk_nbytes: int,  # TODO: Accept a str like `100MB`
+    target_chunk_ratio: Dict[str, int],
+    nbytes_tolerance: float = 0.2,
+) -> dict[str, int]:
+
+    ds = schema_to_template_ds(schema)
+
+    if set(target_chunk_ratio.keys()) != set(ds.dims):
+        raise ValueError(
+            f"target_chunk_ratio must contain all dimensions in dataset. "
+            f"Got {target_chunk_ratio.keys()} but expected {list(ds.dims.keys())}"
+        )
+
+    dims, shape = zip(*ds.dims.items())
+    ratio = [target_chunk_ratio[dim] for dim in dims]
+    ratio_normalized = normalize(np.array(ratio))
+
+    possible_chunks = []
+    for s, r, dim in zip(shape, ratio, dims):
+        if r > 0:
+            # Get a list of all the even divisors
+            possible_chunks.append(even_divisor_chunks(s))
+        elif r == -1:
+            # Always keep this dimension unchunked
+            possible_chunks.append([s])
+        else:
+            raise ValueError(
+                f"Ratio value can only be larger than 0 or -1. Got {r} for dimension {dim}"
+            )
+
+    combinations = [p for p in itertools.product(*possible_chunks)]
+    # Check the size of each combination on the dataset
+    combination_sizes = [
+        get_memory_size(ds, {dim: chunk for dim, chunk in zip(dims, c)}) for c in combinations
+    ]
+
+    # And select a subset with some form of tolerance based on the size requirement
+    tolerance = nbytes_tolerance * target_chunk_nbytes
+    combinations_filtered = [
+        c
+        for c, s in zip(combinations, combination_sizes)
+        if abs(s - target_chunk_nbytes) < tolerance
+    ]
+
+    # If there are no matches in the range, the user has to increase the tolerance for this to work.
+    if len(combinations_filtered) == 0:
+        raise ValueError(
+            "Could not find any chunk combinations satisfying the size constraint. Consider increasing tolerance"
+        )
+
+    # Now that we have cominations in the memory size range we want, we can check which is closest to our
+    # desired chunk ratio. We can think of this as comparing the angle of two vectors.
+    # To compare them we need to normalize (we dont care about the amplitude here)
+
+    # convert the combinations into the normalized inverse
+    ratio_combinations = [normalize(1 / np.array(c)) for c in combinations_filtered]
+    # ratio_combinations = [normalize(np.array(c)) for c in combinations_filtered]
+
+    # Find the 'closest' fit of chunk ratio to the target ratio
+    # cartesian difference between vectors ok?
+    ratio_difference = [difference(ratio_normalized, r) for r in ratio_combinations]
+
+    combinations_sorted = [c for _, c in sorted(zip(ratio_difference, combinations_filtered))]
+
+    # Return the chunk combination with the closest fit
+    optimal_combination = combinations_sorted[0]
+
+    return {dim: chunk for dim, chunk in zip(dims, optimal_combination)}
diff --git a/pangeo_forge_recipes/transforms.py b/pangeo_forge_recipes/transforms.py
@@ -54,7 +54,6 @@
 T = TypeVar("T")
 Indexed = Tuple[Index, T]
 
-
 # TODO: replace with beam.MapTuple?
 def _add_keys(func):
     """Convenience decorator to remove and re-add keys to items in a Map"""

diff --git a/tests/test_dynamic_target_chunks.py b/tests/test_dynamic_target_chunks.py
@@ -0,0 +1,63 @@
+import dask.array as dsa
+import pytest
+import xarray as xr
+
+from pangeo_forge_recipes.aggregation import dataset_to_schema
+from pangeo_forge_recipes.dynamic_target_chunks import dynamic_target_chunks_from_schema
+
+
+class TestDynamicTargetChunks:
+    @pytest.fixture
+    def ds(self) -> int:
+        return xr.DataArray(dsa.random.random([100, 300, 400]), dims=["x", "y", "z"]).to_dataset(
+            name="data"
+        )
+
+    @pytest.mark.parametrize(
+        ("target_chunk_ratio", "expected_target_chunks"),
+        [
+            # make sure that for the same dataset we get smaller chunksize along a dimension if the ratio is larger
+            ({"x": 1, "y": 1, "z": 10}, {"x": 100, "y": 150, "z": 8}),
+            ({"x": 10, "y": 1, "z": 1}, {"x": 10, "y": 150, "z": 80}),
+            # test the special case where we want to just chunk along a single dimension
+            ({"x": -1, "y": -1, "z": 1}, {"x": 100, "y": 300, "z": 4}),
+        ],
+    )
+    def test_dynamic_rechunking1(self, ds, target_chunk_ratio, expected_target_chunks):
+        schema = dataset_to_schema(ds)
+        target_chunks = dynamic_target_chunks_from_schema(
+            schema, 1e6, target_chunk_ratio=target_chunk_ratio
+        )
+        print(target_chunks)
+        for dim, chunks in expected_target_chunks.items():
+            assert target_chunks[dim] == chunks
+
+    @pytest.mark.parametrize(
+        "target_chunk_ratio", [{"x": 1, "y": -1, "z": 10}, {"x": 6, "y": -1, "z": 2}]
+    )  # always keep y unchunked, and vary the others
+    @pytest.mark.parametrize("target_chunk_nbytes", [1e6, 1e7])
+    def test_dynamic_skip_dimension(self, ds, target_chunk_ratio, target_chunk_nbytes):
+        # Mark dimension as 'not-to-chunk' with -1
+        schema = dataset_to_schema(ds)
+        target_chunks = dynamic_target_chunks_from_schema(
+            schema, target_chunk_nbytes, target_chunk_ratio=target_chunk_ratio
+        )
+        assert target_chunks["y"] == len(ds["y"])
+
+    def test_dynamic_rechunking_error_dimension_missing(self, ds):
+        # make sure that an error is raised if some dimension is not specified
+        schema = dataset_to_schema(ds)
+
+        with pytest.raises(
+            ValueError, match="target_chunk_ratio must contain all dimensions in dataset."
+        ):
+            dynamic_target_chunks_from_schema(schema, 1e6, target_chunk_ratio={"x": 1, "z": 10})
+
+    def test_dynamic_rechunking_error_dimension_wrong(self, ds):
+        schema = dataset_to_schema(ds)
+        with pytest.raises(
+            ValueError, match="target_chunk_ratio must contain all dimensions in dataset."
+        ):
+            dynamic_target_chunks_from_schema(
+                schema, 1e6, target_chunk_ratio={"x": 1, "y_wrong": 1, "z": 10}
+            )