pangeo-forge · norlandrhagen · Feb 8, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/pangeo_forge_recipes/rechunking.py b/pangeo_forge_recipes/rechunking.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import xarray as xr
+import zarr
 
 from .aggregation import XarraySchema, determine_target_chunks
 from .chunk_grid import ChunkGrid
@@ -238,3 +239,42 @@ def _sort_by_speed_of_varying(item):
     ds_combined = xr.combine_nested(dsets_to_concat, concat_dim=concat_dims_sorted)
 
     return first_index, ds_combined
+
+
+def _gather_coordinate_dimensions(group: zarr.Group) -> List[str]:
+    return list(
+        set(itertools.chain(*(group[var].attrs.get("_ARRAY_DIMENSIONS", []) for var in group)))
 def _gather_coordinate_dimensions(group: zarr.Group) -> List[str]: 
 def _gather_coordinate_dimensions(group: zarr.Group) -> List[str]: 
+    )
+
+
+def consolidate_dimension_coordinates(
+    singleton_target_store: zarr.storage.FSStore,
+) -> zarr.storage.FSStore:
+    """Consolidate dimension coordinates chunking"""
+    group = zarr.open_group(singleton_target_store)
+
+    dims = (dim for dim in _gather_coordinate_dimensions(group) if dim in group)
+    for dim in dims:
+        arr = group[dim]
+        attrs = dict(arr.attrs)
+        data = arr[:]
+
+        # This will generally use bulk-delete API calls
+        # config.storage_config.target.rm(dim, recursive=True)
+
+        singleton_target_store.fs.rm(singleton_target_store.path + "/" + dim, recursive=True)
+
+        new = group.array(
+            dim,
+            data,
+            chunks=arr.shape,
+            dtype=arr.dtype,
+            compressor=arr.compressor,
+            fill_value=arr.fill_value,
+            order=arr.order,
+            filters=arr.filters,
+            overwrite=True,
+        )
+
+        new.attrs.update(attrs)
+    return singleton_target_store
diff --git a/pangeo_forge_recipes/transforms.py b/pangeo_forge_recipes/transforms.py
@@ -21,7 +21,7 @@
 from .combiners import CombineMultiZarrToZarr, CombineXarraySchemas
 from .openers import open_url, open_with_kerchunk, open_with_xarray
 from .patterns import CombineOp, Dimension, FileType, Index, augment_index_with_start_stop
-from .rechunking import combine_fragments, split_fragment
+from .rechunking import combine_fragments, consolidate_dimension_coordinates, split_fragment
 from .storage import CacheFSSpecTarget, FSSpecTarget
 from .writers import ZarrWriterMixin, store_dataset_fragment, write_combined_reference
 
@@ -412,6 +412,11 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection:
         return new_fragments
 
 
+class ConsolidateDimensionCoordinates(beam.PTransform):
+    def expand(self, pcoll: beam.PCollection[zarr.storage.FSStore]) -> beam.PCollection:
+        return pcoll | beam.Map(consolidate_dimension_coordinates)
+
+
 @dataclass
 class CombineReferences(beam.PTransform):
     """Combines Kerchunk references into a single reference dataset.
@@ -572,6 +577,9 @@ class StoreToZarr(beam.PTransform, ZarrWriterMixin):
         `store_name` will be appended to this prefix to create a full path.
     :param target_chunks: Dictionary mapping dimension names to chunks sizes.
       If a dimension is a not named, the chunks will be inferred from the data.
+    :param consolidate_dimension_coordinates: Whether to rewrite coordinate variables as a
+      single chunk. We recommend consolidating coordinate variables to avoid
+      many small read requests to get the coordinates in xarray. Defaults to ``True``.
     :param dynamic_chunking_fn: Optionally provide a function that takes an ``xarray.Dataset``
       template dataset as its first argument and returns a dynamically generated chunking dict.
       If provided, ``target_chunks`` cannot also be passed. You can use this to determine chunking
@@ -593,6 +601,7 @@ class StoreToZarr(beam.PTransform, ZarrWriterMixin):
     dynamic_chunking_fn: Optional[Callable[[xr.Dataset], dict]] = None
     dynamic_chunking_fn_kwargs: Optional[dict] = field(default_factory=dict)
     attrs: Dict[str, str] = field(default_factory=dict)
+    consolidate_dimension_coordinates: bool = False
 
     def __post_init__(self):
         if self.target_chunks and self.dynamic_chunking_fn:
@@ -625,7 +634,7 @@ def expand(
             | beam.combiners.Sample.FixedSizeGlobally(1)
             | beam.FlatMap(lambda x: x)  # https://stackoverflow.com/a/47146582
         )
-        # TODO: optionally use `singleton_target_store` to
-        # consolidate metadata and/or coordinate dims here
+        if self.consolidate_dimension_coordinates:
+            singleton_target_store = singleton_target_store | ConsolidateDimensionCoordinates()
-        # TODO: optionally use `singleton_target_store` to
-        # consolidate metadata and/or coordinate dims here
-        if self.consolidate_dimension_coordinates:
-            singleton_target_store = singleton_target_store | ConsolidateDimensionCoordinates()
-        # TODO: optionally use `singleton_target_store` to
-        # consolidate metadata and/or coordinate dims here
-        if self.consolidate_dimension_coordinates:
-            singleton_target_store = singleton_target_store | ConsolidateDimensionCoordinates()
 
         return singleton_target_store
diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py
@@ -8,6 +8,7 @@
 import numpy as np
 import pytest
 import xarray as xr
+import zarr
 from apache_beam.options.pipeline_options import PipelineOptions
 from apache_beam.testing.test_pipeline import TestPipeline
 from fsspec.implementations.reference import ReferenceFileSystem
@@ -172,3 +173,31 @@ def test_reference_grib(
     # various inconsistencies (of dtype casting int to float, etc.). With the right combination of
     # options passed to the pipeline, seems like these should pass?
     # xr.testing.assert_equal(ds.load(), ds2)
+
+
+@pytest.mark.parametrize("consolidate_dimension_coordinates", [False, True])
+def test_xarray_zarr_consolidate_dimension_coordinates(
+    netcdf_local_file_pattern_sequential,
+    pipeline,
+    tmp_target_url,
+    consolidate_dimension_coordinates,
+):
+    pattern = netcdf_local_file_pattern_sequential
+    with pipeline as p:
+        (
+            p
+            | beam.Create(pattern.items())
+            | OpenWithXarray(file_type=pattern.file_type)
+            | StoreToZarr(
+                target_root=tmp_target_url,
+                store_name="subpath",
+                combine_dims=pattern.combine_dim_keys,
+                consolidate_dimension_coordinates=consolidate_dimension_coordinates,
+            )
+        )
+
+    store = zarr.open(os.path.join(tmp_target_url, "subpath"))
+    if not consolidate_dimension_coordinates:
+        assert store.time.chunks[0] != store.time.shape[0]
+    if consolidate_dimension_coordinates:
+        assert store.time.chunks[0] == store.time.shape[0]
diff --git a/tests/test_rechunking.py b/tests/test_rechunking.py
@@ -1,11 +1,20 @@
 import itertools
+import os
 import random
 from collections import namedtuple
+from tempfile import TemporaryDirectory
 
+import numpy as np
 import pytest
 import xarray as xr
+import zarr
 
-from pangeo_forge_recipes.rechunking import GroupKey, combine_fragments, split_fragment
+from pangeo_forge_recipes.rechunking import (
+    GroupKey,
+    combine_fragments,
+    consolidate_dimension_coordinates,
+    split_fragment,
+)
 from pangeo_forge_recipes.types import CombineOp, Dimension, Index, IndexedPosition, Position
 
 from .conftest import split_up_files_by_variable_and_day
@@ -258,3 +267,25 @@ def test_combine_fragments_errors():
     index1 = Index({Dimension("time", CombineOp.CONCAT): IndexedPosition(2)})
     with pytest.raises(ValueError, match="are not consistent"):
         _ = combine_fragments(group, [(index0, ds), (index1, ds)])
+
+
+def test_consolidate_dimension_coordinates():
+    td = TemporaryDirectory()
+    store_path = os.path.join(td.name + "tmp.zarr")
+    group = zarr.group(store=store_path, overwrite=True)
+    group.create(name="data", shape=100, chunks=10, dtype="i4")
+    group.create(name="time", shape=100, chunks=10, dtype="i4")
+    group.data[:] = np.random.randn(*group.data.shape)
+    group.time[:] = np.arange(100)
+
+    # If you don't provide these attrs,
+    # consolidate_dimension_coordinates does not
+    # raise an error, while Xarray does
+    group.attrs["_ARRAY_DIMENSIONS"] = ["time"]
+    group.data.attrs["_ARRAY_DIMENSIONS"] = ["time"]
+    group.time.attrs["_ARRAY_DIMENSIONS"] = ["time"]
+
+    consolidated_zarr = consolidate_dimension_coordinates(zarr.storage.FSStore(store_path))
+    store = zarr.open(consolidated_zarr)
+    assert store.time.chunks[0] == 100
+    assert store.data.chunks[0] == 10
diff --git a/tests/test_transforms.py b/tests/test_transforms.py
@@ -310,6 +310,7 @@ def dynamic_chunking_fn(template_ds: xr.Dataset, divisor: int = 1):
             combine_dims=pattern.combine_dim_keys,
             attrs={},
             dynamic_chunking_fn=dynamic_chunking_fn,
+            consolidate_dimension_coordinates=False,
             **kws,
         )
         open_store = target_store | OpenZarrStore()