Skip to content

Commit

Permalink
Merge pull request #429 from alxmrs/cache-hash
Browse files Browse the repository at this point in the history
Avoid serialization blues by computing + caching the hash.
  • Loading branch information
alxmrs authored Oct 26, 2022
2 parents 871c5da + 744fd44 commit beeb15a
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 16 deletions.
1 change: 1 addition & 0 deletions pangeo_forge_recipes/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ def items(self):
for key in self:
yield key, self[key]

@property
def sha256(self):
"""Compute a sha256 hash for the instance."""

Expand Down
13 changes: 7 additions & 6 deletions pangeo_forge_recipes/recipes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from abc import ABC
from dataclasses import dataclass, field, replace
from typing import Callable, ClassVar
from typing import Callable, ClassVar, Optional

import pkg_resources # type: ignore

Expand All @@ -16,6 +16,10 @@
class BaseRecipe(ABC):
_compiler: ClassVar[RecipeCompiler]
_hash_exclude_ = ["storage_config"]
sha256: Optional[bytes] = None

def __post_init__(self):
self.sha256 = dataclass_sha256(self, ignore_keys=self._hash_exclude_)

def to_function(self):
from ..executors import FunctionPipelineExecutor
Expand Down Expand Up @@ -49,15 +53,12 @@ def to_beam(self):

return BeamPipelineExecutor.compile(self._compiler())

def sha256(self):
return dataclass_sha256(self, ignore_keys=self._hash_exclude_)

def get_execution_context(self):
return dict(
# See https://stackoverflow.com/a/2073599 re: version
version=pkg_resources.require("pangeo-forge-recipes")[0].version,
recipe_hash=self.sha256().hex(),
inputs_hash=self.file_pattern.sha256().hex(),
recipe_hash=self.sha256.hex(),
inputs_hash=self.file_pattern.sha256.hex(),
)


Expand Down
1 change: 1 addition & 0 deletions pangeo_forge_recipes/recipes/reference_hdf_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ class HDFReferenceRecipe(BaseRecipe, StorageMixin, FilePatternMixin):
postprocess: Optional[Callable] = None

def __post_init__(self):
super().__post_init__()
self._validate_file_pattern()

def _validate_file_pattern(self):
Expand Down
1 change: 1 addition & 0 deletions pangeo_forge_recipes/recipes/xarray_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,7 @@ class XarrayZarrRecipe(BaseRecipe, StorageMixin, FilePatternMixin):
"""How many items per input along concat_dim."""

def __post_init__(self):
super().__post_init__()
self._validate_file_pattern()

# from here on we know there is at most one merge dim and one concat dim
Expand Down
4 changes: 3 additions & 1 deletion pangeo_forge_recipes/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ def either_encode_or_hash(obj: Any):
if isinstance(obj, Enum): # custom serializer for FileType, CombineOp, etc.
return obj.value
elif hasattr(obj, "sha256"):
return obj.sha256().hex()
return obj.sha256.hex()
elif inspect.isfunction(obj):
return inspect.getsource(obj)
elif isinstance(obj, bytes):
return obj.hex()
raise TypeError(f"object of type {type(obj).__name__} not serializable")


Expand Down
18 changes: 9 additions & 9 deletions tests/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_match_pattern_blockchain(base_pattern, end_date, new_pattern_nitems_per
for k, v in kwargs[i].items():
setattr(pattern, k, v)

matching_key = match_pattern_blockchain(base_pattern.sha256(), new_pattern)
matching_key = match_pattern_blockchain(base_pattern.sha256, new_pattern)

if kwargs[0] == kwargs[1] and new_pattern_nitems_per_file == 1:
assert new_pattern[matching_key] == next_url
Expand All @@ -92,14 +92,14 @@ def test_recipe_sha256_hash_exclude(base_pattern, recipe_cls, tmpdir_factory):
recipe_0 = recipe_cls(base_pattern)
recipe_1 = recipe_cls(base_pattern)

assert recipe_0.sha256() == recipe_1.sha256()
assert recipe_0.sha256 == recipe_1.sha256

local_fs = LocalFileSystem()
custom_target_path = tmpdir_factory.mktemp("custom_target")
custom_storage_config = StorageConfig(target=FSSpecTarget(local_fs, custom_target_path))
recipe_1.storage_config = custom_storage_config

assert recipe_0.sha256() == recipe_1.sha256()
assert recipe_0.sha256 == recipe_1.sha256


@pytest.mark.parametrize(
Expand All @@ -117,9 +117,9 @@ def test_xarray_zarr_sha265(pattern_pair, kwargs):
recipe_1 = XarrayZarrRecipe(pattern_pair[1], **kwargs[1])

if pattern_pair[0] == pattern_pair[1] and kwargs[0] == kwargs[1]:
assert recipe_0.sha256() == recipe_1.sha256()
assert recipe_0.sha256 == recipe_1.sha256
else:
assert recipe_0.sha256() != recipe_1.sha256()
assert recipe_0.sha256 != recipe_1.sha256


@pytest.mark.parametrize(
Expand All @@ -136,9 +136,9 @@ def test_kerchunk_sha265(pattern_pair, kwargs):
recipe_1 = HDFReferenceRecipe(pattern_pair[1], **kwargs[1])

if pattern_pair[0] == pattern_pair[1] and kwargs[0] == kwargs[1]:
assert recipe_0.sha256() == recipe_1.sha256()
assert recipe_0.sha256 == recipe_1.sha256
else:
assert recipe_0.sha256() != recipe_1.sha256()
assert recipe_0.sha256 != recipe_1.sha256


@pytest.mark.parametrize("cls", [XarrayZarrRecipe, HDFReferenceRecipe])
Expand All @@ -165,9 +165,9 @@ class NewRelease(cls):
new_release_obj = NewRelease(base_pattern, **kwargs)

if not kwargs:
assert old_release_obj.sha256() == new_release_obj.sha256()
assert old_release_obj.sha256 == new_release_obj.sha256
else:
assert old_release_obj.sha256() != new_release_obj.sha256()
assert old_release_obj.sha256 != new_release_obj.sha256


def test_either_encode_or_hash_raises():
Expand Down

0 comments on commit beeb15a

Please sign in to comment.