Skip to content

Commit

Permalink
Merge pull request #167 from csiro-coasts/166-multifile-datasets-dont…
Browse files Browse the repository at this point in the history
…-work-with-cache-key-generation

Updated hash key generation to handle missing encoding dtype.
  • Loading branch information
david-sh-csiro authored Jan 21, 2025
2 parents 0bbb470 + 38926af commit 0579a7c
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 2 deletions.
3 changes: 2 additions & 1 deletion docs/releases/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
Next release (in development)
=============================

* ...
* Fix datasets hash_key generation when geometry encoding
is missing a dtype (:issue:`166`, :pr:`167`).
4 changes: 3 additions & 1 deletion src/emsarray/conventions/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1973,7 +1973,9 @@ def hash_geometry(self, hash: "hashlib._Hash") -> None:
# Include the dtype of the data array.
# A float array and an int array mean very different things,
# but could have identical byte patterns.
hash_string(hash, data_array.encoding['dtype'].name)
# Checking for encoding dtype and falling back to values.dtype due to
# xarray multifile dataset bug - https://github.com/pydata/xarray/issues/2436
hash_string(hash, data_array.encoding.get('dtype', data_array.values.dtype).name)

# Include the size and shape of the data.
# 1D coordinate arrays are very different to 2D coordinate arrays,
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
39 changes: 39 additions & 0 deletions tests/operations/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pathlib

import pytest
import xarray

import emsarray
import emsarray.operations.cache
Expand All @@ -12,6 +13,8 @@
int_hash = '7b08e025e311c3dfcf5179b67c0fdc08e73de261'
attr_hash_lat = "2cb433979fc2d9c3884eea8569dd6a44406950f3"
cache_key_hash_cf1d_sha1 = "2b006999273225ed70d4810357b6a06e6bebe9a6"
cache_key_hash_multifile_cf2d_sha1 = "ea2d2e6131f1e499f622e83ed4fc2415649def06"
cache_key_hash_multifile_ugrid_mesh2d_sha1 = "1d72e01b159135208324ae9a643166f85aecba27"

# Blake2b
cache_key_hash_cf1d = "1a3226072f08441ee79f727b0775709209ff2965299539c898ecc401cf17e23f"
Expand Down Expand Up @@ -200,3 +203,39 @@ def test_cache_key_cfgrid1d_sha1(datasets: pathlib.Path):
assert result_cache_key_cf is not None

assert result_cache_key_cf == cache_key_hash_cf1d_sha1


def test_cache_key_with_multifile_dataset_ugrid_mesh2d(datasets: pathlib.Path):

ugrid_path1 = datasets / 'multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-01.nc'
ugrid_path2 = datasets / 'multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-02.nc'

dataset_paths = [ugrid_path1, ugrid_path2]

multifile_dataset = xarray.open_mfdataset(dataset_paths, data_vars=['values'])

multifile_ds_hash = hashlib.sha1()

multifile_dataset.ems.hash_geometry(multifile_ds_hash)

multifile_ds_digest = multifile_ds_hash.hexdigest()

assert multifile_ds_digest == cache_key_hash_multifile_ugrid_mesh2d_sha1


def test_cache_key_with_multifile_dataset_cfgrid2d(datasets: pathlib.Path):

cfgrid_path1 = datasets / 'multifile_datasets/cfgrid2d/cfgrid2d_2024-01-01.nc'
cfgrid_path2 = datasets / 'multifile_datasets/cfgrid2d/cfgrid2d_2024-01-02.nc'

dataset_paths = [cfgrid_path1, cfgrid_path2]

multifile_dataset = xarray.open_mfdataset(dataset_paths, data_vars=['values'])

multifile_ds_hash = hashlib.sha1()

multifile_dataset.ems.hash_geometry(multifile_ds_hash)

multifile_ds_digest = multifile_ds_hash.hexdigest()

assert multifile_ds_digest == cache_key_hash_multifile_cf2d_sha1

0 comments on commit 0579a7c

Please sign in to comment.