From 39722e7019e3af5f8079c8f3d4d734dd8c866aeb Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Fri, 4 Oct 2024 16:37:44 -0400 Subject: [PATCH 01/40] Save progress for next week --- kerchunk/combine.py | 8 ++++---- kerchunk/fits.py | 2 +- kerchunk/grib2.py | 4 ++-- kerchunk/hdf4.py | 2 +- kerchunk/netCDF3.py | 2 +- kerchunk/tests/test_combine.py | 6 +++--- kerchunk/tests/test_combine_concat.py | 20 ++++++++++---------- kerchunk/tests/test_fits.py | 10 +++++----- kerchunk/tests/test_grib.py | 10 +++++----- kerchunk/tests/test_hdf.py | 20 ++++++++++---------- kerchunk/tests/test_tiff.py | 4 ++-- kerchunk/tests/test_utils.py | 8 ++++---- kerchunk/utils.py | 2 +- pyproject.toml | 2 +- 14 files changed, 50 insertions(+), 50 deletions(-) diff --git a/kerchunk/combine.py b/kerchunk/combine.py index eb891de1..155ba4c9 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -203,7 +203,7 @@ def append( ds = xr.open_dataset( fs.get_mapper(), engine="zarr", backend_kwargs={"consolidated": False} ) - z = zarr.open(fs.get_mapper()) + z = zarr.open(fs.get_mapper(), zarr_version=2) mzz = MultiZarrToZarr( path, out=fs.references, # dict or parquet/lazy @@ -360,7 +360,7 @@ def first_pass(self): fs._dircache_from_items() logger.debug("First pass: %s", i) - z = zarr.open_group(fs.get_mapper("")) + z = zarr.open_group(fs.get_mapper(""), zarr_version=2) for var in self.concat_dims: value = self._get_value(i, z, var, fn=self._paths[i]) if isinstance(value, np.ndarray): @@ -387,7 +387,7 @@ def store_coords(self): """ kv = {} store = zarr.storage.KVStore(kv) - group = zarr.open(store) + group = zarr.open(store, zarr_version=2) m = self.fss[0].get_mapper("") z = zarr.open(m) for k, v in self.coos.items(): @@ -461,7 +461,7 @@ def second_pass(self): for i, fs in enumerate(self.fss): to_download = {} m = fs.get_mapper("") - z = zarr.open(m) + z = zarr.open(m, zarr_version=2) if no_deps is None: # done first time only diff --git a/kerchunk/fits.py b/kerchunk/fits.py index 18729a9b..f714af97 100644 --- a/kerchunk/fits.py +++ b/kerchunk/fits.py @@ -72,7 +72,7 @@ def process_file( storage_options = storage_options or {} out = out or {} - g = zarr.open(out) + g = zarr.open(out, zarr_version=2) with fsspec.open(url, mode="rb", **storage_options) as f: infile = fits.open(f, do_not_scale_image_data=True) diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py index f105fe8b..06108db5 100644 --- a/kerchunk/grib2.py +++ b/kerchunk/grib2.py @@ -191,7 +191,7 @@ def scan_grib( if good is False: continue - z = zarr.open_group(store) + z = zarr.open_group(store, zarr_version=2) global_attrs = { f"GRIB_{k}": m[k] for k in cfgrib.dataset.GLOBAL_ATTRIBUTES_KEYS @@ -398,7 +398,7 @@ def grib_tree( # TODO allow passing a LazyReferenceMapper as output? zarr_store = {} - zroot = zarr.open_group(store=zarr_store) + zroot = zarr.open_group(store=zarr_store, zarr_version=2) aggregations: Dict[str, List] = defaultdict(list) aggregation_dims: Dict[str, Set] = defaultdict(set) diff --git a/kerchunk/hdf4.py b/kerchunk/hdf4.py index 483ffba7..4235d139 100644 --- a/kerchunk/hdf4.py +++ b/kerchunk/hdf4.py @@ -144,7 +144,7 @@ def translate(self, filename=None, storage_options=None): remote_protocol=prot, remote_options=self.st, ) - g = zarr.open_group("reference://", storage_options=dict(fs=fs)) + g = zarr.open_group("reference://", storage_options=dict(fs=fs), zarr_version=2) refs = {} for k, v in output.items(): if isinstance(v, dict): diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py index d43b6b97..8e0994ca 100644 --- a/kerchunk/netCDF3.py +++ b/kerchunk/netCDF3.py @@ -167,7 +167,7 @@ def translate(self): import zarr out = self.out - z = zarr.open(out, mode="w") + z = zarr.open(out, mode="w", zarr_version=2) for dim, var in self.variables.items(): if dim in self.chunks: shape = self.chunks[dim][-1] diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py index 13994921..1b5713b2 100644 --- a/kerchunk/tests/test_combine.py +++ b/kerchunk/tests/test_combine.py @@ -133,14 +133,14 @@ # simple time arrays - xarray can't make these! m = fs.get_mapper("time1.zarr") -z = zarr.open(m, mode="w") +z = zarr.open(m, mode="w", zarr_version=2) ar = z.create_dataset("time", data=np.array([1], dtype="M8[s]")) ar.attrs.update({"_ARRAY_DIMENSIONS": ["time"]}) ar = z.create_dataset("data", data=arr) ar.attrs.update({"_ARRAY_DIMENSIONS": ["time", "x", "y"]}) m = fs.get_mapper("time2.zarr") -z = zarr.open(m, mode="w") +z = zarr.open(m, mode="w", zarr_version=2) ar = z.create_dataset("time", data=np.array([2], dtype="M8[s]")) ar.attrs.update({"_ARRAY_DIMENSIONS": ["time"]}) ar = z.create_dataset("data", data=arr) @@ -272,7 +272,7 @@ def test_get_coos(refs, selector, expected): mzz.first_pass() assert mzz.coos["time"].tolist() == expected mzz.store_coords() - g = zarr.open(mzz.out) + g = zarr.open(mzz.out, zarr_version=2) assert g["time"][:].tolist() == expected assert dict(g.attrs) diff --git a/kerchunk/tests/test_combine_concat.py b/kerchunk/tests/test_combine_concat.py index 3f7ff823..f51f10e8 100644 --- a/kerchunk/tests/test_combine_concat.py +++ b/kerchunk/tests/test_combine_concat.py @@ -51,7 +51,7 @@ def test_success(tmpdir, arrays, chunks, axis, m): refs = [] for i, x in enumerate(arrays): fn = f"{tmpdir}/out{i}.zarr" - g = zarr.open(fn) + g = zarr.open(fn, zarr_version=2) g.create_dataset("x", data=x, chunks=chunks) fns.append(fn) ref = kerchunk.zarr.single_zarr(fn, inline=0) @@ -62,7 +62,7 @@ def test_success(tmpdir, arrays, chunks, axis, m): ) mapper = fsspec.get_mapper("reference://", fo=out) - g = zarr.open(mapper) + g = zarr.open(mapper, zarr_version=2) assert (g.x[:] == np.concatenate(arrays, axis=axis)).all() try: @@ -76,7 +76,7 @@ def test_success(tmpdir, arrays, chunks, axis, m): remote_protocol="file", skip_instance_cache=True, ) - g = zarr.open(mapper) + g = zarr.open(mapper, zarr_version=2) assert (g.x[:] == np.concatenate(arrays, axis=axis)).all() kerchunk.df.refs_to_dataframe(out, "memory://out.parq", record_size=1) @@ -86,7 +86,7 @@ def test_success(tmpdir, arrays, chunks, axis, m): remote_protocol="file", skip_instance_cache=True, ) - g = zarr.open(mapper) + g = zarr.open(mapper, zarr_version=2) assert (g.x[:] == np.concatenate(arrays, axis=axis)).all() @@ -95,9 +95,9 @@ def test_fail_chunks(tmpdir): fn2 = f"{tmpdir}/out2.zarr" x1 = np.arange(10) x2 = np.arange(10, 20) - g = zarr.open(fn1) + g = zarr.open(fn1, zarr_version=2) g.create_dataset("x", data=x1, chunks=(2,)) - g = zarr.open(fn2) + g = zarr.open(fn2, zarr_version=2) g.create_dataset("x", data=x2, chunks=(3,)) ref1 = kerchunk.zarr.single_zarr(fn1, inline=0) @@ -112,9 +112,9 @@ def test_fail_shape(tmpdir): fn2 = f"{tmpdir}/out2.zarr" x1 = np.arange(12).reshape(6, 2) x2 = np.arange(12, 24) - g = zarr.open(fn1) + g = zarr.open(fn1, zarr_version=2) g.create_dataset("x", data=x1, chunks=(2,)) - g = zarr.open(fn2) + g = zarr.open(fn2, zarr_version=2) g.create_dataset("x", data=x2, chunks=(2,)) ref1 = kerchunk.zarr.single_zarr(fn1, inline=0) @@ -129,9 +129,9 @@ def test_fail_irregular_chunk_boundaries(tmpdir): fn2 = f"{tmpdir}/out2.zarr" x1 = np.arange(10) x2 = np.arange(10, 24) - g = zarr.open(fn1) + g = zarr.open(fn1, zarr_version=2) g.create_dataset("x", data=x1, chunks=(4,)) - g = zarr.open(fn2) + g = zarr.open(fn2, zarr_version=2) g.create_dataset("x", data=x2, chunks=(4,)) ref1 = kerchunk.zarr.single_zarr(fn1, inline=0) diff --git a/kerchunk/tests/test_fits.py b/kerchunk/tests/test_fits.py index 14ea6fc0..e7211479 100644 --- a/kerchunk/tests/test_fits.py +++ b/kerchunk/tests/test_fits.py @@ -18,7 +18,7 @@ def test_ascii_table(): url = "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits" out = kerchunk.fits.process_file(url, extension=1) m = fsspec.get_mapper("reference://", fo=out, remote_protocol="https") - g = zarr.open(m) + g = zarr.open(m, zarr_version=2) arr = g["u5780205r_cvt.c0h.tab"][:] with fsspec.open( "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits" @@ -31,7 +31,7 @@ def test_ascii_table(): def test_binary_table(): out = kerchunk.fits.process_file(btable, extension=1) m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m) + z = zarr.open(m, zarr_version=2) arr = z["1"] with open(btable, "rb") as f: hdul = fits.open(f) @@ -48,7 +48,7 @@ def test_binary_table(): def test_cube(): out = kerchunk.fits.process_file(range_im) m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m) + z = zarr.open(m, zarr_version=2) arr = z["PRIMARY"] with open(range_im, "rb") as f: hdul = fits.open(f) @@ -61,7 +61,7 @@ def test_with_class(): out = ftz.translate() assert "fits" in repr(ftz) m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m) + z = zarr.open(m, zarr_version=2) arr = z["PRIMARY"] with open(range_im, "rb") as f: hdul = fits.open(f) @@ -76,7 +76,7 @@ def test_var(): ftz = kerchunk.fits.FitsToZarr(var) out = ftz.translate() m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m) + z = zarr.open(m, zarr_version=2) arr = z["1"] vars = [_.tolist() for _ in arr["var"]] diff --git a/kerchunk/tests/test_grib.py b/kerchunk/tests/test_grib.py index 32092ced..91ae9ac7 100644 --- a/kerchunk/tests/test_grib.py +++ b/kerchunk/tests/test_grib.py @@ -119,7 +119,7 @@ def test_grib_tree(): corrected_msg_groups = [correct_hrrr_subhf_step(msg) for msg in scanned_msg_groups] result = grib_tree(corrected_msg_groups) fs = fsspec.filesystem("reference", fo=result) - zg = zarr.open_group(fs.get_mapper("")) + zg = zarr.open_group(fs.get_mapper(""), zarr_version=2) assert isinstance(zg["refc/instant/atmosphere/refc"], zarr.Array) assert isinstance(zg["vbdsf/avg/surface/vbdsf"], zarr.Array) assert set(zg["vbdsf/avg/surface"].attrs["coordinates"].split()) == set( @@ -147,14 +147,14 @@ def test_correct_hrrr_subhf_group_step(): scanned_msgs = ujson.load(fobj) original_zg = [ - zarr.open_group(fsspec.filesystem("reference", fo=val).get_mapper("")) + zarr.open_group(fsspec.filesystem("reference", fo=val).get_mapper(""), zarr_version=2) for val in scanned_msgs ] corrected_msgs = [correct_hrrr_subhf_step(msg) for msg in scanned_msgs] corrected_zg = [ - zarr.open_group(fsspec.filesystem("reference", fo=val).get_mapper("")) + zarr.open_group(fsspec.filesystem("reference", fo=val).get_mapper(""), zarr_version=2) for val in corrected_msgs ] @@ -177,7 +177,7 @@ def test_hrrr_subhf_corrected_grib_tree(): corrected_msgs = [correct_hrrr_subhf_step(msg) for msg in scanned_msgs] merged = grib_tree(corrected_msgs) - zg = zarr.open_group(fsspec.filesystem("reference", fo=merged).get_mapper("")) + zg = zarr.open_group(fsspec.filesystem("reference", fo=merged).get_mapper(""), zarr_version=2) # Check the values and shape of the time coordinates assert zg.u.instant.heightAboveGround.step[:].tolist() == [ 0.0, @@ -220,7 +220,7 @@ def test_hrrr_sfcf_grib_tree(): with open(fpath, "rb") as fobj: scanned_msgs = ujson.load(fobj) merged = grib_tree(scanned_msgs) - zg = zarr.open_group(fsspec.filesystem("reference", fo=merged).get_mapper("")) + zg = zarr.open_group(fsspec.filesystem("reference", fo=merged).get_mapper(""), zarr_version=2) # Check the heightAboveGround level shape of the time coordinates assert zg.u.instant.heightAboveGround.heightAboveGround[()] == 80.0 assert zg.u.instant.heightAboveGround.heightAboveGround.shape == () diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 69fd22b5..2f825e6d 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -193,7 +193,7 @@ def test_string_embed(): out = h.translate() fs = fsspec.filesystem("reference", fo=out) assert txt in fs.references["vlen_str/0"] - z = zarr.open(fs.get_mapper()) + z = zarr.open(fs.get_mapper(), zarr_version=2) assert z.vlen_str.dtype == "O" assert z.vlen_str[0] == txt assert (z.vlen_str[1:] == "").all() @@ -204,7 +204,7 @@ def test_string_null(): h = kerchunk.hdf.SingleHdf5ToZarr(fn, fn, vlen_encode="null", inline_threshold=0) out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper()) + z = zarr.open(fs.get_mapper(), zarr_version=2) assert z.vlen_str.dtype == "O" assert (z.vlen_str[:] == None).all() @@ -217,7 +217,7 @@ def test_string_leave(): ) out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper()) + z = zarr.open(fs.get_mapper(), zarr_version=2) assert z.vlen_str.dtype == "S16" assert z.vlen_str[0] # some obscured ID assert (z.vlen_str[1:] == b"").all() @@ -232,7 +232,7 @@ def test_string_decode(): out = h.translate() fs = fsspec.filesystem("reference", fo=out) assert txt in fs.cat("vlen_str/.zarray").decode() # stored in filter def - z = zarr.open(fs.get_mapper()) + z = zarr.open(fs.get_mapper(), zarr_version=2) assert z.vlen_str[0] == txt assert (z.vlen_str[1:] == "").all() @@ -243,7 +243,7 @@ def test_compound_string_null(): h = kerchunk.hdf.SingleHdf5ToZarr(f, fn, vlen_encode="null", inline_threshold=0) out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper()) + z = zarr.open(fs.get_mapper(), zarr_version=2) assert z.vlen_str[0].tolist() == (10, None) assert (z.vlen_str["ints"][1:] == 0).all() assert (z.vlen_str["strs"][1:] == None).all() @@ -257,7 +257,7 @@ def test_compound_string_leave(): ) out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper()) + z = zarr.open(fs.get_mapper(), zarr_version=2) assert z.vlen_str["ints"][0] == 10 assert z.vlen_str["strs"][0] # random ID assert (z.vlen_str["ints"][1:] == 0).all() @@ -272,7 +272,7 @@ def test_compound_string_encode(): ) out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper()) + z = zarr.open(fs.get_mapper(), zarr_version=2) assert z.vlen_str["ints"][0] == 10 assert z.vlen_str["strs"][0] == "water" assert (z.vlen_str["ints"][1:] == 0).all() @@ -303,7 +303,7 @@ def test_compress(): continue out = h.translate() m = fsspec.get_mapper("reference://", fo=out) - g = zarr.open(m) + g = zarr.open(m, zarr_version=2) assert np.mean(g.data) == 49.5 @@ -313,7 +313,7 @@ def test_embed(): out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper()) + z = zarr.open(fs.get_mapper(), zarr_version=2) data = z["Domain_10"]["STER"]["min_1"]["boom_1"]["temperature"][:] assert data[0].tolist() == [ "2014-04-01 00:00:00.0", @@ -348,7 +348,7 @@ def test_translate_links(): preserve_linked_dsets=True ) fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper()) + z = zarr.open(fs.get_mapper(), zarr_version=2) # 1. Test the hard linked datasets were translated correctly # 2. Test the soft linked datasets were translated correctly diff --git a/kerchunk/tests/test_tiff.py b/kerchunk/tests/test_tiff.py index 3cc52471..4011a67a 100644 --- a/kerchunk/tests/test_tiff.py +++ b/kerchunk/tests/test_tiff.py @@ -16,7 +16,7 @@ def test_one(): fn = files[0] out = kerchunk.tiff.tiff_to_zarr(fn) m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m) + z = zarr.open(m, zarr_version=2) assert list(z) == ["0", "1", "2"] assert z.attrs["multiscales"] == [ { @@ -34,7 +34,7 @@ def test_coord(): fn = files[0] out = kerchunk.tiff.tiff_to_zarr(fn) m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m) # highest res is the one xarray picks + z = zarr.open(m, zarr_version=2) # highest res is the one xarray picks out = kerchunk.tiff.generate_coords(z.attrs, z[0].shape) ds = xr.open_dataset(fn) diff --git a/kerchunk/tests/test_utils.py b/kerchunk/tests/test_utils.py index a1bb094d..8e4502c1 100644 --- a/kerchunk/tests/test_utils.py +++ b/kerchunk/tests/test_utils.py @@ -79,13 +79,13 @@ def test_inline_array(): assert "data/1" not in out2 assert json.loads(out2["data/.zattrs"]) == json.loads(refs["data/.zattrs"]) fs = fsspec.filesystem("reference", fo=out2) - g = zarr.open(fs.get_mapper()) + g = zarr.open(fs.get_mapper(), zarr_version=2) assert g.data[:].tolist() == [1, 2] out3 = kerchunk.utils.inline_array(refs, threshold=1000) # inlines because of size assert "data/1" not in out3 fs = fsspec.filesystem("reference", fo=out3) - g = zarr.open(fs.get_mapper()) + g = zarr.open(fs.get_mapper(), zarr_version=2) assert g.data[:].tolist() == [1, 2] @@ -99,7 +99,7 @@ def test_json(): @pytest.mark.parametrize("chunks", [[10, 10], [5, 10]]) def test_subchunk_exact(m, chunks): store = m.get_mapper("test.zarr") - g = zarr.open_group(store, mode="w") + g = zarr.open_group(store, mode="w", zarr_version=2) data = np.arange(100).reshape(10, 10) arr = g.create_dataset("data", data=data, chunks=chunks, compression=None) ref = kerchunk.zarr.single_zarr("memory://test.zarr")["refs"] @@ -114,7 +114,7 @@ def test_subchunk_exact(m, chunks): ] g2 = zarr.open_group( - "reference://", storage_options={"fo": out, "remote_protocol": "memory"} + "reference://", storage_options={"fo": out, "remote_protocol": "memory"}, zarr_version=2 ) assert (g2.data[:] == data).all() diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 838c3cb1..4049ee63 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -226,7 +226,7 @@ def inline_array(store, threshold=1000, names=None, remote_options=None): fs = fsspec.filesystem( "reference", fo=store, **(remote_options or {}), skip_instance_cache=True ) - g = zarr.open_group(fs.get_mapper(), mode="r+") + g = zarr.open_group(fs.get_mapper(), mode="r+", zarr_version=2) _inline_array(g, threshold, names=names or []) return fs.references diff --git a/pyproject.toml b/pyproject.toml index 415c3cbd..680f4c2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "numcodecs", "numpy", "ujson", - "zarr<3", + "zarr==3.0.0a6", ] [project.optional-dependencies] From d3c7e372cfa6f6822361441df79e872c9b68ee4c Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Sat, 5 Oct 2024 09:49:38 -0400 Subject: [PATCH 02/40] Bump zarr python version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 680f4c2f..6e57e223 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "numcodecs", "numpy", "ujson", - "zarr==3.0.0a6", + "zarr==3.0.0a7", ] [project.optional-dependencies] From 25d7d14e5fb6e563012d1547013d92f28834bcec Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Sat, 5 Oct 2024 09:58:35 -0400 Subject: [PATCH 03/40] Get some tests working others failing --- kerchunk/hdf.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 549923d4..777201b5 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -21,11 +21,11 @@ "for more details." ) -try: - from zarr.meta import encode_fill_value -except ModuleNotFoundError: - # https://github.com/zarr-developers/zarr-python/issues/2021 - from zarr.v2.meta import encode_fill_value +# try: +# from zarr.meta import encode_fill_value +# except ModuleNotFoundError: +# # https://github.com/zarr-developers/zarr-python/issues/2021 +# from zarr.v2.meta import encode_fill_value lggr = logging.getLogger("h5-to-zarr") _HIDDEN_ATTRS = { # from h5netcdf.attrs @@ -465,9 +465,10 @@ def _translator( if h5py.h5ds.is_scale(h5obj.id) and not cinfo: return if h5obj.attrs.get("_FillValue") is not None: - fill = encode_fill_value( - h5obj.attrs.get("_FillValue"), dt or h5obj.dtype - ) + fill = h5obj.attrs.get("_FillValue") + # fill = encode_fill_value( + # h5obj.attrs.get("_FillValue"), dt or h5obj.dtype + # ) # Create a Zarr array equivalent to this HDF5 dataset... za = self._zroot.require_dataset( From ffe5f9d906381be23b41496e167d1d44835a5486 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 8 Oct 2024 17:07:53 -0400 Subject: [PATCH 04/40] get through single hdf to zarr --- kerchunk/combine.py | 8 +-- kerchunk/fits.py | 2 +- kerchunk/grib2.py | 4 +- kerchunk/hdf.py | 94 ++++++++++++++++++++++----- kerchunk/hdf4.py | 2 +- kerchunk/netCDF3.py | 2 +- kerchunk/tests/test_combine.py | 6 +- kerchunk/tests/test_combine_concat.py | 20 +++--- kerchunk/tests/test_fits.py | 10 +-- kerchunk/tests/test_grib.py | 10 +-- kerchunk/tests/test_hdf.py | 23 ++++--- kerchunk/tests/test_tiff.py | 4 +- kerchunk/tests/test_utils.py | 8 +-- kerchunk/utils.py | 2 +- 14 files changed, 129 insertions(+), 66 deletions(-) diff --git a/kerchunk/combine.py b/kerchunk/combine.py index 155ba4c9..b02fa395 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -203,7 +203,7 @@ def append( ds = xr.open_dataset( fs.get_mapper(), engine="zarr", backend_kwargs={"consolidated": False} ) - z = zarr.open(fs.get_mapper(), zarr_version=2) + z = zarr.open(fs.get_mapper(), zarr_format=2) mzz = MultiZarrToZarr( path, out=fs.references, # dict or parquet/lazy @@ -360,7 +360,7 @@ def first_pass(self): fs._dircache_from_items() logger.debug("First pass: %s", i) - z = zarr.open_group(fs.get_mapper(""), zarr_version=2) + z = zarr.open_group(fs.get_mapper(""), zarr_format=2) for var in self.concat_dims: value = self._get_value(i, z, var, fn=self._paths[i]) if isinstance(value, np.ndarray): @@ -387,7 +387,7 @@ def store_coords(self): """ kv = {} store = zarr.storage.KVStore(kv) - group = zarr.open(store, zarr_version=2) + group = zarr.open(store, zarr_format=2) m = self.fss[0].get_mapper("") z = zarr.open(m) for k, v in self.coos.items(): @@ -461,7 +461,7 @@ def second_pass(self): for i, fs in enumerate(self.fss): to_download = {} m = fs.get_mapper("") - z = zarr.open(m, zarr_version=2) + z = zarr.open(m, zarr_format=2) if no_deps is None: # done first time only diff --git a/kerchunk/fits.py b/kerchunk/fits.py index f714af97..f50bef64 100644 --- a/kerchunk/fits.py +++ b/kerchunk/fits.py @@ -72,7 +72,7 @@ def process_file( storage_options = storage_options or {} out = out or {} - g = zarr.open(out, zarr_version=2) + g = zarr.open(out, zarr_format=2) with fsspec.open(url, mode="rb", **storage_options) as f: infile = fits.open(f, do_not_scale_image_data=True) diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py index 06108db5..7d75786f 100644 --- a/kerchunk/grib2.py +++ b/kerchunk/grib2.py @@ -191,7 +191,7 @@ def scan_grib( if good is False: continue - z = zarr.open_group(store, zarr_version=2) + z = zarr.open_group(store, zarr_format=2) global_attrs = { f"GRIB_{k}": m[k] for k in cfgrib.dataset.GLOBAL_ATTRIBUTES_KEYS @@ -398,7 +398,7 @@ def grib_tree( # TODO allow passing a LazyReferenceMapper as output? zarr_store = {} - zroot = zarr.open_group(store=zarr_store, zarr_version=2) + zroot = zarr.open_group(store=zarr_store, zarr_format=2) aggregations: Dict[str, List] = defaultdict(list) aggregation_dims: Dict[str, Set] = defaultdict(set) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 777201b5..4073a2b3 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -1,7 +1,8 @@ import base64 import io import logging -from typing import Union, BinaryIO +from typing import Union, BinaryIO, Any, cast +from packaging.version import Version import fsspec.core from fsspec.implementations.reference import LazyReferenceMapper @@ -111,8 +112,13 @@ def __init__( if vlen_encode not in ["embed", "null", "leave", "encode"]: raise NotImplementedError self.vlen = vlen_encode - self.store = out or {} - self._zroot = zarr.group(store=self.store, overwrite=True) + self.store_dict = out or {} + if Version(zarr.__version__) < Version("3.0.0.a0"): + self.store = zarr.storage.KVStore(self.store_dict) + else: + self.store = zarr.storage.MemoryStore(mode="a", store_dict=self.store_dict) + # self.store = out or {} + self._zroot = zarr.group(store=self.store, zarr_format=2, overwrite=True) self._uri = url self.error = error @@ -141,8 +147,12 @@ def translate(self, preserve_linked_dsets=False): lggr.debug("Translation begins") self._transfer_attrs(self._h5f, self._zroot) + print('transfer done') + self._h5f.visititems(self._translator) + print('visit done') + if preserve_linked_dsets: if not has_visititems_links(): raise RuntimeError( @@ -157,7 +167,10 @@ def translate(self, preserve_linked_dsets=False): self.store.flush() return self.store else: - store = _encode_for_JSON(self.store) + for k, v in self.store_dict.items(): + if isinstance(v, zarr.core.buffer.cpu.Buffer): + self.store_dict[k] = v.to_bytes() + store = _encode_for_JSON(self.store_dict) return {"version": 1, "refs": store} def _unref(self, ref): @@ -466,26 +479,30 @@ def _translator( return if h5obj.attrs.get("_FillValue") is not None: fill = h5obj.attrs.get("_FillValue") - # fill = encode_fill_value( - # h5obj.attrs.get("_FillValue"), dt or h5obj.dtype - # ) + fill = encode_fill_value( + h5obj.attrs.get("_FillValue"), dt or h5obj.dtype + ) + + adims = self._get_array_dims(h5obj) - # Create a Zarr array equivalent to this HDF5 dataset... - za = self._zroot.require_dataset( - h5obj.name, + # Create a Zarr array equivalent to this HDF5 dataset.. + za = self._zroot.require_array( + name=h5obj.name, shape=h5obj.shape, dtype=dt or h5obj.dtype, chunks=h5obj.chunks or False, fill_value=fill, - compression=None, + compressor=None, filters=filters, - overwrite=True, + attributes={ + "_ARRAY_DIMENSIONS": adims, + }, **kwargs, ) lggr.debug(f"Created Zarr array: {za}") - self._transfer_attrs(h5obj, za) - adims = self._get_array_dims(h5obj) - za.attrs["_ARRAY_DIMENSIONS"] = adims + #self._transfer_attrs(h5obj, za) + + # za.attrs["_ARRAY_DIMENSIONS"] = adims lggr.debug(f"_ARRAY_DIMENSIONS = {adims}") if "data" in kwargs: @@ -497,6 +514,7 @@ def _translator( if h5obj.fletcher32: logging.info("Discarding fletcher32 checksum") v["size"] -= 4 + key = ".".join(map(str, k)) if ( self.inline and isinstance(v, dict) @@ -509,9 +527,10 @@ def _translator( data.decode("ascii") except UnicodeDecodeError: data = b"base64:" + base64.b64encode(data) - self.store[za._chunk_key(k)] = data + + self.store_dict[key] = data else: - self.store[za._chunk_key(k)] = [ + self.store_dict[key] = [ self._uri, v["offset"], v["size"], @@ -523,6 +542,7 @@ def _translator( self._transfer_attrs(h5obj, zgrp) except Exception as e: import traceback + raise e msg = "\n".join( [ @@ -682,3 +702,43 @@ def _is_netcdf_variable(dataset: h5py.Dataset): def has_visititems_links(): return hasattr(h5py.Group, "visititems_links") + +def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + # early out + if v is None: + return v + if dtype.kind == "V" and dtype.hasobject: + if object_codec is None: + raise ValueError("missing object_codec for object array") + v = object_codec.encode(v) + v = str(base64.standard_b64encode(v), "ascii") + return v + if dtype.kind == "f": + if np.isnan(v): + return "NaN" + elif np.isposinf(v): + return "Infinity" + elif np.isneginf(v): + return "-Infinity" + else: + return float(v) + elif dtype.kind in "ui": + return int(v) + elif dtype.kind == "b": + return bool(v) + elif dtype.kind in "c": + c = cast(np.complex128, np.dtype(complex).type()) + v = ( + encode_fill_value(v.real, c.real.dtype, object_codec), + encode_fill_value(v.imag, c.imag.dtype, object_codec), + ) + return v + elif dtype.kind in "SV": + v = str(base64.standard_b64encode(v), "ascii") + return v + elif dtype.kind == "U": + return v + elif dtype.kind in "mM": + return int(v.view("i8")) + else: + return v diff --git a/kerchunk/hdf4.py b/kerchunk/hdf4.py index 4235d139..8339659b 100644 --- a/kerchunk/hdf4.py +++ b/kerchunk/hdf4.py @@ -144,7 +144,7 @@ def translate(self, filename=None, storage_options=None): remote_protocol=prot, remote_options=self.st, ) - g = zarr.open_group("reference://", storage_options=dict(fs=fs), zarr_version=2) + g = zarr.open_group("reference://", storage_options=dict(fs=fs), zarr_format=2) refs = {} for k, v in output.items(): if isinstance(v, dict): diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py index 8e0994ca..d44fc808 100644 --- a/kerchunk/netCDF3.py +++ b/kerchunk/netCDF3.py @@ -167,7 +167,7 @@ def translate(self): import zarr out = self.out - z = zarr.open(out, mode="w", zarr_version=2) + z = zarr.open(out, mode="w", zarr_format=2) for dim, var in self.variables.items(): if dim in self.chunks: shape = self.chunks[dim][-1] diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py index 1b5713b2..868a39ff 100644 --- a/kerchunk/tests/test_combine.py +++ b/kerchunk/tests/test_combine.py @@ -133,14 +133,14 @@ # simple time arrays - xarray can't make these! m = fs.get_mapper("time1.zarr") -z = zarr.open(m, mode="w", zarr_version=2) +z = zarr.open(m, mode="w", zarr_format=2) ar = z.create_dataset("time", data=np.array([1], dtype="M8[s]")) ar.attrs.update({"_ARRAY_DIMENSIONS": ["time"]}) ar = z.create_dataset("data", data=arr) ar.attrs.update({"_ARRAY_DIMENSIONS": ["time", "x", "y"]}) m = fs.get_mapper("time2.zarr") -z = zarr.open(m, mode="w", zarr_version=2) +z = zarr.open(m, mode="w", zarr_format=2) ar = z.create_dataset("time", data=np.array([2], dtype="M8[s]")) ar.attrs.update({"_ARRAY_DIMENSIONS": ["time"]}) ar = z.create_dataset("data", data=arr) @@ -272,7 +272,7 @@ def test_get_coos(refs, selector, expected): mzz.first_pass() assert mzz.coos["time"].tolist() == expected mzz.store_coords() - g = zarr.open(mzz.out, zarr_version=2) + g = zarr.open(mzz.out, zarr_format=2) assert g["time"][:].tolist() == expected assert dict(g.attrs) diff --git a/kerchunk/tests/test_combine_concat.py b/kerchunk/tests/test_combine_concat.py index f51f10e8..23e785df 100644 --- a/kerchunk/tests/test_combine_concat.py +++ b/kerchunk/tests/test_combine_concat.py @@ -51,7 +51,7 @@ def test_success(tmpdir, arrays, chunks, axis, m): refs = [] for i, x in enumerate(arrays): fn = f"{tmpdir}/out{i}.zarr" - g = zarr.open(fn, zarr_version=2) + g = zarr.open(fn, zarr_format=2) g.create_dataset("x", data=x, chunks=chunks) fns.append(fn) ref = kerchunk.zarr.single_zarr(fn, inline=0) @@ -62,7 +62,7 @@ def test_success(tmpdir, arrays, chunks, axis, m): ) mapper = fsspec.get_mapper("reference://", fo=out) - g = zarr.open(mapper, zarr_version=2) + g = zarr.open(mapper, zarr_format=2) assert (g.x[:] == np.concatenate(arrays, axis=axis)).all() try: @@ -76,7 +76,7 @@ def test_success(tmpdir, arrays, chunks, axis, m): remote_protocol="file", skip_instance_cache=True, ) - g = zarr.open(mapper, zarr_version=2) + g = zarr.open(mapper, zarr_format=2) assert (g.x[:] == np.concatenate(arrays, axis=axis)).all() kerchunk.df.refs_to_dataframe(out, "memory://out.parq", record_size=1) @@ -86,7 +86,7 @@ def test_success(tmpdir, arrays, chunks, axis, m): remote_protocol="file", skip_instance_cache=True, ) - g = zarr.open(mapper, zarr_version=2) + g = zarr.open(mapper, zarr_format=2) assert (g.x[:] == np.concatenate(arrays, axis=axis)).all() @@ -95,9 +95,9 @@ def test_fail_chunks(tmpdir): fn2 = f"{tmpdir}/out2.zarr" x1 = np.arange(10) x2 = np.arange(10, 20) - g = zarr.open(fn1, zarr_version=2) + g = zarr.open(fn1, zarr_format=2) g.create_dataset("x", data=x1, chunks=(2,)) - g = zarr.open(fn2, zarr_version=2) + g = zarr.open(fn2, zarr_format=2) g.create_dataset("x", data=x2, chunks=(3,)) ref1 = kerchunk.zarr.single_zarr(fn1, inline=0) @@ -112,9 +112,9 @@ def test_fail_shape(tmpdir): fn2 = f"{tmpdir}/out2.zarr" x1 = np.arange(12).reshape(6, 2) x2 = np.arange(12, 24) - g = zarr.open(fn1, zarr_version=2) + g = zarr.open(fn1, zarr_format=2) g.create_dataset("x", data=x1, chunks=(2,)) - g = zarr.open(fn2, zarr_version=2) + g = zarr.open(fn2, zarr_format=2) g.create_dataset("x", data=x2, chunks=(2,)) ref1 = kerchunk.zarr.single_zarr(fn1, inline=0) @@ -129,9 +129,9 @@ def test_fail_irregular_chunk_boundaries(tmpdir): fn2 = f"{tmpdir}/out2.zarr" x1 = np.arange(10) x2 = np.arange(10, 24) - g = zarr.open(fn1, zarr_version=2) + g = zarr.open(fn1, zarr_format=2) g.create_dataset("x", data=x1, chunks=(4,)) - g = zarr.open(fn2, zarr_version=2) + g = zarr.open(fn2, zarr_format=2) g.create_dataset("x", data=x2, chunks=(4,)) ref1 = kerchunk.zarr.single_zarr(fn1, inline=0) diff --git a/kerchunk/tests/test_fits.py b/kerchunk/tests/test_fits.py index e7211479..5d7c3b6d 100644 --- a/kerchunk/tests/test_fits.py +++ b/kerchunk/tests/test_fits.py @@ -18,7 +18,7 @@ def test_ascii_table(): url = "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits" out = kerchunk.fits.process_file(url, extension=1) m = fsspec.get_mapper("reference://", fo=out, remote_protocol="https") - g = zarr.open(m, zarr_version=2) + g = zarr.open(m, zarr_format=2) arr = g["u5780205r_cvt.c0h.tab"][:] with fsspec.open( "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits" @@ -31,7 +31,7 @@ def test_ascii_table(): def test_binary_table(): out = kerchunk.fits.process_file(btable, extension=1) m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_version=2) + z = zarr.open(m, zarr_format=2) arr = z["1"] with open(btable, "rb") as f: hdul = fits.open(f) @@ -48,7 +48,7 @@ def test_binary_table(): def test_cube(): out = kerchunk.fits.process_file(range_im) m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_version=2) + z = zarr.open(m, zarr_format=2) arr = z["PRIMARY"] with open(range_im, "rb") as f: hdul = fits.open(f) @@ -61,7 +61,7 @@ def test_with_class(): out = ftz.translate() assert "fits" in repr(ftz) m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_version=2) + z = zarr.open(m, zarr_format=2) arr = z["PRIMARY"] with open(range_im, "rb") as f: hdul = fits.open(f) @@ -76,7 +76,7 @@ def test_var(): ftz = kerchunk.fits.FitsToZarr(var) out = ftz.translate() m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_version=2) + z = zarr.open(m, zarr_format=2) arr = z["1"] vars = [_.tolist() for _ in arr["var"]] diff --git a/kerchunk/tests/test_grib.py b/kerchunk/tests/test_grib.py index 91ae9ac7..9102529e 100644 --- a/kerchunk/tests/test_grib.py +++ b/kerchunk/tests/test_grib.py @@ -119,7 +119,7 @@ def test_grib_tree(): corrected_msg_groups = [correct_hrrr_subhf_step(msg) for msg in scanned_msg_groups] result = grib_tree(corrected_msg_groups) fs = fsspec.filesystem("reference", fo=result) - zg = zarr.open_group(fs.get_mapper(""), zarr_version=2) + zg = zarr.open_group(fs.get_mapper(""), zarr_format=2) assert isinstance(zg["refc/instant/atmosphere/refc"], zarr.Array) assert isinstance(zg["vbdsf/avg/surface/vbdsf"], zarr.Array) assert set(zg["vbdsf/avg/surface"].attrs["coordinates"].split()) == set( @@ -147,14 +147,14 @@ def test_correct_hrrr_subhf_group_step(): scanned_msgs = ujson.load(fobj) original_zg = [ - zarr.open_group(fsspec.filesystem("reference", fo=val).get_mapper(""), zarr_version=2) + zarr.open_group(fsspec.filesystem("reference", fo=val).get_mapper(""), zarr_format=2) for val in scanned_msgs ] corrected_msgs = [correct_hrrr_subhf_step(msg) for msg in scanned_msgs] corrected_zg = [ - zarr.open_group(fsspec.filesystem("reference", fo=val).get_mapper(""), zarr_version=2) + zarr.open_group(fsspec.filesystem("reference", fo=val).get_mapper(""), zarr_format=2) for val in corrected_msgs ] @@ -177,7 +177,7 @@ def test_hrrr_subhf_corrected_grib_tree(): corrected_msgs = [correct_hrrr_subhf_step(msg) for msg in scanned_msgs] merged = grib_tree(corrected_msgs) - zg = zarr.open_group(fsspec.filesystem("reference", fo=merged).get_mapper(""), zarr_version=2) + zg = zarr.open_group(fsspec.filesystem("reference", fo=merged).get_mapper(""), zarr_format=2) # Check the values and shape of the time coordinates assert zg.u.instant.heightAboveGround.step[:].tolist() == [ 0.0, @@ -220,7 +220,7 @@ def test_hrrr_sfcf_grib_tree(): with open(fpath, "rb") as fobj: scanned_msgs = ujson.load(fobj) merged = grib_tree(scanned_msgs) - zg = zarr.open_group(fsspec.filesystem("reference", fo=merged).get_mapper(""), zarr_version=2) + zg = zarr.open_group(fsspec.filesystem("reference", fo=merged).get_mapper(""), zarr_format=2) # Check the heightAboveGround level shape of the time coordinates assert zg.u.instant.heightAboveGround.heightAboveGround[()] == 80.0 assert zg.u.instant.heightAboveGround.heightAboveGround.shape == () diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 2f825e6d..e140ca48 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -18,6 +18,7 @@ def test_single(): """Test creating references for a single HDF file""" url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" so = dict(anon=True, default_fill_cache=False, default_cache_type="none") + with fsspec.open(url, **so) as f: h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) test_dict = h5chunks.translate() @@ -25,6 +26,8 @@ def test_single(): m = fsspec.get_mapper( "reference://", fo=test_dict, remote_protocol="s3", remote_options=so ) + x = [(k, v) for (k, v) in m.items()] + raise ValueError("foo") ds = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False)) with fsspec.open(url, **so) as f: @@ -193,7 +196,7 @@ def test_string_embed(): out = h.translate() fs = fsspec.filesystem("reference", fo=out) assert txt in fs.references["vlen_str/0"] - z = zarr.open(fs.get_mapper(), zarr_version=2) + z = zarr.open(fs.get_mapper(), zarr_format=2) assert z.vlen_str.dtype == "O" assert z.vlen_str[0] == txt assert (z.vlen_str[1:] == "").all() @@ -204,7 +207,7 @@ def test_string_null(): h = kerchunk.hdf.SingleHdf5ToZarr(fn, fn, vlen_encode="null", inline_threshold=0) out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_version=2) + z = zarr.open(fs.get_mapper(), zarr_format=2) assert z.vlen_str.dtype == "O" assert (z.vlen_str[:] == None).all() @@ -217,7 +220,7 @@ def test_string_leave(): ) out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_version=2) + z = zarr.open(fs.get_mapper(), zarr_format=2) assert z.vlen_str.dtype == "S16" assert z.vlen_str[0] # some obscured ID assert (z.vlen_str[1:] == b"").all() @@ -232,7 +235,7 @@ def test_string_decode(): out = h.translate() fs = fsspec.filesystem("reference", fo=out) assert txt in fs.cat("vlen_str/.zarray").decode() # stored in filter def - z = zarr.open(fs.get_mapper(), zarr_version=2) + z = zarr.open(fs.get_mapper(), zarr_format=2) assert z.vlen_str[0] == txt assert (z.vlen_str[1:] == "").all() @@ -243,7 +246,7 @@ def test_compound_string_null(): h = kerchunk.hdf.SingleHdf5ToZarr(f, fn, vlen_encode="null", inline_threshold=0) out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_version=2) + z = zarr.open(fs.get_mapper(), zarr_format=2) assert z.vlen_str[0].tolist() == (10, None) assert (z.vlen_str["ints"][1:] == 0).all() assert (z.vlen_str["strs"][1:] == None).all() @@ -257,7 +260,7 @@ def test_compound_string_leave(): ) out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_version=2) + z = zarr.open(fs.get_mapper(), zarr_format=2) assert z.vlen_str["ints"][0] == 10 assert z.vlen_str["strs"][0] # random ID assert (z.vlen_str["ints"][1:] == 0).all() @@ -272,7 +275,7 @@ def test_compound_string_encode(): ) out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_version=2) + z = zarr.open(fs.get_mapper(), zarr_format=2) assert z.vlen_str["ints"][0] == 10 assert z.vlen_str["strs"][0] == "water" assert (z.vlen_str["ints"][1:] == 0).all() @@ -303,7 +306,7 @@ def test_compress(): continue out = h.translate() m = fsspec.get_mapper("reference://", fo=out) - g = zarr.open(m, zarr_version=2) + g = zarr.open(m, zarr_format=2) assert np.mean(g.data) == 49.5 @@ -313,7 +316,7 @@ def test_embed(): out = h.translate() fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_version=2) + z = zarr.open(fs.get_mapper(), zarr_format=2) data = z["Domain_10"]["STER"]["min_1"]["boom_1"]["temperature"][:] assert data[0].tolist() == [ "2014-04-01 00:00:00.0", @@ -348,7 +351,7 @@ def test_translate_links(): preserve_linked_dsets=True ) fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_version=2) + z = zarr.open(fs.get_mapper(), zarr_format=2) # 1. Test the hard linked datasets were translated correctly # 2. Test the soft linked datasets were translated correctly diff --git a/kerchunk/tests/test_tiff.py b/kerchunk/tests/test_tiff.py index 4011a67a..74ba59a4 100644 --- a/kerchunk/tests/test_tiff.py +++ b/kerchunk/tests/test_tiff.py @@ -16,7 +16,7 @@ def test_one(): fn = files[0] out = kerchunk.tiff.tiff_to_zarr(fn) m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_version=2) + z = zarr.open(m, zarr_format=2) assert list(z) == ["0", "1", "2"] assert z.attrs["multiscales"] == [ { @@ -34,7 +34,7 @@ def test_coord(): fn = files[0] out = kerchunk.tiff.tiff_to_zarr(fn) m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_version=2) # highest res is the one xarray picks + z = zarr.open(m, zarr_format=2) # highest res is the one xarray picks out = kerchunk.tiff.generate_coords(z.attrs, z[0].shape) ds = xr.open_dataset(fn) diff --git a/kerchunk/tests/test_utils.py b/kerchunk/tests/test_utils.py index 8e4502c1..a951c36c 100644 --- a/kerchunk/tests/test_utils.py +++ b/kerchunk/tests/test_utils.py @@ -79,13 +79,13 @@ def test_inline_array(): assert "data/1" not in out2 assert json.loads(out2["data/.zattrs"]) == json.loads(refs["data/.zattrs"]) fs = fsspec.filesystem("reference", fo=out2) - g = zarr.open(fs.get_mapper(), zarr_version=2) + g = zarr.open(fs.get_mapper(), zarr_format=2) assert g.data[:].tolist() == [1, 2] out3 = kerchunk.utils.inline_array(refs, threshold=1000) # inlines because of size assert "data/1" not in out3 fs = fsspec.filesystem("reference", fo=out3) - g = zarr.open(fs.get_mapper(), zarr_version=2) + g = zarr.open(fs.get_mapper(), zarr_format=2) assert g.data[:].tolist() == [1, 2] @@ -99,7 +99,7 @@ def test_json(): @pytest.mark.parametrize("chunks", [[10, 10], [5, 10]]) def test_subchunk_exact(m, chunks): store = m.get_mapper("test.zarr") - g = zarr.open_group(store, mode="w", zarr_version=2) + g = zarr.open_group(store, mode="w", zarr_format=2) data = np.arange(100).reshape(10, 10) arr = g.create_dataset("data", data=data, chunks=chunks, compression=None) ref = kerchunk.zarr.single_zarr("memory://test.zarr")["refs"] @@ -114,7 +114,7 @@ def test_subchunk_exact(m, chunks): ] g2 = zarr.open_group( - "reference://", storage_options={"fo": out, "remote_protocol": "memory"}, zarr_version=2 + "reference://", storage_options={"fo": out, "remote_protocol": "memory"}, zarr_format=2 ) assert (g2.data[:] == data).all() diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 4049ee63..b52a9c0b 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -226,7 +226,7 @@ def inline_array(store, threshold=1000, names=None, remote_options=None): fs = fsspec.filesystem( "reference", fo=store, **(remote_options or {}), skip_instance_cache=True ) - g = zarr.open_group(fs.get_mapper(), mode="r+", zarr_version=2) + g = zarr.open_group(fs.get_mapper(), mode="r+", zarr_format=2) _inline_array(g, threshold, names=names or []) return fs.references From 5aef233686c89dc9ca56325f1c654e35a80e8440 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 8 Oct 2024 17:13:36 -0400 Subject: [PATCH 05/40] Save progress --- kerchunk/tests/test_hdf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index e140ca48..4135495b 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -6,6 +6,7 @@ import pytest import xarray as xr import zarr +from zarr.storage import MemoryStore import h5py from kerchunk.hdf import SingleHdf5ToZarr, has_visititems_links @@ -26,9 +27,8 @@ def test_single(): m = fsspec.get_mapper( "reference://", fo=test_dict, remote_protocol="s3", remote_options=so ) - x = [(k, v) for (k, v) in m.items()] - raise ValueError("foo") - ds = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False)) + store = MemoryStore(m) + ds = xr.open_dataset(store, engine="zarr", backend_kwargs=dict(consolidated=False)) with fsspec.open(url, **so) as f: expected = xr.open_dataset(f, engine="h5netcdf") From b9323d2e227bd7b163492afe2e7a1f5eec6bda91 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 8 Oct 2024 20:37:52 -0400 Subject: [PATCH 06/40] Cleanup, almost working with hdf --- kerchunk/hdf.py | 12 +++------- kerchunk/tests/test_hdf.py | 45 +++++++++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 4073a2b3..501de4f3 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -115,11 +115,11 @@ def __init__( self.store_dict = out or {} if Version(zarr.__version__) < Version("3.0.0.a0"): self.store = zarr.storage.KVStore(self.store_dict) + self._zroot = zarr.group(store=self.store, overwrite=True) else: self.store = zarr.storage.MemoryStore(mode="a", store_dict=self.store_dict) - # self.store = out or {} - self._zroot = zarr.group(store=self.store, zarr_format=2, overwrite=True) - + self._zroot = zarr.group(store=self.store, zarr_format=2, overwrite=True) + self._uri = url self.error = error lggr.debug(f"HDF5 file URI: {self._uri}") @@ -146,13 +146,8 @@ def translate(self, preserve_linked_dsets=False): """ lggr.debug("Translation begins") self._transfer_attrs(self._h5f, self._zroot) - - print('transfer done') - self._h5f.visititems(self._translator) - print('visit done') - if preserve_linked_dsets: if not has_visititems_links(): raise RuntimeError( @@ -542,7 +537,6 @@ def _translator( self._transfer_attrs(h5obj, zgrp) except Exception as e: import traceback - raise e msg = "\n".join( [ diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 4135495b..e2806545 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -1,6 +1,9 @@ import fsspec import os.path as osp +import fsspec.implementations +import fsspec.implementations.reference + import kerchunk.hdf import numpy as np import pytest @@ -9,6 +12,8 @@ from zarr.storage import MemoryStore import h5py +from packaging.version import Version + from kerchunk.hdf import SingleHdf5ToZarr, has_visititems_links from kerchunk.combine import MultiZarrToZarr, drop @@ -24,11 +29,15 @@ def test_single(): h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) test_dict = h5chunks.translate() - m = fsspec.get_mapper( - "reference://", fo=test_dict, remote_protocol="s3", remote_options=so - ) - store = MemoryStore(m) - ds = xr.open_dataset(store, engine="zarr", backend_kwargs=dict(consolidated=False)) + if Version(zarr.__version__) < Version("3.0.0.a0"): + store = fsspec.get_mapper( + "reference://", fo=test_dict, remote_protocol="s3", remote_options=so + ) + else: + fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict) + store = zarr.storage.RemoteStore(fs, mode="r") + + ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) with fsspec.open(url, **so) as f: expected = xr.open_dataset(f, engine="h5netcdf") @@ -45,22 +54,32 @@ def test_single_direct_open(): h5f=url, inline_threshold=300, storage_options=so ).translate() - m = fsspec.get_mapper( - "reference://", fo=test_dict, remote_protocol="s3", remote_options=so - ) + if Version(zarr.__version__) < Version("3.0.0.a0"): + store = fsspec.get_mapper( + "reference://", fo=test_dict, remote_protocol="s3", remote_options=so + ) + else: + fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict) + store = zarr.storage.RemoteStore(fs, mode="r") + ds_direct = xr.open_dataset( - m, engine="zarr", backend_kwargs=dict(consolidated=False) + store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) ) with fsspec.open(url, **so) as f: h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) test_dict = h5chunks.translate() - m = fsspec.get_mapper( - "reference://", fo=test_dict, remote_protocol="s3", remote_options=so - ) + if Version(zarr.__version__) < Version("3.0.0.a0"): + store = fsspec.get_mapper( + "reference://", fo=test_dict, remote_protocol="s3", remote_options=so + ) + else: + fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict) + store = zarr.storage.RemoteStore(fs, mode="r") + ds_from_file_opener = xr.open_dataset( - m, engine="zarr", backend_kwargs=dict(consolidated=False) + store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) ) xr.testing.assert_equal( From 0f1711944159edcbcce563cf5b7c8bde1e5e5348 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 8 Oct 2024 21:46:49 -0400 Subject: [PATCH 07/40] Closer... --- kerchunk/hdf.py | 14 +++++++++++--- kerchunk/tests/test_hdf.py | 7 +++++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 501de4f3..5e4d2304 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -162,9 +162,16 @@ def translate(self, preserve_linked_dsets=False): self.store.flush() return self.store else: + keys_to_remove = [] + new_keys = {} for k, v in self.store_dict.items(): if isinstance(v, zarr.core.buffer.cpu.Buffer): - self.store_dict[k] = v.to_bytes() + key = str.removeprefix(k, "/") + new_keys[key] = v.to_bytes() + keys_to_remove.append(k) + for k in keys_to_remove: + del self.store_dict[k] + self.store_dict.update(new_keys) store = _encode_for_JSON(self.store_dict) return {"version": 1, "refs": store} @@ -495,7 +502,7 @@ def _translator( **kwargs, ) lggr.debug(f"Created Zarr array: {za}") - #self._transfer_attrs(h5obj, za) + self._transfer_attrs(h5obj, za) # za.attrs["_ARRAY_DIMENSIONS"] = adims lggr.debug(f"_ARRAY_DIMENSIONS = {adims}") @@ -509,7 +516,8 @@ def _translator( if h5obj.fletcher32: logging.info("Discarding fletcher32 checksum") v["size"] -= 4 - key = ".".join(map(str, k)) + key = str.removeprefix(h5obj.name, "/") + "/" + ".".join(map(str, k)) + if ( self.inline and isinstance(v, dict) diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index e2806545..2fe4e1cf 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -1,3 +1,4 @@ +import asyncio import fsspec import os.path as osp @@ -9,8 +10,6 @@ import pytest import xarray as xr import zarr -from zarr.storage import MemoryStore -import h5py from packaging.version import Version @@ -20,6 +19,10 @@ here = osp.dirname(__file__) +async def list_dir(store, path): + [x async for x in store.list_dir(path)] + + def test_single(): """Test creating references for a single HDF file""" url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" From 5c8806bf272334b59cfdba13a9d746cef9e51329 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 9 Oct 2024 14:18:17 -0400 Subject: [PATCH 08/40] Updating tests --- kerchunk/hdf.py | 1 + kerchunk/tests/test_hdf.py | 63 ++++++++++++++------------------------ 2 files changed, 24 insertions(+), 40 deletions(-) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 5e4d2304..6bb16922 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -705,6 +705,7 @@ def _is_netcdf_variable(dataset: h5py.Dataset): def has_visititems_links(): return hasattr(h5py.Group, "visititems_links") + def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: # early out if v is None: diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 2fe4e1cf..ace45472 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -23,6 +23,16 @@ async def list_dir(store, path): [x async for x in store.list_dir(path)] +def create_store(test_dict: dict): + if Version(zarr.__version__) < Version("3.0.0.a0"): + return fsspec.get_mapper( + "reference://", fo=test_dict, remote_protocol="s3", remote_options=so + ) + else: + fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict) + return zarr.storage.RemoteStore(fs, mode="r") + + def test_single(): """Test creating references for a single HDF file""" url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" @@ -32,13 +42,7 @@ def test_single(): h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) test_dict = h5chunks.translate() - if Version(zarr.__version__) < Version("3.0.0.a0"): - store = fsspec.get_mapper( - "reference://", fo=test_dict, remote_protocol="s3", remote_options=so - ) - else: - fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict) - store = zarr.storage.RemoteStore(fs, mode="r") + store = create_store(test_dict) ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) @@ -57,13 +61,7 @@ def test_single_direct_open(): h5f=url, inline_threshold=300, storage_options=so ).translate() - if Version(zarr.__version__) < Version("3.0.0.a0"): - store = fsspec.get_mapper( - "reference://", fo=test_dict, remote_protocol="s3", remote_options=so - ) - else: - fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict) - store = zarr.storage.RemoteStore(fs, mode="r") + store = create_store(test_dict) ds_direct = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) @@ -73,13 +71,7 @@ def test_single_direct_open(): h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) test_dict = h5chunks.translate() - if Version(zarr.__version__) < Version("3.0.0.a0"): - store = fsspec.get_mapper( - "reference://", fo=test_dict, remote_protocol="s3", remote_options=so - ) - else: - fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict) - store = zarr.storage.RemoteStore(fs, mode="r") + store = create_store(test_dict) ds_from_file_opener = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) @@ -105,11 +97,8 @@ def test_multizarr(generate_mzz): """Test creating a combined reference file with MultiZarrToZarr""" mzz = generate_mzz test_dict = mzz.translate() - - m = fsspec.get_mapper( - "reference://", fo=test_dict, remote_protocol="s3", remote_options=so - ) - ds = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False)) + store = create_store(test_dict) + ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) with fsspec.open_files(urls, **so) as fs: expts = [xr.open_dataset(f, engine="h5netcdf") for f in fs] @@ -183,11 +172,8 @@ def test_times(times_data): h5chunks = SingleHdf5ToZarr(f, url) test_dict = h5chunks.translate() - m = fsspec.get_mapper( - "reference://", - fo=test_dict, - ) - result = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False)) + store = create_store(test_dict) + result = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) expected = x1.to_dataset() xr.testing.assert_equal(result, expected) @@ -199,11 +185,8 @@ def test_times_str(times_data): h5chunks = SingleHdf5ToZarr(url) test_dict = h5chunks.translate() - m = fsspec.get_mapper( - "reference://", - fo=test_dict, - ) - result = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False)) + store = create_store(test_dict) + result = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) expected = x1.to_dataset() xr.testing.assert_equal(result, expected) @@ -327,8 +310,8 @@ def test_compress(): h.translate() continue out = h.translate() - m = fsspec.get_mapper("reference://", fo=out) - g = zarr.open(m, zarr_format=2) + store = create_store(out) + g = zarr.open(store, zarr_format=2) assert np.mean(g.data) == 49.5 @@ -337,8 +320,8 @@ def test_embed(): h = kerchunk.hdf.SingleHdf5ToZarr(fn, vlen_encode="embed") out = h.translate() - fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_format=2) + store = create_store(out) + z = zarr.open(store, zarr_format=2) data = z["Domain_10"]["STER"]["min_1"]["boom_1"]["temperature"][:] assert data[0].tolist() == [ "2014-04-01 00:00:00.0", From 80fedcde9a6768761ee2f36bb2ae63b6310d4492 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 10 Oct 2024 13:39:25 -0400 Subject: [PATCH 09/40] reorganize --- kerchunk/hdf.py | 51 ++------------------------------------ kerchunk/tests/test_hdf.py | 14 ++++++++--- kerchunk/utils.py | 44 ++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 53 deletions(-) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 6bb16922..6b7b443d 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -1,7 +1,7 @@ import base64 import io import logging -from typing import Union, BinaryIO, Any, cast +from typing import Union, BinaryIO from packaging.version import Version import fsspec.core @@ -11,7 +11,7 @@ import numcodecs from .codecs import FillStringsCodec -from .utils import _encode_for_JSON +from .utils import _encode_for_JSON, encode_fill_value try: import h5py @@ -22,12 +22,6 @@ "for more details." ) -# try: -# from zarr.meta import encode_fill_value -# except ModuleNotFoundError: -# # https://github.com/zarr-developers/zarr-python/issues/2021 -# from zarr.v2.meta import encode_fill_value - lggr = logging.getLogger("h5-to-zarr") _HIDDEN_ATTRS = { # from h5netcdf.attrs "REFERENCE_LIST", @@ -504,7 +498,6 @@ def _translator( lggr.debug(f"Created Zarr array: {za}") self._transfer_attrs(h5obj, za) - # za.attrs["_ARRAY_DIMENSIONS"] = adims lggr.debug(f"_ARRAY_DIMENSIONS = {adims}") if "data" in kwargs: @@ -705,43 +698,3 @@ def _is_netcdf_variable(dataset: h5py.Dataset): def has_visititems_links(): return hasattr(h5py.Group, "visititems_links") - -def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: - # early out - if v is None: - return v - if dtype.kind == "V" and dtype.hasobject: - if object_codec is None: - raise ValueError("missing object_codec for object array") - v = object_codec.encode(v) - v = str(base64.standard_b64encode(v), "ascii") - return v - if dtype.kind == "f": - if np.isnan(v): - return "NaN" - elif np.isposinf(v): - return "Infinity" - elif np.isneginf(v): - return "-Infinity" - else: - return float(v) - elif dtype.kind in "ui": - return int(v) - elif dtype.kind == "b": - return bool(v) - elif dtype.kind in "c": - c = cast(np.complex128, np.dtype(complex).type()) - v = ( - encode_fill_value(v.real, c.real.dtype, object_codec), - encode_fill_value(v.imag, c.imag.dtype, object_codec), - ) - return v - elif dtype.kind in "SV": - v = str(base64.standard_b64encode(v), "ascii") - return v - elif dtype.kind == "U": - return v - elif dtype.kind in "mM": - return int(v.view("i8")) - else: - return v diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index ace45472..665cd392 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -1,5 +1,6 @@ -import asyncio +from typing import Any import fsspec +import json import os.path as osp import fsspec.implementations @@ -23,25 +24,29 @@ async def list_dir(store, path): [x async for x in store.list_dir(path)] -def create_store(test_dict: dict): +def create_store(test_dict: dict, remote_options: Any = None): if Version(zarr.__version__) < Version("3.0.0.a0"): return fsspec.get_mapper( "reference://", fo=test_dict, remote_protocol="s3", remote_options=so ) else: - fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict) + fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict, remote_options=remote_options) return zarr.storage.RemoteStore(fs, mode="r") def test_single(): """Test creating references for a single HDF file""" - url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" + #url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" + url = "s3://noaa-nos-ofs-pds/ngofs2/netcdf/202410/ngofs2.t03z.20241001.2ds.f020.nc" so = dict(anon=True, default_fill_cache=False, default_cache_type="none") with fsspec.open(url, **so) as f: h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) test_dict = h5chunks.translate() + with open("test_dict.json", "w") as f: + json.dump(test_dict, f) + store = create_store(test_dict) ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) @@ -97,6 +102,7 @@ def test_multizarr(generate_mzz): """Test creating a combined reference file with MultiZarrToZarr""" mzz = generate_mzz test_dict = mzz.translate() + store = create_store(test_dict) ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) diff --git a/kerchunk/utils.py b/kerchunk/utils.py index b52a9c0b..a0f9e96e 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -1,11 +1,13 @@ import base64 import copy import itertools +from typing import Any, cast import warnings import ujson import fsspec +import numpy as np import zarr @@ -134,6 +136,48 @@ def _encode_for_JSON(store): return store + +def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + # early out + if v is None: + return v + if dtype.kind == "V" and dtype.hasobject: + if object_codec is None: + raise ValueError("missing object_codec for object array") + v = object_codec.encode(v) + v = str(base64.standard_b64encode(v), "ascii") + return v + if dtype.kind == "f": + if np.isnan(v): + return "NaN" + elif np.isposinf(v): + return "Infinity" + elif np.isneginf(v): + return "-Infinity" + else: + return float(v) + elif dtype.kind in "ui": + return int(v) + elif dtype.kind == "b": + return bool(v) + elif dtype.kind in "c": + c = cast(np.complex128, np.dtype(complex).type()) + v = ( + encode_fill_value(v.real, c.real.dtype, object_codec), + encode_fill_value(v.imag, c.imag.dtype, object_codec), + ) + return v + elif dtype.kind in "SV": + v = str(base64.standard_b64encode(v), "ascii") + return v + elif dtype.kind == "U": + return v + elif dtype.kind in "mM": + return int(v.view("i8")) + else: + return v + + def do_inline(store, threshold, remote_options=None, remote_protocol=None): """Replace short chunks with the value of that chunk and inline metadata From 1f69a0b129455ed712b1513ebf362c1c3be17b2f Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 10 Oct 2024 13:48:28 -0400 Subject: [PATCH 10/40] Save progress --- kerchunk/netCDF3.py | 13 ++++++++++--- kerchunk/tests/test_hdf.py | 2 +- kerchunk/tests/test_netcdf.py | 20 ++++++++++++++++++-- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py index d44fc808..b9d47063 100644 --- a/kerchunk/netCDF3.py +++ b/kerchunk/netCDF3.py @@ -1,4 +1,5 @@ from functools import reduce +from packaging.version import Version from operator import mul import numpy as np @@ -167,7 +168,13 @@ def translate(self): import zarr out = self.out - z = zarr.open(out, mode="w", zarr_format=2) + if Version(zarr.__version__) < Version("3.0.0.a0"): + store = zarr.storage.KVStore(out) + z = zarr.group(store=store, overwrite=True) + else: + store = zarr.storage.MemoryStore(mode="a", store_dict=out) + z = zarr.open(store, mode="w", zarr_format=2) + for dim, var in self.variables.items(): if dim in self.chunks: shape = self.chunks[dim][-1] @@ -197,7 +204,7 @@ def translate(self): dtype=var.data.dtype, fill_value=fill, chunks=shape, - compression=None, + compressor=None, ) part = ".".join(["0"] * len(shape)) or "0" k = f"{dim}/{part}" @@ -251,7 +258,7 @@ def translate(self): dtype=base, fill_value=fill, chunks=(1,) + dtype.shape, - compression=None, + compressor=None, ) arr.attrs.update( { diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 665cd392..233a58e4 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -27,7 +27,7 @@ async def list_dir(store, path): def create_store(test_dict: dict, remote_options: Any = None): if Version(zarr.__version__) < Version("3.0.0.a0"): return fsspec.get_mapper( - "reference://", fo=test_dict, remote_protocol="s3", remote_options=so + "reference://", fo=test_dict, remote_protocol="s3", remote_options=remote_options ) else: fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict, remote_options=remote_options) diff --git a/kerchunk/tests/test_netcdf.py b/kerchunk/tests/test_netcdf.py index 43b6021b..0036c0a3 100644 --- a/kerchunk/tests/test_netcdf.py +++ b/kerchunk/tests/test_netcdf.py @@ -1,4 +1,5 @@ import os +from typing import Any import fsspec @@ -7,6 +8,8 @@ import pytest from kerchunk import netCDF3 +import zarr + xr = pytest.importorskip("xarray") @@ -24,16 +27,29 @@ ) +def create_store(test_dict: dict, remote_options: Any = None): + if Version(zarr.__version__) < Version("3.0.0.a0"): + return fsspec.get_mapper( + "reference://", fo=test_dict, remote_protocol="s3", remote_options=remote_options + ) + else: + fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict, remote_options=remote_options) + return zarr.storage.RemoteStore(fs, mode="r") + + def test_one(m): m.pipe("data.nc3", bdata) h = netCDF3.netcdf_recording_file("memory://data.nc3") out = h.translate() + + store = create_store(out, remote_options={"remote_protocol": "memory"}) + ds = xr.open_dataset( - "reference://", + store, engine="zarr", backend_kwargs={ "consolidated": False, - "storage_options": {"fo": out, "remote_protocol": "memory"}, + "zarr_format": "2", }, ) assert (ds.data == data).all() From d556e528ab7f012afef68a9ec70f5bfd96c4470a Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 10 Oct 2024 15:30:11 -0400 Subject: [PATCH 11/40] Refactor to clean things up --- kerchunk/hdf.py | 11 ++--- kerchunk/netCDF3.py | 4 +- kerchunk/tests/test_hdf.py | 90 +++++++++++++++++--------------------- kerchunk/utils.py | 37 +++++++++++++--- kerchunk/zarr.py | 35 +++++++++++++++ 5 files changed, 112 insertions(+), 65 deletions(-) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 6b7b443d..7d416f83 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -10,6 +10,8 @@ import zarr import numcodecs +from kerchunk.zarr import dict_to_store + from .codecs import FillStringsCodec from .utils import _encode_for_JSON, encode_fill_value @@ -107,13 +109,8 @@ def __init__( raise NotImplementedError self.vlen = vlen_encode self.store_dict = out or {} - if Version(zarr.__version__) < Version("3.0.0.a0"): - self.store = zarr.storage.KVStore(self.store_dict) - self._zroot = zarr.group(store=self.store, overwrite=True) - else: - self.store = zarr.storage.MemoryStore(mode="a", store_dict=self.store_dict) - self._zroot = zarr.group(store=self.store, zarr_format=2, overwrite=True) - + self.store = dict_to_store(self.store_dict) + self._zroot = zarr.group(store=self.store, zarr_format=2, overwrite=True) self._uri = url self.error = error lggr.debug(f"HDF5 file URI: {self._uri}") diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py index b9d47063..078a5f7b 100644 --- a/kerchunk/netCDF3.py +++ b/kerchunk/netCDF3.py @@ -198,7 +198,7 @@ def translate(self): fill = float(fill) if fill is not None and var.data.dtype.kind == "i": fill = int(fill) - arr = z.create_dataset( + arr = z.create_array( name=dim, shape=shape, dtype=var.data.dtype, @@ -252,7 +252,7 @@ def translate(self): fill = float(fill) if fill is not None and base.kind == "i": fill = int(fill) - arr = z.create_dataset( + arr = z.create_array( name=name, shape=shape, dtype=base, diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 233a58e4..8e2117cc 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -1,42 +1,24 @@ -from typing import Any import fsspec import json import os.path as osp -import fsspec.implementations -import fsspec.implementations.reference - import kerchunk.hdf import numpy as np import pytest import xarray as xr import zarr -from packaging.version import Version - from kerchunk.hdf import SingleHdf5ToZarr, has_visititems_links from kerchunk.combine import MultiZarrToZarr, drop +from kerchunk.utils import refs_as_fs, refs_as_store +from kerchunk.zarr import fs_as_store here = osp.dirname(__file__) -async def list_dir(store, path): - [x async for x in store.list_dir(path)] - - -def create_store(test_dict: dict, remote_options: Any = None): - if Version(zarr.__version__) < Version("3.0.0.a0"): - return fsspec.get_mapper( - "reference://", fo=test_dict, remote_protocol="s3", remote_options=remote_options - ) - else: - fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict, remote_options=remote_options) - return zarr.storage.RemoteStore(fs, mode="r") - - def test_single(): """Test creating references for a single HDF file""" - #url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" + # url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" url = "s3://noaa-nos-ofs-pds/ngofs2/netcdf/202410/ngofs2.t03z.20241001.2ds.f020.nc" so = dict(anon=True, default_fill_cache=False, default_cache_type="none") @@ -47,9 +29,11 @@ def test_single(): with open("test_dict.json", "w") as f: json.dump(test_dict, f) - store = create_store(test_dict) + store = refs_as_store(test_dict) - ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) + ds = xr.open_dataset( + store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) + ) with fsspec.open(url, **so) as f: expected = xr.open_dataset(f, engine="h5netcdf") @@ -66,7 +50,7 @@ def test_single_direct_open(): h5f=url, inline_threshold=300, storage_options=so ).translate() - store = create_store(test_dict) + store = refs_as_store(test_dict) ds_direct = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) @@ -76,7 +60,7 @@ def test_single_direct_open(): h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) test_dict = h5chunks.translate() - store = create_store(test_dict) + store = refs_as_store(test_dict) ds_from_file_opener = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) @@ -103,8 +87,10 @@ def test_multizarr(generate_mzz): mzz = generate_mzz test_dict = mzz.translate() - store = create_store(test_dict) - ds = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) + store = refs_as_store(test_dict) + ds = xr.open_dataset( + store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) + ) with fsspec.open_files(urls, **so) as fs: expts = [xr.open_dataset(f, engine="h5netcdf") for f in fs] @@ -178,8 +164,10 @@ def test_times(times_data): h5chunks = SingleHdf5ToZarr(f, url) test_dict = h5chunks.translate() - store = create_store(test_dict) - result = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) + store = refs_as_store(test_dict) + result = xr.open_dataset( + store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) + ) expected = x1.to_dataset() xr.testing.assert_equal(result, expected) @@ -191,8 +179,10 @@ def test_times_str(times_data): h5chunks = SingleHdf5ToZarr(url) test_dict = h5chunks.translate() - store = create_store(test_dict) - result = xr.open_dataset(store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)) + store = refs_as_store(test_dict) + result = xr.open_dataset( + store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) + ) expected = x1.to_dataset() xr.testing.assert_equal(result, expected) @@ -205,9 +195,10 @@ def test_string_embed(): fn = osp.join(here, "vlen.h5") h = kerchunk.hdf.SingleHdf5ToZarr(fn, fn, vlen_encode="embed") out = h.translate() - fs = fsspec.filesystem("reference", fo=out) + fs = refs_as_fs(out) assert txt in fs.references["vlen_str/0"] - z = zarr.open(fs.get_mapper(), zarr_format=2) + store = fs_as_store(fs) + z = zarr.open(store, zarr_format=2) assert z.vlen_str.dtype == "O" assert z.vlen_str[0] == txt assert (z.vlen_str[1:] == "").all() @@ -217,8 +208,8 @@ def test_string_null(): fn = osp.join(here, "vlen.h5") h = kerchunk.hdf.SingleHdf5ToZarr(fn, fn, vlen_encode="null", inline_threshold=0) out = h.translate() - fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) assert z.vlen_str.dtype == "O" assert (z.vlen_str[:] == None).all() @@ -230,8 +221,8 @@ def test_string_leave(): f, fn, vlen_encode="leave", inline_threshold=0 ) out = h.translate() - fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) assert z.vlen_str.dtype == "S16" assert z.vlen_str[0] # some obscured ID assert (z.vlen_str[1:] == b"").all() @@ -244,9 +235,10 @@ def test_string_decode(): f, fn, vlen_encode="encode", inline_threshold=0 ) out = h.translate() - fs = fsspec.filesystem("reference", fo=out) + fs = refs_as_fs(out) assert txt in fs.cat("vlen_str/.zarray").decode() # stored in filter def - z = zarr.open(fs.get_mapper(), zarr_format=2) + store = fs_as_store(fs) + z = zarr.open(store, zarr_format=2) assert z.vlen_str[0] == txt assert (z.vlen_str[1:] == "").all() @@ -256,8 +248,8 @@ def test_compound_string_null(): with open(fn, "rb") as f: h = kerchunk.hdf.SingleHdf5ToZarr(f, fn, vlen_encode="null", inline_threshold=0) out = h.translate() - fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) assert z.vlen_str[0].tolist() == (10, None) assert (z.vlen_str["ints"][1:] == 0).all() assert (z.vlen_str["strs"][1:] == None).all() @@ -270,8 +262,8 @@ def test_compound_string_leave(): f, fn, vlen_encode="leave", inline_threshold=0 ) out = h.translate() - fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) assert z.vlen_str["ints"][0] == 10 assert z.vlen_str["strs"][0] # random ID assert (z.vlen_str["ints"][1:] == 0).all() @@ -285,8 +277,8 @@ def test_compound_string_encode(): f, fn, vlen_encode="encode", inline_threshold=0 ) out = h.translate() - fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) assert z.vlen_str["ints"][0] == 10 assert z.vlen_str["strs"][0] == "water" assert (z.vlen_str["ints"][1:] == 0).all() @@ -316,7 +308,7 @@ def test_compress(): h.translate() continue out = h.translate() - store = create_store(out) + store = refs_as_store(out) g = zarr.open(store, zarr_format=2) assert np.mean(g.data) == 49.5 @@ -326,7 +318,7 @@ def test_embed(): h = kerchunk.hdf.SingleHdf5ToZarr(fn, vlen_encode="embed") out = h.translate() - store = create_store(out) + store = refs_as_store(out) z = zarr.open(store, zarr_format=2) data = z["Domain_10"]["STER"]["min_1"]["boom_1"]["temperature"][:] assert data[0].tolist() == [ @@ -361,8 +353,8 @@ def test_translate_links(): out = kerchunk.hdf.SingleHdf5ToZarr(fn, inline_threshold=50).translate( preserve_linked_dsets=True ) - fs = fsspec.filesystem("reference", fo=out) - z = zarr.open(fs.get_mapper(), zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) # 1. Test the hard linked datasets were translated correctly # 2. Test the soft linked datasets were translated correctly diff --git a/kerchunk/utils.py b/kerchunk/utils.py index a0f9e96e..59aad1af 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -10,6 +10,28 @@ import numpy as np import zarr +from kerchunk.zarr import fs_as_store + + +def refs_as_fs(refs, remote_protocol=None, remote_options=None, **kwargs): + """Convert a reference set to an fsspec filesystem""" + fs = fsspec.filesystem( + "reference", + fo=refs, + remote_protocol=remote_protocol, + remote_options=remote_options, + **kwargs, + ) + return fs + + +def refs_as_store(refs, remote_protocol=None, remote_options=None): + """Convert a reference set to a zarr store""" + fs = refs_as_fs( + refs, remote_protocol=remote_protocol, remote_options=remote_options + ) + return fs_as_store(fs) + def class_factory(func): """Experimental uniform API across function-based file scanners""" @@ -74,7 +96,7 @@ def rename_target(refs, renames): ------- dict: the altered reference set, which can be saved """ - fs = fsspec.filesystem("reference", fo=refs) # to produce normalised refs + fs = refs_as_fs(refs) # to produce normalised refs refs = fs.references out = {} for k, v in refs.items(): @@ -136,7 +158,6 @@ def _encode_for_JSON(store): return store - def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: # early out if v is None: @@ -190,6 +211,9 @@ def do_inline(store, threshold, remote_options=None, remote_protocol=None): remote_options=remote_options, remote_protocol=remote_protocol, ) + fs = refs_as_fs( + store, remote_protocol=remote_protocol, remote_options=remote_options + ) out = fs.references.copy() # Inlining is done when one of two conditions are satisfied: @@ -267,10 +291,9 @@ def inline_array(store, threshold=1000, names=None, remote_options=None): ------- amended references set (simple style) """ - fs = fsspec.filesystem( - "reference", fo=store, **(remote_options or {}), skip_instance_cache=True - ) - g = zarr.open_group(fs.get_mapper(), mode="r+", zarr_format=2) + fs = refs_as_fs(store, remote_options=remote_options or {}) + zarr_store = fs_as_store(store, mode="r+", remote_options=remote_options or {}) + g = zarr.open_group(zarr_store, mode="r+", zarr_format=2) _inline_array(g, threshold, names=names or []) return fs.references @@ -293,7 +316,7 @@ def subchunk(store, variable, factor): ------- modified store """ - fs = fsspec.filesystem("reference", fo=store) + fs = refs_as_fs(store) store = fs.references meta_file = f"{variable}/.zarray" meta = ujson.loads(fs.cat(meta_file)) diff --git a/kerchunk/zarr.py b/kerchunk/zarr.py index ea0612de..5560ea99 100644 --- a/kerchunk/zarr.py +++ b/kerchunk/zarr.py @@ -1,9 +1,44 @@ +from packaging.version import Version + import fsspec from fsspec.implementations.reference import LazyReferenceMapper +import zarr import kerchunk.utils +def is_zarr3(): + """Check if the installed zarr version is version 3""" + return Version(zarr.__version__) >= Version("3.0.0.a0") + + +def dict_to_store(store_dict: dict): + """Create an in memory zarr store backed by the given dictionary""" + if is_zarr3(): + return zarr.storage.MemoryStore(mode="a", store_dict=store_dict) + else: + return zarr.storage.KVStore(store_dict) + + +def fs_as_store(fs, mode='r', remote_protocol=None, remote_options=None): + """Open the refs as a zarr store + + Parameters + ---------- + refs: dict-like + the references to open + mode: str + + Returns + ------- + zarr.storage.Store or zarr.storage.Mapper, fsspec.AbstractFileSystem + """ + if is_zarr3(): + return zarr.storage.RemoteStore(fs, mode=mode) + else: + return fs.get_mapper() + + def single_zarr( uri_or_store, storage_options=None, From b27e64c5e0d0e13e83e9ae5adb297ec473d8eada Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 10 Oct 2024 16:06:03 -0400 Subject: [PATCH 12/40] Fix circular import --- kerchunk/hdf.py | 5 +---- kerchunk/tests/test_netcdf.py | 17 +++-------------- kerchunk/utils.py | 35 +++++++++++++++++++++++++++++++++-- kerchunk/zarr.py | 35 ----------------------------------- 4 files changed, 37 insertions(+), 55 deletions(-) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 7d416f83..bc00517f 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -2,7 +2,6 @@ import io import logging from typing import Union, BinaryIO -from packaging.version import Version import fsspec.core from fsspec.implementations.reference import LazyReferenceMapper @@ -10,10 +9,8 @@ import zarr import numcodecs -from kerchunk.zarr import dict_to_store - from .codecs import FillStringsCodec -from .utils import _encode_for_JSON, encode_fill_value +from .utils import _encode_for_JSON, encode_fill_value, dict_to_store try: import h5py diff --git a/kerchunk/tests/test_netcdf.py b/kerchunk/tests/test_netcdf.py index 0036c0a3..755823da 100644 --- a/kerchunk/tests/test_netcdf.py +++ b/kerchunk/tests/test_netcdf.py @@ -1,5 +1,4 @@ import os -from typing import Any import fsspec @@ -8,7 +7,7 @@ import pytest from kerchunk import netCDF3 -import zarr +from kerchunk.utils import refs_as_store xr = pytest.importorskip("xarray") @@ -27,29 +26,19 @@ ) -def create_store(test_dict: dict, remote_options: Any = None): - if Version(zarr.__version__) < Version("3.0.0.a0"): - return fsspec.get_mapper( - "reference://", fo=test_dict, remote_protocol="s3", remote_options=remote_options - ) - else: - fs = fsspec.implementations.reference.ReferenceFileSystem(fo=test_dict, remote_options=remote_options) - return zarr.storage.RemoteStore(fs, mode="r") - - def test_one(m): m.pipe("data.nc3", bdata) h = netCDF3.netcdf_recording_file("memory://data.nc3") out = h.translate() - store = create_store(out, remote_options={"remote_protocol": "memory"}) + store = refs_as_store(out, remote_protocol="memory") ds = xr.open_dataset( store, engine="zarr", backend_kwargs={ "consolidated": False, - "zarr_format": "2", + "zarr_format": 2, }, ) assert (ds.data == data).all() diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 59aad1af..c90f89fe 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -1,6 +1,7 @@ import base64 import copy import itertools +from packaging.version import Version from typing import Any, cast import warnings @@ -10,8 +11,6 @@ import numpy as np import zarr -from kerchunk.zarr import fs_as_store - def refs_as_fs(refs, remote_protocol=None, remote_options=None, **kwargs): """Convert a reference set to an fsspec filesystem""" @@ -33,6 +32,38 @@ def refs_as_store(refs, remote_protocol=None, remote_options=None): return fs_as_store(fs) +def is_zarr3(): + """Check if the installed zarr version is version 3""" + return Version(zarr.__version__) >= Version("3.0.0.a0") + + +def dict_to_store(store_dict: dict): + """Create an in memory zarr store backed by the given dictionary""" + if is_zarr3(): + return zarr.storage.MemoryStore(mode="a", store_dict=store_dict) + else: + return zarr.storage.KVStore(store_dict) + + +def fs_as_store(fs, mode='r', remote_protocol=None, remote_options=None): + """Open the refs as a zarr store + + Parameters + ---------- + refs: dict-like + the references to open + mode: str + + Returns + ------- + zarr.storage.Store or zarr.storage.Mapper, fsspec.AbstractFileSystem + """ + if is_zarr3(): + return zarr.storage.RemoteStore(fs, mode=mode) + else: + return fs.get_mapper() + + def class_factory(func): """Experimental uniform API across function-based file scanners""" diff --git a/kerchunk/zarr.py b/kerchunk/zarr.py index 5560ea99..ea0612de 100644 --- a/kerchunk/zarr.py +++ b/kerchunk/zarr.py @@ -1,44 +1,9 @@ -from packaging.version import Version - import fsspec from fsspec.implementations.reference import LazyReferenceMapper -import zarr import kerchunk.utils -def is_zarr3(): - """Check if the installed zarr version is version 3""" - return Version(zarr.__version__) >= Version("3.0.0.a0") - - -def dict_to_store(store_dict: dict): - """Create an in memory zarr store backed by the given dictionary""" - if is_zarr3(): - return zarr.storage.MemoryStore(mode="a", store_dict=store_dict) - else: - return zarr.storage.KVStore(store_dict) - - -def fs_as_store(fs, mode='r', remote_protocol=None, remote_options=None): - """Open the refs as a zarr store - - Parameters - ---------- - refs: dict-like - the references to open - mode: str - - Returns - ------- - zarr.storage.Store or zarr.storage.Mapper, fsspec.AbstractFileSystem - """ - if is_zarr3(): - return zarr.storage.RemoteStore(fs, mode=mode) - else: - return fs.get_mapper() - - def single_zarr( uri_or_store, storage_options=None, From 41d6e8e2eb36b09df844755ea4cb7f38a8d3f818 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 10 Oct 2024 16:07:17 -0400 Subject: [PATCH 13/40] Iterate --- kerchunk/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kerchunk/utils.py b/kerchunk/utils.py index c90f89fe..5cab841d 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -323,7 +323,7 @@ def inline_array(store, threshold=1000, names=None, remote_options=None): amended references set (simple style) """ fs = refs_as_fs(store, remote_options=remote_options or {}) - zarr_store = fs_as_store(store, mode="r+", remote_options=remote_options or {}) + zarr_store = fs_as_store(fs, mode="r+", remote_options=remote_options or {}) g = zarr.open_group(zarr_store, mode="r+", zarr_format=2) _inline_array(g, threshold, names=names or []) return fs.references From 7ade1a6dc2369583869a2a6d34a6953b223a9e02 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 10 Oct 2024 17:08:19 -0400 Subject: [PATCH 14/40] Change zarr dep --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6e57e223..5eb7c0c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "numcodecs", "numpy", "ujson", - "zarr==3.0.0a7", + "zarr", ] [project.optional-dependencies] From 492ddeebac4d844ce63ee6aa93b14f5ce613efed Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 10 Oct 2024 17:31:49 -0400 Subject: [PATCH 15/40] More conversion --- kerchunk/fits.py | 7 ++++--- kerchunk/hdf.py | 13 ++----------- kerchunk/netCDF3.py | 11 ++++------- kerchunk/tests/test_fits.py | 22 ++++++++++++---------- kerchunk/tests/test_tiff.py | 10 ++++++---- kerchunk/utils.py | 37 ++++++++++++++++++++++++++++++++++--- 6 files changed, 62 insertions(+), 38 deletions(-) diff --git a/kerchunk/fits.py b/kerchunk/fits.py index f50bef64..f0d4fa8e 100644 --- a/kerchunk/fits.py +++ b/kerchunk/fits.py @@ -8,7 +8,7 @@ from fsspec.implementations.reference import LazyReferenceMapper -from kerchunk.utils import class_factory +from kerchunk.utils import class_factory, dict_to_store from kerchunk.codecs import AsciiTableCodec, VarArrCodec try: @@ -72,7 +72,8 @@ def process_file( storage_options = storage_options or {} out = out or {} - g = zarr.open(out, zarr_format=2) + store = dict_to_store(out) + g = zarr.open_group(store=store, zarr_format=2) with fsspec.open(url, mode="rb", **storage_options) as f: infile = fits.open(f, do_not_scale_image_data=True) @@ -164,7 +165,7 @@ def process_file( # TODO: we could sub-chunk on biggest dimension name = hdu.name or str(ext) arr = g.empty( - name, dtype=dtype, shape=shape, chunks=shape, compression=None, **kwargs + name=name, dtype=dtype, shape=shape, chunks=shape, compressor=None, zarr_format=2, **kwargs ) arr.attrs.update( { diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index bc00517f..7cb4b5f6 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -10,7 +10,7 @@ import numcodecs from .codecs import FillStringsCodec -from .utils import _encode_for_JSON, encode_fill_value, dict_to_store +from .utils import _encode_for_JSON, encode_fill_value, dict_to_store, translate_refs_serializable try: import h5py @@ -150,16 +150,7 @@ def translate(self, preserve_linked_dsets=False): self.store.flush() return self.store else: - keys_to_remove = [] - new_keys = {} - for k, v in self.store_dict.items(): - if isinstance(v, zarr.core.buffer.cpu.Buffer): - key = str.removeprefix(k, "/") - new_keys[key] = v.to_bytes() - keys_to_remove.append(k) - for k in keys_to_remove: - del self.store_dict[k] - self.store_dict.update(new_keys) + translate_refs_serializable(self.store_dict) store = _encode_for_JSON(self.store_dict) return {"version": 1, "refs": store} diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py index 078a5f7b..31438bb0 100644 --- a/kerchunk/netCDF3.py +++ b/kerchunk/netCDF3.py @@ -6,7 +6,7 @@ from fsspec.implementations.reference import LazyReferenceMapper import fsspec -from kerchunk.utils import _encode_for_JSON, inline_array +from kerchunk.utils import _encode_for_JSON, dict_to_store, inline_array, translate_refs_serializable try: from scipy.io._netcdf import ZERO, NC_VARIABLE, netcdf_file, netcdf_variable @@ -168,12 +168,8 @@ def translate(self): import zarr out = self.out - if Version(zarr.__version__) < Version("3.0.0.a0"): - store = zarr.storage.KVStore(out) - z = zarr.group(store=store, overwrite=True) - else: - store = zarr.storage.MemoryStore(mode="a", store_dict=out) - z = zarr.open(store, mode="w", zarr_format=2) + store = dict_to_store(out) + z = zarr.open(store, mode="w", zarr_format=2, overwrite=True) for dim, var in self.variables.items(): if dim in self.chunks: @@ -302,6 +298,7 @@ def translate(self): out.flush() return out else: + translate_refs_serializable(out) out = _encode_for_JSON(out) return {"version": 1, "refs": out} diff --git a/kerchunk/tests/test_fits.py b/kerchunk/tests/test_fits.py index 5d7c3b6d..de2cad5f 100644 --- a/kerchunk/tests/test_fits.py +++ b/kerchunk/tests/test_fits.py @@ -2,6 +2,8 @@ import fsspec import pytest +from kerchunk.utils import refs_as_store + fits = pytest.importorskip("astropy.io.fits") import kerchunk.fits @@ -17,8 +19,8 @@ def test_ascii_table(): # this one directly hits a remote server - should cache? url = "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits" out = kerchunk.fits.process_file(url, extension=1) - m = fsspec.get_mapper("reference://", fo=out, remote_protocol="https") - g = zarr.open(m, zarr_format=2) + store = refs_as_store(out, remote_protocol="https") + g = zarr.open(store, zarr_format=2) arr = g["u5780205r_cvt.c0h.tab"][:] with fsspec.open( "https://fits.gsfc.nasa.gov/samples/WFPC2u5780205r_c0fx.fits" @@ -30,8 +32,8 @@ def test_ascii_table(): def test_binary_table(): out = kerchunk.fits.process_file(btable, extension=1) - m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) arr = z["1"] with open(btable, "rb") as f: hdul = fits.open(f) @@ -47,8 +49,8 @@ def test_binary_table(): def test_cube(): out = kerchunk.fits.process_file(range_im) - m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) arr = z["PRIMARY"] with open(range_im, "rb") as f: hdul = fits.open(f) @@ -60,8 +62,8 @@ def test_with_class(): ftz = kerchunk.fits.FitsToZarr(range_im) out = ftz.translate() assert "fits" in repr(ftz) - m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) arr = z["PRIMARY"] with open(range_im, "rb") as f: hdul = fits.open(f) @@ -75,8 +77,8 @@ def test_var(): ftz = kerchunk.fits.FitsToZarr(var) out = ftz.translate() - m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) arr = z["1"] vars = [_.tolist() for _ in arr["var"]] diff --git a/kerchunk/tests/test_tiff.py b/kerchunk/tests/test_tiff.py index 74ba59a4..b81e7bab 100644 --- a/kerchunk/tests/test_tiff.py +++ b/kerchunk/tests/test_tiff.py @@ -5,6 +5,8 @@ import pytest import xarray as xr +from kerchunk.utils import refs_as_store + pytest.importorskip("tifffile") pytest.importorskip("rioxarray") import kerchunk.tiff @@ -15,8 +17,8 @@ def test_one(): fn = files[0] out = kerchunk.tiff.tiff_to_zarr(fn) - m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_format=2) + store = refs_as_store(out) + z = zarr.open(store, zarr_format=2) assert list(z) == ["0", "1", "2"] assert z.attrs["multiscales"] == [ { @@ -33,8 +35,8 @@ def test_one(): def test_coord(): fn = files[0] out = kerchunk.tiff.tiff_to_zarr(fn) - m = fsspec.get_mapper("reference://", fo=out) - z = zarr.open(m, zarr_format=2) # highest res is the one xarray picks + store = refs_as_store(out) + z = zarr.open(out, zarr_format=2) # highest res is the one xarray picks out = kerchunk.tiff.generate_coords(z.attrs, z[0].shape) ds = xr.open_dataset(fn) diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 5cab841d..71cee56a 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -45,15 +45,15 @@ def dict_to_store(store_dict: dict): return zarr.storage.KVStore(store_dict) -def fs_as_store(fs, mode='r', remote_protocol=None, remote_options=None): +def fs_as_store(fs, mode="r", remote_protocol=None, remote_options=None): """Open the refs as a zarr store - + Parameters ---------- refs: dict-like the references to open mode: str - + Returns ------- zarr.storage.Store or zarr.storage.Mapper, fsspec.AbstractFileSystem @@ -538,3 +538,34 @@ def templateize(strings, min_length=10, template_name="u"): else: template = {} return template, strings + + +def translate_refs_serializable(refs: dict): + """Translate a reference set to a serializable form, given that zarr + v3 memory stores store data in buffers by default. This modifies the + input dictionary in place, and returns a reference to it. + + It also fixes keys that have a leading slash, which is not appropriate for + zarr v3 keys + + Parameters + ---------- + refs: dict + The reference set + + Returns + ------- + dict + A serializable form of the reference set + """ + keys_to_remove = [] + new_keys = {} + for k, v in refs.items(): + if isinstance(v, zarr.core.buffer.cpu.Buffer): + key = k.removeprefix("/") + new_keys[key] = v.to_bytes() + keys_to_remove.append(k) + for k in keys_to_remove: + del refs[k] + refs.update(new_keys) + return refs \ No newline at end of file From 6e5741ca7d4fe25a9d37bbc3d72266e28c6695de Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 15 Oct 2024 09:48:05 -0400 Subject: [PATCH 16/40] Specify zarr version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5eb7c0c9..3c361a2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "numcodecs", "numpy", "ujson", - "zarr", + "zarr==3.0.0b0", ] [project.optional-dependencies] From c0316ace9b18455aece8d0910a33cd4791e083ce Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 23 Oct 2024 09:31:10 -0400 Subject: [PATCH 17/40] Working remote hdf tests --- kerchunk/hdf.py | 2 +- kerchunk/tests/test_hdf.py | 22 +++++++++++----------- kerchunk/utils.py | 37 ++++++++++++++++++++++++------------- kerchunk/xarray_backend.py | 4 +++- pyproject.toml | 2 +- 5 files changed, 40 insertions(+), 27 deletions(-) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 7cb4b5f6..1d4d0054 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -461,7 +461,7 @@ def _translator( if h5obj.attrs.get("_FillValue") is not None: fill = h5obj.attrs.get("_FillValue") fill = encode_fill_value( - h5obj.attrs.get("_FillValue"), dt or h5obj.dtype + fill, dt or h5obj.dtype ) adims = self._get_array_dims(h5obj) diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 8e2117cc..f600a127 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -1,7 +1,12 @@ +import asyncio import fsspec import json import os.path as osp +import zarr.core +import zarr.core.buffer +import zarr.core.group + import kerchunk.hdf import numpy as np import pytest @@ -11,33 +16,28 @@ from kerchunk.hdf import SingleHdf5ToZarr, has_visititems_links from kerchunk.combine import MultiZarrToZarr, drop from kerchunk.utils import refs_as_fs, refs_as_store -from kerchunk.zarr import fs_as_store +from kerchunk.utils import fs_as_store here = osp.dirname(__file__) def test_single(): """Test creating references for a single HDF file""" - # url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" - url = "s3://noaa-nos-ofs-pds/ngofs2/netcdf/202410/ngofs2.t03z.20241001.2ds.f020.nc" + url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" so = dict(anon=True, default_fill_cache=False, default_cache_type="none") with fsspec.open(url, **so) as f: - h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) + h5chunks = SingleHdf5ToZarr(f, url, storage_options=so, inline_threshold=1) test_dict = h5chunks.translate() with open("test_dict.json", "w") as f: json.dump(test_dict, f) - store = refs_as_store(test_dict) - - ds = xr.open_dataset( - store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) - ) + store = refs_as_store(test_dict, remote_options=dict(asynchronous=True, anon=True)) + ds = xr.open_zarr(store, zarr_format=2, consolidated=False) with fsspec.open(url, **so) as f: expected = xr.open_dataset(f, engine="h5netcdf") - xr.testing.assert_equal(ds.drop_vars("crs"), expected.drop_vars("crs")) @@ -164,7 +164,7 @@ def test_times(times_data): h5chunks = SingleHdf5ToZarr(f, url) test_dict = h5chunks.translate() - store = refs_as_store(test_dict) + store = refs_as_store(test_dict, remote_protocol="file") result = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) ) diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 71cee56a..8cc2f765 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -1,6 +1,7 @@ import base64 import copy import itertools +import fsspec.asyn from packaging.version import Version from typing import Any, cast import warnings @@ -24,12 +25,23 @@ def refs_as_fs(refs, remote_protocol=None, remote_options=None, **kwargs): return fs -def refs_as_store(refs, remote_protocol=None, remote_options=None): +def refs_as_store(refs, mode="r", remote_protocol=None, remote_options=None): """Convert a reference set to a zarr store""" + asynchronous = False + if is_zarr3(): + asynchronous = True + if remote_options is None: + remote_options = {"asynchronous": True} + else: + remote_options["asynchronous"] = True + fs = refs_as_fs( - refs, remote_protocol=remote_protocol, remote_options=remote_options + refs, + remote_protocol=remote_protocol, + remote_options=remote_options, + asynchronous=asynchronous, ) - return fs_as_store(fs) + return fs_as_store(fs, mode=mode) def is_zarr3(): @@ -40,18 +52,17 @@ def is_zarr3(): def dict_to_store(store_dict: dict): """Create an in memory zarr store backed by the given dictionary""" if is_zarr3(): - return zarr.storage.MemoryStore(mode="a", store_dict=store_dict) + return zarr.storage.MemoryStore(mode="w", store_dict=store_dict) else: return zarr.storage.KVStore(store_dict) -def fs_as_store(fs, mode="r", remote_protocol=None, remote_options=None): +def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, mode="r"): """Open the refs as a zarr store Parameters ---------- - refs: dict-like - the references to open + fs: fsspec.async.AsyncFileSystem mode: str Returns @@ -541,18 +552,18 @@ def templateize(strings, min_length=10, template_name="u"): def translate_refs_serializable(refs: dict): - """Translate a reference set to a serializable form, given that zarr - v3 memory stores store data in buffers by default. This modifies the + """Translate a reference set to a serializable form, given that zarr + v3 memory stores store data in buffers by default. This modifies the input dictionary in place, and returns a reference to it. - It also fixes keys that have a leading slash, which is not appropriate for - zarr v3 keys + It also fixes keys that have a leading slash, which is not appropriate for + zarr v3 keys Parameters ---------- refs: dict The reference set - + Returns ------- dict @@ -568,4 +579,4 @@ def translate_refs_serializable(refs: dict): for k in keys_to_remove: del refs[k] refs.update(new_keys) - return refs \ No newline at end of file + return refs diff --git a/kerchunk/xarray_backend.py b/kerchunk/xarray_backend.py index ca377f6d..dfbbafba 100644 --- a/kerchunk/xarray_backend.py +++ b/kerchunk/xarray_backend.py @@ -43,4 +43,6 @@ def open_reference_dataset( m = fsspec.get_mapper("reference://", fo=filename_or_obj, **storage_options) - return xr.open_dataset(m, engine="zarr", consolidated=False, **open_dataset_options) + return xr.open_dataset( + m, engine="zarr", zarr_format=2, consolidated=False, **open_dataset_options + ) diff --git a/pyproject.toml b/pyproject.toml index 3c361a2d..5eb7c0c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "numcodecs", "numpy", "ujson", - "zarr==3.0.0b0", + "zarr", ] [project.optional-dependencies] From 59bd36cafd33b9ec3c29ddf90e9041197e38dc30 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 23 Oct 2024 10:03:25 -0400 Subject: [PATCH 18/40] Working grib impl --- kerchunk/grib2.py | 27 ++++++++------ kerchunk/tests/test_grib.py | 73 ++++++++++++++++++------------------- 2 files changed, 50 insertions(+), 50 deletions(-) diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py index 7d75786f..e4e64bf3 100644 --- a/kerchunk/grib2.py +++ b/kerchunk/grib2.py @@ -11,7 +11,7 @@ import xarray import numpy as np -from kerchunk.utils import class_factory, _encode_for_JSON +from kerchunk.utils import class_factory, _encode_for_JSON, dict_to_store, translate_refs_serializable from kerchunk.codecs import GRIBCodec from kerchunk.combine import MultiZarrToZarr, drop from kerchunk._grib_idx import parse_grib_idx, build_idx_grib_mapping, map_from_index @@ -71,13 +71,13 @@ def _store_array(store, z, data, var, inline_threshold, offset, size, attr): shape = tuple(data.shape or ()) if nbytes < inline_threshold: logger.debug(f"Store {var} inline") - d = z.create_dataset( + d = z.create_array( name=var, shape=shape, chunks=shape, dtype=data.dtype, fill_value=attr.get("missingValue", None), - compressor=False, + compressor=None, ) if hasattr(data, "tobytes"): b = data.tobytes() @@ -91,15 +91,14 @@ def _store_array(store, z, data, var, inline_threshold, offset, size, attr): store[f"{var}/0"] = b.decode("ascii") else: logger.debug(f"Store {var} reference") - d = z.create_dataset( + d = z.create_array( name=var, shape=shape, chunks=shape, dtype=data.dtype, fill_value=attr.get("missingValue", None), filters=[GRIBCodec(var=var, dtype=str(data.dtype))], - compressor=False, - overwrite=True, + compressor=None, ) store[f"{var}/" + ".".join(["0"] * len(shape))] = ["{{u}}", offset, size] d.attrs.update(attr) @@ -153,7 +152,9 @@ def scan_grib( with fsspec.open(url, "rb", **storage_options) as f: logger.debug(f"File {url}") for offset, size, data in _split_file(f, skip=skip): - store = {} + store_dict = {} + store = dict_to_store(store_dict) + mid = eccodes.codes_new_from_message(data) m = cfgrib.cfmessage.CfMessage(mid) @@ -227,7 +228,7 @@ def scan_grib( varName = m["cfVarName"] if varName in ("undef", "unknown"): varName = m["shortName"] - _store_array(store, z, vals, varName, inline_threshold, offset, size, attrs) + _store_array(store_dict, z, vals, varName, inline_threshold, offset, size, attrs) if "typeOfLevel" in message_keys and "level" in message_keys: name = m["typeOfLevel"] coordinates.append(name) @@ -241,7 +242,7 @@ def scan_grib( attrs = {} attrs["_ARRAY_DIMENSIONS"] = [] _store_array( - store, z, data, name, inline_threshold, offset, size, attrs + store_dict, z, data, name, inline_threshold, offset, size, attrs ) dims = ( ["y", "x"] @@ -298,7 +299,7 @@ def scan_grib( dims = [coord] attrs = cfgrib.dataset.COORD_ATTRS[coord] _store_array( - store, + store_dict, z, x, coord, @@ -311,10 +312,11 @@ def scan_grib( if coordinates: z.attrs["coordinates"] = " ".join(coordinates) + translate_refs_serializable(store_dict) out.append( { "version": 1, - "refs": _encode_for_JSON(store), + "refs": _encode_for_JSON(store_dict), "templates": {"u": url}, } ) @@ -397,7 +399,8 @@ def grib_tree( filters = ["stepType", "typeOfLevel"] # TODO allow passing a LazyReferenceMapper as output? - zarr_store = {} + zarr_store_dict = {} + zarr_store = dict_to_store(zarr_store_dict) zroot = zarr.open_group(store=zarr_store, zarr_format=2) aggregations: Dict[str, List] = defaultdict(list) diff --git a/kerchunk/tests/test_grib.py b/kerchunk/tests/test_grib.py index 9102529e..74f24a6d 100644 --- a/kerchunk/tests/test_grib.py +++ b/kerchunk/tests/test_grib.py @@ -6,7 +6,7 @@ import pandas as pd import pytest import xarray as xr -import datatree +#import datatree import zarr import ujson from kerchunk.grib2 import ( @@ -21,6 +21,7 @@ extract_dataset_chunk_index, extract_datatree_chunk_index, ) +from kerchunk.utils import refs_as_store eccodes_ver = tuple(int(i) for i in eccodes.__version__.split(".")) cfgrib = pytest.importorskip("cfgrib") @@ -68,17 +69,13 @@ def _fetch_first(url): def test_archives(tmpdir, url): grib = GribToZarr(url, storage_options={"anon": True}, skip=1) out = grib.translate()[0] - ours = xr.open_dataset( - "reference://", - engine="zarr", - backend_kwargs={ - "consolidated": False, - "storage_options": { - "fo": out, - "remote_protocol": "s3", - "remote_options": {"anon": True}, - }, - }, + + store = refs_as_store(out) + + ours = xr.open_zarr( + store, + zarr_format=2, + consolidated=False, ) data = _fetch_first(url) @@ -266,22 +263,22 @@ def test_hrrr_sfcf_grib_tree(): assert zg.u.instant.isobaricInhPa.time.shape == (1,) -def test_hrrr_sfcf_grib_datatree(): - fpath = os.path.join(here, "hrrr.wrfsfcf.subset.json") - with open(fpath, "rb") as fobj: - scanned_msgs = ujson.load(fobj) - merged = grib_tree(scanned_msgs) - dt = datatree.open_datatree( - fsspec.filesystem("reference", fo=merged).get_mapper(""), - engine="zarr", - consolidated=False, - ) - # Assert a few things... but if it loads we are mostly done. - np.testing.assert_array_equal( - dt.u.instant.heightAboveGround.step.values[:], - np.array([0, 3600 * 10**9], dtype="timedelta64[ns]"), - ) - assert dt.u.attrs == dict(name="U component of wind") +# def test_hrrr_sfcf_grib_datatree(): +# fpath = os.path.join(here, "hrrr.wrfsfcf.subset.json") +# with open(fpath, "rb") as fobj: +# scanned_msgs = ujson.load(fobj) +# merged = grib_tree(scanned_msgs) +# dt = datatree.open_datatree( +# fsspec.filesystem("reference", fo=merged).get_mapper(""), +# engine="zarr", +# consolidated=False, +# ) +# # Assert a few things... but if it loads we are mostly done. +# np.testing.assert_array_equal( +# dt.u.instant.heightAboveGround.step.values[:], +# np.array([0, 3600 * 10**9], dtype="timedelta64[ns]"), +# ) +# assert dt.u.attrs == dict(name="U component of wind") def test_parse_grib_idx_invalid_url(): @@ -345,17 +342,17 @@ def test_parse_grib_idx_content(idx_url, storage_options): assert idx_df.iloc[message_no]["length"] == output[message_no]["refs"][variable][2] -@pytest.fixture -def zarr_tree_and_datatree_instance(): - fn = os.path.join(here, "gfs.t00z.pgrb2.0p25.f006.test-limit-100") - tree_store = tree_store = grib_tree(scan_grib(fn)) - dt_instance = datatree.open_datatree( - fsspec.filesystem("reference", fo=tree_store).get_mapper(""), - engine="zarr", - consolidated=False, - ) +# @pytest.fixture +# def zarr_tree_and_datatree_instance(): +# fn = os.path.join(here, "gfs.t00z.pgrb2.0p25.f006.test-limit-100") +# tree_store = tree_store = grib_tree(scan_grib(fn)) +# dt_instance = datatree.open_datatree( +# fsspec.filesystem("reference", fo=tree_store).get_mapper(""), +# engine="zarr", +# consolidated=False, +# ) - return tree_store, dt_instance, fn +# return tree_store, dt_instance, fn def test_extract_dataset_chunk_index(zarr_tree_and_datatree_instance): From 187ced261feeda286fae65dbe8dda7e9b3da7c7c Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 23 Oct 2024 10:04:22 -0400 Subject: [PATCH 19/40] Add back commented out code --- kerchunk/tests/test_grib.py | 56 ++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/kerchunk/tests/test_grib.py b/kerchunk/tests/test_grib.py index 74f24a6d..f0e58f9d 100644 --- a/kerchunk/tests/test_grib.py +++ b/kerchunk/tests/test_grib.py @@ -6,7 +6,7 @@ import pandas as pd import pytest import xarray as xr -#import datatree +import datatree import zarr import ujson from kerchunk.grib2 import ( @@ -75,7 +75,7 @@ def test_archives(tmpdir, url): ours = xr.open_zarr( store, zarr_format=2, - consolidated=False, + consolidated=False ) data = _fetch_first(url) @@ -263,22 +263,22 @@ def test_hrrr_sfcf_grib_tree(): assert zg.u.instant.isobaricInhPa.time.shape == (1,) -# def test_hrrr_sfcf_grib_datatree(): -# fpath = os.path.join(here, "hrrr.wrfsfcf.subset.json") -# with open(fpath, "rb") as fobj: -# scanned_msgs = ujson.load(fobj) -# merged = grib_tree(scanned_msgs) -# dt = datatree.open_datatree( -# fsspec.filesystem("reference", fo=merged).get_mapper(""), -# engine="zarr", -# consolidated=False, -# ) -# # Assert a few things... but if it loads we are mostly done. -# np.testing.assert_array_equal( -# dt.u.instant.heightAboveGround.step.values[:], -# np.array([0, 3600 * 10**9], dtype="timedelta64[ns]"), -# ) -# assert dt.u.attrs == dict(name="U component of wind") +def test_hrrr_sfcf_grib_datatree(): + fpath = os.path.join(here, "hrrr.wrfsfcf.subset.json") + with open(fpath, "rb") as fobj: + scanned_msgs = ujson.load(fobj) + merged = grib_tree(scanned_msgs) + dt = datatree.open_datatree( + fsspec.filesystem("reference", fo=merged).get_mapper(""), + engine="zarr", + consolidated=False, + ) + # Assert a few things... but if it loads we are mostly done. + np.testing.assert_array_equal( + dt.u.instant.heightAboveGround.step.values[:], + np.array([0, 3600 * 10**9], dtype="timedelta64[ns]"), + ) + assert dt.u.attrs == dict(name="U component of wind") def test_parse_grib_idx_invalid_url(): @@ -342,17 +342,17 @@ def test_parse_grib_idx_content(idx_url, storage_options): assert idx_df.iloc[message_no]["length"] == output[message_no]["refs"][variable][2] -# @pytest.fixture -# def zarr_tree_and_datatree_instance(): -# fn = os.path.join(here, "gfs.t00z.pgrb2.0p25.f006.test-limit-100") -# tree_store = tree_store = grib_tree(scan_grib(fn)) -# dt_instance = datatree.open_datatree( -# fsspec.filesystem("reference", fo=tree_store).get_mapper(""), -# engine="zarr", -# consolidated=False, -# ) +@pytest.fixture +def zarr_tree_and_datatree_instance(): + fn = os.path.join(here, "gfs.t00z.pgrb2.0p25.f006.test-limit-100") + tree_store = tree_store = grib_tree(scan_grib(fn)) + dt_instance = datatree.open_datatree( + fsspec.filesystem("reference", fo=tree_store).get_mapper(""), + engine="zarr", + consolidated=False, + ) -# return tree_store, dt_instance, fn + return tree_store, dt_instance, fn def test_extract_dataset_chunk_index(zarr_tree_and_datatree_instance): From 690ed21922cd4255eb39a795674bf38372c87427 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 23 Oct 2024 11:28:58 -0400 Subject: [PATCH 20/40] Make grib codec a compressor since its bytes to array --- kerchunk/grib2.py | 4 +-- kerchunk/tests/test_grib.py | 54 ++++++++++++++++++------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py index e4e64bf3..eb796e2e 100644 --- a/kerchunk/grib2.py +++ b/kerchunk/grib2.py @@ -97,8 +97,8 @@ def _store_array(store, z, data, var, inline_threshold, offset, size, attr): chunks=shape, dtype=data.dtype, fill_value=attr.get("missingValue", None), - filters=[GRIBCodec(var=var, dtype=str(data.dtype))], - compressor=None, + filters=[], + compressor=GRIBCodec(var=var, dtype=str(data.dtype)), ) store[f"{var}/" + ".".join(["0"] * len(shape))] = ["{{u}}", offset, size] d.attrs.update(attr) diff --git a/kerchunk/tests/test_grib.py b/kerchunk/tests/test_grib.py index f0e58f9d..7d9cf32b 100644 --- a/kerchunk/tests/test_grib.py +++ b/kerchunk/tests/test_grib.py @@ -6,7 +6,7 @@ import pandas as pd import pytest import xarray as xr -import datatree +#import datatree import zarr import ujson from kerchunk.grib2 import ( @@ -263,22 +263,22 @@ def test_hrrr_sfcf_grib_tree(): assert zg.u.instant.isobaricInhPa.time.shape == (1,) -def test_hrrr_sfcf_grib_datatree(): - fpath = os.path.join(here, "hrrr.wrfsfcf.subset.json") - with open(fpath, "rb") as fobj: - scanned_msgs = ujson.load(fobj) - merged = grib_tree(scanned_msgs) - dt = datatree.open_datatree( - fsspec.filesystem("reference", fo=merged).get_mapper(""), - engine="zarr", - consolidated=False, - ) - # Assert a few things... but if it loads we are mostly done. - np.testing.assert_array_equal( - dt.u.instant.heightAboveGround.step.values[:], - np.array([0, 3600 * 10**9], dtype="timedelta64[ns]"), - ) - assert dt.u.attrs == dict(name="U component of wind") +# def test_hrrr_sfcf_grib_datatree(): +# fpath = os.path.join(here, "hrrr.wrfsfcf.subset.json") +# with open(fpath, "rb") as fobj: +# scanned_msgs = ujson.load(fobj) +# merged = grib_tree(scanned_msgs) +# dt = datatree.open_datatree( +# fsspec.filesystem("reference", fo=merged).get_mapper(""), +# engine="zarr", +# consolidated=False, +# ) +# # Assert a few things... but if it loads we are mostly done. +# np.testing.assert_array_equal( +# dt.u.instant.heightAboveGround.step.values[:], +# np.array([0, 3600 * 10**9], dtype="timedelta64[ns]"), +# ) +# assert dt.u.attrs == dict(name="U component of wind") def test_parse_grib_idx_invalid_url(): @@ -342,17 +342,17 @@ def test_parse_grib_idx_content(idx_url, storage_options): assert idx_df.iloc[message_no]["length"] == output[message_no]["refs"][variable][2] -@pytest.fixture -def zarr_tree_and_datatree_instance(): - fn = os.path.join(here, "gfs.t00z.pgrb2.0p25.f006.test-limit-100") - tree_store = tree_store = grib_tree(scan_grib(fn)) - dt_instance = datatree.open_datatree( - fsspec.filesystem("reference", fo=tree_store).get_mapper(""), - engine="zarr", - consolidated=False, - ) +# @pytest.fixture +# def zarr_tree_and_datatree_instance(): +# fn = os.path.join(here, "gfs.t00z.pgrb2.0p25.f006.test-limit-100") +# tree_store = tree_store = grib_tree(scan_grib(fn)) +# dt_instance = datatree.open_datatree( +# fsspec.filesystem("reference", fo=tree_store).get_mapper(""), +# engine="zarr", +# consolidated=False, +# ) - return tree_store, dt_instance, fn +# return tree_store, dt_instance, fn def test_extract_dataset_chunk_index(zarr_tree_and_datatree_instance): From 5019b154903199514a0484f71f625971879defe6 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 23 Oct 2024 11:36:59 -0400 Subject: [PATCH 21/40] Switch back --- kerchunk/grib2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py index eb796e2e..e4e64bf3 100644 --- a/kerchunk/grib2.py +++ b/kerchunk/grib2.py @@ -97,8 +97,8 @@ def _store_array(store, z, data, var, inline_threshold, offset, size, attr): chunks=shape, dtype=data.dtype, fill_value=attr.get("missingValue", None), - filters=[], - compressor=GRIBCodec(var=var, dtype=str(data.dtype)), + filters=[GRIBCodec(var=var, dtype=str(data.dtype))], + compressor=None, ) store[f"{var}/" + ".".join(["0"] * len(shape))] = ["{{u}}", offset, size] d.attrs.update(attr) From d96cf469c3beca0ac28df23d2f96ec831d169069 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Sat, 26 Oct 2024 16:42:03 -0400 Subject: [PATCH 22/40] Add first pass at grib zarr 3 codec --- kerchunk/codecs.py | 87 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 3 deletions(-) diff --git a/kerchunk/codecs.py b/kerchunk/codecs.py index 852076ea..4804423e 100644 --- a/kerchunk/codecs.py +++ b/kerchunk/codecs.py @@ -1,11 +1,22 @@ import ast +from dataclasses import dataclass import io +from typing import TYPE_CHECKING import numcodecs from numcodecs.abc import Codec import numpy as np import threading import zlib +from zarr.abc.codec import ArrayBytesCodec +from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer +from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.registry import register_codec + +if TYPE_CHECKING: + from typing import Self + + from zarr.core.array_spec import ArraySpec class FillStringsCodec(Codec): @@ -115,6 +126,78 @@ def decode(self, buf, out=None): numcodecs.register_codec(GRIBCodec, "grib") +@dataclass(frozen=True) +class GRIBZarrCodec(ArrayBytesCodec): + eclock = threading.RLock() + + var: str + dtype: np.dtype + + def __init__(self, *, var: str, dtype: np.dtype) -> None: + object.__setattr__(self, "var", var) + object.__setattr__(self, "dtype", dtype) + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + _, configuration_parsed = parse_named_configuration( + data, "bytes", require_configuration=True + ) + configuration_parsed = configuration_parsed or {} + return cls(**configuration_parsed) # type: ignore[arg-type] + + def to_dict(self) -> dict[str, JSON]: + if self.endian is None: + return {"name": "grib"} + else: + return { + "name": "grib", + "configuration": {"var": self.var, "dtype": self.dtype}, + } + + async def _decode_single( + self, + chunk_bytes: Buffer, + chunk_spec: ArraySpec, + ) -> NDBuffer: + assert isinstance(chunk_bytes, Buffer) + import eccodes + + if self.var in ["latitude", "longitude"]: + var = self.var + "s" + dt = self.dtype or "float64" + else: + var = "values" + dt = self.dtype or "float32" + + with self.eclock: + mid = eccodes.codes_new_from_message(chunk_bytes.to_bytes()) + try: + data = eccodes.codes_get_array(mid, var) + missingValue = eccodes.codes_get_string(mid, "missingValue") + if var == "values" and missingValue: + data[data == float(missingValue)] = np.nan + return data.astype(dt, copy=False) + + finally: + eccodes.codes_release(mid) + + async def _encode_single( + self, + chunk_array: NDBuffer, + chunk_spec: ArraySpec, + ) -> Buffer | None: + # This is a one way codec + raise NotImplementedError + + def compute_encoded_size( + self, input_byte_length: int, _chunk_spec: ArraySpec + ) -> int: + raise NotImplementedError + + +register_codec("grib", GRIBZarrCodec) + + class AsciiTableCodec(numcodecs.abc.Codec): """Decodes ASCII-TABLE extensions in FITS files""" @@ -166,7 +249,6 @@ def decode(self, buf, out=None): arr2 = np.empty((self.nrow,), dtype=dt_out) heap = buf[arr.nbytes :] for name in dt_out.names: - if dt_out[name] == "O": dt = np.dtype(self.ftypes[self.types[name]]) counts = arr[name][:, 0] @@ -244,8 +326,7 @@ def encode(self, buf): class ZlibCodec(Codec): codec_id = "zlib" - def __init__(self): - ... + def __init__(self): ... def decode(self, data, out=None): if out: From cbcb7208576277351fd57e8746b57698e1b2899c Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 29 Oct 2024 13:30:18 -0700 Subject: [PATCH 23/40] Fix typing --- kerchunk/codecs.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kerchunk/codecs.py b/kerchunk/codecs.py index 4804423e..46b19072 100644 --- a/kerchunk/codecs.py +++ b/kerchunk/codecs.py @@ -1,23 +1,19 @@ import ast from dataclasses import dataclass import io -from typing import TYPE_CHECKING +from typing import Self, TYPE_CHECKING import numcodecs from numcodecs.abc import Codec import numpy as np import threading import zlib +from zarr.core.array_spec import ArraySpec from zarr.abc.codec import ArrayBytesCodec from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer from zarr.core.common import JSON, parse_enum, parse_named_configuration from zarr.registry import register_codec -if TYPE_CHECKING: - from typing import Self - - from zarr.core.array_spec import ArraySpec - class FillStringsCodec(Codec): """Sets fixed-length string fields to empty From b88655f3c0d9789e09dee99afdcf245a652d9b73 Mon Sep 17 00:00:00 2001 From: Nathan Zimmerman Date: Wed, 6 Nov 2024 13:39:53 -0600 Subject: [PATCH 24/40] Fix some broken tests; use async filesystem wrapper --- kerchunk/tests/test_combine.py | 10 ++++++---- kerchunk/utils.py | 3 +++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py index 868a39ff..0cfb9505 100644 --- a/kerchunk/tests/test_combine.py +++ b/kerchunk/tests/test_combine.py @@ -134,16 +134,18 @@ # simple time arrays - xarray can't make these! m = fs.get_mapper("time1.zarr") z = zarr.open(m, mode="w", zarr_format=2) -ar = z.create_dataset("time", data=np.array([1], dtype="M8[s]")) +time1_array = np.array([1], dtype="M8[s]") +ar = z.create_array("time", data=time1_array, shape=time1_array.shape) ar.attrs.update({"_ARRAY_DIMENSIONS": ["time"]}) -ar = z.create_dataset("data", data=arr) +ar = z.create_array("data", data=arr, shape=arr.shape) ar.attrs.update({"_ARRAY_DIMENSIONS": ["time", "x", "y"]}) m = fs.get_mapper("time2.zarr") z = zarr.open(m, mode="w", zarr_format=2) -ar = z.create_dataset("time", data=np.array([2], dtype="M8[s]")) +time2_array = np.array([2], dtype="M8[s]") +ar = z.create_array("time", data=time2_array, shape=time2_array.shape) ar.attrs.update({"_ARRAY_DIMENSIONS": ["time"]}) -ar = z.create_dataset("data", data=arr) +ar = z.create_array("data", data=arr, shape=arr.shape) ar.attrs.update({"_ARRAY_DIMENSIONS": ["time", "x", "y"]}) diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 8cc2f765..5916ebef 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -9,6 +9,7 @@ import ujson import fsspec +from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper import numpy as np import zarr @@ -70,6 +71,8 @@ def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, mode="r"): zarr.storage.Store or zarr.storage.Mapper, fsspec.AbstractFileSystem """ if is_zarr3(): + if not fs.async_impl: + fs = AsyncFileSystemWrapper(fs) return zarr.storage.RemoteStore(fs, mode=mode) else: return fs.get_mapper() From 73eaf33a80801d86afc2f289a33ee56de101f423 Mon Sep 17 00:00:00 2001 From: Nathan Zimmerman Date: Tue, 19 Nov 2024 18:02:01 -0600 Subject: [PATCH 25/40] Implement zarr3 compatibility for grib --- kerchunk/combine.py | 38 ++++++++------ kerchunk/grib2.py | 14 ++--- kerchunk/tests/test_grib.py | 100 ++++++++++++++++++------------------ kerchunk/utils.py | 15 +++--- 4 files changed, 88 insertions(+), 79 deletions(-) diff --git a/kerchunk/combine.py b/kerchunk/combine.py index b02fa395..777853d2 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -11,7 +11,7 @@ import ujson import zarr -from kerchunk.utils import consolidate +from kerchunk.utils import consolidate, fs_as_store, translate_refs_serializable logger = logging.getLogger("kerchunk.combine") @@ -199,6 +199,7 @@ def append( remote_protocol=remote_protocol, remote_options=remote_options, target_options=target_options, + asynchronous=True ) ds = xr.open_dataset( fs.get_mapper(), engine="zarr", backend_kwargs={"consolidated": False} @@ -264,7 +265,7 @@ def fss(self): self._paths = [] for of in fsspec.open_files(self.path, **self.target_options): self._paths.append(of.full_name) - fs = fsspec.core.url_to_fs(self.path[0], **self.target_options)[0] + fs = fsspec.core.url_to_fs(self.path[0], asynchronous=True, **self.target_options)[0] try: # JSON path fo_list = fs.cat(self.path) @@ -360,7 +361,8 @@ def first_pass(self): fs._dircache_from_items() logger.debug("First pass: %s", i) - z = zarr.open_group(fs.get_mapper(""), zarr_format=2) + z_store = fs_as_store(fs, read_only=False) + z = zarr.open_group(z_store, zarr_format=2) for var in self.concat_dims: value = self._get_value(i, z, var, fn=self._paths[i]) if isinstance(value, np.ndarray): @@ -386,10 +388,10 @@ def store_coords(self): Write coordinate arrays into the output """ kv = {} - store = zarr.storage.KVStore(kv) - group = zarr.open(store, zarr_format=2) - m = self.fss[0].get_mapper("") - z = zarr.open(m) + store = zarr.storage.MemoryStore(kv) + group = zarr.open_group(store, zarr_format=2) + m = fs_as_store(self.fss[0], read_only=False) + z = zarr.open(m, zarr_format=2) for k, v in self.coos.items(): if k == "var": # The names of the variables to write in the second pass, not a coordinate @@ -420,10 +422,11 @@ def store_coords(self): elif k in z: # Fall back to existing fill value kw["fill_value"] = z[k].fill_value - arr = group.create_dataset( + arr = group.create_array( name=k, data=data, - overwrite=True, + shape=data.shape, + exists_ok=True, compressor=compression, dtype=self.coo_dtypes.get(k, data.dtype), **kw, @@ -443,8 +446,8 @@ def store_coords(self): logger.debug("Written coordinates") for fn in [".zgroup", ".zattrs"]: # top-level group attributes from first input - if fn in m: - self.out[fn] = ujson.dumps(ujson.loads(m[fn])) + if m.fs.exists(fn): + self.out[fn] = ujson.dumps(ujson.loads(m.fs.cat(fn))) logger.debug("Written global metadata") self.done.add(2) @@ -460,7 +463,7 @@ def second_pass(self): for i, fs in enumerate(self.fss): to_download = {} - m = fs.get_mapper("") + m = fs_as_store(fs, read_only=False) z = zarr.open(m, zarr_format=2) if no_deps is None: @@ -491,9 +494,9 @@ def second_pass(self): if f"{v}/.zgroup" in fns: # recurse into groups - copy meta, add to dirs to process and don't look # for references in this dir - self.out[f"{v}/.zgroup"] = m[f"{v}/.zgroup"] + self.out[f"{v}/.zgroup"] = m.fs.cat(f"{v}/.zgroup") if f"{v}/.zattrs" in fns: - self.out[f"{v}/.zattrs"] = m[f"{v}/.zattrs"] + self.out[f"{v}/.zattrs"] = m.fs.cat(f"{v}/.zattrs") dirs.extend([f for f in fns if not f.startswith(f"{v}/.z")]) continue if v in self.identical_dims: @@ -505,7 +508,7 @@ def second_pass(self): continue logger.debug("Second pass: %s, %s", i, v) - zarray = ujson.loads(m[f"{v}/.zarray"]) + zarray = ujson.loads(m.fs.cat(f"{v}/.zarray")) if v not in chunk_sizes: chunk_sizes[v] = zarray["chunks"] elif chunk_sizes[v] != zarray["chunks"]: @@ -516,7 +519,10 @@ def second_pass(self): chunks so far: {zarray["chunks"]}""" ) chunks = chunk_sizes[v] - zattrs = ujson.loads(m.get(f"{v}/.zattrs", "{}")) + if m.fs.exists(f"{v}/.zattrs"): + zattrs = ujson.loads(m.fs.cat(f"{v}/.zattrs")) + else: + zattrs = ujson.loads({}) coords = zattrs.get("_ARRAY_DIMENSIONS", []) if zarray["shape"] and not coords: coords = list("ikjlm")[: len(zarray["shape"])] diff --git a/kerchunk/grib2.py b/kerchunk/grib2.py index e4e64bf3..686a71a0 100644 --- a/kerchunk/grib2.py +++ b/kerchunk/grib2.py @@ -11,7 +11,7 @@ import xarray import numpy as np -from kerchunk.utils import class_factory, _encode_for_JSON, dict_to_store, translate_refs_serializable +from kerchunk.utils import class_factory, _encode_for_JSON, dict_to_store, fs_as_store, translate_refs_serializable from kerchunk.codecs import GRIBCodec from kerchunk.combine import MultiZarrToZarr, drop from kerchunk._grib_idx import parse_grib_idx, build_idx_grib_mapping, map_from_index @@ -520,17 +520,18 @@ def grib_tree( for key, value in group["refs"].items(): if key not in [".zattrs", ".zgroup"]: - zarr_store[f"{path}/{key}"] = value + zarr_store._store_dict[f"{path}/{key}"] = value # Force all stored values to decode as string, not bytes. String should be correct. # ujson will reject bytes values by default. # Using 'reject_bytes=False' one write would fail an equality check on read. - zarr_store = { + zarr_dict = { key: (val.decode() if isinstance(val, bytes) else val) - for key, val in zarr_store.items() + for key, val in zarr_store._store_dict.items() } # TODO handle other kerchunk reference spec versions? - result = dict(refs=zarr_store, version=1) + translate_refs_serializable(zarr_dict) + result = dict(refs=zarr_dict, version=1) return result @@ -571,7 +572,8 @@ def correct_hrrr_subhf_step(group: Dict) -> Dict: group["refs"][".zattrs"] = ujson.dumps(attrs) fo = fsspec.filesystem("reference", fo=group, mode="r") - xd = xarray.open_dataset(fo.get_mapper(), engine="zarr", consolidated=False) + fstore = fs_as_store(fo, read_only=True) + xd = xarray.open_dataset(fstore, engine="zarr", consolidated=False) correct_step = xd.valid_time.values - xd.time.values diff --git a/kerchunk/tests/test_grib.py b/kerchunk/tests/test_grib.py index 7d9cf32b..9bc90b71 100644 --- a/kerchunk/tests/test_grib.py +++ b/kerchunk/tests/test_grib.py @@ -21,7 +21,7 @@ extract_dataset_chunk_index, extract_datatree_chunk_index, ) -from kerchunk.utils import refs_as_store +from kerchunk.utils import fs_as_store, refs_as_store eccodes_ver = tuple(int(i) for i in eccodes.__version__.split(".")) cfgrib = pytest.importorskip("cfgrib") @@ -70,7 +70,7 @@ def test_archives(tmpdir, url): grib = GribToZarr(url, storage_options={"anon": True}, skip=1) out = grib.translate()[0] - store = refs_as_store(out) + store = refs_as_store(out, remote_options={"anon": True}) ours = xr.open_zarr( store, @@ -116,7 +116,8 @@ def test_grib_tree(): corrected_msg_groups = [correct_hrrr_subhf_step(msg) for msg in scanned_msg_groups] result = grib_tree(corrected_msg_groups) fs = fsspec.filesystem("reference", fo=result) - zg = zarr.open_group(fs.get_mapper(""), zarr_format=2) + store = fs_as_store(fs) + zg = zarr.open_group(store, mode="r", zarr_format=2) assert isinstance(zg["refc/instant/atmosphere/refc"], zarr.Array) assert isinstance(zg["vbdsf/avg/surface/vbdsf"], zarr.Array) assert set(zg["vbdsf/avg/surface"].attrs["coordinates"].split()) == set( @@ -126,7 +127,7 @@ def test_grib_tree(): "atmosphere latitude longitude step time valid_time".split() ) # Assert that the fill value is set correctly - assert zg.refc.instant.atmosphere.step.fill_value is np.nan + assert np.isnan(zg['refc/instant/atmosphere/step'].fill_value) # The following two tests use json fixture data generated from calling scan grib @@ -144,14 +145,14 @@ def test_correct_hrrr_subhf_group_step(): scanned_msgs = ujson.load(fobj) original_zg = [ - zarr.open_group(fsspec.filesystem("reference", fo=val).get_mapper(""), zarr_format=2) + zarr.open_group(fs_as_store(fsspec.filesystem("reference", fo=val)), mode="r", zarr_format=2) for val in scanned_msgs ] corrected_msgs = [correct_hrrr_subhf_step(msg) for msg in scanned_msgs] corrected_zg = [ - zarr.open_group(fsspec.filesystem("reference", fo=val).get_mapper(""), zarr_format=2) + zarr.open_group(fs_as_store(fsspec.filesystem("reference", fo=val)), mode="r", zarr_format=2) for val in corrected_msgs ] @@ -160,10 +161,10 @@ def test_correct_hrrr_subhf_group_step(): assert not all(["step" in zg.array_keys() for zg in original_zg]) # The step values are corrected to floating point hour - assert all([zg.step[()] <= 1.0 for zg in corrected_zg]) + assert all([zg["step"][()] <= 1.0 for zg in corrected_zg]) # The original seems to have values in minutes for some step variables! assert not all( - [zg.step[()] <= 1.0 for zg in original_zg if "step" in zg.array_keys()] + [zg["step"][()] <= 1.0 for zg in original_zg if "step" in zg.array_keys()] ) @@ -174,36 +175,32 @@ def test_hrrr_subhf_corrected_grib_tree(): corrected_msgs = [correct_hrrr_subhf_step(msg) for msg in scanned_msgs] merged = grib_tree(corrected_msgs) - zg = zarr.open_group(fsspec.filesystem("reference", fo=merged).get_mapper(""), zarr_format=2) + z_fs = fsspec.filesystem("reference", fo=merged, asynchronous=True) + zstore = fs_as_store(z_fs) + zg = zarr.open_group(zstore, mode="r", zarr_format=2) # Check the values and shape of the time coordinates - assert zg.u.instant.heightAboveGround.step[:].tolist() == [ + assert zg['u/instant/heightAboveGround/step'][:].tolist() == [ 0.0, 0.25, 0.5, 0.75, 1.0, ] - assert zg.u.instant.heightAboveGround.step.shape == (5,) - - assert zg.u.instant.heightAboveGround.valid_time[:].tolist() == [ + assert zg['u/instant/heightAboveGround/step'].shape == (5,) + assert zg['u/instant/heightAboveGround/valid_time'][:].tolist() == [ [1695862800, 1695863700, 1695864600, 1695865500, 1695866400] ] - assert zg.u.instant.heightAboveGround.valid_time.shape == (1, 5) - - assert zg.u.instant.heightAboveGround.time[:].tolist() == [1695862800] - assert zg.u.instant.heightAboveGround.time.shape == (1,) - - assert zg.dswrf.avg.surface.step[:].tolist() == [0.0, 0.25, 0.5, 0.75, 1.0] - assert zg.dswrf.avg.surface.step.shape == (5,) - - assert zg.dswrf.avg.surface.valid_time[:].tolist() == [ + assert zg['u/instant/heightAboveGround/valid_time'].shape == (1, 5) + assert zg['u/instant/heightAboveGround/time'][:].tolist() == [1695862800] + assert zg['u/instant/heightAboveGround/time'].shape == (1,) + assert zg['dswrf/avg/surface/step'][:].tolist() == [0.0, 0.25, 0.5, 0.75, 1.0] + assert zg['dswrf/avg/surface/step'].shape == (5,) + assert zg['dswrf/avg/surface/valid_time'][:].tolist() == [ [1695862800, 1695863700, 1695864600, 1695865500, 1695866400] ] - assert zg.dswrf.avg.surface.valid_time.shape == (1, 5) - - assert zg.dswrf.avg.surface.time[:].tolist() == [1695862800] - assert zg.dswrf.avg.surface.time.shape == (1,) - + assert zg['dswrf/avg/surface/valid_time'].shape == (1, 5) + assert zg['dswrf/avg/surface/time'][:].tolist() == [1695862800] + assert zg['dswrf/avg/surface/time'].shape == (1,) # The following two test use json fixture data generated from calling scan grib # scan_grib("testdata/hrrr.t01z.wrfsfcf00.grib2") @@ -217,24 +214,22 @@ def test_hrrr_sfcf_grib_tree(): with open(fpath, "rb") as fobj: scanned_msgs = ujson.load(fobj) merged = grib_tree(scanned_msgs) - zg = zarr.open_group(fsspec.filesystem("reference", fo=merged).get_mapper(""), zarr_format=2) + store = fs_as_store(fsspec.filesystem("reference", fo=merged)) + zg = zarr.open_group(store, mode="r", zarr_format=2) # Check the heightAboveGround level shape of the time coordinates - assert zg.u.instant.heightAboveGround.heightAboveGround[()] == 80.0 - assert zg.u.instant.heightAboveGround.heightAboveGround.shape == () - - assert zg.u.instant.heightAboveGround.step[:].tolist() == [0.0, 1.0] - assert zg.u.instant.heightAboveGround.step.shape == (2,) - - assert zg.u.instant.heightAboveGround.valid_time[:].tolist() == [ + assert zg['u/instant/heightAboveGround/heightAboveGround'][()] == 80.0 + assert zg['u/instant/heightAboveGround/heightAboveGround'].shape == () + assert zg['u/instant/heightAboveGround/step'][:].tolist() == [0.0, 1.0] + assert zg['u/instant/heightAboveGround/step'].shape == (2,) + assert zg['u/instant/heightAboveGround/valid_time'][:].tolist() == [ [1695862800, 1695866400] ] - assert zg.u.instant.heightAboveGround.valid_time.shape == (1, 2) - - assert zg.u.instant.heightAboveGround.time[:].tolist() == [1695862800] - assert zg.u.instant.heightAboveGround.time.shape == (1,) + assert zg['u/instant/heightAboveGround/valid_time'].shape == (1, 2) + assert zg['u/instant/heightAboveGround/time'][:].tolist() == [1695862800] + assert zg['u/instant/heightAboveGround/time'].shape == (1,) # Check the isobaricInhPa level shape and time coordinates - assert zg.u.instant.isobaricInhPa.isobaricInhPa[:].tolist() == [ + assert zg['u/instant/isobaricInhPa/isobaricInhPa'][:].tolist() == [ 250.0, 300.0, 500.0, @@ -243,10 +238,9 @@ def test_hrrr_sfcf_grib_tree(): 925.0, 1000.0, ] - assert zg.u.instant.isobaricInhPa.isobaricInhPa.shape == (7,) - - assert zg.u.instant.isobaricInhPa.step[:].tolist() == [0.0, 1.0] - assert zg.u.instant.isobaricInhPa.step.shape == (2,) + assert zg['u/instant/isobaricInhPa/isobaricInhPa'].shape == (7,) + assert zg['u/instant/isobaricInhPa/step'][:].tolist() == [0.0, 1.0] + assert zg['u/instant/isobaricInhPa/step'].shape == (2,) # Valid time values get exploded by isobaricInhPa aggregation # Is this a feature or a bug? @@ -256,11 +250,11 @@ def test_hrrr_sfcf_grib_tree(): [1695866400 for _ in range(7)], ] ] - assert zg.u.instant.isobaricInhPa.valid_time[:].tolist() == expected_valid_times - assert zg.u.instant.isobaricInhPa.valid_time.shape == (1, 2, 7) + assert zg['u/instant/isobaricInhPa/valid_time'][:].tolist() == expected_valid_times + assert zg['u/instant/isobaricInhPa/valid_time'].shape == (1, 2, 7) - assert zg.u.instant.isobaricInhPa.time[:].tolist() == [1695862800] - assert zg.u.instant.isobaricInhPa.time.shape == (1,) + assert zg['u/instant/isobaricInhPa/time'][:].tolist() == [1695862800] + assert zg['u/instant/isobaricInhPa/time'].shape == (1,) # def test_hrrr_sfcf_grib_datatree(): @@ -290,11 +284,14 @@ def test_parse_grib_idx_invalid_url(): def test_parse_grib_idx_no_file(): - with pytest.raises(FileNotFoundError): + # How did this ever work? 403s are returned for anonymous calls to non-existent + # files iirc as a security measure to obscure results/avoid tests for existence + #with pytest.raises(FileNotFoundError): + with pytest.raises(PermissionError): # the url is spelled wrong parse_grib_idx( "s3://noaahrrr-bdp-pds/hrrr.20220804/conus/hrrr.t01z.wrfsfcf01.grib2", - storage_options=dict(anon=True), + storage_options={"anon": True}, ) @@ -355,6 +352,7 @@ def test_parse_grib_idx_content(idx_url, storage_options): # return tree_store, dt_instance, fn +@pytest.mark.skip(reason="datatree support should be updated to use xarray.Datatree") def test_extract_dataset_chunk_index(zarr_tree_and_datatree_instance): tree_store, dt_instance, fn = zarr_tree_and_datatree_instance @@ -385,6 +383,7 @@ def test_extract_dataset_chunk_index(zarr_tree_and_datatree_instance): ) +@pytest.mark.skip(reason="datatree support should be updated to use xarray.Datatree") def test_extract_datatree_chunk_index(zarr_tree_and_datatree_instance): tree_store, dt_instance, fn = zarr_tree_and_datatree_instance @@ -438,6 +437,7 @@ def test_extract_datatree_chunk_index(zarr_tree_and_datatree_instance): ).all() +@pytest.mark.skip(reason="datatree support should be updated to use xarray.Datatree") def test_extract_methods_grib_parameter(zarr_tree_and_datatree_instance): tree_store, dt_instance, _ = zarr_tree_and_datatree_instance diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 5916ebef..b918aa1d 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -22,11 +22,12 @@ def refs_as_fs(refs, remote_protocol=None, remote_options=None, **kwargs): remote_protocol=remote_protocol, remote_options=remote_options, **kwargs, + asynchronous=True ) return fs -def refs_as_store(refs, mode="r", remote_protocol=None, remote_options=None): +def refs_as_store(refs, read_only=True, remote_protocol=None, remote_options=None): """Convert a reference set to a zarr store""" asynchronous = False if is_zarr3(): @@ -39,10 +40,9 @@ def refs_as_store(refs, mode="r", remote_protocol=None, remote_options=None): fs = refs_as_fs( refs, remote_protocol=remote_protocol, - remote_options=remote_options, - asynchronous=asynchronous, + remote_options=remote_options ) - return fs_as_store(fs, mode=mode) + return fs_as_store(fs, read_only=True) def is_zarr3(): @@ -53,12 +53,12 @@ def is_zarr3(): def dict_to_store(store_dict: dict): """Create an in memory zarr store backed by the given dictionary""" if is_zarr3(): - return zarr.storage.MemoryStore(mode="w", store_dict=store_dict) + return zarr.storage.MemoryStore(read_only=False, store_dict=store_dict) else: return zarr.storage.KVStore(store_dict) -def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, mode="r"): +def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, read_only=True): """Open the refs as a zarr store Parameters @@ -73,7 +73,8 @@ def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, mode="r"): if is_zarr3(): if not fs.async_impl: fs = AsyncFileSystemWrapper(fs) - return zarr.storage.RemoteStore(fs, mode=mode) + fs.asynchronous = True + return zarr.storage.RemoteStore(fs, read_only=read_only) else: return fs.get_mapper() From 37571995c70573613ead3c8cf0f1c14c54640f43 Mon Sep 17 00:00:00 2001 From: Nathan Zimmerman Date: Thu, 21 Nov 2024 16:24:05 -0600 Subject: [PATCH 26/40] Use zarr3 stores directly; avoid use of internal fs --- kerchunk/combine.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/kerchunk/combine.py b/kerchunk/combine.py index 777853d2..841b9e8a 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -1,3 +1,4 @@ +import asyncio import collections.abc import logging import re @@ -10,6 +11,7 @@ import numcodecs import ujson import zarr +from zarr.core.buffer.core import default_buffer_prototype from kerchunk.utils import consolidate, fs_as_store, translate_refs_serializable @@ -349,6 +351,16 @@ def _get_value(self, index, z, var, fn=None): logger.debug("Decode: %s -> %s", (selector, index, var, fn), o) return o + async def _read_meta_files(self, m, files): + """Helper to load multiple metadata files asynchronously""" + res = {} + for fn in files: + exists = await m.exists(fn) + if exists: + content = await m.get(fn, prototype=default_buffer_prototype()) + res[fn] = ujson.dumps(ujson.loads(content.to_bytes())) + return res + def first_pass(self): """Accumulate the set of concat coords values across all inputs""" @@ -444,10 +456,9 @@ def store_coords(self): # TODO: rewrite .zarray/.zattrs with ujson to save space. Maybe make them by hand anyway. self.out.update(kv) logger.debug("Written coordinates") - for fn in [".zgroup", ".zattrs"]: - # top-level group attributes from first input - if m.fs.exists(fn): - self.out[fn] = ujson.dumps(ujson.loads(m.fs.cat(fn))) + + metadata = asyncio.run(self._read_meta_files(m, [".zgroup", ".zattrs"])) + self.out.update(metadata) logger.debug("Written global metadata") self.done.add(2) @@ -494,9 +505,8 @@ def second_pass(self): if f"{v}/.zgroup" in fns: # recurse into groups - copy meta, add to dirs to process and don't look # for references in this dir - self.out[f"{v}/.zgroup"] = m.fs.cat(f"{v}/.zgroup") - if f"{v}/.zattrs" in fns: - self.out[f"{v}/.zattrs"] = m.fs.cat(f"{v}/.zattrs") + metadata = asyncio.run(self._read_meta_files(m, [f"{v}/.zgroup", f"{v}/.zattrs"])) + self.out.update(metadata) dirs.extend([f for f in fns if not f.startswith(f"{v}/.z")]) continue if v in self.identical_dims: @@ -507,8 +517,9 @@ def second_pass(self): self.out[k] = fs.references[k] continue logger.debug("Second pass: %s, %s", i, v) - - zarray = ujson.loads(m.fs.cat(f"{v}/.zarray")) + + zarray = asyncio.run(self._read_meta_files(m, [f"{v}/.zarray"]))[f"{v}/.zarray"] + zarray = ujson.loads(zarray) if v not in chunk_sizes: chunk_sizes[v] = zarray["chunks"] elif chunk_sizes[v] != zarray["chunks"]: @@ -519,10 +530,8 @@ def second_pass(self): chunks so far: {zarray["chunks"]}""" ) chunks = chunk_sizes[v] - if m.fs.exists(f"{v}/.zattrs"): - zattrs = ujson.loads(m.fs.cat(f"{v}/.zattrs")) - else: - zattrs = ujson.loads({}) + zattr_meta = asyncio.run(self._read_meta_files(m, [f"{v}/.zattrs"])) + zattrs = ujson.loads(zattr_meta.get(f"{v}/.zattrs", {})) coords = zattrs.get("_ARRAY_DIMENSIONS", []) if zarray["shape"] and not coords: coords = list("ikjlm")[: len(zarray["shape"])] From d8848ce5cb621493258efd468619e9eecfc10f4b Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 26 Nov 2024 16:25:52 -0500 Subject: [PATCH 27/40] Forward --- kerchunk/fits.py | 2 +- kerchunk/hdf.py | 2 +- kerchunk/hdf4.py | 1 - kerchunk/netCDF3.py | 2 +- kerchunk/tests/test_utils.py | 24 +++++++++++++----------- kerchunk/utils.py | 20 +++++++++----------- 6 files changed, 25 insertions(+), 26 deletions(-) diff --git a/kerchunk/fits.py b/kerchunk/fits.py index f0d4fa8e..f4d181ad 100644 --- a/kerchunk/fits.py +++ b/kerchunk/fits.py @@ -249,7 +249,7 @@ def add_wcs_coords(hdu, zarr_group=None, dataset=None, dtype="float32"): } if zarr_group is not None: arr = zarr_group.empty( - name, shape=shape, chunks=shape, overwrite=True, dtype=dtype + name, shape=shape, chunks=shape, dtype=dtype ) arr.attrs.update(attrs) arr[:] = world_coord.value.reshape(shape) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 1d4d0054..f72bf8a2 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -107,7 +107,7 @@ def __init__( self.vlen = vlen_encode self.store_dict = out or {} self.store = dict_to_store(self.store_dict) - self._zroot = zarr.group(store=self.store, zarr_format=2, overwrite=True) + self._zroot = zarr.group(store=self.store, zarr_format=2) self._uri = url self.error = error lggr.debug(f"HDF5 file URI: {self._uri}") diff --git a/kerchunk/hdf4.py b/kerchunk/hdf4.py index 8339659b..92b738c7 100644 --- a/kerchunk/hdf4.py +++ b/kerchunk/hdf4.py @@ -155,7 +155,6 @@ def translate(self, filename=None, storage_options=None): dtype=v["dtype"], chunks=v.get("chunks", v["dims"]), compressor=compression, - overwrite=True, ) arr.attrs.update( dict( diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py index 31438bb0..af410784 100644 --- a/kerchunk/netCDF3.py +++ b/kerchunk/netCDF3.py @@ -169,7 +169,7 @@ def translate(self): out = self.out store = dict_to_store(out) - z = zarr.open(store, mode="w", zarr_format=2, overwrite=True) + z = zarr.open_group(store, mode="w", zarr_format=2) for dim, var in self.variables.items(): if dim in self.chunks: diff --git a/kerchunk/tests/test_utils.py b/kerchunk/tests/test_utils.py index a951c36c..701427e2 100644 --- a/kerchunk/tests/test_utils.py +++ b/kerchunk/tests/test_utils.py @@ -72,21 +72,20 @@ def test_inline_array(): "data/1": b"\x02\x00\x00\x00", "data/.zattrs": '{"foo": "bar"}', } - fs = fsspec.filesystem("reference", fo=refs) out1 = kerchunk.utils.inline_array(refs, threshold=1) # does nothing assert out1 == refs out2 = kerchunk.utils.inline_array(refs, threshold=1, names=["data"]) # explicit - assert "data/1" not in out2 + assert "data/1" not in out2 # TODO: Is this wrong? I dont think zarr deletes existing chunks when overwriting assert json.loads(out2["data/.zattrs"]) == json.loads(refs["data/.zattrs"]) - fs = fsspec.filesystem("reference", fo=out2) - g = zarr.open(fs.get_mapper(), zarr_format=2) - assert g.data[:].tolist() == [1, 2] + store = kerchunk.utils.refs_as_store(out2) + g = zarr.open(store, mode='r', zarr_format=2) + assert g.data[:].tolist() == [1, 2] # What is g.data??? out3 = kerchunk.utils.inline_array(refs, threshold=1000) # inlines because of size assert "data/1" not in out3 - fs = fsspec.filesystem("reference", fo=out3) - g = zarr.open(fs.get_mapper(), zarr_format=2) - assert g.data[:].tolist() == [1, 2] + store = kerchunk.utils.refs_as_store(out3) + g = zarr.open(store, mode='r', zarr_format=2) + assert g.data[:].tolist() == [1, 2] # What is g.data??? def test_json(): @@ -113,9 +112,12 @@ def test_subchunk_exact(m, chunks): f"data/{_}.0" for _ in range(nchunk) ] - g2 = zarr.open_group( - "reference://", storage_options={"fo": out, "remote_protocol": "memory"}, zarr_format=2 - ) + store = kerchunk.utils.refs_as_store(out, remote_protocol="memory") + g2 = zarr.open_group(store, mode='r', zarr_format=2) + + # g2 = zarr.open_group( + # "reference://", storage_options={"fo": out, "remote_protocol": "memory"}, zarr_format=2 + # ) assert (g2.data[:] == data).all() diff --git a/kerchunk/utils.py b/kerchunk/utils.py index b918aa1d..9bc7686e 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -20,7 +20,7 @@ def refs_as_fs(refs, remote_protocol=None, remote_options=None, **kwargs): "reference", fo=refs, remote_protocol=remote_protocol, - remote_options=remote_options, + # remote_options=remote_options, **kwargs, asynchronous=True ) @@ -29,9 +29,7 @@ def refs_as_fs(refs, remote_protocol=None, remote_options=None, **kwargs): def refs_as_store(refs, read_only=True, remote_protocol=None, remote_options=None): """Convert a reference set to a zarr store""" - asynchronous = False if is_zarr3(): - asynchronous = True if remote_options is None: remote_options = {"asynchronous": True} else: @@ -40,14 +38,14 @@ def refs_as_store(refs, read_only=True, remote_protocol=None, remote_options=Non fs = refs_as_fs( refs, remote_protocol=remote_protocol, - remote_options=remote_options + remote_options=remote_options, ) - return fs_as_store(fs, read_only=True) + return fs_as_store(fs, read_only=read_only) def is_zarr3(): """Check if the installed zarr version is version 3""" - return Version(zarr.__version__) >= Version("3.0.0.a0") + return Version(zarr.__version__) >= Version("3.0.0.b2") def dict_to_store(store_dict: dict): @@ -71,6 +69,7 @@ def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, read_only=True): zarr.storage.Store or zarr.storage.Mapper, fsspec.AbstractFileSystem """ if is_zarr3(): + print(fs.async_impl is None) if not fs.async_impl: fs = AsyncFileSystemWrapper(fs) fs.asynchronous = True @@ -288,7 +287,7 @@ def do_inline(store, threshold, remote_options=None, remote_protocol=None): def _inline_array(group, threshold, names, prefix=""): - for name, thing in group.items(): + for name, thing in group.members(): if prefix: prefix1 = f"{prefix}.{name}" else: @@ -306,9 +305,8 @@ def _inline_array(group, threshold, names, prefix=""): shape=thing.shape, data=thing[:], chunks=thing.shape, - compression=None, - overwrite=True, fill_value=thing.fill_value, + exists_ok=True, ) arr.attrs.update(original_attrs) @@ -338,8 +336,8 @@ def inline_array(store, threshold=1000, names=None, remote_options=None): amended references set (simple style) """ fs = refs_as_fs(store, remote_options=remote_options or {}) - zarr_store = fs_as_store(fs, mode="r+", remote_options=remote_options or {}) - g = zarr.open_group(zarr_store, mode="r+", zarr_format=2) + zarr_store = fs_as_store(fs, read_only=False) + g = zarr.open_group(zarr_store, zarr_format=2) _inline_array(g, threshold, names=names or []) return fs.references From 1fa294e145962ea6472bc53bdcbd69fedd66a69b Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 26 Nov 2024 16:29:25 -0500 Subject: [PATCH 28/40] More --- kerchunk/fits.py | 2 +- kerchunk/hdf4.py | 1 + kerchunk/netCDF3.py | 1 + kerchunk/utils.py | 3 +-- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/kerchunk/fits.py b/kerchunk/fits.py index f4d181ad..2e84120f 100644 --- a/kerchunk/fits.py +++ b/kerchunk/fits.py @@ -249,7 +249,7 @@ def add_wcs_coords(hdu, zarr_group=None, dataset=None, dtype="float32"): } if zarr_group is not None: arr = zarr_group.empty( - name, shape=shape, chunks=shape, dtype=dtype + name, shape=shape, chunks=shape, dtype=dtype, exists_ok=True ) arr.attrs.update(attrs) arr[:] = world_coord.value.reshape(shape) diff --git a/kerchunk/hdf4.py b/kerchunk/hdf4.py index 92b738c7..16b08740 100644 --- a/kerchunk/hdf4.py +++ b/kerchunk/hdf4.py @@ -155,6 +155,7 @@ def translate(self, filename=None, storage_options=None): dtype=v["dtype"], chunks=v.get("chunks", v["dims"]), compressor=compression, + exists_ok=True, ) arr.attrs.update( dict( diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py index af410784..457aafbb 100644 --- a/kerchunk/netCDF3.py +++ b/kerchunk/netCDF3.py @@ -255,6 +255,7 @@ def translate(self): fill_value=fill, chunks=(1,) + dtype.shape, compressor=None, + exists_ok=True, ) arr.attrs.update( { diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 9bc7686e..bb9cd4cb 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -27,7 +27,7 @@ def refs_as_fs(refs, remote_protocol=None, remote_options=None, **kwargs): return fs -def refs_as_store(refs, read_only=True, remote_protocol=None, remote_options=None): +def refs_as_store(refs, read_only=False, remote_protocol=None, remote_options=None): """Convert a reference set to a zarr store""" if is_zarr3(): if remote_options is None: @@ -69,7 +69,6 @@ def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, read_only=True): zarr.storage.Store or zarr.storage.Mapper, fsspec.AbstractFileSystem """ if is_zarr3(): - print(fs.async_impl is None) if not fs.async_impl: fs = AsyncFileSystemWrapper(fs) fs.asynchronous = True From 543178d33eb62a73ac8f4ad184dee7d3fb941b9f Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 26 Nov 2024 16:39:36 -0500 Subject: [PATCH 29/40] Figure out async wrapper --- kerchunk/tests/test_hdf.py | 7 +++++-- kerchunk/utils.py | 12 +++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index f600a127..122cced2 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -13,6 +13,7 @@ import xarray as xr import zarr +from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper from kerchunk.hdf import SingleHdf5ToZarr, has_visititems_links from kerchunk.combine import MultiZarrToZarr, drop from kerchunk.utils import refs_as_fs, refs_as_store @@ -164,7 +165,8 @@ def test_times(times_data): h5chunks = SingleHdf5ToZarr(f, url) test_dict = h5chunks.translate() - store = refs_as_store(test_dict, remote_protocol="file") + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(test_dict, fs=localfs) result = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) ) @@ -179,7 +181,8 @@ def test_times_str(times_data): h5chunks = SingleHdf5ToZarr(url) test_dict = h5chunks.translate() - store = refs_as_store(test_dict) + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(test_dict, fs=localfs) result = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) ) diff --git a/kerchunk/utils.py b/kerchunk/utils.py index bb9cd4cb..667a8b74 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -14,20 +14,21 @@ import zarr -def refs_as_fs(refs, remote_protocol=None, remote_options=None, **kwargs): +def refs_as_fs(refs, fs=None, remote_protocol=None, remote_options=None, **kwargs): """Convert a reference set to an fsspec filesystem""" fs = fsspec.filesystem( "reference", fo=refs, + fs=fs, remote_protocol=remote_protocol, - # remote_options=remote_options, + remote_options=remote_options, **kwargs, asynchronous=True ) return fs -def refs_as_store(refs, read_only=False, remote_protocol=None, remote_options=None): +def refs_as_store(refs, read_only=False, fs=None, remote_protocol=None, remote_options=None): """Convert a reference set to a zarr store""" if is_zarr3(): if remote_options is None: @@ -35,12 +36,13 @@ def refs_as_store(refs, read_only=False, remote_protocol=None, remote_options=No else: remote_options["asynchronous"] = True - fs = refs_as_fs( + fss = refs_as_fs( refs, + fs=fs, remote_protocol=remote_protocol, remote_options=remote_options, ) - return fs_as_store(fs, read_only=read_only) + return fs_as_store(fss, read_only=read_only) def is_zarr3(): From 96b56cd39e564817a7b31d988c4a9ad37f8ea615 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 26 Nov 2024 16:55:06 -0500 Subject: [PATCH 30/40] Closer on hdf5 --- kerchunk/fits.py | 4 +- kerchunk/hdf.py | 13 +++---- kerchunk/tests/test_hdf.py | 76 ++++++++++++++++++++++---------------- kerchunk/utils.py | 14 +++---- 4 files changed, 59 insertions(+), 48 deletions(-) diff --git a/kerchunk/fits.py b/kerchunk/fits.py index 2e84120f..70f48d8a 100644 --- a/kerchunk/fits.py +++ b/kerchunk/fits.py @@ -151,7 +151,7 @@ def process_file( for name in dtype.names if hdu.columns[name].format.startswith(("P", "Q")) } - kwargs["object_codec"] = VarArrCodec( + kwargs["compressor"] = VarArrCodec( str(dtype), str(dt2), nrows, types ) dtype = dt2 @@ -165,7 +165,7 @@ def process_file( # TODO: we could sub-chunk on biggest dimension name = hdu.name or str(ext) arr = g.empty( - name=name, dtype=dtype, shape=shape, chunks=shape, compressor=None, zarr_format=2, **kwargs + name=name, dtype=dtype, shape=shape, chunks=shape, zarr_format=2, **kwargs ) arr.attrs.update( { diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index f72bf8a2..56ae958a 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -325,11 +325,11 @@ def _translator( for v in val ] kwargs["data"] = out - kwargs["object_codec"] = numcodecs.JSON() + kwargs["compressor"] = numcodecs.JSON() fill = None elif self.vlen == "null": dt = "O" - kwargs["object_codec"] = FillStringsCodec(dtype="S16") + kwargs["compressor"] = FillStringsCodec(dtype="S16") fill = " " elif self.vlen == "leave": dt = "S16" @@ -344,7 +344,7 @@ def _translator( index.decode(): label.decode() for index, label in zip(indexes, labels) } - kwargs["object_codec"] = FillStringsCodec( + kwargs["compressor"] = FillStringsCodec( dtype="S16", id_map=mapping ) fill = " " @@ -384,7 +384,7 @@ def _translator( ) } ) - kwargs["object_codec"] = FillStringsCodec( + kwargs["compressor"] = FillStringsCodec( dtype=str(dt), id_map=mapping ) dt = [ @@ -410,7 +410,7 @@ def _translator( ) for v in h5obj.dtype.names ] - kwargs["object_codec"] = FillStringsCodec(dtype=str(dt)) + kwargs["compressor"] = FillStringsCodec(dtype=str(dt)) dt = [ ( v, @@ -451,7 +451,7 @@ def _translator( ) dt = "O" kwargs["data"] = data2 - kwargs["object_codec"] = numcodecs.JSON() + kwargs["compressor"] = numcodecs.JSON() fill = None else: raise NotImplementedError @@ -473,7 +473,6 @@ def _translator( dtype=dt or h5obj.dtype, chunks=h5obj.chunks or False, fill_value=fill, - compressor=None, filters=filters, attributes={ "_ARRAY_DIMENSIONS": adims, diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 122cced2..ecfffa1a 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -198,23 +198,26 @@ def test_string_embed(): fn = osp.join(here, "vlen.h5") h = kerchunk.hdf.SingleHdf5ToZarr(fn, fn, vlen_encode="embed") out = h.translate() - fs = refs_as_fs(out) - assert txt in fs.references["vlen_str/0"] + + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + fs = refs_as_fs(out, fs=localfs) + #assert txt in fs.references["vlen_str/0"] store = fs_as_store(fs) z = zarr.open(store, zarr_format=2) - assert z.vlen_str.dtype == "O" - assert z.vlen_str[0] == txt - assert (z.vlen_str[1:] == "").all() + assert z["vlen_str"].dtype == "O" + assert z["vlen_str"][0] == txt + assert (z["vlen_str"][1:] == "").all() def test_string_null(): fn = osp.join(here, "vlen.h5") h = kerchunk.hdf.SingleHdf5ToZarr(fn, fn, vlen_encode="null", inline_threshold=0) out = h.translate() - store = refs_as_store(out) + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(out, fs=localfs) z = zarr.open(store, zarr_format=2) - assert z.vlen_str.dtype == "O" - assert (z.vlen_str[:] == None).all() + assert z["vlen_str"].dtype == "O" + assert (z["vlen_str"][:] == None).all() def test_string_leave(): @@ -224,11 +227,13 @@ def test_string_leave(): f, fn, vlen_encode="leave", inline_threshold=0 ) out = h.translate() - store = refs_as_store(out) + + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(out, fs=localfs) z = zarr.open(store, zarr_format=2) - assert z.vlen_str.dtype == "S16" - assert z.vlen_str[0] # some obscured ID - assert (z.vlen_str[1:] == b"").all() + assert z["vlen_str"].dtype == "S16" + assert z["vlen_str"][0] # some obscured ID + assert (z["vlen_str"][1:] == b"").all() def test_string_decode(): @@ -238,12 +243,13 @@ def test_string_decode(): f, fn, vlen_encode="encode", inline_threshold=0 ) out = h.translate() - fs = refs_as_fs(out) + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + fs = refs_as_fs(out, fs=localfs) assert txt in fs.cat("vlen_str/.zarray").decode() # stored in filter def store = fs_as_store(fs) z = zarr.open(store, zarr_format=2) - assert z.vlen_str[0] == txt - assert (z.vlen_str[1:] == "").all() + assert z["vlen_str"][0] == txt + assert (z["vlen_str"][1:] == "").all() def test_compound_string_null(): @@ -251,11 +257,12 @@ def test_compound_string_null(): with open(fn, "rb") as f: h = kerchunk.hdf.SingleHdf5ToZarr(f, fn, vlen_encode="null", inline_threshold=0) out = h.translate() - store = refs_as_store(out) + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(out, fs=localfs) z = zarr.open(store, zarr_format=2) - assert z.vlen_str[0].tolist() == (10, None) - assert (z.vlen_str["ints"][1:] == 0).all() - assert (z.vlen_str["strs"][1:] == None).all() + assert z["vlen_str"][0].tolist() == (10, None) + assert (z["vlen_str"]["ints"][1:] == 0).all() + assert (z["vlen_str"]["strs"][1:] == None).all() def test_compound_string_leave(): @@ -265,12 +272,13 @@ def test_compound_string_leave(): f, fn, vlen_encode="leave", inline_threshold=0 ) out = h.translate() - store = refs_as_store(out) + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(out, fs=localfs) z = zarr.open(store, zarr_format=2) - assert z.vlen_str["ints"][0] == 10 - assert z.vlen_str["strs"][0] # random ID - assert (z.vlen_str["ints"][1:] == 0).all() - assert (z.vlen_str["strs"][1:] == b"").all() + assert z["vlen_str"]["ints"][0] == 10 + assert z["vlen_str"]["strs"][0] # random ID + assert (z["vlen_str"]["ints"][1:] == 0).all() + assert (z["vlen_str"]["strs"][1:] == b"").all() def test_compound_string_encode(): @@ -280,12 +288,13 @@ def test_compound_string_encode(): f, fn, vlen_encode="encode", inline_threshold=0 ) out = h.translate() - store = refs_as_store(out) + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(out, fs=localfs) z = zarr.open(store, zarr_format=2) - assert z.vlen_str["ints"][0] == 10 - assert z.vlen_str["strs"][0] == "water" - assert (z.vlen_str["ints"][1:] == 0).all() - assert (z.vlen_str["strs"][1:] == "").all() + assert z["vlen_str"]["ints"][0] == 10 + assert z["vlen_str"]["strs"][0] == "water" + assert (z["vlen_str"]["ints"][1:] == 0).all() + assert (z["vlen_str"]["strs"][1:] == "").all() # def test_compact(): @@ -311,7 +320,8 @@ def test_compress(): h.translate() continue out = h.translate() - store = refs_as_store(out) + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(out, fs=localfs) g = zarr.open(store, zarr_format=2) assert np.mean(g.data) == 49.5 @@ -321,7 +331,8 @@ def test_embed(): h = kerchunk.hdf.SingleHdf5ToZarr(fn, vlen_encode="embed") out = h.translate() - store = refs_as_store(out) + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(out, fs=localfs) z = zarr.open(store, zarr_format=2) data = z["Domain_10"]["STER"]["min_1"]["boom_1"]["temperature"][:] assert data[0].tolist() == [ @@ -356,7 +367,8 @@ def test_translate_links(): out = kerchunk.hdf.SingleHdf5ToZarr(fn, inline_threshold=50).translate( preserve_linked_dsets=True ) - store = refs_as_store(out) + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(out, fs=localfs) z = zarr.open(store, zarr_format=2) # 1. Test the hard linked datasets were translated correctly diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 667a8b74..773d5dd1 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -58,7 +58,7 @@ def dict_to_store(store_dict: dict): return zarr.storage.KVStore(store_dict) -def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, read_only=True): +def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, read_only=False): """Open the refs as a zarr store Parameters @@ -204,14 +204,14 @@ def _encode_for_JSON(store): return store -def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: +def encode_fill_value(v: Any, dtype: np.dtype, compressor: Any = None) -> Any: # early out if v is None: return v if dtype.kind == "V" and dtype.hasobject: - if object_codec is None: - raise ValueError("missing object_codec for object array") - v = object_codec.encode(v) + if compressor is None: + raise ValueError("missing compressor for object array") + v = compressor.encode(v) v = str(base64.standard_b64encode(v), "ascii") return v if dtype.kind == "f": @@ -230,8 +230,8 @@ def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: elif dtype.kind in "c": c = cast(np.complex128, np.dtype(complex).type()) v = ( - encode_fill_value(v.real, c.real.dtype, object_codec), - encode_fill_value(v.imag, c.imag.dtype, object_codec), + encode_fill_value(v.real, c.real.dtype, compressor), + encode_fill_value(v.imag, c.imag.dtype, compressor), ) return v elif dtype.kind in "SV": From 0808b05b64eb7d378f226d55297298c7fa2540c6 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 26 Nov 2024 16:59:20 -0500 Subject: [PATCH 31/40] netcdf but failing --- kerchunk/tests/test_netcdf.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kerchunk/tests/test_netcdf.py b/kerchunk/tests/test_netcdf.py index 755823da..e6bfd066 100644 --- a/kerchunk/tests/test_netcdf.py +++ b/kerchunk/tests/test_netcdf.py @@ -1,12 +1,12 @@ import os - import fsspec import numpy as np from packaging.version import Version import pytest from kerchunk import netCDF3 +from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper from kerchunk.utils import refs_as_store xr = pytest.importorskip("xarray") @@ -31,7 +31,7 @@ def test_one(m): h = netCDF3.netcdf_recording_file("memory://data.nc3") out = h.translate() - store = refs_as_store(out, remote_protocol="memory") + store = refs_as_store(out) ds = xr.open_dataset( store, @@ -86,13 +86,14 @@ def test_unlimited(unlimited_dataset): expected = xr.open_dataset(fn, engine="scipy") h = netCDF3.NetCDF3ToZarr(fn) out = h.translate() - ds = xr.open_dataset( - "reference://", - engine="zarr", - backend_kwargs={ - "consolidated": False, - "storage_options": {"fo": out}, - }, + + fs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(out, fs) + + ds = xr.open_zarr( + store, + zarr_format=2, + consolidated=False, ) assert ds.attrs["title"] == "testing" assert ds.temp.attrs["units"] == "K" From aef006e342e56aa03e771a79d6262cb9b999b105 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 26 Nov 2024 17:06:34 -0500 Subject: [PATCH 32/40] grib passing --- kerchunk/tests/test_grib.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kerchunk/tests/test_grib.py b/kerchunk/tests/test_grib.py index 9bc90b71..5925abc6 100644 --- a/kerchunk/tests/test_grib.py +++ b/kerchunk/tests/test_grib.py @@ -9,6 +9,7 @@ #import datatree import zarr import ujson +from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper from kerchunk.grib2 import ( scan_grib, _split_file, @@ -32,10 +33,13 @@ def test_one(): # from https://dd.weather.gc.ca/model_gem_regional/10km/grib2/00/000 fn = os.path.join(here, "CMC_reg_DEPR_ISBL_10_ps10km_2022072000_P000.grib2") out = scan_grib(fn) - ds = xr.open_dataset( - "reference://", - engine="zarr", - backend_kwargs={"consolidated": False, "storage_options": {"fo": out[0]}}, + + fs = AsyncFileSystemWrapper(fsspec.filesystem("file")) + store = refs_as_store(out[0], fs=fs) + ds = xr.open_zarr( + store, + zarr_format=2, + consolidated=False ) assert ds.attrs["GRIB_centre"] == "cwao" From d9bf0dd1f10463ee26b2558ef1fba6764d5609c4 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 26 Nov 2024 17:17:52 -0500 Subject: [PATCH 33/40] Fix inline test --- kerchunk/tests/test_utils.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kerchunk/tests/test_utils.py b/kerchunk/tests/test_utils.py index 701427e2..5b556794 100644 --- a/kerchunk/tests/test_utils.py +++ b/kerchunk/tests/test_utils.py @@ -8,6 +8,8 @@ import pytest import zarr +from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + def test_rename(): old = {"version": 1, "refs": {"v0": ["oldpath", 0, 0], "bin": "data"}} @@ -75,17 +77,17 @@ def test_inline_array(): out1 = kerchunk.utils.inline_array(refs, threshold=1) # does nothing assert out1 == refs out2 = kerchunk.utils.inline_array(refs, threshold=1, names=["data"]) # explicit - assert "data/1" not in out2 # TODO: Is this wrong? I dont think zarr deletes existing chunks when overwriting assert json.loads(out2["data/.zattrs"]) == json.loads(refs["data/.zattrs"]) - store = kerchunk.utils.refs_as_store(out2) + + localfs = fsspec.filesystem("file") + store = kerchunk.utils.refs_as_store(out2, fs=localfs) g = zarr.open(store, mode='r', zarr_format=2) - assert g.data[:].tolist() == [1, 2] # What is g.data??? + assert g["data"][:].tolist() == [1, 2] # What is g.data??? out3 = kerchunk.utils.inline_array(refs, threshold=1000) # inlines because of size - assert "data/1" not in out3 - store = kerchunk.utils.refs_as_store(out3) + store = kerchunk.utils.refs_as_store(out3, localfs) g = zarr.open(store, mode='r', zarr_format=2) - assert g.data[:].tolist() == [1, 2] # What is g.data??? + assert g["data"][:].tolist() == [1, 2] # What is g.data??? def test_json(): From 884fc685ecff296cf8f677334a8990860fb0d9ae Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 26 Nov 2024 17:27:03 -0500 Subject: [PATCH 34/40] More --- kerchunk/tests/test_zarr.py | 1 + kerchunk/xarray_backend.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/kerchunk/tests/test_zarr.py b/kerchunk/tests/test_zarr.py index 94af8939..3c02fc69 100644 --- a/kerchunk/tests/test_zarr.py +++ b/kerchunk/tests/test_zarr.py @@ -54,6 +54,7 @@ def test_zarr_in_zip(zarr_in_zip, ds): out = kerchunk.zarr.ZarrToZarr( url="zip://", storage_options={"fo": zarr_in_zip} ).translate() + ds2 = xr.open_dataset( out, engine="kerchunk", diff --git a/kerchunk/xarray_backend.py b/kerchunk/xarray_backend.py index dfbbafba..0620614b 100644 --- a/kerchunk/xarray_backend.py +++ b/kerchunk/xarray_backend.py @@ -3,6 +3,8 @@ import os import fsspec +from kerchunk.utils import refs_as_store + class KerchunkBackend(BackendEntrypoint): def open_dataset( @@ -41,8 +43,8 @@ def open_reference_dataset( if open_dataset_options is None: open_dataset_options = {} - m = fsspec.get_mapper("reference://", fo=filename_or_obj, **storage_options) + store = refs_as_store(filename_or_obj, remote_options=storage_options) - return xr.open_dataset( - m, engine="zarr", zarr_format=2, consolidated=False, **open_dataset_options + return xr.open_zarr( + store, zarr_format=2, consolidated=False, **open_dataset_options ) From 1145f454afd3ad663bdc0d55ae5003fc65ee5ae8 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 26 Nov 2024 21:47:45 -0500 Subject: [PATCH 35/40] standardize compressor name --- kerchunk/combine.py | 4 ++-- kerchunk/hdf4.py | 4 ++-- kerchunk/tests/test_df.py | 2 +- kerchunk/tests/test_utils.py | 4 ++-- kerchunk/tests/test_zarr.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kerchunk/combine.py b/kerchunk/combine.py index 841b9e8a..376a8003 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -409,7 +409,7 @@ def store_coords(self): # The names of the variables to write in the second pass, not a coordinate continue # parametrize the threshold value below? - compression = numcodecs.Zstd() if len(v) > 100 else None + compressor = numcodecs.Zstd() if len(v) > 100 else None kw = {} if self.cf_units and k in self.cf_units: if "M" not in self.coo_dtypes.get(k, ""): @@ -439,7 +439,7 @@ def store_coords(self): data=data, shape=data.shape, exists_ok=True, - compressor=compression, + compressor=compressor, dtype=self.coo_dtypes.get(k, data.dtype), **kw, ) diff --git a/kerchunk/hdf4.py b/kerchunk/hdf4.py index 16b08740..030c33a0 100644 --- a/kerchunk/hdf4.py +++ b/kerchunk/hdf4.py @@ -148,13 +148,13 @@ def translate(self, filename=None, storage_options=None): refs = {} for k, v in output.items(): if isinstance(v, dict): - compression = ZlibCodec() if "refs" in v else None + compressor = ZlibCodec() if "refs" in v else None arr = g.create_dataset( name=k, shape=v["dims"], dtype=v["dtype"], chunks=v.get("chunks", v["dims"]), - compressor=compression, + compressor=compressor, exists_ok=True, ) arr.attrs.update( diff --git a/kerchunk/tests/test_df.py b/kerchunk/tests/test_df.py index 0d0fafb1..45bcb9bc 100644 --- a/kerchunk/tests/test_df.py +++ b/kerchunk/tests/test_df.py @@ -18,7 +18,7 @@ def test_1(m, url): "a/4": ["memory://url4.file"], "a/5": ["memory://url5.file"], "a/6": b"data", - "a/.zarray": b"""{"shape": [7], "chunks":[1], "filters": [], "compression": null}""", + "a/.zarray": b"""{"shape": [7], "chunks":[1], "filters": [], "compressor": null}""", ".zgroup": b'{"zarr_format": 2}', } u = "memory://myrefs.json" diff --git a/kerchunk/tests/test_utils.py b/kerchunk/tests/test_utils.py index 5b556794..a29e3b4f 100644 --- a/kerchunk/tests/test_utils.py +++ b/kerchunk/tests/test_utils.py @@ -102,7 +102,7 @@ def test_subchunk_exact(m, chunks): store = m.get_mapper("test.zarr") g = zarr.open_group(store, mode="w", zarr_format=2) data = np.arange(100).reshape(10, 10) - arr = g.create_dataset("data", data=data, chunks=chunks, compression=None) + arr = g.create_dataset("data", data=data, chunks=chunks, compressor=None) ref = kerchunk.zarr.single_zarr("memory://test.zarr")["refs"] extra = [] if chunks[0] == 10 else ["data/1.0"] @@ -162,7 +162,7 @@ def test_deflate_zip_archive(m): data = b"piece of data" with fsspec.open("memory://archive", "wb") as f: - arc = zipfile.ZipFile(file=f, mode="w", compression=zipfile.ZIP_DEFLATED) + arc = zipfile.ZipFile(file=f, mode="w", compressor=zipfile.ZIP_DEFLATED) arc.writestr("data1", data) arc.close() refs = { diff --git a/kerchunk/tests/test_zarr.py b/kerchunk/tests/test_zarr.py index 3c02fc69..b78baaaa 100644 --- a/kerchunk/tests/test_zarr.py +++ b/kerchunk/tests/test_zarr.py @@ -37,7 +37,7 @@ def _zip(file): filename = file + os.path.extsep + "zip" with zipfile.ZipFile( - filename, "w", compression=zipfile.ZIP_STORED, allowZip64=True + filename, "w", compressor=zipfile.ZIP_STORED, allowZip64=True ) as fh: for root, _, filenames in os.walk(file): for each_filename in filenames: From 94ec47938c8eed8319ddfc80c6cc36189579b973 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Tue, 26 Nov 2024 21:53:08 -0500 Subject: [PATCH 36/40] Fix one more hdf test --- kerchunk/tests/test_hdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index ecfffa1a..68961394 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -323,7 +323,7 @@ def test_compress(): localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) store = refs_as_store(out, fs=localfs) g = zarr.open(store, zarr_format=2) - assert np.mean(g.data) == 49.5 + assert np.mean(g["data"]) == 49.5 def test_embed(): From a9693d1b5be8c5752b63221beef7831ae0b5584b Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 27 Nov 2024 10:39:29 -0500 Subject: [PATCH 37/40] Small tweaks --- kerchunk/tests/test_netcdf.py | 2 ++ kerchunk/tests/test_utils.py | 2 +- kerchunk/tests/test_zarr.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kerchunk/tests/test_netcdf.py b/kerchunk/tests/test_netcdf.py index e6bfd066..b7143398 100644 --- a/kerchunk/tests/test_netcdf.py +++ b/kerchunk/tests/test_netcdf.py @@ -31,6 +31,8 @@ def test_one(m): h = netCDF3.netcdf_recording_file("memory://data.nc3") out = h.translate() + print(out) + store = refs_as_store(out) ds = xr.open_dataset( diff --git a/kerchunk/tests/test_utils.py b/kerchunk/tests/test_utils.py index a29e3b4f..5cbfb150 100644 --- a/kerchunk/tests/test_utils.py +++ b/kerchunk/tests/test_utils.py @@ -162,7 +162,7 @@ def test_deflate_zip_archive(m): data = b"piece of data" with fsspec.open("memory://archive", "wb") as f: - arc = zipfile.ZipFile(file=f, mode="w", compressor=zipfile.ZIP_DEFLATED) + arc = zipfile.ZipFile(file=f, mode="w", compression=zipfile.ZIP_DEFLATED) arc.writestr("data1", data) arc.close() refs = { diff --git a/kerchunk/tests/test_zarr.py b/kerchunk/tests/test_zarr.py index b78baaaa..3c02fc69 100644 --- a/kerchunk/tests/test_zarr.py +++ b/kerchunk/tests/test_zarr.py @@ -37,7 +37,7 @@ def _zip(file): filename = file + os.path.extsep + "zip" with zipfile.ZipFile( - filename, "w", compressor=zipfile.ZIP_STORED, allowZip64=True + filename, "w", compression=zipfile.ZIP_STORED, allowZip64=True ) as fh: for root, _, filenames in os.walk(file): for each_filename in filenames: From 7e9112ad7418fee0acde01a4fb5f2c91fc805121 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 27 Nov 2024 10:55:54 -0500 Subject: [PATCH 38/40] Hide fsspec import where necessary --- kerchunk/utils.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 773d5dd1..b8a53e3c 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -9,7 +9,6 @@ import ujson import fsspec -from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper import numpy as np import zarr @@ -23,12 +22,14 @@ def refs_as_fs(refs, fs=None, remote_protocol=None, remote_options=None, **kwarg remote_protocol=remote_protocol, remote_options=remote_options, **kwargs, - asynchronous=True + asynchronous=True, ) return fs -def refs_as_store(refs, read_only=False, fs=None, remote_protocol=None, remote_options=None): +def refs_as_store( + refs, read_only=False, fs=None, remote_protocol=None, remote_options=None +): """Convert a reference set to a zarr store""" if is_zarr3(): if remote_options is None: @@ -40,7 +41,7 @@ def refs_as_store(refs, read_only=False, fs=None, remote_protocol=None, remote_o refs, fs=fs, remote_protocol=remote_protocol, - remote_options=remote_options, + remote_options=remote_options, ) return fs_as_store(fss, read_only=read_only) @@ -72,7 +73,14 @@ def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, read_only=False): """ if is_zarr3(): if not fs.async_impl: - fs = AsyncFileSystemWrapper(fs) + try: + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + + fs = AsyncFileSystemWrapper(fs) + except ImportError: + raise ImportError( + "Only fsspec>2024.10.0 supports the async filesystem wrapper required for working with reference filesystems. " + ) fs.asynchronous = True return zarr.storage.RemoteStore(fs, read_only=read_only) else: From a7af691c2aea422783907362be834913648fe61d Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 16 Jan 2025 09:52:24 -0500 Subject: [PATCH 39/40] Update with many fixes - but stioll not complete --- kerchunk/__init__.py | 2 +- kerchunk/codecs.py | 7 ++-- kerchunk/combine.py | 20 +++++---- kerchunk/fits.py | 9 ++-- kerchunk/hdf.py | 75 ++++++++++++++++++++++------------ kerchunk/netCDF3.py | 12 ++++-- kerchunk/tests/test_combine.py | 26 +++++++----- kerchunk/tests/test_hdf.py | 14 +++---- kerchunk/tests/test_tiff.py | 4 +- kerchunk/tests/test_utils.py | 28 +++++++------ kerchunk/tests/test_zarr.py | 4 +- kerchunk/utils.py | 68 +++++++++++++++--------------- kerchunk/xarray_backend.py | 2 +- kerchunk/zarr.py | 17 +++++++- pytest.ini | 2 + 15 files changed, 173 insertions(+), 117 deletions(-) create mode 100644 pytest.ini diff --git a/kerchunk/__init__.py b/kerchunk/__init__.py index 21b4e540..85863c32 100644 --- a/kerchunk/__init__.py +++ b/kerchunk/__init__.py @@ -1,4 +1,4 @@ -from . import codecs +from kerchunk import codecs from importlib.metadata import version as _version diff --git a/kerchunk/codecs.py b/kerchunk/codecs.py index 46b19072..c0680da8 100644 --- a/kerchunk/codecs.py +++ b/kerchunk/codecs.py @@ -134,7 +134,7 @@ def __init__(self, *, var: str, dtype: np.dtype) -> None: object.__setattr__(self, "dtype", dtype) @classmethod - def from_dict(cls, data: dict[str, JSON]) -> Self: + def from_dict(cls, data: dict[str, JSON]) -> "GRIBZarrCodec": _, configuration_parsed = parse_named_configuration( data, "bytes", require_configuration=True ) @@ -149,7 +149,7 @@ def to_dict(self) -> dict[str, JSON]: "name": "grib", "configuration": {"var": self.var, "dtype": self.dtype}, } - + async def _decode_single( self, chunk_bytes: Buffer, @@ -322,7 +322,8 @@ def encode(self, buf): class ZlibCodec(Codec): codec_id = "zlib" - def __init__(self): ... + def __init__(self): + ... def decode(self, data, out=None): if out: diff --git a/kerchunk/combine.py b/kerchunk/combine.py index 376a8003..ca3e488d 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -201,7 +201,7 @@ def append( remote_protocol=remote_protocol, remote_options=remote_options, target_options=target_options, - asynchronous=True + asynchronous=True, ) ds = xr.open_dataset( fs.get_mapper(), engine="zarr", backend_kwargs={"consolidated": False} @@ -267,7 +267,9 @@ def fss(self): self._paths = [] for of in fsspec.open_files(self.path, **self.target_options): self._paths.append(of.full_name) - fs = fsspec.core.url_to_fs(self.path[0], asynchronous=True, **self.target_options)[0] + fs = fsspec.core.url_to_fs( + self.path[0], asynchronous=True, **self.target_options + )[0] try: # JSON path fo_list = fs.cat(self.path) @@ -436,13 +438,13 @@ def store_coords(self): kw["fill_value"] = z[k].fill_value arr = group.create_array( name=k, - data=data, shape=data.shape, - exists_ok=True, + overwrite=True, compressor=compressor, dtype=self.coo_dtypes.get(k, data.dtype), **kw, ) + arr[:] = data if k in z: # copy attributes if values came from an original variable arr.attrs.update(z[k].attrs) @@ -505,7 +507,9 @@ def second_pass(self): if f"{v}/.zgroup" in fns: # recurse into groups - copy meta, add to dirs to process and don't look # for references in this dir - metadata = asyncio.run(self._read_meta_files(m, [f"{v}/.zgroup", f"{v}/.zattrs"])) + metadata = asyncio.run( + self._read_meta_files(m, [f"{v}/.zgroup", f"{v}/.zattrs"]) + ) self.out.update(metadata) dirs.extend([f for f in fns if not f.startswith(f"{v}/.z")]) continue @@ -517,8 +521,10 @@ def second_pass(self): self.out[k] = fs.references[k] continue logger.debug("Second pass: %s, %s", i, v) - - zarray = asyncio.run(self._read_meta_files(m, [f"{v}/.zarray"]))[f"{v}/.zarray"] + + zarray = asyncio.run(self._read_meta_files(m, [f"{v}/.zarray"]))[ + f"{v}/.zarray" + ] zarray = ujson.loads(zarray) if v not in chunk_sizes: chunk_sizes[v] = zarray["chunks"] diff --git a/kerchunk/fits.py b/kerchunk/fits.py index 70f48d8a..7afadd6d 100644 --- a/kerchunk/fits.py +++ b/kerchunk/fits.py @@ -8,7 +8,7 @@ from fsspec.implementations.reference import LazyReferenceMapper -from kerchunk.utils import class_factory, dict_to_store +from kerchunk.utils import class_factory, dict_to_store, translate_refs_serializable from kerchunk.codecs import AsciiTableCodec, VarArrCodec try: @@ -94,7 +94,7 @@ def process_file( hdu.header.__str__() # causes fixing of invalid cards attrs = dict(hdu.header) - kwargs = {} + kwargs = {"compressor": None} if hdu.is_image: # for images/cubes (i.e., ndarrays with simple type) nax = hdu.header["NAXIS"] @@ -164,8 +164,8 @@ def process_file( # one chunk for whole thing. # TODO: we could sub-chunk on biggest dimension name = hdu.name or str(ext) - arr = g.empty( - name=name, dtype=dtype, shape=shape, chunks=shape, zarr_format=2, **kwargs + arr = g.create_array( + name=name, dtype=dtype, shape=shape, chunks=shape, **kwargs ) arr.attrs.update( { @@ -191,6 +191,7 @@ def process_file( ) if isinstance(out, LazyReferenceMapper): out.flush() + out = translate_refs_serializable(out) return out diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 56ae958a..e0d58951 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -10,7 +10,12 @@ import numcodecs from .codecs import FillStringsCodec -from .utils import _encode_for_JSON, encode_fill_value, dict_to_store, translate_refs_serializable +from .utils import ( + _encode_for_JSON, + encode_fill_value, + dict_to_store, + translate_refs_serializable, +) try: import h5py @@ -32,6 +37,7 @@ "_nc3_strict", "_NCProperties", } +fsspec.utils.setup_logging(lggr) class SingleHdf5ToZarr: @@ -173,6 +179,7 @@ def _transfer_attrs( An equivalent Zarr group or array to the HDF5 group or dataset with attributes. """ + upd = {} for n, v in h5obj.attrs.items(): if n in _HIDDEN_ATTRS: continue @@ -196,11 +203,19 @@ def _transfer_attrs( if v == "DIMENSION_SCALE": continue try: - zobj.attrs[n] = v + if isinstance(v, (str, int, float)): + upd[n] = v + elif isinstance(v, (tuple, set, list)) and all( + isinstance(_, (str, int, float)) for _ in v + ): + upd[n] = list(v) + else: + upd[n] = str(v) except TypeError: lggr.debug( f"TypeError transferring attr, skipping:\n {n}@{h5obj.name} = {v} ({type(v)})" ) + zobj.attrs.update(upd) def _decode_filters(self, h5obj: Union[h5py.Dataset, h5py.Group]): if h5obj.scaleoffset: @@ -272,7 +287,7 @@ def _translator( ): """Produce Zarr metadata for all groups and datasets in the HDF5 file.""" try: # method must not raise exception - kwargs = {} + kwargs = {"compressor": None} if isinstance(h5obj, (h5py.SoftLink, h5py.HardLink)): h5obj = self._h5f[name] @@ -289,9 +304,9 @@ def _translator( if h5obj.id.get_create_plist().get_layout() == h5py.h5d.COMPACT: # Only do if h5obj.nbytes < self.inline?? kwargs["data"] = h5obj[:] - filters = [] + kwargs["filters"] = [] else: - filters = self._decode_filters(h5obj) + kwargs["filters"] = self._decode_filters(h5obj) dt = None # Get storage info of this HDF5 dataset... cinfo = self._storage_info(h5obj) @@ -325,11 +340,11 @@ def _translator( for v in val ] kwargs["data"] = out - kwargs["compressor"] = numcodecs.JSON() + kwargs["filters"] = [numcodecs.JSON()] fill = None elif self.vlen == "null": dt = "O" - kwargs["compressor"] = FillStringsCodec(dtype="S16") + kwargs["filters"] = [FillStringsCodec(dtype="S16")] fill = " " elif self.vlen == "leave": dt = "S16" @@ -344,9 +359,9 @@ def _translator( index.decode(): label.decode() for index, label in zip(indexes, labels) } - kwargs["compressor"] = FillStringsCodec( - dtype="S16", id_map=mapping - ) + kwargs["filters"] = [ + FillStringsCodec(dtype="S16", id_map=mapping) + ] fill = " " else: raise NotImplementedError @@ -384,9 +399,9 @@ def _translator( ) } ) - kwargs["compressor"] = FillStringsCodec( - dtype=str(dt), id_map=mapping - ) + kwargs["filters"] = [ + FillStringsCodec(dtype=str(dt), id_map=mapping) + ] dt = [ ( v, @@ -410,7 +425,7 @@ def _translator( ) for v in h5obj.dtype.names ] - kwargs["compressor"] = FillStringsCodec(dtype=str(dt)) + kwargs["filters"] = [FillStringsCodec(dtype=str(dt))] dt = [ ( v, @@ -451,7 +466,7 @@ def _translator( ) dt = "O" kwargs["data"] = data2 - kwargs["compressor"] = numcodecs.JSON() + kwargs["filters"] = [numcodecs.JSON()] fill = None else: raise NotImplementedError @@ -460,20 +475,18 @@ def _translator( return if h5obj.attrs.get("_FillValue") is not None: fill = h5obj.attrs.get("_FillValue") - fill = encode_fill_value( - fill, dt or h5obj.dtype - ) + fill = encode_fill_value(fill, dt or h5obj.dtype) adims = self._get_array_dims(h5obj) - # Create a Zarr array equivalent to this HDF5 dataset.. - za = self._zroot.require_array( + # Create a Zarr array equivalent to this HDF5 dataset. + data = kwargs.pop("data", None) + za = self._zroot.create_array( name=h5obj.name, shape=h5obj.shape, dtype=dt or h5obj.dtype, - chunks=h5obj.chunks or False, + chunks=h5obj.chunks or h5obj.shape, fill_value=fill, - filters=filters, attributes={ "_ARRAY_DIMENSIONS": adims, }, @@ -483,9 +496,14 @@ def _translator( self._transfer_attrs(h5obj, za) lggr.debug(f"_ARRAY_DIMENSIONS = {adims}") - - if "data" in kwargs: - return # embedded bytes, no chunks to copy + if data is not None: + try: + za[:] = data + except (ValueError, TypeError): + self.store_dict[f"{za.path}/0"] = kwargs["filters"][0].encode( + data + ) + return # Store chunk location metadata... if cinfo: @@ -493,7 +511,11 @@ def _translator( if h5obj.fletcher32: logging.info("Discarding fletcher32 checksum") v["size"] -= 4 - key = str.removeprefix(h5obj.name, "/") + "/" + ".".join(map(str, k)) + key = ( + str.removeprefix(h5obj.name, "/") + + "/" + + ".".join(map(str, k)) + ) if ( self.inline @@ -681,4 +703,3 @@ def _is_netcdf_variable(dataset: h5py.Dataset): def has_visititems_links(): return hasattr(h5py.Group, "visititems_links") - diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py index 457aafbb..d5356876 100644 --- a/kerchunk/netCDF3.py +++ b/kerchunk/netCDF3.py @@ -6,7 +6,13 @@ from fsspec.implementations.reference import LazyReferenceMapper import fsspec -from kerchunk.utils import _encode_for_JSON, dict_to_store, inline_array, translate_refs_serializable +import kerchunk.utils +from kerchunk.utils import ( + _encode_for_JSON, + dict_to_store, + inline_array, + translate_refs_serializable, +) try: from scipy.io._netcdf import ZERO, NC_VARIABLE, netcdf_file, netcdf_variable @@ -255,7 +261,7 @@ def translate(self): fill_value=fill, chunks=(1,) + dtype.shape, compressor=None, - exists_ok=True, + overwrite=True, ) arr.attrs.update( { @@ -288,13 +294,13 @@ def translate(self): if k != "filename" # special "attribute" } ) + out = kerchunk.utils.translate_refs_serializable(out) if self.threshold: out = inline_array( out, self.threshold, remote_options=dict(remote_options=self.storage_options), ) - if isinstance(out, LazyReferenceMapper): out.flush() return out diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py index 0cfb9505..054291a4 100644 --- a/kerchunk/tests/test_combine.py +++ b/kerchunk/tests/test_combine.py @@ -4,7 +4,7 @@ import dask.array as da import pytest import xarray as xr -import zarr +import zarr.storage import kerchunk.combine from kerchunk.zarr import single_zarr @@ -132,20 +132,23 @@ xr.Dataset({"data": data}).to_zarr("memory://quad_2chunk2.zarr") # simple time arrays - xarray can't make these! -m = fs.get_mapper("time1.zarr") -z = zarr.open(m, mode="w", zarr_format=2) +z = zarr.open("memory://time1.zarr", mode="w", zarr_format=2) time1_array = np.array([1], dtype="M8[s]") -ar = z.create_array("time", data=time1_array, shape=time1_array.shape) +ar = z.create_array("time", shape=time1_array.shape, dtype=time1_array.dtype) +ar[:] = time1_array ar.attrs.update({"_ARRAY_DIMENSIONS": ["time"]}) -ar = z.create_array("data", data=arr, shape=arr.shape) +ar = z.create_array("data", dtype=arr.dtype, shape=arr.shape) +ar[:] = arr ar.attrs.update({"_ARRAY_DIMENSIONS": ["time", "x", "y"]}) -m = fs.get_mapper("time2.zarr") -z = zarr.open(m, mode="w", zarr_format=2) + +z = zarr.open("memory://ime2.zarr", mode="w", zarr_format=2) time2_array = np.array([2], dtype="M8[s]") -ar = z.create_array("time", data=time2_array, shape=time2_array.shape) +ar = z.create_array("time", dtype=time2_array.dtype, shape=time2_array.shape) +ar[:] = time2_array ar.attrs.update({"_ARRAY_DIMENSIONS": ["time"]}) -ar = z.create_array("data", data=arr, shape=arr.shape) +ar = z.create_array("data", dtype=arr.dtype, shape=arr.shape) +ar[:] = arr ar.attrs.update({"_ARRAY_DIMENSIONS": ["time", "x", "y"]}) @@ -228,8 +231,9 @@ def refs(): def test_fixture(refs): # effectively checks that single_zarr works assert "single1" in refs - m = fsspec.get_mapper("reference://", fo=refs["single1"], remote_protocol="memory") - g = xr.open_dataset(m, engine="zarr", backend_kwargs={"consolidated": False}) + fs = fsspec.filesystem("reference", fo=refs["single1"], remote_protocol="memory") + store = zarr.storage.FsspecStore(fs) + g = xr.open_dataset(store, engine="zarr", backend_kwargs={"consolidated": False}) assert g.time.values.tolist() == [1] assert (g.data.values == arr).all() assert g.attrs["attr1"] == 5 diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 68961394..c8d6c678 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -51,7 +51,7 @@ def test_single_direct_open(): h5f=url, inline_threshold=300, storage_options=so ).translate() - store = refs_as_store(test_dict) + store = refs_as_store(test_dict, remote_options=dict(asynchronous=True, anon=True)) ds_direct = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) @@ -61,7 +61,7 @@ def test_single_direct_open(): h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) test_dict = h5chunks.translate() - store = refs_as_store(test_dict) + store = refs_as_store(test_dict, remote_options=dict(asynchronous=True, anon=True)) ds_from_file_opener = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) @@ -88,7 +88,7 @@ def test_multizarr(generate_mzz): mzz = generate_mzz test_dict = mzz.translate() - store = refs_as_store(test_dict) + store = refs_as_store(test_dict, remote_options=dict(asynchronous=True, anon=True)) ds = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) ) @@ -196,12 +196,12 @@ def test_times_str(times_data): def test_string_embed(): fn = osp.join(here, "vlen.h5") - h = kerchunk.hdf.SingleHdf5ToZarr(fn, fn, vlen_encode="embed") + h = kerchunk.hdf.SingleHdf5ToZarr(fn, fn, vlen_encode="embed", error="pdb") out = h.translate() localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) fs = refs_as_fs(out, fs=localfs) - #assert txt in fs.references["vlen_str/0"] + # assert txt in fs.references["vlen_str/0"] store = fs_as_store(fs) z = zarr.open(store, zarr_format=2) assert z["vlen_str"].dtype == "O" @@ -227,7 +227,7 @@ def test_string_leave(): f, fn, vlen_encode="leave", inline_threshold=0 ) out = h.translate() - + localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) store = refs_as_store(out, fs=localfs) z = zarr.open(store, zarr_format=2) @@ -328,7 +328,7 @@ def test_compress(): def test_embed(): fn = osp.join(here, "NEONDSTowerTemperatureData.hdf5") - h = kerchunk.hdf.SingleHdf5ToZarr(fn, vlen_encode="embed") + h = kerchunk.hdf.SingleHdf5ToZarr(fn, vlen_encode="embed", error="pdb") out = h.translate() localfs = AsyncFileSystemWrapper(fsspec.filesystem("file")) diff --git a/kerchunk/tests/test_tiff.py b/kerchunk/tests/test_tiff.py index b81e7bab..3e4ea1c7 100644 --- a/kerchunk/tests/test_tiff.py +++ b/kerchunk/tests/test_tiff.py @@ -36,8 +36,8 @@ def test_coord(): fn = files[0] out = kerchunk.tiff.tiff_to_zarr(fn) store = refs_as_store(out) - z = zarr.open(out, zarr_format=2) # highest res is the one xarray picks - out = kerchunk.tiff.generate_coords(z.attrs, z[0].shape) + z = zarr.open(store, zarr_format=2) # highest res is the one xarray picks + out = kerchunk.tiff.generate_coords(z.attrs, z["0"].shape) ds = xr.open_dataset(fn) assert (ds.x == out["x"]).all() diff --git a/kerchunk/tests/test_utils.py b/kerchunk/tests/test_utils.py index 5cbfb150..f6c7e5ef 100644 --- a/kerchunk/tests/test_utils.py +++ b/kerchunk/tests/test_utils.py @@ -81,13 +81,13 @@ def test_inline_array(): localfs = fsspec.filesystem("file") store = kerchunk.utils.refs_as_store(out2, fs=localfs) - g = zarr.open(store, mode='r', zarr_format=2) - assert g["data"][:].tolist() == [1, 2] # What is g.data??? + g = zarr.open(store, mode="r", zarr_format=2) + assert g["data"][:].tolist() == [1, 2] # What is g.data??? out3 = kerchunk.utils.inline_array(refs, threshold=1000) # inlines because of size store = kerchunk.utils.refs_as_store(out3, localfs) - g = zarr.open(store, mode='r', zarr_format=2) - assert g["data"][:].tolist() == [1, 2] # What is g.data??? + g = zarr.open(store, mode="r", zarr_format=2) + assert g["data"][:].tolist() == [1, 2] # What is g.data??? def test_json(): @@ -99,28 +99,30 @@ def test_json(): @pytest.mark.parametrize("chunks", [[10, 10], [5, 10]]) def test_subchunk_exact(m, chunks): - store = m.get_mapper("test.zarr") - g = zarr.open_group(store, mode="w", zarr_format=2) + g = zarr.open_group("memory://test.zarr", mode="w", zarr_format=2) data = np.arange(100).reshape(10, 10) - arr = g.create_dataset("data", data=data, chunks=chunks, compressor=None) + arr = g.create_array( + "data", dtype=data.dtype, shape=data.shape, chunks=chunks, compressor=None + ) + arr[:] = data ref = kerchunk.zarr.single_zarr("memory://test.zarr")["refs"] extra = [] if chunks[0] == 10 else ["data/1.0"] - assert list(ref) == [".zgroup", "data/.zarray", "data/0.0"] + extra + ref2 = list(_ for _ in ref if not _.endswith("zattrs")) # ignore empty attrs + assert ref2 == [".zgroup", "data/.zarray", "data/0.0"] + extra out = kerchunk.utils.subchunk(ref, "data", 5) nchunk = 10 // chunks[0] * 5 - assert list(out) == [".zgroup", "data/.zarray"] + [ - f"data/{_}.0" for _ in range(nchunk) - ] + out2 = list(_ for _ in out if not _.endswith("zattrs")) + assert out2 == [".zgroup", "data/.zarray"] + [f"data/{_}.0" for _ in range(nchunk)] store = kerchunk.utils.refs_as_store(out, remote_protocol="memory") - g2 = zarr.open_group(store, mode='r', zarr_format=2) + g2 = zarr.open_group(store, mode="r", zarr_format=2) # g2 = zarr.open_group( # "reference://", storage_options={"fo": out, "remote_protocol": "memory"}, zarr_format=2 # ) - assert (g2.data[:] == data).all() + assert (g2["data"][:] == data).all() @pytest.mark.parametrize("archive", ["zip", "tar"]) diff --git a/kerchunk/tests/test_zarr.py b/kerchunk/tests/test_zarr.py index 3c02fc69..27063541 100644 --- a/kerchunk/tests/test_zarr.py +++ b/kerchunk/tests/test_zarr.py @@ -46,7 +46,7 @@ def _zip(file): return filename fn = f"{tmpdir}/test.zarr" - ds.to_zarr(fn, mode="w") + ds.to_zarr(fn, mode="w", zarr_format=2) return _zip(fn) @@ -90,7 +90,7 @@ def test_zarr_combine(tmpdir, ds): def test_zarr_json_dump_succeeds(tmpdir, ds): fn1 = f"{tmpdir}/test1.zarr" - ds.to_zarr(fn1) + ds.to_zarr(fn1, zarr_format=2) one = kerchunk.zarr.ZarrToZarr( fn1, diff --git a/kerchunk/utils.py b/kerchunk/utils.py index b8a53e3c..9bdce3f8 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -8,12 +8,19 @@ import ujson -import fsspec +import fsspec.implementations.asyn_wrapper import numpy as np -import zarr +import zarr.storage -def refs_as_fs(refs, fs=None, remote_protocol=None, remote_options=None, **kwargs): +def refs_as_fs( + refs, + fs=None, + remote_protocol=None, + remote_options=None, + asynchronous=True, + **kwargs, +): """Convert a reference set to an fsspec filesystem""" fs = fsspec.filesystem( "reference", @@ -22,7 +29,7 @@ def refs_as_fs(refs, fs=None, remote_protocol=None, remote_options=None, **kwarg remote_protocol=remote_protocol, remote_options=remote_options, **kwargs, - asynchronous=True, + asynchronous=asynchronous, ) return fs @@ -31,11 +38,8 @@ def refs_as_store( refs, read_only=False, fs=None, remote_protocol=None, remote_options=None ): """Convert a reference set to a zarr store""" - if is_zarr3(): - if remote_options is None: - remote_options = {"asynchronous": True} - else: - remote_options["asynchronous"] = True + remote_options = remote_options or {} + remote_options["asynchronous"] = True fss = refs_as_fs( refs, @@ -65,26 +69,23 @@ def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, read_only=False): Parameters ---------- fs: fsspec.async.AsyncFileSystem - mode: str + read_only: bool Returns ------- zarr.storage.Store or zarr.storage.Mapper, fsspec.AbstractFileSystem """ - if is_zarr3(): - if not fs.async_impl: - try: - from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper + if not fs.async_impl: + try: + from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper - fs = AsyncFileSystemWrapper(fs) - except ImportError: - raise ImportError( - "Only fsspec>2024.10.0 supports the async filesystem wrapper required for working with reference filesystems. " - ) - fs.asynchronous = True - return zarr.storage.RemoteStore(fs, read_only=read_only) - else: - return fs.get_mapper() + fs = AsyncFileSystemWrapper(fs) + except ImportError: + raise ImportError( + "Only fsspec>2024.10.0 supports the async filesystem wrapper required for working with reference filesystems. " + ) + fs.asynchronous = True + return zarr.storage.FsspecStore(fs, read_only=read_only) def class_factory(func): @@ -259,14 +260,11 @@ def do_inline(store, threshold, remote_options=None, remote_protocol=None): The chunk may need encoding with base64 if not ascii, so actual length may be larger than threshold. """ - fs = fsspec.filesystem( - "reference", - fo=store, - remote_options=remote_options, - remote_protocol=remote_protocol, - ) fs = refs_as_fs( - store, remote_protocol=remote_protocol, remote_options=remote_options + store, + remote_protocol=remote_protocol, + remote_options=remote_options, + asynchronous=False, ) out = fs.references.copy() @@ -308,15 +306,15 @@ def _inline_array(group, threshold, names, prefix=""): cond2 = prefix1 in names if cond1 or cond2: original_attrs = dict(thing.attrs) - arr = group.create_dataset( + arr = group.create_array( name=name, dtype=thing.dtype, shape=thing.shape, - data=thing[:], chunks=thing.shape, fill_value=thing.fill_value, - exists_ok=True, + overwrite=True, ) + arr[:] = thing[:] arr.attrs.update(original_attrs) @@ -369,7 +367,7 @@ def subchunk(store, variable, factor): ------- modified store """ - fs = refs_as_fs(store) + fs = fsspec.filesystem("reference", fo=store) store = fs.references meta_file = f"{variable}/.zarray" meta = ujson.loads(fs.cat(meta_file)) @@ -419,7 +417,7 @@ def subchunk(store, variable, factor): else: (url,) = v offset = 0 - size = fs.size(k) + size = fs.info(k)["size"] for subpart in range(factor): new_index = ( chunk_index[:ind] diff --git a/kerchunk/xarray_backend.py b/kerchunk/xarray_backend.py index 0620614b..79976d57 100644 --- a/kerchunk/xarray_backend.py +++ b/kerchunk/xarray_backend.py @@ -43,7 +43,7 @@ def open_reference_dataset( if open_dataset_options is None: open_dataset_options = {} - store = refs_as_store(filename_or_obj, remote_options=storage_options) + store = refs_as_store(filename_or_obj, **storage_options) return xr.open_zarr( store, zarr_format=2, consolidated=False, **open_dataset_options diff --git a/kerchunk/zarr.py b/kerchunk/zarr.py index ea0612de..083e0f48 100644 --- a/kerchunk/zarr.py +++ b/kerchunk/zarr.py @@ -2,6 +2,7 @@ from fsspec.implementations.reference import LazyReferenceMapper import kerchunk.utils +import ujson def single_zarr( @@ -35,11 +36,20 @@ def single_zarr( """ if isinstance(uri_or_store, str): mapper = fsspec.get_mapper(uri_or_store, **(storage_options or {})) + protocol = mapper.fs.unstrip_protocol("").rstrip("://") else: mapper = uri_or_store if isinstance(mapper, fsspec.FSMap) and storage_options is None: storage_options = mapper.fs.storage_options + protocol = mapper.fs.unstrip_protocol("").rstrip("://") + else: + protocol = None + try: + check = ujson.loads(mapper[".zgroup"]) + assert check["zarr_format"] == 2 + except (KeyError, ValueError, TypeError) as e: + raise ValueError("Failed to load dataset as V2 zarr") from e refs = out or {} for k in mapper: if k.startswith("."): @@ -50,7 +60,12 @@ def single_zarr( inline_threshold = inline or inline_threshold if inline_threshold: - refs = do_inline(refs, inline_threshold, remote_options=storage_options) + refs = do_inline( + refs, + inline_threshold, + remote_options=storage_options, + remote_protocol=protocol, + ) if isinstance(refs, LazyReferenceMapper): refs.flush() refs = kerchunk.utils.consolidate(refs) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..e83bb177 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +asyncio_default_fixture_loop_scope=session From 95f340fa874c0c3ed6ccf8f9f98bc2f7a692c1cc Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 16 Jan 2025 10:24:28 -0500 Subject: [PATCH 40/40] min python --- .github/workflows/tests.yml | 2 +- ci/environment-py310.yml | 36 ----------------- pyproject.toml | 2 +- tests/test_grib.py | 80 ++++++++++++++++++------------------- 4 files changed, 40 insertions(+), 80 deletions(-) delete mode 100644 ci/environment-py310.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 90d8bb9d..0a31f183 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [310, 311, 312] + python-version: [311, 312] steps: - uses: actions/checkout@v4 diff --git a/ci/environment-py310.yml b/ci/environment-py310.yml deleted file mode 100644 index 970acd42..00000000 --- a/ci/environment-py310.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: test_env -channels: - - conda-forge - - nodefaults -dependencies: - - python=3.10 - - dask - - zarr - - xarray>=2024.10.0 - - h5netcdf - - h5py - - pandas - - cfgrib - - cftime - # Temporary workaround for #508 - - eccodes <2.38 - - - astropy - - requests - - aiohttp - - pytest-cov - - fsspec - - dask - - scipy - - s3fs - - python-blosc - - flake8 - - black - - fastparquet - - pip - - pyopenssl - - tifffile - - netCDF4 - - pip: - - git+https://github.com/fsspec/filesystem_spec - - ipfsspec diff --git a/pyproject.toml b/pyproject.toml index 4d3f9832..ca28f8ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "kerchunk" description = "Functions to make reference descriptions for ReferenceFileSystem" readme = "README.md" -requires-python = ">=3.7" +requires-python = ">=3.11" dynamic = ["version"] license = {text = "MIT"} authors = [ diff --git a/tests/test_grib.py b/tests/test_grib.py index 5edb42d9..2c5387fd 100644 --- a/tests/test_grib.py +++ b/tests/test_grib.py @@ -6,7 +6,6 @@ import pandas as pd import pytest import xarray as xr -#import datatree import zarr import ujson from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper @@ -36,11 +35,7 @@ def test_one(): fs = AsyncFileSystemWrapper(fsspec.filesystem("file")) store = refs_as_store(out[0], fs=fs) - ds = xr.open_zarr( - store, - zarr_format=2, - consolidated=False - ) + ds = xr.open_zarr(store, zarr_format=2, consolidated=False) assert ds.attrs["GRIB_centre"] == "cwao" ds2 = xr.open_dataset(fn, engine="cfgrib", backend_kwargs={"indexpath": ""}) @@ -76,11 +71,7 @@ def test_archives(tmpdir, url): store = refs_as_store(out, remote_options={"anon": True}) - ours = xr.open_zarr( - store, - zarr_format=2, - consolidated=False - ) + ours = xr.open_zarr(store, zarr_format=2, consolidated=False) data = _fetch_first(url) fn = os.path.join(tmpdir, "grib.grib2") @@ -131,7 +122,7 @@ def test_grib_tree(): "atmosphere latitude longitude step time valid_time".split() ) # Assert that the fill value is set correctly - assert np.isnan(zg['refc/instant/atmosphere/step'].fill_value) + assert np.isnan(zg["refc/instant/atmosphere/step"].fill_value) # The following two tests use json fixture data generated from calling scan grib @@ -149,14 +140,18 @@ def test_correct_hrrr_subhf_group_step(): scanned_msgs = ujson.load(fobj) original_zg = [ - zarr.open_group(fs_as_store(fsspec.filesystem("reference", fo=val)), mode="r", zarr_format=2) + zarr.open_group( + fs_as_store(fsspec.filesystem("reference", fo=val)), mode="r", zarr_format=2 + ) for val in scanned_msgs ] corrected_msgs = [correct_hrrr_subhf_step(msg) for msg in scanned_msgs] corrected_zg = [ - zarr.open_group(fs_as_store(fsspec.filesystem("reference", fo=val)), mode="r", zarr_format=2) + zarr.open_group( + fs_as_store(fsspec.filesystem("reference", fo=val)), mode="r", zarr_format=2 + ) for val in corrected_msgs ] @@ -183,28 +178,29 @@ def test_hrrr_subhf_corrected_grib_tree(): zstore = fs_as_store(z_fs) zg = zarr.open_group(zstore, mode="r", zarr_format=2) # Check the values and shape of the time coordinates - assert zg['u/instant/heightAboveGround/step'][:].tolist() == [ + assert zg["u/instant/heightAboveGround/step"][:].tolist() == [ 0.0, 0.25, 0.5, 0.75, 1.0, ] - assert zg['u/instant/heightAboveGround/step'].shape == (5,) - assert zg['u/instant/heightAboveGround/valid_time'][:].tolist() == [ + assert zg["u/instant/heightAboveGround/step"].shape == (5,) + assert zg["u/instant/heightAboveGround/valid_time"][:].tolist() == [ [1695862800, 1695863700, 1695864600, 1695865500, 1695866400] ] - assert zg['u/instant/heightAboveGround/valid_time'].shape == (1, 5) - assert zg['u/instant/heightAboveGround/time'][:].tolist() == [1695862800] - assert zg['u/instant/heightAboveGround/time'].shape == (1,) - assert zg['dswrf/avg/surface/step'][:].tolist() == [0.0, 0.25, 0.5, 0.75, 1.0] - assert zg['dswrf/avg/surface/step'].shape == (5,) - assert zg['dswrf/avg/surface/valid_time'][:].tolist() == [ + assert zg["u/instant/heightAboveGround/valid_time"].shape == (1, 5) + assert zg["u/instant/heightAboveGround/time"][:].tolist() == [1695862800] + assert zg["u/instant/heightAboveGround/time"].shape == (1,) + assert zg["dswrf/avg/surface/step"][:].tolist() == [0.0, 0.25, 0.5, 0.75, 1.0] + assert zg["dswrf/avg/surface/step"].shape == (5,) + assert zg["dswrf/avg/surface/valid_time"][:].tolist() == [ [1695862800, 1695863700, 1695864600, 1695865500, 1695866400] ] - assert zg['dswrf/avg/surface/valid_time'].shape == (1, 5) - assert zg['dswrf/avg/surface/time'][:].tolist() == [1695862800] - assert zg['dswrf/avg/surface/time'].shape == (1,) + assert zg["dswrf/avg/surface/valid_time"].shape == (1, 5) + assert zg["dswrf/avg/surface/time"][:].tolist() == [1695862800] + assert zg["dswrf/avg/surface/time"].shape == (1,) + # The following two test use json fixture data generated from calling scan grib # scan_grib("testdata/hrrr.t01z.wrfsfcf00.grib2") @@ -221,19 +217,19 @@ def test_hrrr_sfcf_grib_tree(): store = fs_as_store(fsspec.filesystem("reference", fo=merged)) zg = zarr.open_group(store, mode="r", zarr_format=2) # Check the heightAboveGround level shape of the time coordinates - assert zg['u/instant/heightAboveGround/heightAboveGround'][()] == 80.0 - assert zg['u/instant/heightAboveGround/heightAboveGround'].shape == () - assert zg['u/instant/heightAboveGround/step'][:].tolist() == [0.0, 1.0] - assert zg['u/instant/heightAboveGround/step'].shape == (2,) - assert zg['u/instant/heightAboveGround/valid_time'][:].tolist() == [ + assert zg["u/instant/heightAboveGround/heightAboveGround"][()] == 80.0 + assert zg["u/instant/heightAboveGround/heightAboveGround"].shape == () + assert zg["u/instant/heightAboveGround/step"][:].tolist() == [0.0, 1.0] + assert zg["u/instant/heightAboveGround/step"].shape == (2,) + assert zg["u/instant/heightAboveGround/valid_time"][:].tolist() == [ [1695862800, 1695866400] ] - assert zg['u/instant/heightAboveGround/valid_time'].shape == (1, 2) - assert zg['u/instant/heightAboveGround/time'][:].tolist() == [1695862800] - assert zg['u/instant/heightAboveGround/time'].shape == (1,) + assert zg["u/instant/heightAboveGround/valid_time"].shape == (1, 2) + assert zg["u/instant/heightAboveGround/time"][:].tolist() == [1695862800] + assert zg["u/instant/heightAboveGround/time"].shape == (1,) # Check the isobaricInhPa level shape and time coordinates - assert zg['u/instant/isobaricInhPa/isobaricInhPa'][:].tolist() == [ + assert zg["u/instant/isobaricInhPa/isobaricInhPa"][:].tolist() == [ 250.0, 300.0, 500.0, @@ -242,9 +238,9 @@ def test_hrrr_sfcf_grib_tree(): 925.0, 1000.0, ] - assert zg['u/instant/isobaricInhPa/isobaricInhPa'].shape == (7,) - assert zg['u/instant/isobaricInhPa/step'][:].tolist() == [0.0, 1.0] - assert zg['u/instant/isobaricInhPa/step'].shape == (2,) + assert zg["u/instant/isobaricInhPa/isobaricInhPa"].shape == (7,) + assert zg["u/instant/isobaricInhPa/step"][:].tolist() == [0.0, 1.0] + assert zg["u/instant/isobaricInhPa/step"].shape == (2,) # Valid time values get exploded by isobaricInhPa aggregation # Is this a feature or a bug? @@ -254,11 +250,11 @@ def test_hrrr_sfcf_grib_tree(): [1695866400 for _ in range(7)], ] ] - assert zg['u/instant/isobaricInhPa/valid_time'][:].tolist() == expected_valid_times - assert zg['u/instant/isobaricInhPa/valid_time'].shape == (1, 2, 7) + assert zg["u/instant/isobaricInhPa/valid_time"][:].tolist() == expected_valid_times + assert zg["u/instant/isobaricInhPa/valid_time"].shape == (1, 2, 7) - assert zg['u/instant/isobaricInhPa/time'][:].tolist() == [1695862800] - assert zg['u/instant/isobaricInhPa/time'].shape == (1,) + assert zg["u/instant/isobaricInhPa/time"][:].tolist() == [1695862800] + assert zg["u/instant/isobaricInhPa/time"].shape == (1,) # def test_hrrr_sfcf_grib_datatree():