Skip to content

Commit

Permalink
Merge pull request #310 from davidhassell/hdf5-chunks
Browse files Browse the repository at this point in the history
Extension to the HDF5 chunks API
  • Loading branch information
davidhassell authored Aug 31, 2024
2 parents 9ceb39e + beb4e0b commit 8d3b7d2
Show file tree
Hide file tree
Showing 41 changed files with 1,563 additions and 133 deletions.
2 changes: 2 additions & 0 deletions Changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ Version NEXTVERSION

* Upgrades to allow cfdm to work with Python 3.12
(https://github.com/NCAS-CMS/cfdm/issues/302)
* Extension to the HDF5 chunks API
(https://github.com/NCAS-CMS/cfdm/issues/309)
* New function `cfdm.netcdf_flattener` that replaces the import of
`netcdf_flattener` (https://github.com/NCAS-CMS/cfdm/issues/286)
* New function `cfdm.netcdf_indexer` that applies netCDF masking and
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ The ``cfdm`` package can:

* read field and domain constructs from netCDF and CDL datasets with a
choice of netCDF backends,
* be fully flexible with respect to HDF5 chunking,
* create new field and domain constructs in memory,
* write and append field and domain constructs to netCDF datasets on disk,
* read, write, and manipulate UGRID mesh topologies,
Expand Down
2 changes: 2 additions & 0 deletions cfdm/auxiliarycoordinate.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class AuxiliaryCoordinate(
{{netCDF UGRID node coordinate}}
{{netCDF HDF5 chunks}}
.. versionadded:: (cfdm) 1.7.0
"""
Expand Down
2 changes: 2 additions & 0 deletions cfdm/bounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class Bounds(
{{netCDF variable group}}
{{netCDF HDF5 chunks}}
.. versionadded:: (cfdm) 1.7.0
"""
Expand Down
2 changes: 2 additions & 0 deletions cfdm/cellmeasure.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class CellMeasure(
{{netCDF global attributes}}
{{netCDF HDF5 chunks}}
.. versionadded:: (cfdm) 1.7.0
"""
Expand Down
34 changes: 25 additions & 9 deletions cfdm/cfdmimplementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1045,25 +1045,21 @@ def nc_get_variable_groups(self, field):
return field.nc_variable_groups()

def nc_get_hdf5_chunksizes(self, data):
"""Return the HDF5 chunksizes for the data.
"""Get the HDF5 chunking strategy for the data.
..versionadded:: (cfdm) 1.7.2
:Parameters:
data: Data instance
data: `Data`
:Returns:
`tuple` or `None`
The HDF5 chunksizes, or `None` if they haven't been set.
`tuple` or `int` or `str` or `None`
The HDF5 chunking strategy.
"""
out = data.nc_hdf5_chunksizes()
if not out:
out = None

return out
return data.nc_hdf5_chunksizes()

def nc_get_sample_dimension(self, count, default=None):
"""Return the name of the netCDF sample dimension.
Expand Down Expand Up @@ -1173,6 +1169,26 @@ def nc_set_group_attributes(self, field, attributes):
for attr, value in attributes.items():
field.nc_set_group_attribute(attr, value)

def nc_set_hdf5_chunksizes(self, data, chunksizes):
"""Set the HDF5 chunking strategy for the data.
..versionadded:: (cfdm) NEXTVERSION
:Parameters:
data: `Data`
chunksizes: `int` or `str` or `None` or `dict` or a sequence
Set the chunking strategy when writing to a netCDF4
file.
:Returns:
`None`
"""
return data.nc_set_hdf5_chunksizes(chunksizes)

def equal_components(self, construct0, construct1, ignore_type=False):
"""Whether or not two field construct components are equal.
Expand Down
2 changes: 2 additions & 0 deletions cfdm/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class Count(
`nc_get_sample_dimension`, `nc_del_sample_dimension` and
`nc_has_sample_dimension` methods.
{{netCDF HDF5 chunks}}
.. versionadded:: (cfdm) 1.7.0
"""
Expand Down
106 changes: 80 additions & 26 deletions cfdm/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,9 +361,10 @@ def __getitem__(self, indices):
out = self.copy(array=False)
out._set_Array(array, copy=False)

if out.shape != self.shape:
# Delete hdf5 chunksizes
out.nc_clear_hdf5_chunksizes()
# Update the HDF5 chunking strategy
chunksizes = out.nc_hdf5_chunksizes()
if isinstance(chunksizes, tuple) and out.shape != self.shape:
out.nc_set_hdf5_chunksizes(chunksizes)

return out

Expand Down Expand Up @@ -652,6 +653,7 @@ def _binary_operation(self, other, method):
[0 2 4 6]
"""
original_shape = self.shape
inplace = method[2] == "i"
if inplace:
d = self
Expand All @@ -662,6 +664,13 @@ def _binary_operation(self, other, method):

d._set_Array(array, copy=False)

# Update the HDF5 chunking strategy
if (
isinstance(self.nc_hdf5_chunksizes(), tuple)
and d.shape != original_shape
):
d.nc_clear_hdf5_chunksizes()

return d

def _item(self, index):
Expand Down Expand Up @@ -1791,11 +1800,11 @@ def insert_dimension(self, position=0, inplace=False):
>>> d.shape
(19, 73, 96)
>>> d.insert_dimension('domainaxis3').shape
>>> d.insert_dimension(0).shape
(1, 96, 73, 19)
>>> d.insert_dimension('domainaxis3', position=3).shape
>>> d.insert_dimension(3).shape
(19, 73, 96, 1)
>>> d.insert_dimension('domainaxis3', position=-1, inplace=True)
>>> d.insert_dimension(-1, inplace=True)
>>> d.shape
(19, 73, 1, 96)
Expand All @@ -1815,8 +1824,12 @@ def insert_dimension(self, position=0, inplace=False):

d._set_Array(array, copy=False)

# Delete hdf5 chunksizes
d.nc_clear_hdf5_chunksizes()
# Update the HDF5 chunking strategy
chunksizes = d.nc_hdf5_chunksizes()
if isinstance(chunksizes, tuple):
chunksizes = list(chunksizes)
chunksizes.insert(position, 1)
d.nc_set_hdf5_chunksizes(chunksizes)

return d

Expand Down Expand Up @@ -2262,15 +2275,25 @@ def maximum(self, axes=None, squeeze=False):
except ValueError as error:
raise ValueError(f"Can't find maximum of data: {error}")

if axes is None:
axes = tuple(range(self.ndim))

keepdims = not squeeze
array = self.array
array = np.amax(array, axis=axes, keepdims=not squeeze)
array = np.amax(array, axis=axes, keepdims=keepdims)

out = self.copy(array=False)
out._set_Array(array, copy=False)

if out.shape != self.shape:
# Delete hdf5 chunksizes
out.nc_clear_hdf5_chunksizes()
# Update the HDF5 chunking strategy
chunksizes = out.nc_hdf5_chunksizes()
if isinstance(chunksizes, tuple) and out.shape != self.shape:
if not keepdims:
chunksizes = [
size for i, size in enumerate(chunksizes) if i not in axes
]

out.nc_set_hdf5_chunksizes(chunksizes)

return out

Expand Down Expand Up @@ -2334,15 +2357,25 @@ def minimum(self, axes=None):
except ValueError as error:
raise ValueError(f"Can't find minimum of data: {error}")

if axes is None:
axes = tuple(range(self.ndim))

keepdims = True
array = self.array
array = np.amin(array, axis=axes, keepdims=True)
array = np.amin(array, axis=axes, keepdims=keepdims)

out = self.copy(array=False)
out._set_Array(array, copy=False)

if out.shape != self.shape:
# Delete hdf5 chunksizes
out.nc_clear_hdf5_chunksizes()
# Update the HDF5 chunking strategy
chunksizes = out.nc_hdf5_chunksizes()
if isinstance(chunksizes, tuple) and out.shape != self.shape:
if not keepdims:
chunksizes = [
size for i, size in enumerate(chunksizes) if i not in axes
]

out.nc_set_hdf5_chunksizes(chunksizes)

return out

Expand Down Expand Up @@ -2417,8 +2450,13 @@ def squeeze(self, axes=None, inplace=False):

d._set_Array(array, copy=False)

# Delete hdf5 chunksizes
d.nc_clear_hdf5_chunksizes()
# Update the HDF5 chunking strategy
chunksizes = d.nc_hdf5_chunksizes()
if isinstance(chunksizes, tuple):
chunksizes = [
size for i, size in enumerate(chunksizes) if i not in axes
]
d.nc_set_hdf5_chunksizes(chunksizes)

return d

Expand Down Expand Up @@ -2488,15 +2526,25 @@ def sum(self, axes=None, squeeze=False):
except ValueError as error:
raise ValueError(f"Can't sum data: {error}")

if axes is None:
axes = tuple(range(self.ndim))

keepdims = not squeeze
array = self.array
array = np.sum(array, axis=axes, keepdims=not squeeze)
array = np.sum(array, axis=axes, keepdims=keepdims)

d = self.copy(array=False)
d._set_Array(array, copy=False)

if d.shape != self.shape:
# Delete hdf5 chunksizes
d.nc_clear_hdf5_chunksizes()
# Update the HDF5 chunking strategy
chunksizes = d.nc_hdf5_chunksizes()
if isinstance(chunksizes, tuple) and d.shape != self.shape:
if not keepdims:
chunksizes = [
size for i, size in enumerate(chunksizes) if i not in axes
]

d.nc_set_hdf5_chunksizes(chunksizes)

return d

Expand Down Expand Up @@ -2566,6 +2614,12 @@ def transpose(self, axes=None, inplace=False):

d._set_Array(array, copy=False)

# Update the HDF5 chunking strategy
chunksizes = d.nc_hdf5_chunksizes()
if isinstance(chunksizes, tuple):
chunksizes = [chunksizes[i] for i in axes]
d.nc_set_hdf5_chunksizes(chunksizes)

return d

def get_compressed_axes(self):
Expand Down Expand Up @@ -3279,18 +3333,18 @@ def unique(self):
d = self.copy(array=False)
d._set_Array(array, copy=False)

if d.shape != self.shape:
# Delete hdf5 chunksizes
# Update the HDF5 chunking strategy
if isinstance(d.nc_hdf5_chunksizes(), tuple) and d.shape != self.shape:
d.nc_clear_hdf5_chunksizes()

return d

# ----------------------------------------------------------------
# Aliases
# ----------------------------------------------------------------
def max(self, axes=None):
def max(self, axes=None, squeeze=False):
"""Alias for `maximum`."""
return self.maximum(axes=axes)
return self.maximum(axes=axes, squeeze=squeeze)

def min(self, axes=None):
"""Alias for `minimum`."""
Expand Down
2 changes: 2 additions & 0 deletions cfdm/dimensioncoordinate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ class DimensionCoordinate(
{{netCDF variable}}
{{netCDF HDF5 chunks}}
.. versionadded:: (cfdm) 1.7.0
"""
Expand Down
Loading

0 comments on commit 8d3b7d2

Please sign in to comment.