JuliaDataCubes · meggart · Aug 19, 2022 · May 2, 2022 · May 4, 2022 · May 5, 2022
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "YAXArrays"
 uuid = "c21b50f5-aa40-41ea-b809-c0f5e47bfa5c"
 authors = ["Fabian Gans <[email protected]>"]
-version = "0.3.0"
+version = "0.4.0"
 
 [deps]
 CFTime = "179af706-886a-5703-950a-314cd64e0468"
@@ -21,6 +21,7 @@ IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 OnlineStats = "a15396b6-48d5-5d58-9928-6d29437db91e"
+Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 ParallelUtilities = "fad6cfc8-4f83-11e9-06cc-151124046ad0"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
@@ -55,5 +56,5 @@ Requires = "1"
 StatsBase = "0.32, 0.33"
 Tables = "0.2, 1.0"
 WeightedOnlineStats = "0.3, 0.4, 0.5, 0.6"
-YAXArrayBase = "0.4"
+YAXArrayBase = "0.5"
 julia = "1.6"
diff --git a/docs/src/examples/Saving and rechunking.md b/docs/src/examples/Saving and rechunking.md
@@ -0,0 +1,185 @@
+# Saving and and Rechunking Datasets and YAXArrays
+
+## Saving 
+
+### Saving a YAXArray to Zarr
+
+One can save any `YAXArray` using the `savecube` function. Simply add a path as an argument and the cube will be saved. 
+
+````@jldoctest
+julia> using YAXArrays, Zarr, NetCDF
+
+julia> a = YAXArray(rand(10,20));
+
+julia> f = tempname();
+
+julia> savecube(a,f,driver=:zarr)
+YAXArray with the following dimensions
+Dim_1               Axis with 10 Elements from 1 to 10
+Dim_2               Axis with 20 Elements from 1 to 20
+Total size: 1.56 KB
+````
+
+
+If case the pathname ends with ".zarr", the driver argument can be omitted. 
+
+### Saving a YAXArray to NetCDF
+
+Saving to NetCDF works exactly the same way. The `driver` argument can be omitted when the filename ends with ".nc"
+
+````@jldoctest
+julia> using YAXArrays, Zarr, NetCDF
+
+julia> a = YAXArray(rand(10,20));
+
+julia> f = tempname();
+
+julia> savecube(a,f,driver=:netcdf)
+YAXArray with the following dimensions
+Dim_1               Axis with 10 Elements from 1 to 10
+Dim_2               Axis with 20 Elements from 1 to 20
+Total size: 1.56 KB
+````
+
+### Saving a Dataset
+
+Saving Datasets can be done using the `savedataset` function.
+
+````@jldoctest saveds
+julia> using YAXArrays, Zarr
+
+julia> ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10)));
+
+julia> f = tempname();
+
+julia> savedataset(ds,path=f,driver=:zarr)
+YAXArray Dataset
+Dimensions: 
+   Dim_2               Axis with 20 Elements from 1 to 20
+   Dim_1               Axis with 10 Elements from 1 to 10
+Variables: x y
+````
+
+### Overwriting a Dataset
+
+If a path already exists, an error will be thrown. Set `overwrite=true` to delete the existing dataset>
+
+````@jldoctest saveds
+julia> savedataset(ds,path=f,driver=:zarr, overwrite=true)
+YAXArray Dataset
+Dimensions: 
+   Dim_2               Axis with 20 Elements from 1 to 20
+   Dim_1               Axis with 10 Elements from 1 to 10
+Variables: x y
+````
+
+### Appending to a Dataset
+
+New variables can be added to an existing dataset using the `append=true` keyword. 
+
+````@jldoctest
+julia> ds2 = Dataset(z = YAXArray(rand(10,20,5)));
+
+julia> savedataset(ds2,path=f,backend=:zarr,append=true);
+
+julia> open_dataset(f, driver=:zarr)
+YAXArray Dataset
+Dimensions: 
+   Dim_2               Axis with 20 Elements from 1 to 20
+   Dim_1               Axis with 10 Elements from 1 to 10
+   Dim_3               Axis with 5 Elements from 1 to 5
+Variables: x z y 
+````
+
+### Creating a Dataset without writing the actual data
+
+Sometimes one merely wants to create a Dataset "Skeleton" on disk and gradually fill it with data.
+Here we create Dataset and write only the axis data and array metadata, while no actual array data is
+copied:
+
+````@jldoctest
+julia> using YAXArrays, Zarr
+
+julia> a = YAXArray(zeros(Union{Missing, Int32},10,20))
+YAXArray with the following dimensions
+Dim_1               Axis with 10 Elements from 1 to 10
+Dim_2               Axis with 20 Elements from 1 to 20
+Total size: 800.0 bytes
+
+
+julia> f = tempname();
+
+julia> r = savecube(a,f,driver=:zarr,skeleton_only=true);
+
+julia> all(ismissing,r[:,:])
+true
+````
+
+The `skeleton_only` argument is also available for `savedataset`. 
+
+## Rechunking
+
+### Saving a YAXArray with user-defined chunks
+
+To determine the chunk size of the array representation on disk, call the `setchunks` function prior to saving:
+
+````@jldoctest chunks1
+julia> using YAXArrays, Zarr, NetCDF
+
+julia> a = YAXArray(rand(10,20));
+
+julia> f = tempname();
+
+julia> a_chunked = setchunks(a,(5,10));
+
+julia> savecube(a_chunked,f,backend=:zarr);
+
+julia> Cube(f).chunks
+2×2 DiskArrays.GridChunks{2}:
+ (1:5, 1:10)   (1:5, 11:20)
+ (6:10, 1:10)  (6:10, 11:20)
+````
+
+Alternatively chunk sizes can be given by dimension name, so the following results in the same chunks:
+
+````@jldoctest chunks1
+a_chunked = setchunks(a,(Dim_2=10, Dim_1=5));
+````
+
+## Rechunking Datasets
+
+### Set Chunks by Axis
+
+Set chunk size for each axis occuring in a dataset. This will be applied to all variables in the dataset:
+
+````@jldoctest
+using YAXArrays, Zarr
+ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10)), z = YAXArray(rand(10,20,5)));
+dschunked = setchunks(ds,Dict("Dim_1"=>5, "Dim_2"=>10, "Dim_3"=>2));
+f = tempname();
+savedataset(dschunked,path=f,driver=:zarr)
+````
+
+### Set chunking by Variable
+
+The following will set the chunk size for each Variable separately and results in exactly the same chunkg as the example above
+
+````@jldoctest
+using YAXArrays, Zarr
+ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10)), z = YAXArray(rand(10,20,5)));
+dschunked = setchunks(ds,(x = (5,10), y = Dict("Dim_1"=>5), z = (Dim_1 = 5, Dim_2 = 10, Dim_3 = 2)));
+f = tempname();
+savedataset(dschunked,path=f,driver=:zarr)
+````
+
+### Set chunking for all variables
+
+The following code snippet only works when all member variables of the dataset have the same shape and sets the output chunks for all arrays. 
+
+````@jldoctest
+using YAXArrays, Zarr
+ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10,20)), z = YAXArray(rand(10,20)));
+dschunked = setchunks(ds,(5,10));
+f = tempname();
+savedataset(dschunked,path=f,driver=:zarr)
+````
diff --git a/src/Cubes/Cubes.jl b/src/Cubes/Cubes.jl
@@ -3,7 +3,7 @@ The functions provided by YAXArrays are supposed to work on different types of c
 Data types that
 """
 module Cubes
-using DiskArrays: DiskArrays, eachchunk, approx_chunksize, max_chunksize, grid_offset
+using DiskArrays: DiskArrays, eachchunk, approx_chunksize, max_chunksize, grid_offset, GridChunks
 using Distributed: myid
 using Dates: TimeType
 using IntervalSets: Interval, (..)
@@ -14,7 +14,7 @@ import YAXArrayBase: getattributes, iscontdim, dimnames, dimvals, getdata
 using DiskArrayTools: CFDiskArray
 using DocStringExtensions
 
-export concatenatecubes, caxes, subsetcube, readcubedata, renameaxis!, YAXArray
+export concatenatecubes, caxes, subsetcube, readcubedata, renameaxis!, YAXArray, setchunks
 
 """
 This function calculates a subset of a cube's data
@@ -88,12 +88,13 @@ It can wrap normal arrays or, more typically DiskArrays.
 * `axes` a `Vector{CubeAxis}` containing the Axes of the Cube
 * `data` N-D array containing the data
 """
-struct YAXArray{TypeOfData,NumberOfAxes,A<:AbstractArray{TypeOfData,NumberOfAxes},AxesTypes}
+struct YAXArray{T,N,A<:AbstractArray{T,N},AxesTypes}
     axes::AxesTypes
     data::A
     properties::Dict{String}
+    chunks::GridChunks{N}
     cleaner::Vector{CleanMe}
-    function YAXArray(axes, data, properties, cleaner)
+    function YAXArray(axes, data, properties, chunks, cleaner)
         if ndims(data) != length(axes) # case: mismatched Arguments
             throw(
                 ArgumentError(
@@ -106,22 +107,28 @@ struct YAXArray{TypeOfData,NumberOfAxes,A<:AbstractArray{TypeOfData,NumberOfAxes
                     "Can not construct YAXArray, supplied data size is $(size(data)) while axis lenghts are $(ntuple(i->length(axes[i]),ndims(data)))",
                 ),
             )
-        else # case: create new YAXArray
+        elseif ndims(chunks) != ndims(data)
+            throw(ArgumentError("Can not construct YAXArray, supplied chunk dimension is $(ndims(chunks)) while the number of dims is $(length(axes))"))
+        else
             return new{eltype(data),ndims(data),typeof(data),typeof(axes)}(
                 axes,
                 data,
                 properties,
+                chunks,
                 cleaner,
             )
         end
     end
 end
-YAXArray(axes, data, properties=Dict{String,Any}(); cleaner=CleanMe[]) =
-    YAXArray(axes, data, properties, cleaner)
+
+YAXArray(axes, data, properties = Dict{String,Any}(); cleaner = CleanMe[], chunks = eachchunk(data)) =
+    YAXArray(axes, data, properties, chunks, cleaner)
+YAXArray(axes,data,properties,cleaner) = YAXArray(axes,data,properties,eachchunk(data),cleaner)
 function YAXArray(x::AbstractArray)
     ax = caxes(x)
     props = getattributes(x)
-    return YAXArray(ax, x, props)
+    chunks = eachchunk(x)
+    YAXArray(ax, x, props,chunks=chunks)
 end
 
 # Base utility overloads
@@ -146,19 +153,15 @@ function Base.propertynames(a::YAXArray, private::Bool=false)
         (axsym.(caxes(a))..., :axes, :data)
     end
 end
-Base.ndims(a::YAXArray{<:Any,NumberOfAxes}) where {NumberOfAxes} = NumberOfAxes
-Base.eltype(a::YAXArray{TypeOfData}) where {TypeOfData} = TypeOfData
-# really needed? it sounds like bad performance to permute the raw data?
-Base.permutedims(c::YAXArray, p) =
-    YAXArray(caxes(c)[collect(p)], permutedims(getdata(c), p), c.properties, c.cleaner)
-Base.getindex(x::YAXArray, i...) = getdata(x)[i...]
 
-"""
-    caxes(x)
 
-returns the axes of a cube
-"""
-#TODO: is the general version really needed?
+Base.ndims(a::YAXArray{<:Any,N}) where {N} = N
+Base.eltype(a::YAXArray{T}) where {T} = T
+function Base.permutedims(c::YAXArray, p) 
+    newaxes = caxes(c)[collect(p)]
+    newchunks = DiskArrays.GridChunks(c.chunks.chunks[collect(p)])
+    YAXArray(newaxes, permutedims(getdata(c), p), c.properties, newchunks, c.cleaner)
+end
 function caxes(x)
     map(enumerate(dimnames(x))) do a
         index, symbol = a
@@ -177,9 +180,41 @@ function readcubedata(x)
     YAXArray(collect(CubeAxis, caxes(x)), getindex_all(x), getattributes(x))
 end
 
-cubechunks(c) = approx_chunksize(eachchunk(getdata(c)))
+interpret_cubechunks(cs::NTuple{N,Int},cube) where N = DiskArrays.GridChunks(cube.data,cs)
+interpret_cubechunks(cs::DiskArrays.GridChunks,_) = cs
+interpret_dimchunk(cs::Integer,s) = DiskArrays.RegularChunks(cs,0,s)
+interpret_dimchunk(cs::DiskArrays.ChunkType, _) = cs
+
+function interpret_cubechunks(cs,cube)
+    oldchunks = DiskArrays.eachchunk(cube).chunks
+    for k in keys(cs)
+        i = findAxis(k,cube)
+        if i !== nothing
+            dimchunk = interpret_dimchunk(cs[k],size(cube.data,i))
+            oldchunks = Base.setindex(oldchunks,dimchunk,i)
+        end
+    end
+    GridChunks(oldchunks)
+end
+
+"""
+    setchunks(c::YAXArray,chunks)
+
+Resets the chunks of a YAXArray and returns a new YAXArray. Note that this will not change the chunking of the underlying data itself, 
+it will just make the data "look" like it had a different chunking. If you need a persistent on-disk representation
+of this chunking, use `savecube` on the resulting array. The `chunks` argument can take one of the following forms:
+
+- a `DiskArrays.GridChunks` object
+- a tuple specifying the chunk size along each dimension
+- an AbstractDict or NamedTuple mapping one or more axis names to chunk sizes
+
+"""
+setchunks(c::YAXArray,chunks) = YAXArray(c.axes,c.data,c.properties,interpret_cubechunks(chunks,c),c.cleaner)
+cubechunks(c) = approx_chunksize(c.chunks)
+DiskArrays.eachchunk(c) = c.chunks
 getindex_all(a) = getindex(a, ntuple(_ -> Colon(), ndims(a))...)
-chunkoffset(c) = grid_offset(eachchunk(getdata(c)))
+Base.getindex(x::YAXArray, i...) = getdata(x)[i...]
+chunkoffset(c) = grid_offset(c.chunks)
 
 # Implementation for YAXArrayBase interface
 YAXArrayBase.dimvals(x::YAXArray, i) = caxes(x)[i].values
@@ -312,6 +347,9 @@ function show_yax(io::IO, c)
     println(io, "Total size: ", formatbytes(cubesize(c)))
 end
 
+
+
 include("TransformedCubes.jl")
 include("Slices.jl")
+include("Rechunker.jl")
 end #module