pangeo-forge · rabernat · Jan 18, 2023 · Jan 12, 2023 · Jan 12, 2023 · Jan 17, 2023
diff --git a/pangeo_forge_recipes/aggregation.py b/pangeo_forge_recipes/aggregation.py
@@ -23,11 +23,17 @@ class XarraySchema(TypedDict):
 
 
 def dataset_to_schema(ds: xr.Dataset) -> XarraySchema:
-    """Convert the output of `dataset.to_dict(data=False)` to a schema
+    """Convert the output of `dataset.to_dict(data=False, encoding=True)` to a schema
     (Basically justs adds chunks, which is not part of the Xarray ouput).
     """
 
-    d = ds.to_dict(data=False)
+    # Remove redundant encoding options
+    for v in ds.variables:
+        for option in ["_FillValue", "source"]:
+            # TODO: should be okay to remove _FillValue?
+            if option in ds[v].encoding:
+                del ds[v].encoding[option]
+    d = ds.to_dict(data=False, encoding=True)
     return XarraySchema(
         attrs=d.get("attrs"),
         coords=d.get("coords"),
@@ -164,6 +170,8 @@ def _combine_vars(v1, v2, concat_dim, allow_both=False):
                 raise DatasetCombineError(f"Can't merge datasets with the same variable {vname}")
             attrs = _combine_attrs(v1[vname]["attrs"], v2[vname]["attrs"])
             dtype = _combine_dtype(v1[vname]["dtype"], v2[vname]["dtype"])
+            # Can combine encoding using the same approach as attrs
+            encoding = _combine_attrs(v1[vname]["encoding"], v2[vname]["encoding"])
             (d1, s1), (d2, s2) = (
                 (v1[vname]["dims"], v1[vname]["shape"]),
                 (v2[vname]["dims"], v2[vname]["shape"]),
@@ -182,7 +190,14 @@ def _combine_vars(v1, v2, concat_dim, allow_both=False):
                     )
                 else:
                     shape.append(l1)
-            new_vars[vname] = {"dims": dims, "attrs": attrs, "dtype": dtype, "shape": tuple(shape)}
+            new_vars[vname] = {
+                "dims": dims,
+                "attrs": attrs,
+                "dtype": dtype,
+                "shape": tuple(shape),
+                "encoding": encoding,
+            }
+
     return new_vars
 
 
@@ -195,13 +210,10 @@ def _to_variable(template, target_chunks):
     chunks = tuple(target_chunks[dim] for dim in dims)
     # we pick zeros as the safest value to initialize empty data with
     # will only be used for dimension coordinates
-    # WARNING: there are lots of edge cases aroudn time!
-    # Xarray will pick a time encoding for the dataset (e.g. "days since days since 1970-01-01")
-    # and this may not be compatible with the actual values in the time coordinate
-    # (which we don't know yet)
     data = dsa.zeros(shape=shape, chunks=chunks, dtype=dtype)
     # TODO: add more encoding
-    encoding = {"chunks": chunks}
+    encoding = template.get("encoding", {})
+    encoding["chunks"] = chunks
     return xr.Variable(dims=dims, data=data, attrs=template["attrs"], encoding=encoding)
 
 

diff --git a/tests/data_generation.py b/tests/data_generation.py
@@ -34,4 +34,12 @@ def make_ds(nt=10, non_dim_coords=False):
         coords=coords,
         attrs={"conventions": "CF 1.6"},
     )
+
+    # Add time coord encoding
+    # Remove "%H:%M:%s" as it will be dropped when time is 0:0:0
+    ds.time.encoding = {
+        "units": f"days since {time[0].strftime('%Y-%m-%d')}",
+        "calendar": "proleptic_gregorian",
+    }
+
     return ds
diff --git a/tests/test_aggregation.py b/tests/test_aggregation.py
@@ -31,6 +31,8 @@ def test_schema_to_template_ds(specified_chunks):
             chunksize = var.chunksizes[dim]
             expected_chunksize = _expected_chunks(size, specified_chunks.get(dim, None))
             assert chunksize == expected_chunksize
+    # Confirm original time units have been preserved
+    assert ds.time.encoding.get("units") == dst.time.encoding.get("units")
     schema2 = dataset_to_schema(dst)
     assert schema == schema2
 

diff --git a/tests/test_writers.py b/tests/test_writers.py
@@ -17,7 +17,7 @@ def temp_store(tmp_path):
 def test_store_dataset_fragment(temp_store):
 
     ds = make_ds(non_dim_coords=True)
-    schema = ds.to_dict(data=False)
+    schema = ds.to_dict(data=False, encoding=True)
     schema["chunks"] = {}
 
     ds.to_zarr(temp_store)
@@ -138,3 +138,6 @@ def test_store_dataset_fragment(temp_store):
     ds_target = xr.open_dataset(temp_store, engine="zarr").load()
 
     xr.testing.assert_identical(ds, ds_target)
+    # assert_identical() doesn't check encoding
+    # Checking the original time encoding units should be sufficient
+    assert ds.time.encoding.get("units") == ds_target.time.encoding.get("units")