Skip to content
This repository has been archived by the owner on Aug 29, 2023. It is now read-only.

Commit

Permalink
closes #740
Browse files Browse the repository at this point in the history
  • Loading branch information
forman committed Sep 12, 2018
1 parent ba7655f commit cf424a9
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 35 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Values of scalar variables are now always shown in **VARIABLES** panel in Cate Desktop [#702](https://github.com/CCI-Tools/cate/issues/702)
* Added information about resources of type `GeoDataFrame` (Shapefiles, GeoJSON) in the details section
of the **WORSPACE** panel in Cate Desktop [#705](https://github.com/CCI-Tools/cate/issues/705)
* Added new operation `merge()` [#740](https://github.com/CCI-Tools/cate/issues/740)
* Added new operation `data_frame_subset()` [#708](https://github.com/CCI-Tools/cate/issues/708)
* Fixed display of CCI Sea Level MSLAMPH data [#722](https://github.com/CCI-Tools/cate/issues/722)
* Improve indexers to first do a validation with respect to the available dimensions and the selected remaining_dims
Expand Down
64 changes: 63 additions & 1 deletion cate/ops/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,69 @@
from cate.util.monitor import Monitor


@op(tags=['utility', 'internal'])
@op(tags=['utility'])
@op_input('ds_1', data_type=DatasetLike)
@op_input('ds_2', data_type=DatasetLike)
@op_input('ds_3', data_type=DatasetLike)
@op_input('ds_4', data_type=DatasetLike)
@op_input('join', value_set=["outer", "inner", "left", "right", "exact"])
@op_input('compat', value_set=["identical", "equals", "broadcast_equals", "no_conflicts"])
def merge(ds_1: DatasetLike.TYPE,
ds_2: DatasetLike.TYPE,
ds_3: DatasetLike.TYPE = None,
ds_4: DatasetLike.TYPE = None,
join: str = 'outer',
compat: str = 'no_conflicts') -> xr.Dataset:
"""
Merge up to four datasets to produce a new dataset with combined variables from each input dataset.
This is a wrapper for the ``xarray.merge()`` function.
For documentation refer to xarray documentation at
http://xarray.pydata.org/en/stable/generated/xarray.Dataset.merge.html#xarray.Dataset.merge
The *compat* argument indicates how to compare variables of the same name for potential conflicts:
* "broadcast_equals": all values must be equal when variables are broadcast
against each other to ensure common dimensions.
* "equals": all values and dimensions must be the same.
* "identical": all values, dimensions and attributes must be the same.
* "no_conflicts": only values which are not null in both datasets must be equal.
The returned dataset then contains the combination of all non-null values.
:param ds_1: The first input dataset.
:param ds_2: The second input dataset.
:param ds_3: An optional 3rd input dataset.
:param ds_4: An optional 4th input dataset.
:param join: How to combine objects with different indexes.
:param compat: How to compare variables of the same name for potential conflicts.
:return: A new dataset with combined variables from each input dataset.
"""

ds_1 = DatasetLike.convert(ds_1)
ds_2 = DatasetLike.convert(ds_2)
ds_3 = DatasetLike.convert(ds_3)
ds_4 = DatasetLike.convert(ds_4)

datasets = []
for ds in (ds_1, ds_2, ds_3, ds_4):
if ds is not None:
included = False
for ds2 in datasets:
if ds is ds2:
included = True
if not included:
datasets.append(ds)

if len(datasets) == 0:
raise ValidationError('At least two different datasets must be given')
elif len(datasets) == 1:
return datasets[0]
else:
return xr.merge(datasets, compat=compat, join=join)


@op(tags=['utility'])
@op_input('ds', data_type=DatasetLike)
@op_input('point', data_type=PointLike, units='degree')
@op_input('time', data_type=TimeLike)
Expand Down
2 changes: 1 addition & 1 deletion test/cli/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ def test_op_list(self):
self.assert_main(['op', 'list'], expected_stdout=['operations found'])
self.assert_main(['op', 'list', '-n', 'read'], expected_stdout=['operations found'])
self.assert_main(['op', 'list', '-n', 'nevermatch'], expected_stdout=['No operations found'])
self.assert_main(['op', 'list', '--internal'], expected_stdout=['2 operations found'])
self.assert_main(['op', 'list', '--internal'], expected_stdout=['One operation found'])
self.assert_main(['op', 'list', '--tag', 'input'], expected_stdout=['9 operations found'])
self.assert_main(['op', 'list', '--tag', 'output'], expected_stdout=['6 operations found'])
self.assert_main(['op', 'list', '--deprecated'], expected_stdout=['2 operations found'])
Expand Down
123 changes: 90 additions & 33 deletions test/ops/test_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,42 +9,67 @@
import xarray as xr

from cate.core.op import OP_REGISTRY
from cate.ops.utility import sel, from_dataframe, identity, literal, pandas_fillna
from cate.core.types import ValidationError
from cate.ops.utility import merge, sel, from_dataframe, identity, literal, pandas_fillna
from cate.util.misc import object_to_qualified_name


def new_ds():
lon = [10.1, 10.2, 10.3, 10.4]
lat = [34.5, 34.6]
time = pd.date_range('2014-09-06', periods=10)
reference_time = pd.Timestamp('2014-09-05')

time_res = len(time)
lon_res = len(lon)
lat_res = len(lat)

temperature = (15 + 8 * np.random.randn(lon_res, lat_res, time_res)).round(decimals=1)
precipitation = (10 * np.random.rand(lon_res, lat_res, time_res)).round(decimals=1)

ds = xr.Dataset({'temperature': (['lon', 'lat', 'time'], temperature),
'precipitation': (['lon', 'lat', 'time'], precipitation)
},
coords={'lon': lon,
'lat': lat,
'time': time,
'reference_time': reference_time
})
return ds


def assert_dataset_equal(expected, actual):
# this method is functionally equivalent to
# `assert expected == actual`, but it checks each aspect
# of equality separately for easier debugging
assert expected.equals(actual), (expected, actual)


class TestSel(TestCase):
class MergeTest(TestCase):
def test_nominal(self):
"""
Test nominal execution
"""
periods = 5
time = pd.date_range('2000-01-01', periods=periods)

ds_1 = xr.Dataset({'A': (['time'], np.random.randn(periods)),
'B': (['time'], np.random.randn(periods)),
'time': time})
ds_2 = xr.Dataset({'C': (['time'], np.random.randn(periods)),
'D': (['time'], np.random.randn(periods)),
'time': time})
new_ds = merge(ds_1=ds_1, ds_2=ds_2, ds_3=None, ds_4=None)
self.assertTrue('A' in new_ds)
self.assertTrue('B' in new_ds)
self.assertTrue('C' in new_ds)
self.assertTrue('D' in new_ds)

new_ds = merge(ds_1=ds_1, ds_2=ds_1, ds_3=ds_1, ds_4=ds_2)
self.assertTrue('A' in new_ds)
self.assertTrue('B' in new_ds)
self.assertTrue('C' in new_ds)
self.assertTrue('D' in new_ds)

new_ds = merge(ds_1=ds_1, ds_2=ds_1, ds_3=ds_1, ds_4=ds_1)
self.assertIs(new_ds, ds_1)

new_ds = merge(ds_1=ds_2, ds_2=ds_2, ds_3=ds_2, ds_4=ds_2)
self.assertIs(new_ds, ds_2)

ds_3 = xr.Dataset({'E': (['time'], np.random.randn(periods)),
'time': time})
new_ds = merge(ds_1=ds_1, ds_2=ds_2, ds_3=ds_3, ds_4=None)
self.assertTrue('A' in new_ds)
self.assertTrue('B' in new_ds)
self.assertTrue('C' in new_ds)
self.assertTrue('D' in new_ds)
self.assertTrue('E' in new_ds)

ds_4 = xr.Dataset({'F': (['time'], np.random.randn(periods)),
'time': time})
new_ds = merge(ds_1=ds_1, ds_2=ds_2, ds_3=ds_3, ds_4=ds_4)
self.assertTrue('A' in new_ds)
self.assertTrue('B' in new_ds)
self.assertTrue('C' in new_ds)
self.assertTrue('D' in new_ds)
self.assertTrue('E' in new_ds)

def test_failures(self):
with self.assertRaises(ValidationError):
merge(ds_1=None, ds_2=None, ds_3=None, ds_4=None)


class SelTest(TestCase):
def test_nominal(self):
ds = new_ds()

Expand Down Expand Up @@ -173,6 +198,7 @@ class TestFillna(TestCase):
"""
Test fillna operation
"""

def test_nominal(self):
"""
Test nominal operation
Expand Down Expand Up @@ -214,3 +240,34 @@ def test_registered(self):

actual = reg_op(df=df, method='ffill')
self.assertTrue(actual.equals(expected))


def new_ds():
lon = [10.1, 10.2, 10.3, 10.4]
lat = [34.5, 34.6]
time = pd.date_range('2014-09-06', periods=10)
reference_time = pd.Timestamp('2014-09-05')

time_res = len(time)
lon_res = len(lon)
lat_res = len(lat)

temperature = (15 + 8 * np.random.randn(lon_res, lat_res, time_res)).round(decimals=1)
precipitation = (10 * np.random.rand(lon_res, lat_res, time_res)).round(decimals=1)

ds = xr.Dataset({'temperature': (['lon', 'lat', 'time'], temperature),
'precipitation': (['lon', 'lat', 'time'], precipitation)
},
coords={'lon': lon,
'lat': lat,
'time': time,
'reference_time': reference_time
})
return ds


def assert_dataset_equal(expected, actual):
# this method is functionally equivalent to
# `assert expected == actual`, but it checks each aspect
# of equality separately for easier debugging
assert expected.equals(actual), (expected, actual)

0 comments on commit cf424a9

Please sign in to comment.