Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not sort on initialization or append #893

Merged
merged 14 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Next Release
# Next release

The next release must be bumped to v3.0.0.

- [#893](https://github.com/IAMconsortium/pyam/pull/893) No sorting of timeseries data on initialization or append
- [#879](https://github.com/IAMconsortium/pyam/pull/879) Add `read_netcdf()` function

# Release v2.3.0
Expand Down
10 changes: 6 additions & 4 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@ def append(

# merge extra columns in `data`
ret.extra_cols += [i for i in other.extra_cols if i not in ret.extra_cols]
ret._data = _data.sort_index()
ret._data = _data
ret._set_attributes()

if not inplace:
Expand Down Expand Up @@ -805,17 +805,19 @@ def timeseries(self, iamc_index=False):
reducing to IAMC-index yields an index with duplicates
"""
if self.empty:
raise ValueError("This IamDataFrame is empty!")
raise ValueError("This IamDataFrame is empty.")

s = self._data
if iamc_index:
if self.time_col == "time":
raise ValueError(
"Cannot use `iamc_index=True` with 'datetime' time-domain!"
"Cannot use `iamc_index=True` with 'datetime' time-domain."
)
s = s.droplevel(self.extra_cols)

return s.unstack(level=self.time_col).rename_axis(None, axis=1)
return (
s.unstack(level=self.time_col).sort_index(axis=1).rename_axis(None, axis=1)
)

def set_meta(self, meta, name=None, index=None): # noqa: C901
"""Add meta indicators as pandas.Series, list or value (int/float/str)
Expand Down
2 changes: 1 addition & 1 deletion pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ def format_data(df, index, **kwargs): # noqa: C901
if df.empty:
logger.warning("Formatted data is empty.")

return df.sort_index(), index, time_col, extra_cols
return df, index, time_col, extra_cols


def _validate_complete_index(df):
Expand Down
71 changes: 56 additions & 15 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from pyam.core import _meta_idx
from pyam.utils import IAMC_IDX, META_IDX

from .conftest import TEST_DF

df_filter_by_meta_matching_idx = pd.DataFrame(
[
["model_a", "scen_a", "region_1", 1],
Expand Down Expand Up @@ -42,7 +44,7 @@


@pytest.mark.parametrize("index", (None, META_IDX, ["model"]))
def test_init_df(test_pd_df, index):
def test_init_df_with_non_default_index(test_pd_df, index):
"""Casting to IamDataFrame and returning as `timeseries()` yields original frame"""

# set a value to `nan` to check that timeseries columns are ordered correctly
Expand All @@ -54,6 +56,19 @@ def test_init_df(test_pd_df, index):
pdt.assert_frame_equal(obs, test_pd_df.set_index(IAMC_IDX), check_column_type=False)


def test_init_df_unsorted(test_pd_df):
"""Casting unsorted timeseries data does not sort on init"""

columns = IAMC_IDX + list(test_pd_df.columns[[6, 5]])
unsorted_data = test_pd_df.iloc[[2, 0, 1]][columns]
df = IamDataFrame(unsorted_data)

# `data` is not sorted
assert list(df.data.scenario.unique()) == ["scen_b", "scen_a"]
assert list(df.data.year.unique()) == [2010, 2005]
assert not df._data.index.is_monotonic_increasing


def test_init_from_iamdf(test_df_year):
# casting an IamDataFrame instance again works
df = IamDataFrame(test_df_year)
Expand Down Expand Up @@ -510,30 +525,56 @@ def test_variable_depth_with_list_raises(test_df, filter_name):
pytest.raises(ValueError, test_df.filter, **{filter_name: [1, 2]})


def test_timeseries(test_df):
dct = {
"model": ["model_a"] * 2,
"scenario": ["scen_a"] * 2,
"years": [2005, 2010],
"value": [1, 6],
}
exp = pd.DataFrame(dct).pivot_table(
index=["model", "scenario"], columns=["years"], values="value"
)
obs = test_df.filter(scenario="scen_a", variable="Primary Energy").timeseries()
npt.assert_array_equal(obs, exp)
@pytest.mark.parametrize("unsort", [False, True])
def test_timeseries(test_df, unsort):
"""Assert that the timeseries is shown as expected even from unordered data"""
exp = TEST_DF.set_index(IAMC_IDX)

if unsort:
# revert order of _data, then check that the index and columns are sorted anyway
data = test_df.data
if test_df.time_col == "time":
time = test_df.time
data.time = data.time.replace(
dict([(year, time[i]) for i, year in enumerate([2005, 2010])])
)
test_df = IamDataFrame(data.iloc[[5, 4, 3, 2, 1, 0]])
# check that `data` is not sorted internally
unsorted_data = test_df.data
assert list(unsorted_data.scenario.unique()) == ["scen_b", "scen_a"]
if test_df.time_col == "year":
time = unsorted_data.year.unique()
else:
time = unsorted_data.time.unique()
assert time[0] > time[1]

if test_df.time_col == "time":
exp.columns = test_df.time
exp.columns.name = None

obs = test_df.timeseries()
pdt.assert_frame_equal(obs, exp, check_column_type=False)


def test_timeseries_wide_unsorted(test_pd_df):
"""Assert that the timeseries is shown as expected even from unordered data"""

# for some reason, `unstack` behaves differently if columns or rows are not sorted
exp = test_pd_df.set_index(IAMC_IDX)
obs = IamDataFrame(test_pd_df[IAMC_IDX + [2010, 2005]]).timeseries()
pdt.assert_frame_equal(obs, exp, check_column_type=False)


def test_timeseries_empty_raises(test_df_year):
"""Calling `timeseries()` on an empty IamDataFrame raises"""
_df = test_df_year.filter(model="foo")
with pytest.raises(ValueError, match="This IamDataFrame is empty!"):
with pytest.raises(ValueError, match="This IamDataFrame is empty."):
_df.timeseries()


def test_timeseries_time_iamc_raises(test_df_time):
"""Calling `timeseries(iamc_index=True)` on a continuous-time IamDataFrame raises"""
match = "Cannot use `iamc_index=True` with 'datetime' time-domain!"
match = "Cannot use `iamc_index=True` with 'datetime' time-domain."
with pytest.raises(ValueError, match=match):
test_df_time.timeseries(iamc_index=True)

Expand Down
4 changes: 2 additions & 2 deletions tests/test_feature_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def test_aggregate_skip_intermediate(recursive_df):
# make the data inconsistent, check (and then skip) validation

recursive_df._data.iloc[0] = recursive_df._data.iloc[0] + 2
recursive_df._data.iloc[3] = recursive_df._data.iloc[3] + 2
recursive_df._data.iloc[2] = recursive_df._data.iloc[2] + 2

# create object without variables to be aggregated, but with intermediate variables
v = "Secondary Energy|Electricity"
Expand Down Expand Up @@ -324,7 +324,7 @@ def test_aggregate_region_with_negative_weights(simple_df, caplog):

# dropping negative weights works as expected
neg_weights_df = simple_df.copy()
neg_weights_df._data.iloc[18] = -6
neg_weights_df._data.iloc[26] = -6
exp = simple_df.filter(variable=v, region="World", year=2010)
assert_iamframe_equal(neg_weights_df.aggregate_region(v, weight=w), exp)

Expand Down
30 changes: 27 additions & 3 deletions tests/test_feature_append_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ def test_concat(test_df, reverse, iterable):
assert test_df.scenario == ["scen_a", "scen_b"]
assert other.scenario == ["scen_c"]

# assert that merging of meta works as expected (reorder columns)
pdt.assert_frame_equal(result.meta[EXP_META.columns], EXP_META)
# assert that merging of meta works as expected
pdt.assert_frame_equal(result.meta[EXP_META.columns], EXP_META, check_like=True)

# assert that appending data works as expected
ts = result.timeseries()
Expand Down Expand Up @@ -188,7 +188,31 @@ def test_concat_all_pd_dataframe(test_df):
npt.assert_array_equal(ts.iloc[2].values, ts.iloc[3].values)


def test_append(test_df):
@pytest.mark.parametrize("inplace", (True, False))
def test_append_data_not_sorted(test_pd_df, inplace):
"""Appending timeseries data does not sort"""

columns = IAMC_IDX + list(test_pd_df.columns[[6, 5]])
unsorted_data = test_pd_df.iloc[[2, 1]][columns]
df = IamDataFrame(unsorted_data)

if inplace:
obs = df.copy()
obs.append(test_pd_df.iloc[[0]], inplace=True)
else:
obs = df.append(test_pd_df.iloc[[0]])
# assert that original object was not modified
assert len(df._data) == 4

# `data` is not sorted, only applies to pandas >= 2.2
# TODO remove this if-statement when dropping support for pandas < 2.2
if pd.__version__ >= "2.2":
assert list(obs.data.scenario.unique()) == ["scen_b", "scen_a"]
assert list(obs.data.year.unique()) == [2010, 2005]
assert not obs._data.index.is_monotonic_increasing


def test_append_meta(test_df):
other = test_df.filter(scenario="scen_b").rename({"scenario": {"scen_b": "scen_c"}})

test_df.set_meta([0, 1], name="col1")
Expand Down
6 changes: 4 additions & 2 deletions tests/test_iiasa.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from pyam import IamDataFrame, iiasa, lazy_read_iiasa, read_iiasa
from pyam.testing import assert_iamframe_equal
from pyam.utils import META_IDX
from pyam.utils import IAMC_IDX, META_IDX

from .conftest import IIASA_UNAVAILABLE, META_COLS, TEST_API, TEST_API_NAME

Expand Down Expand Up @@ -375,7 +375,9 @@ def test_lazy_read(tmpdir):
assert df.model == ["model_a"]
# This is read from the file, so the filter is not applied.
df2 = lazy_read_iiasa(tmp_file, TEST_API)
assert df.data.equals(df2.data)
assert (
df.data.sort_values(by=IAMC_IDX).reset_index(drop=True).equals(df2.data)
)
# If requesting with an inconsistent filter, get nothing back. Strings and filters
# work interchangably.
tmp_file = str(tmp_file)
Expand Down
Loading