Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: support multiindexed and arbitrarly-named dimensions for grouping #373

Merged
merged 1 commit into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Upcoming Version

* When writing out an LP file, large variables and constraints are now chunked to avoid memory issues. This is especially useful for large models with constraints with many terms. The chunk size can be set with the `slice_size` argument in the `solve` function.
* Constraints which of the form `<= infinity` and `>= -infinity` are now automatically filtered out when solving. The `solve` function now has a new argument `sanitize_infinities` to control this feature. Default is set to `True`.
* Grouping expressions is now supported on dimensions called "group" and dimensions that have the same name as the grouping object.
* Grouping dimensions which have multiindexed coordinates is now supported.

Version 0.3.15
--------------
Expand Down
1 change: 1 addition & 0 deletions linopy/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
TERM_DIM = "_term"
STACKED_TERM_DIM = "_stacked_term"
GROUPED_TERM_DIM = "_grouped_term"
GROUP_DIM = "_group"
FACTOR_DIM = "_factor"
CONCAT_DIM = "_concat"
HELPER_DIMS = [TERM_DIM, STACKED_TERM_DIM, GROUPED_TERM_DIM, FACTOR_DIM, CONCAT_DIM]
Expand Down
36 changes: 21 additions & 15 deletions linopy/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
EQUAL,
FACTOR_DIM,
GREATER_EQUAL,
GROUP_DIM,
GROUPED_TERM_DIM,
HELPER_DIMS,
LESS_EQUAL,
Expand Down Expand Up @@ -218,42 +219,43 @@ def sum(self, use_fallback: bool = False, **kwargs) -> LinearExpression:
group: pd.Series | pd.DataFrame | xr.DataArray = self.group
if isinstance(group, pd.DataFrame):
# dataframes do not have a name, so we need to set it
group_name = "group"
final_group_name = "group"
else:
group_name = getattr(group, "name", "group") or "group"
final_group_name = getattr(group, "name", "group") or "group"

if isinstance(group, DataArray):
group = group.to_pandas()

int_map = None
if isinstance(group, pd.DataFrame):
index_name = group.index.name
group = group.reindex(self.data.indexes[group.index.name])
group.index.name = index_name # ensure name for multiindex
int_map = get_index_map(*group.values.T)
orig_group = group
group = group.apply(tuple, axis=1).map(int_map)

group_dim = group.index.name
if group_name == group_dim:
raise ValueError(
"Group name cannot be the same as group dimension in non-fallback mode."
)

arrays = [group, group.groupby(group).cumcount()]
idx = pd.MultiIndex.from_arrays(
arrays, names=[group_name, GROUPED_TERM_DIM]
)
coords = Coordinates.from_pandas_multiindex(idx, group_dim)
ds = self.data.assign_coords(coords)
idx = pd.MultiIndex.from_arrays(arrays, names=[GROUP_DIM, GROUPED_TERM_DIM])
new_coords = Coordinates.from_pandas_multiindex(idx, group_dim)
coords = self.data.indexes[group_dim]
names_to_drop = [coords.name]
if isinstance(coords, pd.MultiIndex):
names_to_drop += list(coords.names)
ds = self.data.drop_vars(names_to_drop).assign_coords(new_coords)
ds = ds.unstack(group_dim, fill_value=LinearExpression._fill_value)
ds = LinearExpression._sum(ds, dim=GROUPED_TERM_DIM)

if int_map is not None:
index = ds.indexes["group"].map({v: k for k, v in int_map.items()})
index = ds.indexes[GROUP_DIM].map({v: k for k, v in int_map.items()})
index.names = [str(col) for col in orig_group.columns]
index.name = group_name
coords = Coordinates.from_pandas_multiindex(index, group_name)
ds = xr.Dataset(ds.assign_coords(coords))
index.name = GROUP_DIM
new_coords = Coordinates.from_pandas_multiindex(index, GROUP_DIM)
ds = xr.Dataset(ds.assign_coords(new_coords))

ds = ds.rename({GROUP_DIM: final_group_name})
return LinearExpression(ds, self.model)

def func(ds):
Expand Down Expand Up @@ -1428,6 +1430,8 @@ def to_polars(self) -> pl.DataFrame:

drop = exprwrap(Dataset.drop)

drop_vars = exprwrap(Dataset.drop_vars)

drop_sel = exprwrap(Dataset.drop_sel)

drop_isel = exprwrap(Dataset.drop_isel)
Expand All @@ -1452,6 +1456,8 @@ def to_polars(self) -> pl.DataFrame:

rename = exprwrap(Dataset.rename)

reset_index = exprwrap(Dataset.reset_index)

rename_dims = exprwrap(Dataset.rename_dims)

roll = exprwrap(Dataset.roll)
Expand Down
71 changes: 66 additions & 5 deletions test/test_linear_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,17 @@ def test_linear_expression_diff(v):

@pytest.mark.parametrize("use_fallback", [True, False])
def test_linear_expression_groupby(v, use_fallback):
expr = 1 * v
dim = v.dims[0]
groups = xr.DataArray([1] * 10 + [2] * 10, coords=v.coords, name=dim)
grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
assert dim in grouped.dims
assert (grouped.data[dim] == [1, 2]).all()
assert grouped.nterm == 10


@pytest.mark.parametrize("use_fallback", [True, False])
def test_linear_expression_groupby_on_same_name_as_target_dim(v, use_fallback):
expr = 1 * v
groups = xr.DataArray([1] * 10 + [2] * 10, coords=v.coords)
grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
Expand Down Expand Up @@ -719,20 +730,31 @@ def test_linear_expression_groupby_series_with_name(v, use_fallback):


@pytest.mark.parametrize("use_fallback", [True, False])
def test_linear_expression_groupby_with_series_false(v, use_fallback):
def test_linear_expression_groupby_with_series_with_same_group_name(v, use_fallback):
"""
Test that the group by works with a series whose name is the same as
the dimension to group.
"""
expr = 1 * v
groups = pd.Series([1] * 10 + [2] * 10, index=v.indexes["dim_2"])
groups.name = "dim_2"
if not use_fallback:
with pytest.raises(ValueError):
expr.groupby(groups).sum(use_fallback=use_fallback)
return
grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
assert "dim_2" in grouped.dims
assert (grouped.data.dim_2 == [1, 2]).all()
assert grouped.nterm == 10


@pytest.mark.parametrize("use_fallback", [True, False])
def test_linear_expression_groupby_with_series_on_multiindex(u, use_fallback):
expr = 1 * u
len_grouped_dim = len(u.data["dim_3"])
groups = pd.Series([1] * len_grouped_dim, index=u.indexes["dim_3"])
grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
assert "group" in grouped.dims
assert (grouped.data.group == [1]).all()
assert grouped.nterm == len_grouped_dim


@pytest.mark.parametrize("use_fallback", [True, False])
def test_linear_expression_groupby_with_dataframe(v, use_fallback):
expr = 1 * v
Expand All @@ -751,6 +773,45 @@ def test_linear_expression_groupby_with_dataframe(v, use_fallback):
assert grouped.nterm == 3


@pytest.mark.parametrize("use_fallback", [True, False])
def test_linear_expression_groupby_with_dataframe_with_same_group_name(v, use_fallback):
"""
Test that the group by works with a dataframe whose column name is the same as
the dimension to group.
"""
expr = 1 * v
groups = pd.DataFrame(
{"dim_2": [1] * 10 + [2] * 10, "b": list(range(4)) * 5},
index=v.indexes["dim_2"],
)
if use_fallback:
with pytest.raises(ValueError):
expr.groupby(groups).sum(use_fallback=use_fallback)
return

grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
index = pd.MultiIndex.from_frame(groups)
assert "group" in grouped.dims
assert set(grouped.data.group.values) == set(index.values)
assert grouped.nterm == 3


@pytest.mark.parametrize("use_fallback", [True, False])
def test_linear_expression_groupby_with_dataframe_on_multiindex(u, use_fallback):
expr = 1 * u
len_grouped_dim = len(u.data["dim_3"])
groups = pd.DataFrame({"a": [1] * len_grouped_dim}, index=u.indexes["dim_3"])

if use_fallback:
with pytest.raises(ValueError):
expr.groupby(groups).sum(use_fallback=use_fallback)
return
grouped = expr.groupby(groups).sum(use_fallback=use_fallback)
assert "group" in grouped.dims
assert isinstance(grouped.indexes["group"], pd.MultiIndex)
assert grouped.nterm == len_grouped_dim


@pytest.mark.parametrize("use_fallback", [True, False])
def test_linear_expression_groupby_with_dataarray(v, use_fallback):
expr = 1 * v
Expand Down