Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add is_column_selection() to expression meta, enhance expand_selector #16479

Merged
merged 6 commits into from
May 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions crates/polars-plan/src/dsl/meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,26 @@ impl MetaNameSpace {
}
}

/// Indicate if this expression only selects columns; the presence of any
/// transform operations will cause the check to return `false`, though
/// aliasing of the selected columns is optionally allowed.
pub fn is_column_selection(&self, allow_aliasing: bool) -> bool {
self.0.into_iter().all(|e| match e {
Expr::Column(_)
| Expr::Columns(_)
| Expr::DtypeColumn(_)
| Expr::Exclude(_, _)
| Expr::Nth(_)
| Expr::IndexColumn(_)
| Expr::Selector(_)
| Expr::Wildcard => true,
Expr::Alias(_, _) | Expr::KeepName(_) | Expr::RenameAlias { .. } if allow_aliasing => {
true
},
_ => false,
})
}

/// Indicate if this expression expands to multiple expressions with regex expansion.
pub fn is_regex_projection(&self) -> bool {
self.0.into_iter().any(|e| match e {
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-plan/src/logical_plan/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ impl fmt::Debug for Expr {
Columns(names) => write!(f, "cols({names:?})"),
DtypeColumn(dt) => write!(f, "dtype_columns({dt:?})"),
IndexColumn(idxs) => write!(f, "index_columns({idxs:?})"),
Selector(_) => write!(f, "SELECTOR"),
Selector(_) => write!(f, "selector"),
#[cfg(feature = "dtype-struct")]
Field(names) => write!(f, ".field({names:?})"),
}
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3485,7 +3485,7 @@ def write_database(
"""
Write the data in a Polars DataFrame to a database.

.. versionadded:: 0.20.26
.. versionchanged:: 0.20.26
Support for instantiated connection objects in addition to URI strings, and
a new `engine_options` parameter.

Expand Down
42 changes: 40 additions & 2 deletions py-polars/polars/expr/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def has_multiple_outputs(self) -> bool:

Examples
--------
>>> e = pl.col(["a", "b"]).alias("bar")
>>> e = pl.col(["a", "b"]).name.suffix("_foo")
>>> e.meta.has_multiple_outputs()
True
"""
Expand Down Expand Up @@ -100,12 +100,50 @@ def is_regex_projection(self) -> bool:

Examples
--------
>>> e = pl.col("^.*$").alias("bar")
>>> e = pl.col("^.*$").name.prefix("foo_")
>>> e.meta.is_regex_projection()
True
"""
return self._pyexpr.meta_is_regex_projection()

def is_column_selection(self, *, allow_aliasing: bool = False) -> bool:
"""
Indicate if this expression only selects columns (optionally with aliasing).

This can include bare columns, column matches by regex or dtype, selectors
and exclude ops, and (optionally) column/expression aliasing.

.. versionadded:: 0.20.30

Parameters
----------
allow_aliasing
If False (default), any aliasing is not considered pure column selection.
Set True to allow for column selection that also includes aliasing.

Examples
--------
>>> import polars.selectors as cs
>>> e = pl.col("foo")
>>> e.meta.is_column_selection()
True
>>> e = pl.col("foo").alias("bar")
>>> e.meta.is_column_selection()
False
>>> e.meta.is_column_selection(allow_aliasing=True)
True
>>> e = pl.col("foo") * pl.col("bar")
>>> e.meta.is_column_selection()
False
>>> e = cs.starts_with("foo")
>>> e.meta.is_column_selection()
True
>>> e = cs.starts_with("foo").exclude("foo!")
>>> e.meta.is_column_selection()
True
"""
return self._pyexpr.meta_is_column_selection(allow_aliasing)

@overload
def output_name(self, *, raise_if_undetermined: Literal[True] = True) -> str: ...

Expand Down
6 changes: 3 additions & 3 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,11 @@ def read_excel(
"""
Read Excel spreadsheet data into a DataFrame.

.. versionadded:: 0.20.6
.. versionchanged:: 0.20.6
Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls).
.. versionadded:: 0.19.4
.. versionchanged:: 0.19.4
Added "pyxlsb" engine for Excel Binary Workbooks (.xlsb).
.. versionadded:: 0.19.3
.. versionchanged:: 0.19.3
Added "openpyxl" engine, and added `schema_overrides` parameter.

Parameters
Expand Down
40 changes: 30 additions & 10 deletions py-polars/polars/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,25 @@ def is_selector(obj: Any) -> bool:
def expand_selector(
target: DataFrame | LazyFrame | Mapping[str, PolarsDataType],
selector: SelectorType | Expr,
*,
strict: bool = True,
) -> tuple[str, ...]:
"""
Expand a selector to column names with respect to a specific frame or schema target.
Expand selector to column names, with respect to a specific frame or target schema.

.. versionchanged:: 0.20.30
The `strict` parameter was added.

Parameters
----------
target
A polars DataFrame, LazyFrame or schema.
selector
An arbitrary polars selector (or compound selector).
strict
Setting False will additionally allow for a broader range of column selection
expressions (such as bare columns or use of `.exclude()`) to be expanded, not
just the dedicated selectors.

Examples
--------
Expand Down Expand Up @@ -118,22 +127,33 @@ def expand_selector(
Expand selector with respect to a standalone schema:

>>> schema = {
... "colx": pl.Float32,
... "coly": pl.Float64,
... "colz": pl.Date,
... "id": pl.Int64,
... "desc": pl.String,
... "count": pl.UInt32,
... "value": pl.Float64,
... }
>>> cs.expand_selector(schema, cs.float())
('colx', 'coly')
"""
if not is_selector(selector):
msg = f"expected a selector; found {selector!r} instead."
raise TypeError(msg)
>>> cs.expand_selector(schema, cs.string() | cs.float())
('desc', 'value')

Allow for non-strict selection expressions (such as those
including use of an `.exclude()` constraint) to be expanded:

>>> cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False)
('count', 'value')
"""
if isinstance(target, Mapping):
from polars.dataframe import DataFrame

target = DataFrame(schema=target)

if not (
is_selector(selector)
if strict
else selector.meta.is_column_selection(allow_aliasing=False)
):
msg = f"expected a selector; found {selector!r} instead."
raise TypeError(msg)

return tuple(target.select(selector).columns)


Expand Down
7 changes: 7 additions & 0 deletions py-polars/src/expr/meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ impl PyExpr {
self.inner.clone().meta().is_regex_projection()
}

fn meta_is_column_selection(&self, allow_aliasing: bool) -> bool {
self.inner
.clone()
.meta()
.is_column_selection(allow_aliasing)
}

fn _meta_selector_add(&self, other: PyExpr) -> PyResult<PyExpr> {
let out = self
.inner
Expand Down
45 changes: 43 additions & 2 deletions py-polars/tests/unit/operations/namespaces/test_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

import polars as pl
import polars.selectors as cs

if TYPE_CHECKING:
from pathlib import Path
Expand Down Expand Up @@ -65,7 +66,7 @@ def test_undo_aliases() -> None:


def test_meta_has_multiple_outputs() -> None:
e = pl.col(["a", "b"]).alias("bar")
e = pl.col(["a", "b"]).name.suffix("_foo")
assert e.meta.has_multiple_outputs()


Expand All @@ -80,8 +81,48 @@ def test_is_column() -> None:
assert not e.meta.is_column()


@pytest.mark.parametrize(
("expr", "is_column_selection"),
[
# columns
(pl.col("foo"), True),
(pl.col("foo", "bar"), True),
(pl.col(pl.NUMERIC_DTYPES), True),
# column expressions
(pl.col("foo") + 100, False),
(pl.col("foo").floordiv(10), False),
(pl.col("foo") * pl.col("bar"), False),
# selectors / expressions
(cs.numeric() * 100, False),
(cs.temporal() - cs.time(), True),
(cs.numeric().exclude("value"), True),
((cs.temporal() - cs.time()).exclude("dt"), True),
# top-level selection funcs
(pl.nth(2), True),
(pl.first(), True),
(pl.last(), True),
],
)
def test_is_column_selection(
expr: pl.Expr,
is_column_selection: bool,
) -> None:
if is_column_selection:
assert expr.meta.is_column_selection()
assert expr.meta.is_column_selection(allow_aliasing=True)
expr = (
expr.name.suffix("!")
if expr.meta.has_multiple_outputs()
else expr.alias("!")
)
assert not expr.meta.is_column_selection()
assert expr.meta.is_column_selection(allow_aliasing=True)
else:
assert not expr.meta.is_column_selection()


def test_meta_is_regex_projection() -> None:
e = pl.col("^.*$").alias("bar")
e = pl.col("^.*$").name.suffix("_foo")
assert e.meta.is_regex_projection()
assert e.meta.has_multiple_outputs()

Expand Down
24 changes: 24 additions & 0 deletions py-polars/tests/unit/test_selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,30 @@ def test_selector_ends_with(df: pl.DataFrame) -> None:
df.select(cs.ends_with(999)) # type: ignore[arg-type]


def test_selector_expand() -> None:
schema = {
"id": pl.Int64,
"desc": pl.String,
"count": pl.UInt32,
"value": pl.Float64,
}

expanded = cs.expand_selector(schema, cs.numeric() - cs.unsigned_integer())
assert expanded == ("id", "value")

with pytest.raises(TypeError, match="expected a selector"):
cs.expand_selector(schema, pl.exclude("id", "count"))

with pytest.raises(TypeError, match="expected a selector"):
cs.expand_selector(schema, pl.col("value") // 100)

expanded = cs.expand_selector(schema, pl.exclude("id", "count"), strict=False)
assert expanded == ("desc", "value")

expanded = cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False)
assert expanded == ("count", "value")


def test_selector_first_last(df: pl.DataFrame) -> None:
assert df.select(cs.first()).columns == ["abc"]
assert df.select(cs.last()).columns == ["qqR"]
Expand Down