From ea6e8b567f10dd7e199252f44dc82038dfdad9b6 Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Fri, 24 May 2024 01:29:25 +0400 Subject: [PATCH 1/6] feat: Add `is_column_selection()` to expression meta methods --- crates/polars-plan/src/dsl/meta.rs | 19 ++++++++++ crates/polars-plan/src/logical_plan/format.rs | 2 +- py-polars/polars/expr/meta.py | 38 ++++++++++++++++++- py-polars/polars/selectors.py | 37 +++++++++++++----- py-polars/src/expr/meta.rs | 7 ++++ .../unit/operations/namespaces/test_meta.py | 27 ++++++++++++- 6 files changed, 115 insertions(+), 15 deletions(-) diff --git a/crates/polars-plan/src/dsl/meta.rs b/crates/polars-plan/src/dsl/meta.rs index 19ae650f1e52..5df88ffd7a14 100644 --- a/crates/polars-plan/src/dsl/meta.rs +++ b/crates/polars-plan/src/dsl/meta.rs @@ -68,6 +68,25 @@ impl MetaNameSpace { } } + /// Indicate if this expression only selects columns; the presence of any + /// transform operations will cause the check to return `false`, though + /// aliasing of the selected columns is optionally allowed. + pub fn is_column_selection(&self, allow_aliasing: bool) -> bool { + self.0.into_iter().all(|e| match e { + Expr::Column(_) + | Expr::Columns(_) + | Expr::DtypeColumn(_) + | Expr::Exclude(_, _) + | Expr::IndexColumn(_) + | Expr::Selector(_) + | Expr::Wildcard => true, + Expr::Alias(_, _) | Expr::KeepName(_) | Expr::RenameAlias { .. } if allow_aliasing => { + true + }, + _ => false, + }) + } + /// Indicate if this expression expands to multiple expressions with regex expansion. pub fn is_regex_projection(&self) -> bool { self.0.into_iter().any(|e| match e { diff --git a/crates/polars-plan/src/logical_plan/format.rs b/crates/polars-plan/src/logical_plan/format.rs index 2118140b2d26..4c7b46b01969 100644 --- a/crates/polars-plan/src/logical_plan/format.rs +++ b/crates/polars-plan/src/logical_plan/format.rs @@ -164,7 +164,7 @@ impl fmt::Debug for Expr { Columns(names) => write!(f, "cols({names:?})"), DtypeColumn(dt) => write!(f, "dtype_columns({dt:?})"), IndexColumn(idxs) => write!(f, "index_columns({idxs:?})"), - Selector(_) => write!(f, "SELECTOR"), + Selector(_) => write!(f, "selector"), #[cfg(feature = "dtype-struct")] Field(names) => write!(f, ".field({names:?})"), } diff --git a/py-polars/polars/expr/meta.py b/py-polars/polars/expr/meta.py index 775405e2594c..ba9c35357fc0 100644 --- a/py-polars/polars/expr/meta.py +++ b/py-polars/polars/expr/meta.py @@ -70,7 +70,7 @@ def has_multiple_outputs(self) -> bool: Examples -------- - >>> e = pl.col(["a", "b"]).alias("bar") + >>> e = pl.col(["a", "b"]).name.suffix("_foo") >>> e.meta.has_multiple_outputs() True """ @@ -100,12 +100,46 @@ def is_regex_projection(self) -> bool: Examples -------- - >>> e = pl.col("^.*$").alias("bar") + >>> e = pl.col("^.*$").name.prefix("foo_") >>> e.meta.is_regex_projection() True """ return self._pyexpr.meta_is_regex_projection() + def is_column_selection(self, *, allow_aliasing: bool = False) -> bool: + """ + Indicate if this expression only selects columns (optionally with aliasing). + + This can include bare columns, column matches by regex or dtype, selectors + and exclude ops, and (optionally) column/expression aliasing. + + Parameters + ---------- + allow_aliasing + If False (default), any aliasing is not considered pure column selection. + Set True to allow for column selection that also includes aliasing. + + Examples + -------- + >>> import polars.selectors as cs + >>> e = pl.col("foo") + >>> e.meta.is_column_selection() + True + >>> e = pl.col("foo") * pl.col("bar") + >>> e.meta.is_column_selection() + False + >>> e = cs.starts_with("foo").exclude("foo!") + >>> e.meta.is_column_selection() + True + >>> e = cs.starts_with("foo").exclude("foo!").name.suffix("_bar") + >>> e.meta.is_column_selection() + False + >>> e = cs.starts_with("foo").exclude("foo!").name.suffix("_bar") + >>> e.meta.is_column_selection(allow_aliasing=True) + True + """ + return self._pyexpr.meta_is_column_selection(allow_aliasing) + @overload def output_name(self, *, raise_if_undetermined: Literal[True] = True) -> str: ... diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index ddc94fe4aa45..55084a3bb6bf 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -81,9 +81,11 @@ def is_selector(obj: Any) -> bool: def expand_selector( target: DataFrame | LazyFrame | Mapping[str, PolarsDataType], selector: SelectorType | Expr, + *, + strict: bool = True, ) -> tuple[str, ...]: """ - Expand a selector to column names with respect to a specific frame or schema target. + Expand selector to column names, with respect to a specific frame or target schema. Parameters ---------- @@ -91,6 +93,10 @@ def expand_selector( A polars DataFrame, LazyFrame or schema. selector An arbitrary polars selector (or compound selector). + strict + Setting False will additionally allow for a broader range of column selection + expressions (such as bare columns or use of `.exclude()`) to be expanded, not + just the dedicated selectors. Examples -------- @@ -118,22 +124,33 @@ def expand_selector( Expand selector with respect to a standalone schema: >>> schema = { - ... "colx": pl.Float32, - ... "coly": pl.Float64, - ... "colz": pl.Date, + ... "id": pl.Int64, + ... "desc": pl.String, + ... "count": pl.UInt32, + ... "value": pl.Float64, ... } - >>> cs.expand_selector(schema, cs.float()) - ('colx', 'coly') - """ - if not is_selector(selector): - msg = f"expected a selector; found {selector!r} instead." - raise TypeError(msg) + >>> cs.expand_selector(schema, cs.string() | cs.float()) + ('desc', 'value') + + Allow for non-strict selection expressions (such as those + including use of an `.exclude()` constraint) to be expanded: + >>> cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False) + ('count', 'value') + """ if isinstance(target, Mapping): from polars.dataframe import DataFrame target = DataFrame(schema=target) + if not ( + is_selector(selector) + if strict + else selector.meta.is_column_selection(allow_aliasing=False) + ): + msg = f"expected a selector; found {selector!r} instead." + raise TypeError(msg) + return tuple(target.select(selector).columns) diff --git a/py-polars/src/expr/meta.rs b/py-polars/src/expr/meta.rs index 225d8573d281..4e2a41b09769 100644 --- a/py-polars/src/expr/meta.rs +++ b/py-polars/src/expr/meta.rs @@ -54,6 +54,13 @@ impl PyExpr { self.inner.clone().meta().is_regex_projection() } + fn meta_is_column_selection(&self, allow_aliasing: bool) -> bool { + self.inner + .clone() + .meta() + .is_column_selection(allow_aliasing) + } + fn _meta_selector_add(&self, other: PyExpr) -> PyResult { let out = self .inner diff --git a/py-polars/tests/unit/operations/namespaces/test_meta.py b/py-polars/tests/unit/operations/namespaces/test_meta.py index fe554c694491..2cfbfa78d4fd 100644 --- a/py-polars/tests/unit/operations/namespaces/test_meta.py +++ b/py-polars/tests/unit/operations/namespaces/test_meta.py @@ -5,6 +5,7 @@ import pytest import polars as pl +import polars.selectors as cs if TYPE_CHECKING: from pathlib import Path @@ -65,7 +66,7 @@ def test_undo_aliases() -> None: def test_meta_has_multiple_outputs() -> None: - e = pl.col(["a", "b"]).alias("bar") + e = pl.col(["a", "b"]).name.suffix("_foo") assert e.meta.has_multiple_outputs() @@ -80,8 +81,30 @@ def test_is_column() -> None: assert not e.meta.is_column() +def test_is_column_selection() -> None: + e = pl.col("foo") + assert e.meta.is_column_selection() + + e = pl.col("foo").alias("bar") + assert not e.meta.is_column_selection() + assert e.meta.is_column_selection(allow_aliasing=True) + + e = pl.col(pl.String) + assert e.meta.is_column_selection() + + e = pl.col(pl.String).name.prefix("str_") + assert e.meta.is_column_selection(allow_aliasing=True) + + e = cs.numeric().exclude("value") + assert e.meta.is_column_selection() + assert not (e + 100).meta.is_column_selection() + + e = cs.numeric() - cs.integer() + assert e.meta.is_column_selection() + + def test_meta_is_regex_projection() -> None: - e = pl.col("^.*$").alias("bar") + e = pl.col("^.*$").name.suffix("_foo") assert e.meta.is_regex_projection() assert e.meta.has_multiple_outputs() From e7e433dded5f6fe9de6ea16d0f67b397728afd37 Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Sat, 25 May 2024 01:39:39 +0400 Subject: [PATCH 2/6] don't forget about `nth` --- crates/polars-plan/src/dsl/meta.rs | 1 + .../unit/operations/namespaces/test_meta.py | 3 +++ py-polars/tests/unit/test_selectors.py | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/crates/polars-plan/src/dsl/meta.rs b/crates/polars-plan/src/dsl/meta.rs index 5df88ffd7a14..f5b67fcf63bf 100644 --- a/crates/polars-plan/src/dsl/meta.rs +++ b/crates/polars-plan/src/dsl/meta.rs @@ -77,6 +77,7 @@ impl MetaNameSpace { | Expr::Columns(_) | Expr::DtypeColumn(_) | Expr::Exclude(_, _) + | Expr::Nth(_) | Expr::IndexColumn(_) | Expr::Selector(_) | Expr::Wildcard => true, diff --git a/py-polars/tests/unit/operations/namespaces/test_meta.py b/py-polars/tests/unit/operations/namespaces/test_meta.py index 2cfbfa78d4fd..5e108ce82964 100644 --- a/py-polars/tests/unit/operations/namespaces/test_meta.py +++ b/py-polars/tests/unit/operations/namespaces/test_meta.py @@ -102,6 +102,9 @@ def test_is_column_selection() -> None: e = cs.numeric() - cs.integer() assert e.meta.is_column_selection() + e = pl.nth(2) + assert e.meta.is_column_selection() + def test_meta_is_regex_projection() -> None: e = pl.col("^.*$").name.suffix("_foo") diff --git a/py-polars/tests/unit/test_selectors.py b/py-polars/tests/unit/test_selectors.py index d694d91dc8e6..d2d0ded45075 100644 --- a/py-polars/tests/unit/test_selectors.py +++ b/py-polars/tests/unit/test_selectors.py @@ -376,6 +376,24 @@ def test_selector_ends_with(df: pl.DataFrame) -> None: df.select(cs.ends_with(999)) # type: ignore[arg-type] +def test_selector_expand() -> None: + schema = { + "id": pl.Int64, + "desc": pl.String, + "count": pl.UInt32, + "value": pl.Float64, + } + + expanded = cs.expand_selector(schema, cs.numeric() - cs.unsigned_integer()) + assert expanded == ("id", "value") + + with pytest.raises(TypeError, match="expected a selector"): + cs.expand_selector(schema, pl.exclude("id", "count")) + + expanded = cs.expand_selector(schema, pl.exclude("id", "count"), strict=False) + assert expanded == ("desc", "value") + + def test_selector_first_last(df: pl.DataFrame) -> None: assert df.select(cs.first()).columns == ["abc"] assert df.select(cs.last()).columns == ["qqR"] From 7821f375c81167108bcf485fbbd746ca476032ae Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Sat, 25 May 2024 11:08:27 +0400 Subject: [PATCH 3/6] parametrize `is_column_selection` tests --- .../unit/operations/namespaces/test_meta.py | 62 ++++++++++++------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/py-polars/tests/unit/operations/namespaces/test_meta.py b/py-polars/tests/unit/operations/namespaces/test_meta.py index 5e108ce82964..6a5f8a4c0951 100644 --- a/py-polars/tests/unit/operations/namespaces/test_meta.py +++ b/py-polars/tests/unit/operations/namespaces/test_meta.py @@ -81,29 +81,45 @@ def test_is_column() -> None: assert not e.meta.is_column() -def test_is_column_selection() -> None: - e = pl.col("foo") - assert e.meta.is_column_selection() - - e = pl.col("foo").alias("bar") - assert not e.meta.is_column_selection() - assert e.meta.is_column_selection(allow_aliasing=True) - - e = pl.col(pl.String) - assert e.meta.is_column_selection() - - e = pl.col(pl.String).name.prefix("str_") - assert e.meta.is_column_selection(allow_aliasing=True) - - e = cs.numeric().exclude("value") - assert e.meta.is_column_selection() - assert not (e + 100).meta.is_column_selection() - - e = cs.numeric() - cs.integer() - assert e.meta.is_column_selection() - - e = pl.nth(2) - assert e.meta.is_column_selection() +@pytest.mark.parametrize( + ("expr", "expected"), + [ + # columns + (pl.col("foo"), True), + (pl.col("foo", "bar"), True), + (pl.col(pl.NUMERIC_DTYPES), True), + # column expressions + (pl.col("foo") + 100, False), + (pl.col("foo").floordiv(10), False), + (pl.col("foo") * pl.col("bar"), False), + # selectors / expressions + (cs.numeric() * 100, False), + (cs.temporal() - cs.time(), True), + (cs.numeric().exclude("value"), True), + ((cs.temporal() - cs.time()).exclude("dt"), True), + # top-level selection funcs + (pl.nth(2), True), + (pl.first(), True), + (pl.last(), True), + ], +) +def test_is_column_selection( + expr: pl.Expr, + expected: bool, +) -> None: + if expected: + assert expr.meta.is_column_selection() + assert expr.meta.is_column_selection(allow_aliasing=True) + + expr = ( + expr.name.suffix("!") + if expr.meta.has_multiple_outputs() + else expr.alias("!") + ) + assert not expr.meta.is_column_selection() + assert expr.meta.is_column_selection(allow_aliasing=True) + else: + assert not expr.meta.is_column_selection() def test_meta_is_regex_projection() -> None: From f7dc24e66b6d41aad30ad68c6eef1e07f0435c72 Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Sat, 25 May 2024 11:34:45 +0400 Subject: [PATCH 4/6] `versionadded` / `versionchanged` tags --- py-polars/polars/dataframe/frame.py | 2 +- py-polars/polars/expr/meta.py | 14 +++++++++----- py-polars/polars/io/spreadsheet/functions.py | 6 +++--- py-polars/polars/selectors.py | 3 +++ 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 30650d1b6882..439d34b74339 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3485,7 +3485,7 @@ def write_database( """ Write the data in a Polars DataFrame to a database. - .. versionadded:: 0.20.26 + .. versionchanged:: 0.20.26 Support for instantiated connection objects in addition to URI strings, and a new `engine_options` parameter. diff --git a/py-polars/polars/expr/meta.py b/py-polars/polars/expr/meta.py index ba9c35357fc0..ec531dbd8ca2 100644 --- a/py-polars/polars/expr/meta.py +++ b/py-polars/polars/expr/meta.py @@ -113,6 +113,8 @@ def is_column_selection(self, *, allow_aliasing: bool = False) -> bool: This can include bare columns, column matches by regex or dtype, selectors and exclude ops, and (optionally) column/expression aliasing. + .. versionadded:: 0.20.30 + Parameters ---------- allow_aliasing @@ -125,17 +127,19 @@ def is_column_selection(self, *, allow_aliasing: bool = False) -> bool: >>> e = pl.col("foo") >>> e.meta.is_column_selection() True + >>> e = pl.col("foo").alias("bar") + >>> e.meta.is_column_selection() + False + >>> e.meta.is_column_selection(allow_aliasing=True) + True >>> e = pl.col("foo") * pl.col("bar") >>> e.meta.is_column_selection() False - >>> e = cs.starts_with("foo").exclude("foo!") + >>> e = cs.starts_with("foo") >>> e.meta.is_column_selection() True - >>> e = cs.starts_with("foo").exclude("foo!").name.suffix("_bar") + >>> e = cs.starts_with("foo").exclude("foo!") >>> e.meta.is_column_selection() - False - >>> e = cs.starts_with("foo").exclude("foo!").name.suffix("_bar") - >>> e.meta.is_column_selection(allow_aliasing=True) True """ return self._pyexpr.meta_is_column_selection(allow_aliasing) diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 4b3a143f865b..37823d52977b 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -150,11 +150,11 @@ def read_excel( """ Read Excel spreadsheet data into a DataFrame. - .. versionadded:: 0.20.6 + .. versionchanged:: 0.20.6 Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls). - .. versionadded:: 0.19.4 + .. versionchanged:: 0.19.4 Added "pyxlsb" engine for Excel Binary Workbooks (.xlsb). - .. versionadded:: 0.19.3 + .. versionchanged:: 0.19.3 Added "openpyxl" engine, and added `schema_overrides` parameter. Parameters diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index 55084a3bb6bf..6e2e327f3b55 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -87,6 +87,9 @@ def expand_selector( """ Expand selector to column names, with respect to a specific frame or target schema. + .. versionchanged:: 0.20.30 + The `strict` parameter was added. + Parameters ---------- target From 8027a4c993dbfda38a3d40a8d26ae5e32a27ddc2 Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Sat, 25 May 2024 11:38:30 +0400 Subject: [PATCH 5/6] bonus test coverage --- py-polars/tests/unit/test_selectors.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/py-polars/tests/unit/test_selectors.py b/py-polars/tests/unit/test_selectors.py index d2d0ded45075..d8cf3900c39d 100644 --- a/py-polars/tests/unit/test_selectors.py +++ b/py-polars/tests/unit/test_selectors.py @@ -390,9 +390,15 @@ def test_selector_expand() -> None: with pytest.raises(TypeError, match="expected a selector"): cs.expand_selector(schema, pl.exclude("id", "count")) + with pytest.raises(TypeError, match="expected a selector"): + cs.expand_selector(schema, pl.col("value") // 100) + expanded = cs.expand_selector(schema, pl.exclude("id", "count"), strict=False) assert expanded == ("desc", "value") + expanded = cs.expand_selector(schema, cs.numeric().exclude("id"), strict=False) + assert expanded == ("count", "value") + def test_selector_first_last(df: pl.DataFrame) -> None: assert df.select(cs.first()).columns == ["abc"] From 57325f291811553789e082e39924578cfecc0d14 Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Sat, 25 May 2024 12:33:51 +0400 Subject: [PATCH 6/6] improve test param name --- py-polars/tests/unit/operations/namespaces/test_meta.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/py-polars/tests/unit/operations/namespaces/test_meta.py b/py-polars/tests/unit/operations/namespaces/test_meta.py index 6a5f8a4c0951..386c34ac03a9 100644 --- a/py-polars/tests/unit/operations/namespaces/test_meta.py +++ b/py-polars/tests/unit/operations/namespaces/test_meta.py @@ -82,7 +82,7 @@ def test_is_column() -> None: @pytest.mark.parametrize( - ("expr", "expected"), + ("expr", "is_column_selection"), [ # columns (pl.col("foo"), True), @@ -105,12 +105,11 @@ def test_is_column() -> None: ) def test_is_column_selection( expr: pl.Expr, - expected: bool, + is_column_selection: bool, ) -> None: - if expected: + if is_column_selection: assert expr.meta.is_column_selection() assert expr.meta.is_column_selection(allow_aliasing=True) - expr = ( expr.name.suffix("!") if expr.meta.has_multiple_outputs()