From 9af88f1fdee6f3af5b4e5ac646ecd13fce80bca0 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Fri, 14 Jun 2024 17:45:51 +0200 Subject: [PATCH] feat(python): Add `PerformanceWarning` to LazyFrame properties (#16964) --- py-polars/polars/_utils/deprecation.py | 5 +- py-polars/polars/_utils/unstable.py | 5 +- py-polars/polars/_utils/various.py | 19 +++++++ py-polars/polars/exceptions.py | 34 ++++++++---- py-polars/polars/lazyframe/frame.py | 55 +++++++++++++++---- py-polars/src/error.rs | 3 +- py-polars/src/lib.rs | 9 ++- py-polars/tests/unit/io/cloud/test_aws.py | 6 +- .../tests/unit/lazyframe/test_lazyframe.py | 14 +++-- .../operations/arithmetic/test_arithmetic.py | 4 +- py-polars/tests/unit/test_exceptions.py | 34 ++++++++++-- py-polars/tests/unit/utils/test_various.py | 10 ++++ 12 files changed, 151 insertions(+), 47 deletions(-) create mode 100644 py-polars/tests/unit/utils/test_various.py diff --git a/py-polars/polars/_utils/deprecation.py b/py-polars/polars/_utils/deprecation.py index 00c253764804..b0d6f7677c6a 100644 --- a/py-polars/polars/_utils/deprecation.py +++ b/py-polars/polars/_utils/deprecation.py @@ -1,11 +1,10 @@ from __future__ import annotations import inspect -import warnings from functools import wraps from typing import TYPE_CHECKING, Callable, Sequence, TypeVar -from polars._utils.various import find_stacklevel +from polars._utils.various import issue_warning if TYPE_CHECKING: import sys @@ -41,7 +40,7 @@ def issue_deprecation_warning(message: str, *, version: str) -> None: This argument is used to help developers determine when to remove the deprecated functionality. """ - warnings.warn(message, DeprecationWarning, stacklevel=find_stacklevel()) + issue_warning(message, DeprecationWarning) def deprecate_function( diff --git a/py-polars/polars/_utils/unstable.py b/py-polars/polars/_utils/unstable.py index 3ad2e4fde306..d60ecc47dde5 100644 --- a/py-polars/polars/_utils/unstable.py +++ b/py-polars/polars/_utils/unstable.py @@ -2,11 +2,10 @@ import inspect import os -import warnings from functools import wraps from typing import TYPE_CHECKING, Callable, TypeVar -from polars._utils.various import find_stacklevel +from polars._utils.various import issue_warning from polars.exceptions import UnstableWarning if TYPE_CHECKING: @@ -46,7 +45,7 @@ def issue_unstable_warning(message: str | None = None) -> None: " It may be changed at any point without it being considered a breaking change." ) - warnings.warn(message, UnstableWarning, stacklevel=find_stacklevel()) + issue_warning(message, UnstableWarning) def unstable() -> Callable[[Callable[P, T]], Callable[P, T]]: diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py index 1db371fb8000..e1e1c0b69fb4 100644 --- a/py-polars/polars/_utils/various.py +++ b/py-polars/polars/_utils/various.py @@ -431,6 +431,25 @@ def find_stacklevel() -> int: return n +def issue_warning(message: str, category: type[Warning], **kwargs: Any) -> None: + """ + Issue a warning. + + Parameters + ---------- + message + The message associated with the warning. + category + The warning category. + **kwargs + Additional arguments for `warnings.warn`. Note that the `stacklevel` is + determined automatically. + """ + warnings.warn( + message=message, category=category, stacklevel=find_stacklevel(), **kwargs + ) + + def _get_stack_locals( of_type: type | Collection[type] | Callable[[Any], bool] | None = None, *, diff --git a/py-polars/polars/exceptions.py b/py-polars/polars/exceptions.py index f839c828cdd0..aa3065384cfc 100644 --- a/py-polars/polars/exceptions.py +++ b/py-polars/polars/exceptions.py @@ -8,6 +8,7 @@ MapWithoutReturnDtypeWarning, NoDataError, OutOfBoundsError, + PerformanceWarning, PolarsError, PolarsPanicError, PolarsWarning, @@ -94,11 +95,14 @@ class StructFieldNotFoundError(PolarsError): # type: ignore[no-redef, misc] class PolarsWarning(Exception): # type: ignore[no-redef] """Base class for all Polars warnings.""" - class CategoricalRemappingWarning(PolarsWarning): # type: ignore[no-redef, misc] - """Warning raised when a categorical needs to be remapped to be compatible with another categorical.""" # noqa: W505 + class PerformanceWarning(PolarsWarning): # type: ignore[no-redef, misc] + """Warning issued to indicate potential performance pitfalls.""" + + class CategoricalRemappingWarning(PerformanceWarning): # type: ignore[no-redef, misc] + """Warning issued when a categorical needs to be remapped to be compatible with another categorical.""" # noqa: W505 class MapWithoutReturnDtypeWarning(PolarsWarning): # type: ignore[no-redef, misc] - """Warning raised when `map_elements` is performed without specifying the return dtype.""" # noqa: W505 + """Warning issued when `map_elements` is performed without specifying the return dtype.""" # noqa: W505 class InvalidAssert(PolarsError): # type: ignore[misc] @@ -141,7 +145,11 @@ class ChronoFormatWarning(PolarsWarning): # type: ignore[misc] """ -class PolarsInefficientMapWarning(PolarsWarning): # type: ignore[misc] +class CustomUFuncWarning(PolarsWarning): # type: ignore[misc] + """Warning issued when a custom ufunc is handled differently than numpy ufunc would.""" # noqa: W505 + + +class PolarsInefficientMapWarning(PerformanceWarning): # type: ignore[misc] """Warning issued when a potentially slow `map_*` operation is performed.""" @@ -149,26 +157,19 @@ class UnstableWarning(PolarsWarning): # type: ignore[misc] """Warning issued when unstable functionality is used.""" -class CustomUFuncWarning(PolarsWarning): # type: ignore[misc] - """Warning issued when a custom ufunc is handled differently than numpy ufunc would.""" # noqa: W505 - - __all__ = [ - "CategoricalRemappingWarning", + # Errors "ChronoFormatWarning", "ColumnNotFoundError", "ComputeError", "DuplicateError", "InvalidOperationError", - "MapWithoutReturnDtypeWarning", "ModuleUpgradeRequired", "NoDataError", "NoRowsReturnedError", "OutOfBoundsError", "PolarsError", - "PolarsInefficientMapWarning", "PolarsPanicError", - "PolarsWarning", "RowsError", "SQLInterfaceError", "SQLSyntaxError", @@ -178,4 +179,13 @@ class CustomUFuncWarning(PolarsWarning): # type: ignore[misc] "StringCacheMismatchError", "StructFieldNotFoundError", "TooManyRowsReturnedError", + # Warnings + "PolarsWarning", + "CategoricalRemappingWarning", + "ChronoFormatWarning", + "CustomUFuncWarning", + "MapWithoutReturnDtypeWarning", + "PerformanceWarning", + "PolarsInefficientMapWarning", + "UnstableWarning", ] diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 2d24433a9ef9..6fb3fb41e199 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -41,6 +41,7 @@ extend_bool, is_bool_sequence, is_sequence, + issue_warning, normalize_filepath, parse_percentiles, ) @@ -74,6 +75,7 @@ py_type_to_dtype, ) from polars.dependencies import import_optional, subprocess +from polars.exceptions import PerformanceWarning from polars.lazyframe.group_by import LazyGroupBy from polars.lazyframe.in_process import InProcessQuery from polars.schema import Schema @@ -399,13 +401,15 @@ def columns(self) -> list[str]: Warnings -------- - Determining the column names of a LazyFrame requires resolving its schema. - Resolving the schema of a LazyFrame can be an expensive operation. - Avoid accessing this property repeatedly if possible. + Determining the column names of a LazyFrame requires resolving its schema, + which is a potentially expensive operation. + Using :meth:`collect_schema` is the idiomatic way of resolving the schema. + This property exists only for symmetry with the DataFrame class. See Also -------- collect_schema + Schema.names Examples -------- @@ -419,6 +423,12 @@ def columns(self) -> list[str]: >>> lf.columns ['foo', 'bar'] """ + issue_warning( + "Determining the column names of a LazyFrame requires resolving its schema," + " which is a potentially expensive operation. Use `LazyFrame.collect_schema().names()`" + " to get the column names without this warning.", + category=PerformanceWarning, + ) return self.collect_schema().names() @property @@ -433,13 +443,15 @@ def dtypes(self) -> list[DataType]: Warnings -------- - Determining the data types of a LazyFrame requires resolving its schema. - Resolving the schema of a LazyFrame can be an expensive operation. - Avoid accessing this property repeatedly if possible. + Determining the data types of a LazyFrame requires resolving its schema, + which is a potentially expensive operation. + Using :meth:`collect_schema` is the idiomatic way to resolve the schema. + This property exists only for symmetry with the DataFrame class. See Also -------- collect_schema + Schema.dtypes Examples -------- @@ -453,6 +465,12 @@ def dtypes(self) -> list[DataType]: >>> lf.dtypes [Int64, Float64, String] """ + issue_warning( + "Determining the data types of a LazyFrame requires resolving its schema," + " which is a potentially expensive operation. Use `LazyFrame.collect_schema().dtypes()`" + " to get the data types without this warning.", + category=PerformanceWarning, + ) return self.collect_schema().dtypes() @property @@ -462,12 +480,14 @@ def schema(self) -> Schema: Warnings -------- - Resolving the schema of a LazyFrame can be an expensive operation. - Avoid accessing this property repeatedly if possible. + Resolving the schema of a LazyFrame is a potentially expensive operation. + Using :meth:`collect_schema` is the idiomatic way to resolve the schema. + This property exists only for symmetry with the DataFrame class. See Also -------- collect_schema + Schema Examples -------- @@ -481,6 +501,11 @@ def schema(self) -> Schema: >>> lf.schema Schema({'foo': Int64, 'bar': Float64, 'ham': String}) """ + issue_warning( + "Resolving the schema of a LazyFrame is a potentially expensive operation." + " Use `LazyFrame.collect_schema()` to get the schema without this warning.", + category=PerformanceWarning, + ) return self.collect_schema() @property @@ -494,13 +519,15 @@ def width(self) -> int: Warnings -------- - Determining the width of a LazyFrame requires resolving its schema. - Resolving the schema of a LazyFrame can be an expensive operation. - Avoid accessing this property repeatedly if possible. + Determining the width of a LazyFrame requires resolving its schema, + which is a potentially expensive operation. + Using :meth:`collect_schema` is the idiomatic way to resolve the schema. + This property exists only for symmetry with the DataFrame class. See Also -------- collect_schema + Schema.len Examples -------- @@ -513,6 +540,12 @@ def width(self) -> int: >>> lf.width 2 """ + issue_warning( + "Determining the width of a LazyFrame requires resolving its schema," + " which is a potentially expensive operation. Use `LazyFrame.collect_schema().len()`" + " to get the width without this warning.", + category=PerformanceWarning, + ) return self.collect_schema().len() def __bool__(self) -> NoReturn: diff --git a/py-polars/src/error.rs b/py-polars/src/error.rs index f73ab2798fc7..483fe1129904 100644 --- a/py-polars/src/error.rs +++ b/py-polars/src/error.rs @@ -104,10 +104,11 @@ create_exception!(polars.exceptions, StringCacheMismatchError, PolarsBaseError); create_exception!(polars.exceptions, StructFieldNotFoundError, PolarsBaseError); create_exception!(polars.exceptions, PolarsBaseWarning, PyWarning); +create_exception!(polars.exceptions, PerformanceWarning, PolarsBaseWarning); create_exception!( polars.exceptions, CategoricalRemappingWarning, - PolarsBaseWarning + PerformanceWarning ); create_exception!( polars.exceptions, diff --git a/py-polars/src/lib.rs b/py-polars/src/lib.rs index 97c376ec56fe..0cf485f93be8 100644 --- a/py-polars/src/lib.rs +++ b/py-polars/src/lib.rs @@ -49,8 +49,8 @@ use crate::dataframe::PyDataFrame; use crate::error::{ CategoricalRemappingWarning, ColumnNotFoundError, ComputeError, DuplicateError, InvalidOperationError, MapWithoutReturnDtypeWarning, NoDataError, OutOfBoundsError, - PolarsBaseError, PolarsBaseWarning, PyPolarsErr, SQLInterfaceError, SQLSyntaxError, - SchemaError, SchemaFieldNotFoundError, StructFieldNotFoundError, + PerformanceWarning, PolarsBaseError, PolarsBaseWarning, PyPolarsErr, SQLInterfaceError, + SQLSyntaxError, SchemaError, SchemaFieldNotFoundError, StructFieldNotFoundError, }; use crate::expr::PyExpr; use crate::functions::PyStringCacheHolder; @@ -366,6 +366,11 @@ fn polars(py: Python, m: &Bound) -> PyResult<()> { // Exceptions - Warnings m.add("PolarsWarning", py.get_type_bound::()) .unwrap(); + m.add( + "PerformanceWarning", + py.get_type_bound::(), + ) + .unwrap(); m.add( "CategoricalRemappingWarning", py.get_type_bound::(), diff --git a/py-polars/tests/unit/io/cloud/test_aws.py b/py-polars/tests/unit/io/cloud/test_aws.py index e7fd9573af58..198fd3f8d76d 100644 --- a/py-polars/tests/unit/io/cloud/test_aws.py +++ b/py-polars/tests/unit/io/cloud/test_aws.py @@ -89,12 +89,12 @@ def test_read_s3(s3: str, function: Callable[..., Any], extension: str) -> None: [(pl.scan_ipc, "ipc"), (pl.scan_parquet, "parquet")], ) def test_scan_s3(s3: str, function: Callable[..., Any], extension: str) -> None: - df = function( + lf = function( f"s3://bucket/foods1.{extension}", storage_options={"endpoint_url": s3}, ) - assert df.columns == ["category", "calories", "fats_g", "sugars_g"] - assert df.collect().shape == (27, 4) + assert lf.collect_schema().names() == ["category", "calories", "fats_g", "sugars_g"] + assert lf.collect().shape == (27, 4) def test_lazy_count_s3(s3: str) -> None: diff --git a/py-polars/tests/unit/lazyframe/test_lazyframe.py b/py-polars/tests/unit/lazyframe/test_lazyframe.py index b446d66027cd..4bddf0bac8f2 100644 --- a/py-polars/tests/unit/lazyframe/test_lazyframe.py +++ b/py-polars/tests/unit/lazyframe/test_lazyframe.py @@ -14,7 +14,7 @@ import polars.selectors as cs from polars import lit, when from polars.datatypes import FLOAT_DTYPES -from polars.exceptions import PolarsInefficientMapWarning +from polars.exceptions import PerformanceWarning, PolarsInefficientMapWarning from polars.testing import assert_frame_equal, assert_series_equal if TYPE_CHECKING: @@ -1382,7 +1382,11 @@ def test_lf_properties() -> None: "ham": ["a", "b", "c"], } ) - assert lf.schema == {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String} - assert lf.columns == ["foo", "bar", "ham"] - assert lf.dtypes == [pl.Int64, pl.Float64, pl.String] - assert lf.width == 3 + with pytest.warns(PerformanceWarning): + assert lf.schema == {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String} + with pytest.warns(PerformanceWarning): + assert lf.columns == ["foo", "bar", "ham"] + with pytest.warns(PerformanceWarning): + assert lf.dtypes == [pl.Int64, pl.Float64, pl.String] + with pytest.warns(PerformanceWarning): + assert lf.width == 3 diff --git a/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py b/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py index dfb492c4cacc..ad0920bc6226 100644 --- a/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py +++ b/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py @@ -670,7 +670,7 @@ def test_arithmetic_duration_div_multiply() -> None: e=pl.col("a") * 2.5, f=pl.col("a") / pl.col("a"), # a constant float ) - assert q.schema == pl.Schema( + assert q.collect_schema() == pl.Schema( [ ("a", pl.Duration(time_unit="us")), ("b", pl.Duration(time_unit="us")), @@ -715,7 +715,7 @@ def test_arithmetic_duration_div_multiply() -> None: b=2 * pl.col("a"), c=2.5 * pl.col("a"), ) - assert q.schema == pl.Schema( + assert q.collect_schema() == pl.Schema( [ ("a", pl.Duration(time_unit="us")), ("b", pl.Duration(time_unit="us")), diff --git a/py-polars/tests/unit/test_exceptions.py b/py-polars/tests/unit/test_exceptions.py index 41228a3bf7de..d94a81d92aa7 100644 --- a/py-polars/tests/unit/test_exceptions.py +++ b/py-polars/tests/unit/test_exceptions.py @@ -1,10 +1,34 @@ import pytest -import polars as pl +from polars.exceptions import ( + CategoricalRemappingWarning, + ComputeError, + CustomUFuncWarning, + MapWithoutReturnDtypeWarning, + OutOfBoundsError, + PerformanceWarning, + PolarsError, + PolarsInefficientMapWarning, + PolarsWarning, +) -def test_base_class() -> None: - assert isinstance(pl.ComputeError("msg"), pl.PolarsError) +def test_polars_error_base_class() -> None: msg = "msg" - with pytest.raises(pl.PolarsError, match=msg): - raise pl.OutOfBoundsError(msg) + assert isinstance(ComputeError(msg), PolarsError) + with pytest.raises(PolarsError, match=msg): + raise OutOfBoundsError(msg) + + +def test_polars_warning_base_class() -> None: + msg = "msg" + assert isinstance(MapWithoutReturnDtypeWarning(msg), PolarsWarning) + with pytest.raises(PolarsWarning, match=msg): + raise CustomUFuncWarning(msg) + + +def test_performance_warning_base_class() -> None: + msg = "msg" + assert isinstance(PolarsInefficientMapWarning(msg), PerformanceWarning) + with pytest.raises(PerformanceWarning, match=msg): + raise CategoricalRemappingWarning(msg) diff --git a/py-polars/tests/unit/utils/test_various.py b/py-polars/tests/unit/utils/test_various.py new file mode 100644 index 000000000000..a0bab09c11c5 --- /dev/null +++ b/py-polars/tests/unit/utils/test_various.py @@ -0,0 +1,10 @@ +import pytest + +from polars._utils.various import issue_warning +from polars.exceptions import PerformanceWarning + + +def test_issue_warning() -> None: + msg = "hello" + with pytest.warns(PerformanceWarning, match=msg): + issue_warning(msg, PerformanceWarning)