From 834867d427bba1e53579fbab7dce940118d3dffb Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 11 Jun 2024 09:21:28 +0200 Subject: [PATCH 1/3] Move existing tests --- .../tests/unit/{ => datatypes}/test_schema.py | 23 ------------------- .../unit/functions/as_datatype/test_struct.py | 19 +++++++++++++++ .../operations/namespaces/list/test_list.py | 5 ++++ 3 files changed, 24 insertions(+), 23 deletions(-) rename py-polars/tests/unit/{ => datatypes}/test_schema.py (97%) create mode 100644 py-polars/tests/unit/functions/as_datatype/test_struct.py diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/datatypes/test_schema.py similarity index 97% rename from py-polars/tests/unit/test_schema.py rename to py-polars/tests/unit/datatypes/test_schema.py index fd6b154f06cf..d85beeba346d 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/datatypes/test_schema.py @@ -667,29 +667,6 @@ def test_alias_prune_in_fold_15438() -> None: assert_frame_equal(df, expected) -def test_resolved_names_15442() -> None: - df = pl.DataFrame( - { - "x": [206.0], - "y": [225.0], - } - ) - center = pl.struct( - x=pl.col("x"), - y=pl.col("y"), - ) - - left = 0 - right = 1000 - in_x = (left < center.struct.field("x")) & (center.struct.field("x") <= right) - assert df.lazy().filter(in_x).collect().shape == (1, 2) - - -def test_list_sum_bool_schema() -> None: - q = pl.LazyFrame({"x": [[True, True, False]]}) - assert q.select(pl.col("x").list.sum()).schema["x"] == pl.UInt32 - - @pytest.mark.parametrize("op", ["and_", "or_"]) def test_bitwise_integral_schema(op: str) -> None: df = pl.LazyFrame({"a": [1, 2], "b": [3, 4]}) diff --git a/py-polars/tests/unit/functions/as_datatype/test_struct.py b/py-polars/tests/unit/functions/as_datatype/test_struct.py new file mode 100644 index 000000000000..beeff23fb24c --- /dev/null +++ b/py-polars/tests/unit/functions/as_datatype/test_struct.py @@ -0,0 +1,19 @@ +import polars as pl + + +def test_resolved_names_15442() -> None: + df = pl.DataFrame( + { + "x": [206.0], + "y": [225.0], + } + ) + center = pl.struct( + x=pl.col("x"), + y=pl.col("y"), + ) + + left = 0 + right = 1000 + in_x = (left < center.struct.field("x")) & (center.struct.field("x") <= right) + assert df.lazy().filter(in_x).collect().shape == (1, 2) diff --git a/py-polars/tests/unit/operations/namespaces/list/test_list.py b/py-polars/tests/unit/operations/namespaces/list/test_list.py index 0b86f7eaca7f..86ed6b719f13 100644 --- a/py-polars/tests/unit/operations/namespaces/list/test_list.py +++ b/py-polars/tests/unit/operations/namespaces/list/test_list.py @@ -896,3 +896,8 @@ def test_list_eval_err_raise_15653() -> None: df = pl.DataFrame({"foo": [[]]}) with pytest.raises(pl.StructFieldNotFoundError): df.with_columns(bar=pl.col("foo").list.eval(pl.element().struct.field("baz"))) + + +def test_list_sum_bool_schema() -> None: + q = pl.LazyFrame({"x": [[True, True, False]]}) + assert q.select(pl.col("x").list.sum()).schema["x"] == pl.UInt32 From 44d2ec3ffb9b565a0c1e5e8213aa11652200baba Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 11 Jun 2024 13:59:31 +0200 Subject: [PATCH 2/3] Move more tests --- .../tests/unit/constructors/test_dataframe.py | 34 +- .../tests/unit/constructors/test_series.py | 6 + py-polars/tests/unit/datatypes/test_schema.py | 447 +----------------- py-polars/tests/unit/functions/test_concat.py | 39 ++ py-polars/tests/unit/lazyframe/test_rename.py | 8 + .../operations/aggregation/test_horizontal.py | 33 ++ .../operations/arithmetic/test_arithmetic.py | 27 ++ .../unit/operations/arithmetic/test_pow.py | 59 +++ .../unit/operations/namespaces/test_struct.py | 12 + py-polars/tests/unit/operations/test_cast.py | 17 + .../tests/unit/operations/test_comparison.py | 6 + py-polars/tests/unit/operations/test_diff.py | 12 + .../tests/unit/operations/test_fill_null.py | 37 ++ .../tests/unit/operations/test_group_by.py | 69 +++ .../unit/operations/test_shrink_dtype.py | 46 ++ py-polars/tests/unit/test_convert.py | 40 ++ 16 files changed, 445 insertions(+), 447 deletions(-) create mode 100644 py-polars/tests/unit/lazyframe/test_rename.py create mode 100644 py-polars/tests/unit/operations/arithmetic/test_pow.py create mode 100644 py-polars/tests/unit/operations/test_diff.py create mode 100644 py-polars/tests/unit/operations/test_fill_null.py create mode 100644 py-polars/tests/unit/operations/test_shrink_dtype.py create mode 100644 py-polars/tests/unit/test_convert.py diff --git a/py-polars/tests/unit/constructors/test_dataframe.py b/py-polars/tests/unit/constructors/test_dataframe.py index 9a03d3a2f2eb..9de21d42bbbe 100644 --- a/py-polars/tests/unit/constructors/test_dataframe.py +++ b/py-polars/tests/unit/constructors/test_dataframe.py @@ -2,7 +2,7 @@ import sys from collections import OrderedDict -from typing import Any +from typing import Any, Iterator, Mapping import pytest @@ -159,3 +159,35 @@ def test_unit_and_empty_construction_15896() -> None: A=pl.int_range("A"), # creates empty series ) ) + + +class CustomSchema(Mapping[str, Any]): + """Dummy schema object for testing compatibility with Mapping.""" + + _entries: dict[str, Any] + + def __init__(self, **named_entries: Any) -> None: + self._items = OrderedDict(named_entries.items()) + + def __getitem__(self, key: str) -> Any: + return self._items[key] + + def __len__(self) -> int: + return len(self._items) + + def __iter__(self) -> Iterator[str]: + yield from self._items + + +def test_custom_schema() -> None: + df = pl.DataFrame(schema=CustomSchema(bool=pl.Boolean, misc=pl.UInt8)) + assert df.schema == OrderedDict([("bool", pl.Boolean), ("misc", pl.UInt8)]) + + with pytest.raises(ValueError): + pl.DataFrame(schema=CustomSchema(bool="boolean", misc="unsigned int")) + + +def test_list_null_constructor_schema() -> None: + expected = pl.List(pl.Null) + assert pl.DataFrame({"a": [[]]}).dtypes[0] == expected + assert pl.DataFrame(schema={"a": pl.List}).dtypes[0] == expected diff --git a/py-polars/tests/unit/constructors/test_series.py b/py-polars/tests/unit/constructors/test_series.py index 464f6b494712..fd6dc683bda8 100644 --- a/py-polars/tests/unit/constructors/test_series.py +++ b/py-polars/tests/unit/constructors/test_series.py @@ -148,3 +148,9 @@ def test_series_init_np_2d_zero_zero_shape() -> None: match=re.escape("cannot reshape empty array into shape (0, 0)"), ): pl.Series(arr) + + +def test_list_null_constructor_schema() -> None: + expected = pl.List(pl.Null) + assert pl.Series([[]]).dtype == expected + assert pl.Series([[]], dtype=pl.List).dtype == expected diff --git a/py-polars/tests/unit/datatypes/test_schema.py b/py-polars/tests/unit/datatypes/test_schema.py index d85beeba346d..ab2260d71dbe 100644 --- a/py-polars/tests/unit/datatypes/test_schema.py +++ b/py-polars/tests/unit/datatypes/test_schema.py @@ -1,186 +1,11 @@ from __future__ import annotations -from collections import OrderedDict from datetime import date, timedelta -from typing import TYPE_CHECKING, Any, Iterator, Mapping import pytest import polars as pl -from polars.testing import assert_frame_equal, assert_series_equal - -if TYPE_CHECKING: - from polars.type_aliases import PolarsDataType - - -class CustomSchema(Mapping[str, Any]): - """Dummy schema object for testing compatibility with Mapping.""" - - _entries: dict[str, Any] - - def __init__(self, **named_entries: Any) -> None: - self._items = OrderedDict(named_entries.items()) - - def __getitem__(self, key: str) -> Any: - return self._items[key] - - def __len__(self) -> int: - return len(self._items) - - def __iter__(self) -> Iterator[str]: - yield from self._items - - -def test_custom_schema() -> None: - df = pl.DataFrame(schema=CustomSchema(bool=pl.Boolean, misc=pl.UInt8)) - assert df.schema == OrderedDict([("bool", pl.Boolean), ("misc", pl.UInt8)]) - - with pytest.raises(ValueError): - pl.DataFrame(schema=CustomSchema(bool="boolean", misc="unsigned int")) - - -def test_schema_on_agg() -> None: - df = pl.DataFrame({"a": ["x", "x", "y", "n"], "b": [1, 2, 3, 4]}) - - assert ( - df.lazy() - .group_by("a") - .agg( - [ - pl.col("b").min().alias("min"), - pl.col("b").max().alias("max"), - pl.col("b").sum().alias("sum"), - pl.col("b").first().alias("first"), - pl.col("b").last().alias("last"), - ] - ) - ).schema == { - "a": pl.String, - "min": pl.Int64, - "max": pl.Int64, - "sum": pl.Int64, - "first": pl.Int64, - "last": pl.Int64, - } - - -def test_fill_null_minimal_upcast_4056() -> None: - df = pl.DataFrame({"a": [-1, 2, None]}) - df = df.with_columns(pl.col("a").cast(pl.Int8)) - assert df.with_columns(pl.col(pl.Int8).fill_null(-1)).dtypes[0] == pl.Int8 - assert df.with_columns(pl.col(pl.Int8).fill_null(-1000)).dtypes[0] == pl.Int16 - - -def test_fill_enum_upcast() -> None: - dtype = pl.Enum(["a", "b"]) - s = pl.Series(["a", "b", None], dtype=dtype) - s_filled = s.fill_null("b") - expected = pl.Series(["a", "b", "b"], dtype=dtype) - assert s_filled.dtype == dtype - assert_series_equal(s_filled, expected) - - -def test_pow_dtype() -> None: - df = pl.DataFrame( - { - "foo": [1, 2, 3, 4, 5], - "a": [1, 2, 3, 4, 5], - "b": [1, 2, 3, 4, 5], - "c": [1, 2, 3, 4, 5], - "d": [1, 2, 3, 4, 5], - "e": [1, 2, 3, 4, 5], - "f": [1, 2, 3, 4, 5], - "g": [1, 2, 1, 2, 1], - "h": [1, 2, 1, 2, 1], - }, - schema_overrides={ - "a": pl.Int64, - "b": pl.UInt64, - "c": pl.Int32, - "d": pl.UInt32, - "e": pl.Int16, - "f": pl.UInt16, - "g": pl.Int8, - "h": pl.UInt8, - }, - ).lazy() - - df = ( - df.with_columns([pl.col("foo").cast(pl.UInt32)]) - .with_columns( - (pl.col("foo") * 2**2).alias("scaled_foo"), - (pl.col("foo") * 2**2.1).alias("scaled_foo2"), - (pl.col("a") ** pl.col("h")).alias("a_pow_h"), - (pl.col("b") ** pl.col("h")).alias("b_pow_h"), - (pl.col("c") ** pl.col("h")).alias("c_pow_h"), - (pl.col("d") ** pl.col("h")).alias("d_pow_h"), - (pl.col("e") ** pl.col("h")).alias("e_pow_h"), - (pl.col("f") ** pl.col("h")).alias("f_pow_h"), - (pl.col("g") ** pl.col("h")).alias("g_pow_h"), - (pl.col("h") ** pl.col("h")).alias("h_pow_h"), - ) - .drop(["a", "b", "c", "d", "e", "f", "g", "h"]) - ) - expected = [ - pl.UInt32, - pl.UInt32, - pl.Float64, - pl.Int64, - pl.UInt64, - pl.Int32, - pl.UInt32, - pl.Int16, - pl.UInt16, - pl.Int8, - pl.UInt8, - ] - assert df.collect().dtypes == expected - assert df.dtypes == expected - - -def test_bool_numeric_supertype() -> None: - df = pl.DataFrame({"v": [1, 2, 3, 4, 5, 6]}) - for dt in [ - pl.UInt8, - pl.UInt16, - pl.UInt32, - pl.UInt64, - pl.Int8, - pl.Int16, - pl.Int32, - pl.Int64, - ]: - assert ( - df.select([(pl.col("v") < 3).sum().cast(dt) / pl.len()]).item() - 0.3333333 - <= 0.00001 - ) - - -def test_from_dicts_nested_nulls() -> None: - assert pl.from_dicts([{"a": [None, None]}, {"a": [1, 2]}]).to_dict( - as_series=False - ) == {"a": [[None, None], [1, 2]]} - - -def test_group_schema_err() -> None: - df = pl.DataFrame({"foo": [None, 1, 2], "bar": [1, 2, 3]}).lazy() - with pytest.raises(pl.ColumnNotFoundError): - df.group_by("not-existent").agg(pl.col("bar").max().alias("max_bar")).schema - - -def test_schema_inference_from_rows() -> None: - # these have to upcast to float - result = pl.from_records([[1, 2.1, 3], [4, 5, 6.4]]) - assert result.to_dict(as_series=False) == { - "column_0": [1.0, 2.1, 3.0], - "column_1": [4.0, 5.0, 6.4], - } - - result = pl.from_dicts([{"a": 1, "b": 2}, {"a": 3.1, "b": 4.5}]) - assert result.to_dict(as_series=False) == { - "a": [1.0, 3.1], - "b": [2.0, 4.5], - } +from polars.testing import assert_frame_equal def test_lazy_map_schema() -> None: @@ -264,114 +89,6 @@ def test_fold_all_schema() -> None: assert result.dtypes == [pl.UInt64] -def test_fill_null_static_schema_4843() -> None: - df1 = pl.DataFrame( - { - "a": [1, 2, None], - "b": [1, None, 4], - } - ).lazy() - - df2 = df1.select([pl.col(pl.Int64).fill_null(0)]) - df3 = df2.select(pl.col(pl.Int64)) - assert df3.schema == {"a": pl.Int64, "b": pl.Int64} - - -def test_shrink_dtype() -> None: - out = pl.DataFrame( - { - "a": [1, 2, 3], - "b": [1, 2, 2 << 32], - "c": [-1, 2, 1 << 30], - "d": [-112, 2, 112], - "e": [-112, 2, 129], - "f": ["a", "b", "c"], - "g": [0.1, 1.32, 0.12], - "h": [True, None, False], - "i": pl.Series([None, None, None], dtype=pl.UInt64), - "j": pl.Series([None, None, None], dtype=pl.Int64), - "k": pl.Series([None, None, None], dtype=pl.Float64), - } - ).select(pl.all().shrink_dtype()) - assert out.dtypes == [ - pl.Int8, - pl.Int64, - pl.Int32, - pl.Int8, - pl.Int16, - pl.String, - pl.Float32, - pl.Boolean, - pl.UInt8, - pl.Int8, - pl.Float32, - ] - - assert out.to_dict(as_series=False) == { - "a": [1, 2, 3], - "b": [1, 2, 8589934592], - "c": [-1, 2, 1073741824], - "d": [-112, 2, 112], - "e": [-112, 2, 129], - "f": ["a", "b", "c"], - "g": [0.10000000149011612, 1.3200000524520874, 0.11999999731779099], - "h": [True, None, False], - "i": [None, None, None], - "j": [None, None, None], - "k": [None, None, None], - } - - -def test_diff_duration_dtype() -> None: - data = ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-03"] - df = pl.Series("date", data).str.to_date("%Y-%m-%d").to_frame() - - result = df.select(pl.col("date").diff() < pl.duration(days=1)) - - expected = pl.Series("date", [None, False, False, True]).to_frame() - assert_frame_equal(result, expected) - - -def test_schema_owned_arithmetic_5669() -> None: - df = ( - pl.DataFrame({"A": [1, 2, 3]}) - .lazy() - .filter(pl.col("A") >= 3) - .with_columns(-pl.col("A").alias("B")) - .collect() - ) - assert df.columns == ["A", "B"] - assert df.rows() == [(3, -3)] - - -def test_fill_null_f32_with_lit() -> None: - # ensure the literal integer does not upcast the f32 to an f64 - df = pl.DataFrame({"a": [1.1, 1.2]}, schema=[("a", pl.Float32)]) - assert df.fill_null(value=0).dtypes == [pl.Float32] - - -def test_lazy_rename() -> None: - df = pl.DataFrame({"x": [1], "y": [2]}) - - assert ( - df.lazy().rename({"y": "x", "x": "y"}).select(["x", "y"]).collect() - ).to_dict(as_series=False) == {"x": [2], "y": [1]} - - -def test_all_null_cast_5826() -> None: - df = pl.DataFrame(data=[pl.Series("a", [None], dtype=pl.String)]) - out = df.with_columns(pl.col("a").cast(pl.Boolean)) - assert out.dtypes == [pl.Boolean] - assert out.item() is None - - -def test_empty_list_eval_schema_5734() -> None: - df = pl.DataFrame({"a": [[{"b": 1, "c": 2}]]}) - assert df.filter(False).select( - pl.col("a").list.eval(pl.element().struct.field("b")) - ).schema == {"a": pl.List(pl.Int64)} - - def test_list_eval_type_cast_11188() -> None: df = pl.DataFrame( [ @@ -384,27 +101,6 @@ def test_list_eval_type_cast_11188() -> None: ).schema == {"a_str": pl.List(pl.String)} -def test_schema_true_divide_6643() -> None: - df = pl.DataFrame({"a": [1]}) - a = pl.col("a") - assert df.lazy().select(a / 2).select(pl.col(pl.Int64)).collect().shape == (0, 0) - - -def test_from_dicts_all_cols_6716() -> None: - dicts = [{"a": None} for _ in range(20)] + [{"a": "crash"}] - - with pytest.raises( - pl.ComputeError, match="make sure that all rows have the same schema" - ): - pl.from_dicts(dicts, infer_schema_length=20) - assert pl.from_dicts(dicts, infer_schema_length=None).dtypes == [pl.String] - - -def test_from_dicts_empty() -> None: - with pytest.raises(pl.NoDataError, match="no data, cannot infer schema"): - pl.from_dicts([]) - - def test_duration_division_schema() -> None: df = pl.DataFrame({"a": [1]}) q = ( @@ -472,101 +168,6 @@ def sub_col_min(column: str, min_column: str) -> pl.Expr: ] -@pytest.mark.parametrize( - ("data", "expr", "expected_select", "expected_gb"), - [ - ( - {"x": ["x"], "y": ["y"]}, - pl.coalesce(pl.col("x"), pl.col("y")), - {"x": pl.String}, - {"x": pl.List(pl.String)}, - ), - ( - {"x": [True]}, - pl.col("x").sum(), - {"x": pl.UInt32}, - {"x": pl.UInt32}, - ), - ( - {"a": [[1, 2]]}, - pl.col("a").list.sum(), - {"a": pl.Int64}, - {"a": pl.List(pl.Int64)}, - ), - ], -) -def test_schemas( - data: dict[str, list[Any]], - expr: pl.Expr, - expected_select: dict[str, pl.PolarsDataType], - expected_gb: dict[str, pl.PolarsDataType], -) -> None: - df = pl.DataFrame(data) - - # test selection schema - schema = df.select(expr).schema - for key, dtype in expected_select.items(): - assert schema[key] == dtype - - # test group_by schema - schema = df.group_by(pl.lit(1)).agg(expr).schema - for key, dtype in expected_gb.items(): - assert schema[key] == dtype - - -def test_list_null_constructor_schema() -> None: - expected = pl.List(pl.Null) - assert pl.Series([[]]).dtype == expected - assert pl.Series([[]], dtype=pl.List).dtype == expected - assert pl.DataFrame({"a": [[]]}).dtypes[0] == expected - assert pl.DataFrame(schema={"a": pl.List}).dtypes[0] == expected - - -def test_schema_ne_missing_9256() -> None: - df = pl.DataFrame({"a": [0, 1, None], "b": [True, False, True]}) - - assert df.select(pl.col("a").ne_missing(0).or_(pl.col("b")))["a"].all() - - -def test_concat_vertically_relaxed() -> None: - a = pl.DataFrame( - data={"a": [1, 2, 3], "b": [True, False, None]}, - schema={"a": pl.Int8, "b": pl.Boolean}, - ) - b = pl.DataFrame( - data={"a": [43, 2, 3], "b": [32, 1, None]}, - schema={"a": pl.Int16, "b": pl.Int64}, - ) - out = pl.concat([a, b], how="vertical_relaxed") - assert out.schema == {"a": pl.Int16, "b": pl.Int64} - assert out.to_dict(as_series=False) == { - "a": [1, 2, 3, 43, 2, 3], - "b": [1, 0, None, 32, 1, None], - } - out = pl.concat([b, a], how="vertical_relaxed") - assert out.schema == {"a": pl.Int16, "b": pl.Int64} - assert out.to_dict(as_series=False) == { - "a": [43, 2, 3, 1, 2, 3], - "b": [32, 1, None, 1, 0, None], - } - - c = pl.DataFrame({"a": [1, 2], "b": [2, 1]}) - d = pl.DataFrame({"a": [1.0, 0.2], "b": [None, 0.1]}) - - out = pl.concat([c, d], how="vertical_relaxed") - assert out.schema == {"a": pl.Float64, "b": pl.Float64} - assert out.to_dict(as_series=False) == { - "a": [1.0, 2.0, 1.0, 0.2], - "b": [2.0, 1.0, None, 0.1], - } - out = pl.concat([d, c], how="vertical_relaxed") - assert out.schema == {"a": pl.Float64, "b": pl.Float64} - assert out.to_dict(as_series=False) == { - "a": [1.0, 0.2, 1.0, 2.0], - "b": [None, 0.1, 2.0, 1.0], - } - - def test_lit_iter_schema() -> None: df = pl.DataFrame( { @@ -602,52 +203,6 @@ def test_nested_binary_literal_super_type_12227() -> None: ) -def test_literal_subtract_schema_13284() -> None: - assert ( - pl.LazyFrame({"a": [23, 30]}, schema={"a": pl.UInt8}) - .with_columns(pl.col("a") - pl.lit(1)) - .group_by("a") - .len() - ).schema == OrderedDict([("a", pl.UInt8), ("len", pl.UInt32)]) - - -def test_schema_boolean_sum_horizontal() -> None: - lf = pl.LazyFrame({"a": [True, False]}).select(pl.sum_horizontal("a")) - assert lf.schema == OrderedDict([("a", pl.UInt32)]) - - -@pytest.mark.parametrize( - ("in_dtype", "out_dtype"), - [ - (pl.Boolean, pl.Float64), - (pl.UInt8, pl.Float64), - (pl.UInt16, pl.Float64), - (pl.UInt32, pl.Float64), - (pl.UInt64, pl.Float64), - (pl.Int8, pl.Float64), - (pl.Int16, pl.Float64), - (pl.Int32, pl.Float64), - (pl.Int64, pl.Float64), - (pl.Float32, pl.Float32), - (pl.Float64, pl.Float64), - ], -) -def test_schema_mean_horizontal_single_column( - in_dtype: PolarsDataType, - out_dtype: PolarsDataType, -) -> None: - lf = pl.LazyFrame({"a": pl.Series([1, 0], dtype=in_dtype)}).select( - pl.mean_horizontal(pl.all()) - ) - - assert lf.schema == OrderedDict([("a", out_dtype)]) - - -def test_struct_alias_prune_15401() -> None: - df = pl.DataFrame({"a": []}, schema={"a": pl.Struct({"b": pl.Int8})}) - assert df.select(pl.col("a").alias("c").struct.field("b")).columns == ["b"] - - def test_alias_prune_in_fold_15438() -> None: df = pl.DataFrame({"x": [1, 2], "expected_result": ["first", "second"]}).select( actual_result=pl.fold( diff --git a/py-polars/tests/unit/functions/test_concat.py b/py-polars/tests/unit/functions/test_concat.py index dacd997d49f7..8e7c4c9f31e3 100644 --- a/py-polars/tests/unit/functions/test_concat.py +++ b/py-polars/tests/unit/functions/test_concat.py @@ -20,3 +20,42 @@ def test_concat_lf_stack_overflow() -> None: for i in range(n): bar = pl.concat([bar, pl.DataFrame({"a": i}).lazy()]) assert bar.collect().shape == (1001, 1) + + +def test_concat_vertically_relaxed() -> None: + a = pl.DataFrame( + data={"a": [1, 2, 3], "b": [True, False, None]}, + schema={"a": pl.Int8, "b": pl.Boolean}, + ) + b = pl.DataFrame( + data={"a": [43, 2, 3], "b": [32, 1, None]}, + schema={"a": pl.Int16, "b": pl.Int64}, + ) + out = pl.concat([a, b], how="vertical_relaxed") + assert out.schema == {"a": pl.Int16, "b": pl.Int64} + assert out.to_dict(as_series=False) == { + "a": [1, 2, 3, 43, 2, 3], + "b": [1, 0, None, 32, 1, None], + } + out = pl.concat([b, a], how="vertical_relaxed") + assert out.schema == {"a": pl.Int16, "b": pl.Int64} + assert out.to_dict(as_series=False) == { + "a": [43, 2, 3, 1, 2, 3], + "b": [32, 1, None, 1, 0, None], + } + + c = pl.DataFrame({"a": [1, 2], "b": [2, 1]}) + d = pl.DataFrame({"a": [1.0, 0.2], "b": [None, 0.1]}) + + out = pl.concat([c, d], how="vertical_relaxed") + assert out.schema == {"a": pl.Float64, "b": pl.Float64} + assert out.to_dict(as_series=False) == { + "a": [1.0, 2.0, 1.0, 0.2], + "b": [2.0, 1.0, None, 0.1], + } + out = pl.concat([d, c], how="vertical_relaxed") + assert out.schema == {"a": pl.Float64, "b": pl.Float64} + assert out.to_dict(as_series=False) == { + "a": [1.0, 0.2, 1.0, 2.0], + "b": [None, 0.1, 2.0, 1.0], + } diff --git a/py-polars/tests/unit/lazyframe/test_rename.py b/py-polars/tests/unit/lazyframe/test_rename.py new file mode 100644 index 000000000000..2a7462d1edb2 --- /dev/null +++ b/py-polars/tests/unit/lazyframe/test_rename.py @@ -0,0 +1,8 @@ +import polars as pl + + +def test_lazy_rename() -> None: + df = pl.DataFrame({"x": [1], "y": [2]}) + + result = df.lazy().rename({"y": "x", "x": "y"}).select(["x", "y"]) + assert result.collect().to_dict(as_series=False) == {"x": [2], "y": [1]} diff --git a/py-polars/tests/unit/operations/aggregation/test_horizontal.py b/py-polars/tests/unit/operations/aggregation/test_horizontal.py index 0596b73417d2..4d474367af02 100644 --- a/py-polars/tests/unit/operations/aggregation/test_horizontal.py +++ b/py-polars/tests/unit/operations/aggregation/test_horizontal.py @@ -1,6 +1,7 @@ from __future__ import annotations import datetime +from collections import OrderedDict from typing import Any import pytest @@ -407,3 +408,35 @@ def test_mean_horizontal_all_null() -> None: expected = pl.LazyFrame({"a": [1.5, None]}, schema={"a": pl.Float64}) assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("in_dtype", "out_dtype"), + [ + (pl.Boolean, pl.Float64), + (pl.UInt8, pl.Float64), + (pl.UInt16, pl.Float64), + (pl.UInt32, pl.Float64), + (pl.UInt64, pl.Float64), + (pl.Int8, pl.Float64), + (pl.Int16, pl.Float64), + (pl.Int32, pl.Float64), + (pl.Int64, pl.Float64), + (pl.Float32, pl.Float32), + (pl.Float64, pl.Float64), + ], +) +def test_schema_mean_horizontal_single_column( + in_dtype: pl.PolarsDataType, + out_dtype: pl.PolarsDataType, +) -> None: + lf = pl.LazyFrame({"a": pl.Series([1, 0], dtype=in_dtype)}).select( + pl.mean_horizontal(pl.all()) + ) + + assert lf.schema == OrderedDict([("a", out_dtype)]) + + +def test_schema_boolean_sum_horizontal() -> None: + lf = pl.LazyFrame({"a": [True, False]}).select(pl.sum_horizontal("a")) + assert lf.schema == OrderedDict([("a", pl.UInt32)]) diff --git a/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py b/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py index 4097f304c082..1973f0483eae 100644 --- a/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py +++ b/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py @@ -1,4 +1,5 @@ import operator +from collections import OrderedDict from datetime import date, datetime, timedelta from typing import Any @@ -583,3 +584,29 @@ def test_array_arithmetic_same_size(expected: Any, expr: pl.Expr) -> None: df.select(expr), pl.Series("a", expected).to_frame(), ) + + +def test_schema_owned_arithmetic_5669() -> None: + df = ( + pl.LazyFrame({"A": [1, 2, 3]}) + .filter(pl.col("A") >= 3) + .with_columns(-pl.col("A").alias("B")) + .collect() + ) + assert df.columns == ["A", "B"] + assert df.rows() == [(3, -3)] + + +def test_schema_true_divide_6643() -> None: + df = pl.DataFrame({"a": [1]}) + a = pl.col("a") + assert df.lazy().select(a / 2).select(pl.col(pl.Int64)).collect().shape == (0, 0) + + +def test_literal_subtract_schema_13284() -> None: + assert ( + pl.LazyFrame({"a": [23, 30]}, schema={"a": pl.UInt8}) + .with_columns(pl.col("a") - pl.lit(1)) + .group_by("a") + .len() + ).schema == OrderedDict([("a", pl.UInt8), ("len", pl.UInt32)]) diff --git a/py-polars/tests/unit/operations/arithmetic/test_pow.py b/py-polars/tests/unit/operations/arithmetic/test_pow.py new file mode 100644 index 000000000000..cd606dce8afa --- /dev/null +++ b/py-polars/tests/unit/operations/arithmetic/test_pow.py @@ -0,0 +1,59 @@ +import polars as pl + + +def test_pow_dtype() -> None: + df = pl.DataFrame( + { + "foo": [1, 2, 3, 4, 5], + "a": [1, 2, 3, 4, 5], + "b": [1, 2, 3, 4, 5], + "c": [1, 2, 3, 4, 5], + "d": [1, 2, 3, 4, 5], + "e": [1, 2, 3, 4, 5], + "f": [1, 2, 3, 4, 5], + "g": [1, 2, 1, 2, 1], + "h": [1, 2, 1, 2, 1], + }, + schema_overrides={ + "a": pl.Int64, + "b": pl.UInt64, + "c": pl.Int32, + "d": pl.UInt32, + "e": pl.Int16, + "f": pl.UInt16, + "g": pl.Int8, + "h": pl.UInt8, + }, + ).lazy() + + df = ( + df.with_columns([pl.col("foo").cast(pl.UInt32)]) + .with_columns( + (pl.col("foo") * 2**2).alias("scaled_foo"), + (pl.col("foo") * 2**2.1).alias("scaled_foo2"), + (pl.col("a") ** pl.col("h")).alias("a_pow_h"), + (pl.col("b") ** pl.col("h")).alias("b_pow_h"), + (pl.col("c") ** pl.col("h")).alias("c_pow_h"), + (pl.col("d") ** pl.col("h")).alias("d_pow_h"), + (pl.col("e") ** pl.col("h")).alias("e_pow_h"), + (pl.col("f") ** pl.col("h")).alias("f_pow_h"), + (pl.col("g") ** pl.col("h")).alias("g_pow_h"), + (pl.col("h") ** pl.col("h")).alias("h_pow_h"), + ) + .drop(["a", "b", "c", "d", "e", "f", "g", "h"]) + ) + expected = [ + pl.UInt32, + pl.UInt32, + pl.Float64, + pl.Int64, + pl.UInt64, + pl.Int32, + pl.UInt32, + pl.Int16, + pl.UInt16, + pl.Int8, + pl.UInt8, + ] + assert df.collect().dtypes == expected + assert df.dtypes == expected diff --git a/py-polars/tests/unit/operations/namespaces/test_struct.py b/py-polars/tests/unit/operations/namespaces/test_struct.py index ee4806c00188..a365b1cef1d1 100644 --- a/py-polars/tests/unit/operations/namespaces/test_struct.py +++ b/py-polars/tests/unit/operations/namespaces/test_struct.py @@ -83,3 +83,15 @@ def test_prefix_suffix_fields() -> None: assert suffix_df.schema == OrderedDict( [("x", pl.Struct({"a_f": pl.Int64, "b_f": pl.Int64}))] ) + + +def test_struct_alias_prune_15401() -> None: + df = pl.DataFrame({"a": []}, schema={"a": pl.Struct({"b": pl.Int8})}) + assert df.select(pl.col("a").alias("c").struct.field("b")).columns == ["b"] + + +def test_empty_list_eval_schema_5734() -> None: + df = pl.DataFrame({"a": [[{"b": 1, "c": 2}]]}) + assert df.filter(False).select( + pl.col("a").list.eval(pl.element().struct.field("b")) + ).schema == {"a": pl.List(pl.Int64)} diff --git a/py-polars/tests/unit/operations/test_cast.py b/py-polars/tests/unit/operations/test_cast.py index 84336573c2e6..1f9bae881bca 100644 --- a/py-polars/tests/unit/operations/test_cast.py +++ b/py-polars/tests/unit/operations/test_cast.py @@ -672,3 +672,20 @@ def test_invalid_inner_type_cast_list() -> None: match=r"cannot cast List inner type: 'Int64' to Categorical", ): s.cast(pl.List(pl.Categorical)) + + +def test_all_null_cast_5826() -> None: + df = pl.DataFrame(data=[pl.Series("a", [None], dtype=pl.String)]) + out = df.with_columns(pl.col("a").cast(pl.Boolean)) + assert out.dtypes == [pl.Boolean] + assert out.item() is None + + +@pytest.mark.parametrize( + "dtype", + [pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Int8, pl.Int16, pl.Int32, pl.Int64], +) +def test_bool_numeric_supertype(dtype: pl.PolarsDataType) -> None: + df = pl.DataFrame({"v": [1, 2, 3, 4, 5, 6]}) + result = df.select((pl.col("v") < 3).sum().cast(dtype) / pl.len()) + assert result.item() - 0.3333333 <= 0.00001 diff --git a/py-polars/tests/unit/operations/test_comparison.py b/py-polars/tests/unit/operations/test_comparison.py index 6316e39aafe2..ebbb4b4c46d5 100644 --- a/py-polars/tests/unit/operations/test_comparison.py +++ b/py-polars/tests/unit/operations/test_comparison.py @@ -373,3 +373,9 @@ def test_cat_compare_with_bool() -> None: with pytest.raises(pl.ComputeError, match="cannot compare categorical with bool"): data.filter(pl.col("col1") == True) # noqa: E712 + + +def test_schema_ne_missing_9256() -> None: + df = pl.DataFrame({"a": [0, 1, None], "b": [True, False, True]}) + + assert df.select(pl.col("a").ne_missing(0).or_(pl.col("b")))["a"].all() diff --git a/py-polars/tests/unit/operations/test_diff.py b/py-polars/tests/unit/operations/test_diff.py new file mode 100644 index 000000000000..463716c69893 --- /dev/null +++ b/py-polars/tests/unit/operations/test_diff.py @@ -0,0 +1,12 @@ +import polars as pl +from polars.testing import assert_frame_equal + + +def test_diff_duration_dtype() -> None: + data = ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-03"] + df = pl.Series("date", data).str.to_date("%Y-%m-%d").to_frame() + + result = df.select(pl.col("date").diff() < pl.duration(days=1)) + + expected = pl.Series("date", [None, False, False, True]).to_frame() + assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/operations/test_fill_null.py b/py-polars/tests/unit/operations/test_fill_null.py new file mode 100644 index 000000000000..ad2411e02d52 --- /dev/null +++ b/py-polars/tests/unit/operations/test_fill_null.py @@ -0,0 +1,37 @@ +import polars as pl +from polars.testing import assert_series_equal + + +def test_fill_null_minimal_upcast_4056() -> None: + df = pl.DataFrame({"a": [-1, 2, None]}) + df = df.with_columns(pl.col("a").cast(pl.Int8)) + assert df.with_columns(pl.col(pl.Int8).fill_null(-1)).dtypes[0] == pl.Int8 + assert df.with_columns(pl.col(pl.Int8).fill_null(-1000)).dtypes[0] == pl.Int16 + + +def test_fill_enum_upcast() -> None: + dtype = pl.Enum(["a", "b"]) + s = pl.Series(["a", "b", None], dtype=dtype) + s_filled = s.fill_null("b") + expected = pl.Series(["a", "b", "b"], dtype=dtype) + assert s_filled.dtype == dtype + assert_series_equal(s_filled, expected) + + +def test_fill_null_static_schema_4843() -> None: + df1 = pl.DataFrame( + { + "a": [1, 2, None], + "b": [1, None, 4], + } + ).lazy() + + df2 = df1.select([pl.col(pl.Int64).fill_null(0)]) + df3 = df2.select(pl.col(pl.Int64)) + assert df3.schema == {"a": pl.Int64, "b": pl.Int64} + + +def test_fill_null_f32_with_lit() -> None: + # ensure the literal integer does not upcast the f32 to an f64 + df = pl.DataFrame({"a": [1.1, 1.2]}, schema=[("a", pl.Float32)]) + assert df.fill_null(value=0).dtypes == [pl.Float32] diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py index e95ea2703970..8f6cce5a4b95 100644 --- a/py-polars/tests/unit/operations/test_group_by.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -1002,3 +1002,72 @@ def test_partitioned_group_by_chunked(partition_limit: int) -> None: df.group_by(gps).sum().sort("oo"), df.rechunk().group_by(gps, maintain_order=True).sum(), ) + + +def test_schema_on_agg() -> None: + lf = pl.LazyFrame({"a": ["x", "x", "y", "n"], "b": [1, 2, 3, 4]}) + + result = lf.group_by("a").agg( + pl.col("b").min().alias("min"), + pl.col("b").max().alias("max"), + pl.col("b").sum().alias("sum"), + pl.col("b").first().alias("first"), + pl.col("b").last().alias("last"), + ) + expected_schema = { + "a": pl.String, + "min": pl.Int64, + "max": pl.Int64, + "sum": pl.Int64, + "first": pl.Int64, + "last": pl.Int64, + } + assert result.schema == expected_schema + + +def test_group_by_schema_err() -> None: + lf = pl.LazyFrame({"foo": [None, 1, 2], "bar": [1, 2, 3]}) + with pytest.raises(pl.ColumnNotFoundError): + lf.group_by("not-existent").agg(pl.col("bar").max().alias("max_bar")).schema + + +@pytest.mark.parametrize( + ("data", "expr", "expected_select", "expected_gb"), + [ + ( + {"x": ["x"], "y": ["y"]}, + pl.coalesce(pl.col("x"), pl.col("y")), + {"x": pl.String}, + {"x": pl.List(pl.String)}, + ), + ( + {"x": [True]}, + pl.col("x").sum(), + {"x": pl.UInt32}, + {"x": pl.UInt32}, + ), + ( + {"a": [[1, 2]]}, + pl.col("a").list.sum(), + {"a": pl.Int64}, + {"a": pl.List(pl.Int64)}, + ), + ], +) +def test_schemas( + data: dict[str, list[Any]], + expr: pl.Expr, + expected_select: dict[str, pl.PolarsDataType], + expected_gb: dict[str, pl.PolarsDataType], +) -> None: + df = pl.DataFrame(data) + + # test selection schema + schema = df.select(expr).schema + for key, dtype in expected_select.items(): + assert schema[key] == dtype + + # test group_by schema + schema = df.group_by(pl.lit(1)).agg(expr).schema + for key, dtype in expected_gb.items(): + assert schema[key] == dtype diff --git a/py-polars/tests/unit/operations/test_shrink_dtype.py b/py-polars/tests/unit/operations/test_shrink_dtype.py new file mode 100644 index 000000000000..443f55814ec1 --- /dev/null +++ b/py-polars/tests/unit/operations/test_shrink_dtype.py @@ -0,0 +1,46 @@ +import polars as pl + + +def test_shrink_dtype() -> None: + out = pl.DataFrame( + { + "a": [1, 2, 3], + "b": [1, 2, 2 << 32], + "c": [-1, 2, 1 << 30], + "d": [-112, 2, 112], + "e": [-112, 2, 129], + "f": ["a", "b", "c"], + "g": [0.1, 1.32, 0.12], + "h": [True, None, False], + "i": pl.Series([None, None, None], dtype=pl.UInt64), + "j": pl.Series([None, None, None], dtype=pl.Int64), + "k": pl.Series([None, None, None], dtype=pl.Float64), + } + ).select(pl.all().shrink_dtype()) + assert out.dtypes == [ + pl.Int8, + pl.Int64, + pl.Int32, + pl.Int8, + pl.Int16, + pl.String, + pl.Float32, + pl.Boolean, + pl.UInt8, + pl.Int8, + pl.Float32, + ] + + assert out.to_dict(as_series=False) == { + "a": [1, 2, 3], + "b": [1, 2, 8589934592], + "c": [-1, 2, 1073741824], + "d": [-112, 2, 112], + "e": [-112, 2, 129], + "f": ["a", "b", "c"], + "g": [0.10000000149011612, 1.3200000524520874, 0.11999999731779099], + "h": [True, None, False], + "i": [None, None, None], + "j": [None, None, None], + "k": [None, None, None], + } diff --git a/py-polars/tests/unit/test_convert.py b/py-polars/tests/unit/test_convert.py new file mode 100644 index 000000000000..e74bd6f13024 --- /dev/null +++ b/py-polars/tests/unit/test_convert.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import pytest + +import polars as pl + + +def test_schema_inference_from_rows() -> None: + # these have to upcast to float + result = pl.from_records([[1, 2.1, 3], [4, 5, 6.4]]) + assert result.to_dict(as_series=False) == { + "column_0": [1.0, 2.1, 3.0], + "column_1": [4.0, 5.0, 6.4], + } + + result = pl.from_dicts([{"a": 1, "b": 2}, {"a": 3.1, "b": 4.5}]) + assert result.to_dict(as_series=False) == { + "a": [1.0, 3.1], + "b": [2.0, 4.5], + } + + +def test_from_dicts_nested_nulls() -> None: + result = pl.from_dicts([{"a": [None, None]}, {"a": [1, 2]}]) + assert result.to_dict(as_series=False) == {"a": [[None, None], [1, 2]]} + + +def test_from_dicts_empty() -> None: + with pytest.raises(pl.NoDataError, match="no data, cannot infer schema"): + pl.from_dicts([]) + + +def test_from_dicts_all_cols_6716() -> None: + dicts = [{"a": None} for _ in range(20)] + [{"a": "crash"}] + + with pytest.raises( + pl.ComputeError, match="make sure that all rows have the same schema" + ): + pl.from_dicts(dicts, infer_schema_length=20) + assert pl.from_dicts(dicts, infer_schema_length=None).dtypes == [pl.String] From 3532e92fcef32e68bd3a172ca9805aa1cd6ee979 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 11 Jun 2024 14:11:34 +0200 Subject: [PATCH 3/3] Move more tests --- py-polars/tests/unit/datatypes/test_schema.py | 229 ------------------ py-polars/tests/unit/lazyframe/test_rename.py | 7 + .../{ => aggregation}/test_folds.py | 19 ++ .../operations/aggregation/test_horizontal.py | 15 ++ .../operations/aggregation/test_vertical.py | 8 + .../operations/arithmetic/test_arithmetic.py | 22 ++ .../unit/operations/map/test_map_batches.py | 32 +++ .../unit/operations/map/test_map_elements.py | 20 ++ .../operations/namespaces/list/test_list.py | 12 + .../tests/unit/operations/test_bitwise.py | 10 + .../tests/unit/operations/test_comparison.py | 9 + .../tests/unit/operations/test_group_by.py | 56 +++++ .../tests/unit/operations/test_join_asof.py | 7 + 13 files changed, 217 insertions(+), 229 deletions(-) delete mode 100644 py-polars/tests/unit/datatypes/test_schema.py rename py-polars/tests/unit/operations/{ => aggregation}/test_folds.py (71%) create mode 100644 py-polars/tests/unit/operations/test_bitwise.py diff --git a/py-polars/tests/unit/datatypes/test_schema.py b/py-polars/tests/unit/datatypes/test_schema.py deleted file mode 100644 index ab2260d71dbe..000000000000 --- a/py-polars/tests/unit/datatypes/test_schema.py +++ /dev/null @@ -1,229 +0,0 @@ -from __future__ import annotations - -from datetime import date, timedelta - -import pytest - -import polars as pl -from polars.testing import assert_frame_equal - - -def test_lazy_map_schema() -> None: - df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - - # identity - assert_frame_equal(df.lazy().map_batches(lambda x: x).collect(), df) - - def custom(df: pl.DataFrame) -> pl.Series: - return df["a"] - - with pytest.raises( - pl.ComputeError, - match="Expected 'LazyFrame.map' to return a 'DataFrame', got a", - ): - df.lazy().map_batches(custom).collect() # type: ignore[arg-type] - - def custom2( - df: pl.DataFrame, - ) -> pl.DataFrame: - # changes schema - return df.select(pl.all().cast(pl.String)) - - with pytest.raises( - pl.ComputeError, - match="The output schema of 'LazyFrame.map' is incorrect. Expected", - ): - df.lazy().map_batches(custom2).collect() - - assert df.lazy().map_batches( - custom2, validate_output_schema=False - ).collect().to_dict(as_series=False) == {"a": ["1", "2", "3"], "b": ["a", "b", "c"]} - - -def test_join_as_of_by_schema() -> None: - a = pl.DataFrame({"a": [1], "b": [2], "c": [3]}).lazy() - b = pl.DataFrame({"a": [1], "b": [2], "d": [4]}).lazy() - q = a.join_asof(b, on=pl.col("a").set_sorted(), by="b") - assert q.collect().columns == q.columns - - -def test_unknown_map_elements() -> None: - df = pl.DataFrame( - { - "Amount": [10, 1, 1, 5], - "Flour": ["1000g", "100g", "50g", "75g"], - } - ) - - q = df.lazy().select( - pl.col("Amount"), - pl.col("Flour").map_elements(lambda x: 100.0) / pl.col("Amount"), - ) - - assert q.collect().to_dict(as_series=False) == { - "Amount": [10, 1, 1, 5], - "Flour": [10.0, 100.0, 100.0, 20.0], - } - assert q.dtypes == [pl.Int64, pl.Unknown] - - -def test_remove_redundant_mapping_4668() -> None: - df = pl.DataFrame([["a"]] * 2, ["A", "B "]).lazy() - clean_name_dict = {x: " ".join(x.split()) for x in df.columns} - df = df.rename(clean_name_dict) - assert df.columns == ["A", "B"] - - -def test_fold_all_schema() -> None: - df = pl.DataFrame( - { - "A": [1, 2, 3, 4, 5], - "fruits": ["banana", "banana", "apple", "apple", "banana"], - "B": [5, 4, 3, 2, 1], - "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], - "optional": [28, 300, None, 2, -30], - } - ) - # divide because of overflow - result = df.select(pl.sum_horizontal(pl.all().hash(seed=1) // int(1e8))) - assert result.dtypes == [pl.UInt64] - - -def test_list_eval_type_cast_11188() -> None: - df = pl.DataFrame( - [ - {"a": None}, - ], - schema={"a": pl.List(pl.Int64)}, - ) - assert df.select( - pl.col("a").list.eval(pl.element().cast(pl.String)).alias("a_str") - ).schema == {"a_str": pl.List(pl.String)} - - -def test_duration_division_schema() -> None: - df = pl.DataFrame({"a": [1]}) - q = ( - df.lazy() - .with_columns(pl.col("a").cast(pl.Duration)) - .select(pl.col("a") / pl.col("a")) - ) - - assert q.schema == {"a": pl.Float64} - assert q.collect().to_dict(as_series=False) == {"a": [1.0]} - - -def test_int_operator_stability() -> None: - for dt in pl.datatypes.INTEGER_DTYPES: - s = pl.Series(values=[10], dtype=dt) - assert pl.select(pl.lit(s) // 2).dtypes == [dt] - assert pl.select(pl.lit(s) + 2).dtypes == [dt] - assert pl.select(pl.lit(s) - 2).dtypes == [dt] - assert pl.select(pl.lit(s) * 2).dtypes == [dt] - assert pl.select(pl.lit(s) / 2).dtypes == [pl.Float64] - - -def test_deep_subexpression_f32_schema_7129() -> None: - df = pl.DataFrame({"a": [1.1, 2.3, 3.4, 4.5]}, schema={"a": pl.Float32()}) - assert df.with_columns(pl.col("a") - pl.col("a").median()).dtypes == [pl.Float32] - assert df.with_columns( - (pl.col("a") - pl.col("a").mean()) / (pl.col("a").std() + 0.001) - ).dtypes == [pl.Float32] - - -def test_absence_off_null_prop_8224() -> None: - # a reminder to self to not do null propagation - # it is inconsistent and makes output dtype - # dependent of the data, big no! - - def sub_col_min(column: str, min_column: str) -> pl.Expr: - return pl.col(column).sub(pl.col(min_column).min()) - - df = pl.DataFrame( - { - "group": [1, 1, 2, 2], - "vals_num": [10.0, 11.0, 12.0, 13.0], - "vals_partial": [None, None, 12.0, 13.0], - "vals_null": [None, None, None, None], - } - ) - - q = ( - df.lazy() - .group_by("group") - .agg( - [ - sub_col_min("vals_num", "vals_num").alias("sub_num"), - sub_col_min("vals_num", "vals_partial").alias("sub_partial"), - sub_col_min("vals_num", "vals_null").alias("sub_null"), - ] - ) - ) - - assert q.collect().dtypes == [ - pl.Int64, - pl.List(pl.Float64), - pl.List(pl.Float64), - pl.List(pl.Float64), - ] - - -def test_lit_iter_schema() -> None: - df = pl.DataFrame( - { - "key": ["A", "A", "A", "A"], - "dates": [ - date(1970, 1, 1), - date(1970, 1, 1), - date(1970, 1, 2), - date(1970, 1, 3), - ], - } - ) - - result = df.group_by("key").agg(pl.col("dates").unique() + timedelta(days=1)) - expected = { - "key": ["A"], - "dates": [[date(1970, 1, 2), date(1970, 1, 3), date(1970, 1, 4)]], - } - assert result.to_dict(as_series=False) == expected - - -def test_nested_binary_literal_super_type_12227() -> None: - # The `.alias` is important here to trigger the bug. - assert ( - pl.select(x=1).select((pl.lit(0) + ((pl.col("x") > 0) * 0.1)).alias("x")).item() - == 0.1 - ) - assert ( - pl.select( - (pl.lit(0) + (pl.lit(0) == pl.lit(0)) * pl.lit(0.1)) + pl.lit(0) - ).item() - == 0.1 - ) - - -def test_alias_prune_in_fold_15438() -> None: - df = pl.DataFrame({"x": [1, 2], "expected_result": ["first", "second"]}).select( - actual_result=pl.fold( - acc=pl.lit("other", dtype=pl.Utf8), - function=lambda acc, x: pl.when(x).then(pl.lit(x.name)).otherwise(acc), # type: ignore[arg-type, return-value] - exprs=[ - (pl.col("x") == 1).alias("first"), - (pl.col("x") == 2).alias("second"), - ], - ) - ) - expected = pl.DataFrame( - { - "actual_result": ["first", "second"], - } - ) - assert_frame_equal(df, expected) - - -@pytest.mark.parametrize("op", ["and_", "or_"]) -def test_bitwise_integral_schema(op: str) -> None: - df = pl.LazyFrame({"a": [1, 2], "b": [3, 4]}) - q = df.select(getattr(pl.col("a"), op)(pl.col("b"))) - assert q.schema["a"] == df.schema["a"] diff --git a/py-polars/tests/unit/lazyframe/test_rename.py b/py-polars/tests/unit/lazyframe/test_rename.py index 2a7462d1edb2..1638e32ce32c 100644 --- a/py-polars/tests/unit/lazyframe/test_rename.py +++ b/py-polars/tests/unit/lazyframe/test_rename.py @@ -6,3 +6,10 @@ def test_lazy_rename() -> None: result = df.lazy().rename({"y": "x", "x": "y"}).select(["x", "y"]) assert result.collect().to_dict(as_series=False) == {"x": [2], "y": [1]} + + +def test_remove_redundant_mapping_4668() -> None: + lf = pl.LazyFrame([["a"]] * 2, ["A", "B "]).lazy() + clean_name_dict = {x: " ".join(x.split()) for x in lf.columns} + lf = lf.rename(clean_name_dict) + assert lf.columns == ["A", "B"] diff --git a/py-polars/tests/unit/operations/test_folds.py b/py-polars/tests/unit/operations/aggregation/test_folds.py similarity index 71% rename from py-polars/tests/unit/operations/test_folds.py rename to py-polars/tests/unit/operations/aggregation/test_folds.py index f1204cf92e15..f14bd7852347 100644 --- a/py-polars/tests/unit/operations/test_folds.py +++ b/py-polars/tests/unit/operations/aggregation/test_folds.py @@ -59,3 +59,22 @@ def test_cum_reduce() -> None: } ) assert_frame_equal(result, expected) + + +def test_alias_prune_in_fold_15438() -> None: + df = pl.DataFrame({"x": [1, 2], "expected_result": ["first", "second"]}).select( + actual_result=pl.fold( + acc=pl.lit("other", dtype=pl.Utf8), + function=lambda acc, x: pl.when(x).then(pl.lit(x.name)).otherwise(acc), # type: ignore[arg-type, return-value] + exprs=[ + (pl.col("x") == 1).alias("first"), + (pl.col("x") == 2).alias("second"), + ], + ) + ) + expected = pl.DataFrame( + { + "actual_result": ["first", "second"], + } + ) + assert_frame_equal(df, expected) diff --git a/py-polars/tests/unit/operations/aggregation/test_horizontal.py b/py-polars/tests/unit/operations/aggregation/test_horizontal.py index 4d474367af02..5211ff2969c9 100644 --- a/py-polars/tests/unit/operations/aggregation/test_horizontal.py +++ b/py-polars/tests/unit/operations/aggregation/test_horizontal.py @@ -440,3 +440,18 @@ def test_schema_mean_horizontal_single_column( def test_schema_boolean_sum_horizontal() -> None: lf = pl.LazyFrame({"a": [True, False]}).select(pl.sum_horizontal("a")) assert lf.schema == OrderedDict([("a", pl.UInt32)]) + + +def test_fold_all_schema() -> None: + df = pl.DataFrame( + { + "A": [1, 2, 3, 4, 5], + "fruits": ["banana", "banana", "apple", "apple", "banana"], + "B": [5, 4, 3, 2, 1], + "cars": ["beetle", "audi", "beetle", "beetle", "beetle"], + "optional": [28, 300, None, 2, -30], + } + ) + # divide because of overflow + result = df.select(pl.sum_horizontal(pl.all().hash(seed=1) // int(1e8))) + assert result.dtypes == [pl.UInt64] diff --git a/py-polars/tests/unit/operations/aggregation/test_vertical.py b/py-polars/tests/unit/operations/aggregation/test_vertical.py index 26f01dacc3d2..3f2dbe080c07 100644 --- a/py-polars/tests/unit/operations/aggregation/test_vertical.py +++ b/py-polars/tests/unit/operations/aggregation/test_vertical.py @@ -74,3 +74,11 @@ def test_mean_overflow() -> None: result = df.with_columns(pl.col("value").cast(pl.Int32)).get_column("value").mean() assert np.isclose(result, expected) + + +def test_deep_subexpression_f32_schema_7129() -> None: + df = pl.DataFrame({"a": [1.1, 2.3, 3.4, 4.5]}, schema={"a": pl.Float32()}) + assert df.with_columns(pl.col("a") - pl.col("a").median()).dtypes == [pl.Float32] + assert df.with_columns( + (pl.col("a") - pl.col("a").mean()) / (pl.col("a").std() + 0.001) + ).dtypes == [pl.Float32] diff --git a/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py b/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py index 1973f0483eae..e505881c6542 100644 --- a/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py +++ b/py-polars/tests/unit/operations/arithmetic/test_arithmetic.py @@ -610,3 +610,25 @@ def test_literal_subtract_schema_13284() -> None: .group_by("a") .len() ).schema == OrderedDict([("a", pl.UInt8), ("len", pl.UInt32)]) + + +def test_int_operator_stability() -> None: + for dt in pl.datatypes.INTEGER_DTYPES: + s = pl.Series(values=[10], dtype=dt) + assert pl.select(pl.lit(s) // 2).dtypes == [dt] + assert pl.select(pl.lit(s) + 2).dtypes == [dt] + assert pl.select(pl.lit(s) - 2).dtypes == [dt] + assert pl.select(pl.lit(s) * 2).dtypes == [dt] + assert pl.select(pl.lit(s) / 2).dtypes == [pl.Float64] + + +def test_duration_division_schema() -> None: + df = pl.DataFrame({"a": [1]}) + q = ( + df.lazy() + .with_columns(pl.col("a").cast(pl.Duration)) + .select(pl.col("a") / pl.col("a")) + ) + + assert q.schema == {"a": pl.Float64} + assert q.collect().to_dict(as_series=False) == {"a": [1.0]} diff --git a/py-polars/tests/unit/operations/map/test_map_batches.py b/py-polars/tests/unit/operations/map/test_map_batches.py index 127f1079b060..c3240ba4fb7b 100644 --- a/py-polars/tests/unit/operations/map/test_map_batches.py +++ b/py-polars/tests/unit/operations/map/test_map_batches.py @@ -84,3 +84,35 @@ def test_ufunc_args() -> None: result = df.select(z=np.add(2, pl.col("a"))) # type: ignore[call-overload] expected = pl.DataFrame({"z": [3, 4, 5]}) assert_frame_equal(result, expected) + + +def test_lazy_map_schema() -> None: + df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + # identity + assert_frame_equal(df.lazy().map_batches(lambda x: x).collect(), df) + + def custom(df: pl.DataFrame) -> pl.Series: + return df["a"] + + with pytest.raises( + pl.ComputeError, + match="Expected 'LazyFrame.map' to return a 'DataFrame', got a", + ): + df.lazy().map_batches(custom).collect() # type: ignore[arg-type] + + def custom2( + df: pl.DataFrame, + ) -> pl.DataFrame: + # changes schema + return df.select(pl.all().cast(pl.String)) + + with pytest.raises( + pl.ComputeError, + match="The output schema of 'LazyFrame.map' is incorrect. Expected", + ): + df.lazy().map_batches(custom2).collect() + + assert df.lazy().map_batches( + custom2, validate_output_schema=False + ).collect().to_dict(as_series=False) == {"a": ["1", "2", "3"], "b": ["a", "b", "c"]} diff --git a/py-polars/tests/unit/operations/map/test_map_elements.py b/py-polars/tests/unit/operations/map/test_map_elements.py index 98bf9cf996e1..59bdfa39ea1f 100644 --- a/py-polars/tests/unit/operations/map/test_map_elements.py +++ b/py-polars/tests/unit/operations/map/test_map_elements.py @@ -344,3 +344,23 @@ def test_cabbage_strategy_14396() -> None: ValueError, match="strategy 'cabbage' is not supported" ), pytest.warns(PolarsInefficientMapWarning): df.select(pl.col("x").map_elements(lambda x: 2 * x, strategy="cabbage")) # type: ignore[arg-type] + + +def test_unknown_map_elements() -> None: + df = pl.DataFrame( + { + "Amount": [10, 1, 1, 5], + "Flour": ["1000g", "100g", "50g", "75g"], + } + ) + + q = df.lazy().select( + pl.col("Amount"), + pl.col("Flour").map_elements(lambda x: 100.0) / pl.col("Amount"), + ) + + assert q.collect().to_dict(as_series=False) == { + "Amount": [10, 1, 1, 5], + "Flour": [10.0, 100.0, 100.0, 20.0], + } + assert q.dtypes == [pl.Int64, pl.Unknown] diff --git a/py-polars/tests/unit/operations/namespaces/list/test_list.py b/py-polars/tests/unit/operations/namespaces/list/test_list.py index 86ed6b719f13..e3d238259f89 100644 --- a/py-polars/tests/unit/operations/namespaces/list/test_list.py +++ b/py-polars/tests/unit/operations/namespaces/list/test_list.py @@ -901,3 +901,15 @@ def test_list_eval_err_raise_15653() -> None: def test_list_sum_bool_schema() -> None: q = pl.LazyFrame({"x": [[True, True, False]]}) assert q.select(pl.col("x").list.sum()).schema["x"] == pl.UInt32 + + +def test_list_eval_type_cast_11188() -> None: + df = pl.DataFrame( + [ + {"a": None}, + ], + schema={"a": pl.List(pl.Int64)}, + ) + assert df.select( + pl.col("a").list.eval(pl.element().cast(pl.String)).alias("a_str") + ).schema == {"a_str": pl.List(pl.String)} diff --git a/py-polars/tests/unit/operations/test_bitwise.py b/py-polars/tests/unit/operations/test_bitwise.py new file mode 100644 index 000000000000..674de1aef418 --- /dev/null +++ b/py-polars/tests/unit/operations/test_bitwise.py @@ -0,0 +1,10 @@ +import pytest + +import polars as pl + + +@pytest.mark.parametrize("op", ["and_", "or_"]) +def test_bitwise_integral_schema(op: str) -> None: + df = pl.LazyFrame({"a": [1, 2], "b": [3, 4]}) + q = df.select(getattr(pl.col("a"), op)(pl.col("b"))) + assert q.schema["a"] == df.schema["a"] diff --git a/py-polars/tests/unit/operations/test_comparison.py b/py-polars/tests/unit/operations/test_comparison.py index ebbb4b4c46d5..ef1aed2ffcb9 100644 --- a/py-polars/tests/unit/operations/test_comparison.py +++ b/py-polars/tests/unit/operations/test_comparison.py @@ -379,3 +379,12 @@ def test_schema_ne_missing_9256() -> None: df = pl.DataFrame({"a": [0, 1, None], "b": [True, False, True]}) assert df.select(pl.col("a").ne_missing(0).or_(pl.col("b")))["a"].all() + + +def test_nested_binary_literal_super_type_12227() -> None: + # The `.alias` is important here to trigger the bug. + result = pl.select(x=1).select((pl.lit(0) + ((pl.col("x") > 0) * 0.1)).alias("x")) + assert result.item() == 0.1 + + result = pl.select((pl.lit(0) + (pl.lit(0) == pl.lit(0)) * pl.lit(0.1)) + pl.lit(0)) + assert result.item() == 0.1 diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py index 8f6cce5a4b95..973e023fabb7 100644 --- a/py-polars/tests/unit/operations/test_group_by.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -1071,3 +1071,59 @@ def test_schemas( schema = df.group_by(pl.lit(1)).agg(expr).schema for key, dtype in expected_gb.items(): assert schema[key] == dtype + + +def test_lit_iter_schema() -> None: + df = pl.DataFrame( + { + "key": ["A", "A", "A", "A"], + "dates": [ + date(1970, 1, 1), + date(1970, 1, 1), + date(1970, 1, 2), + date(1970, 1, 3), + ], + } + ) + + result = df.group_by("key").agg(pl.col("dates").unique() + timedelta(days=1)) + expected = { + "key": ["A"], + "dates": [[date(1970, 1, 2), date(1970, 1, 3), date(1970, 1, 4)]], + } + assert result.to_dict(as_series=False) == expected + + +def test_absence_off_null_prop_8224() -> None: + # a reminder to self to not do null propagation + # it is inconsistent and makes output dtype + # dependent of the data, big no! + + def sub_col_min(column: str, min_column: str) -> pl.Expr: + return pl.col(column).sub(pl.col(min_column).min()) + + df = pl.DataFrame( + { + "group": [1, 1, 2, 2], + "vals_num": [10.0, 11.0, 12.0, 13.0], + "vals_partial": [None, None, 12.0, 13.0], + "vals_null": [None, None, None, None], + } + ) + + q = ( + df.lazy() + .group_by("group") + .agg( + sub_col_min("vals_num", "vals_num").alias("sub_num"), + sub_col_min("vals_num", "vals_partial").alias("sub_partial"), + sub_col_min("vals_num", "vals_null").alias("sub_null"), + ) + ) + + assert q.collect().dtypes == [ + pl.Int64, + pl.List(pl.Float64), + pl.List(pl.Float64), + pl.List(pl.Float64), + ] diff --git a/py-polars/tests/unit/operations/test_join_asof.py b/py-polars/tests/unit/operations/test_join_asof.py index 3f3f9ad7745c..b10d857c3041 100644 --- a/py-polars/tests/unit/operations/test_join_asof.py +++ b/py-polars/tests/unit/operations/test_join_asof.py @@ -1165,3 +1165,10 @@ def test_join_asof_invalid_args() -> None: TypeError, match="expected `right_on` to be str or Expr, got 'list'" ): df1.join_asof(df2, left_on="a", right_on=["a"]) # type: ignore[arg-type] + + +def test_join_as_of_by_schema() -> None: + a = pl.DataFrame({"a": [1], "b": [2], "c": [3]}).lazy() + b = pl.DataFrame({"a": [1], "b": [2], "d": [4]}).lazy() + q = a.join_asof(b, on=pl.col("a").set_sorted(), by="b") + assert q.collect().columns == q.columns