pola-rs · stinodego · Jun 11, 2024 · Jun 11, 2024 · Jun 11, 2024 · Jun 11, 2024
@@ -2,7 +2,7 @@
 
 import sys
 from collections import OrderedDict
-from typing import Any
+from typing import Any, Iterator, Mapping
 
 import pytest
 
@@ -159,3 +159,35 @@ def test_unit_and_empty_construction_15896() -> None:
             A=pl.int_range("A"),  # creates empty series
         )
     )
+
+
+class CustomSchema(Mapping[str, Any]):
+    """Dummy schema object for testing compatibility with Mapping."""
+
+    _entries: dict[str, Any]
+
+    def __init__(self, **named_entries: Any) -> None:
+        self._items = OrderedDict(named_entries.items())
+
+    def __getitem__(self, key: str) -> Any:
+        return self._items[key]
+
+    def __len__(self) -> int:
+        return len(self._items)
+
+    def __iter__(self) -> Iterator[str]:
+        yield from self._items
+
+
+def test_custom_schema() -> None:
+    df = pl.DataFrame(schema=CustomSchema(bool=pl.Boolean, misc=pl.UInt8))
+    assert df.schema == OrderedDict([("bool", pl.Boolean), ("misc", pl.UInt8)])
+
+    with pytest.raises(ValueError):
+        pl.DataFrame(schema=CustomSchema(bool="boolean", misc="unsigned int"))
+
+
+def test_list_null_constructor_schema() -> None:
+    expected = pl.List(pl.Null)
+    assert pl.DataFrame({"a": [[]]}).dtypes[0] == expected
+    assert pl.DataFrame(schema={"a": pl.List}).dtypes[0] == expected
@@ -148,3 +148,9 @@ def test_series_init_np_2d_zero_zero_shape() -> None:
         match=re.escape("cannot reshape empty array into shape (0, 0)"),
     ):
         pl.Series(arr)
+
+
+def test_list_null_constructor_schema() -> None:
+    expected = pl.List(pl.Null)
+    assert pl.Series([[]]).dtype == expected
+    assert pl.Series([[]], dtype=pl.List).dtype == expected
@@ -0,0 +1,19 @@
+import polars as pl
+
+
+def test_resolved_names_15442() -> None:
+    df = pl.DataFrame(
+        {
+            "x": [206.0],
+            "y": [225.0],
+        }
+    )
+    center = pl.struct(
+        x=pl.col("x"),
+        y=pl.col("y"),
+    )
+
+    left = 0
+    right = 1000
+    in_x = (left < center.struct.field("x")) & (center.struct.field("x") <= right)
+    assert df.lazy().filter(in_x).collect().shape == (1, 2)
@@ -20,3 +20,42 @@ def test_concat_lf_stack_overflow() -> None:
     for i in range(n):
         bar = pl.concat([bar, pl.DataFrame({"a": i}).lazy()])
     assert bar.collect().shape == (1001, 1)
+
+
+def test_concat_vertically_relaxed() -> None:
+    a = pl.DataFrame(
+        data={"a": [1, 2, 3], "b": [True, False, None]},
+        schema={"a": pl.Int8, "b": pl.Boolean},
+    )
+    b = pl.DataFrame(
+        data={"a": [43, 2, 3], "b": [32, 1, None]},
+        schema={"a": pl.Int16, "b": pl.Int64},
+    )
+    out = pl.concat([a, b], how="vertical_relaxed")
+    assert out.schema == {"a": pl.Int16, "b": pl.Int64}
+    assert out.to_dict(as_series=False) == {
+        "a": [1, 2, 3, 43, 2, 3],
+        "b": [1, 0, None, 32, 1, None],
+    }
+    out = pl.concat([b, a], how="vertical_relaxed")
+    assert out.schema == {"a": pl.Int16, "b": pl.Int64}
+    assert out.to_dict(as_series=False) == {
+        "a": [43, 2, 3, 1, 2, 3],
+        "b": [32, 1, None, 1, 0, None],
+    }
+
+    c = pl.DataFrame({"a": [1, 2], "b": [2, 1]})
+    d = pl.DataFrame({"a": [1.0, 0.2], "b": [None, 0.1]})
+
+    out = pl.concat([c, d], how="vertical_relaxed")
+    assert out.schema == {"a": pl.Float64, "b": pl.Float64}
+    assert out.to_dict(as_series=False) == {
+        "a": [1.0, 2.0, 1.0, 0.2],
+        "b": [2.0, 1.0, None, 0.1],
+    }
+    out = pl.concat([d, c], how="vertical_relaxed")
+    assert out.schema == {"a": pl.Float64, "b": pl.Float64}
+    assert out.to_dict(as_series=False) == {
+        "a": [1.0, 0.2, 1.0, 2.0],
+        "b": [None, 0.1, 2.0, 1.0],
+    }
@@ -0,0 +1,15 @@
+import polars as pl
+
+
+def test_lazy_rename() -> None:
+    df = pl.DataFrame({"x": [1], "y": [2]})
+
+    result = df.lazy().rename({"y": "x", "x": "y"}).select(["x", "y"])
+    assert result.collect().to_dict(as_series=False) == {"x": [2], "y": [1]}
+
+
+def test_remove_redundant_mapping_4668() -> None:
+    lf = pl.LazyFrame([["a"]] * 2, ["A", "B "]).lazy()
+    clean_name_dict = {x: " ".join(x.split()) for x in lf.columns}
+    lf = lf.rename(clean_name_dict)
+    assert lf.columns == ["A", "B"]
@@ -59,3 +59,22 @@ def test_cum_reduce() -> None:
         }
     )
     assert_frame_equal(result, expected)
+
+
+def test_alias_prune_in_fold_15438() -> None:
+    df = pl.DataFrame({"x": [1, 2], "expected_result": ["first", "second"]}).select(
+        actual_result=pl.fold(
+            acc=pl.lit("other", dtype=pl.Utf8),
+            function=lambda acc, x: pl.when(x).then(pl.lit(x.name)).otherwise(acc),  # type: ignore[arg-type, return-value]
+            exprs=[
+                (pl.col("x") == 1).alias("first"),
+                (pl.col("x") == 2).alias("second"),
+            ],
+        )
+    )
+    expected = pl.DataFrame(
+        {
+            "actual_result": ["first", "second"],
+        }
+    )
+    assert_frame_equal(df, expected)
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import datetime
+from collections import OrderedDict
 from typing import Any
 
 import pytest
@@ -407,3 +408,50 @@ def test_mean_horizontal_all_null() -> None:
 
     expected = pl.LazyFrame({"a": [1.5, None]}, schema={"a": pl.Float64})
     assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("in_dtype", "out_dtype"),
+    [
+        (pl.Boolean, pl.Float64),
+        (pl.UInt8, pl.Float64),
+        (pl.UInt16, pl.Float64),
+        (pl.UInt32, pl.Float64),
+        (pl.UInt64, pl.Float64),
+        (pl.Int8, pl.Float64),
+        (pl.Int16, pl.Float64),
+        (pl.Int32, pl.Float64),
+        (pl.Int64, pl.Float64),
+        (pl.Float32, pl.Float32),
+        (pl.Float64, pl.Float64),
+    ],
+)
+def test_schema_mean_horizontal_single_column(
+    in_dtype: pl.PolarsDataType,
+    out_dtype: pl.PolarsDataType,
+) -> None:
+    lf = pl.LazyFrame({"a": pl.Series([1, 0], dtype=in_dtype)}).select(
+        pl.mean_horizontal(pl.all())
+    )
+
+    assert lf.schema == OrderedDict([("a", out_dtype)])
+
+
+def test_schema_boolean_sum_horizontal() -> None:
+    lf = pl.LazyFrame({"a": [True, False]}).select(pl.sum_horizontal("a"))
+    assert lf.schema == OrderedDict([("a", pl.UInt32)])
+
+
+def test_fold_all_schema() -> None:
+    df = pl.DataFrame(
+        {
+            "A": [1, 2, 3, 4, 5],
+            "fruits": ["banana", "banana", "apple", "apple", "banana"],
+            "B": [5, 4, 3, 2, 1],
+            "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
+            "optional": [28, 300, None, 2, -30],
+        }
+    )
+    # divide because of overflow
+    result = df.select(pl.sum_horizontal(pl.all().hash(seed=1) // int(1e8)))
+    assert result.dtypes == [pl.UInt64]
@@ -74,3 +74,11 @@ def test_mean_overflow() -> None:
 
     result = df.with_columns(pl.col("value").cast(pl.Int32)).get_column("value").mean()
     assert np.isclose(result, expected)
+
+
+def test_deep_subexpression_f32_schema_7129() -> None:
+    df = pl.DataFrame({"a": [1.1, 2.3, 3.4, 4.5]}, schema={"a": pl.Float32()})
+    assert df.with_columns(pl.col("a") - pl.col("a").median()).dtypes == [pl.Float32]
+    assert df.with_columns(
+        (pl.col("a") - pl.col("a").mean()) / (pl.col("a").std() + 0.001)
+    ).dtypes == [pl.Float32]
@@ -1,4 +1,5 @@
 import operator
+from collections import OrderedDict
 from datetime import date, datetime, timedelta
 from typing import Any
 
@@ -583,3 +584,51 @@ def test_array_arithmetic_same_size(expected: Any, expr: pl.Expr) -> None:
         df.select(expr),
         pl.Series("a", expected).to_frame(),
     )
+
+
+def test_schema_owned_arithmetic_5669() -> None:
+    df = (
+        pl.LazyFrame({"A": [1, 2, 3]})
+        .filter(pl.col("A") >= 3)
+        .with_columns(-pl.col("A").alias("B"))
+        .collect()
+    )
+    assert df.columns == ["A", "B"]
+    assert df.rows() == [(3, -3)]
+
+
+def test_schema_true_divide_6643() -> None:
+    df = pl.DataFrame({"a": [1]})
+    a = pl.col("a")
+    assert df.lazy().select(a / 2).select(pl.col(pl.Int64)).collect().shape == (0, 0)
+
+
+def test_literal_subtract_schema_13284() -> None:
+    assert (
+        pl.LazyFrame({"a": [23, 30]}, schema={"a": pl.UInt8})
+        .with_columns(pl.col("a") - pl.lit(1))
+        .group_by("a")
+        .len()
+    ).schema == OrderedDict([("a", pl.UInt8), ("len", pl.UInt32)])
+
+
+def test_int_operator_stability() -> None:
+    for dt in pl.datatypes.INTEGER_DTYPES:
+        s = pl.Series(values=[10], dtype=dt)
+        assert pl.select(pl.lit(s) // 2).dtypes == [dt]
+        assert pl.select(pl.lit(s) + 2).dtypes == [dt]
+        assert pl.select(pl.lit(s) - 2).dtypes == [dt]
+        assert pl.select(pl.lit(s) * 2).dtypes == [dt]
+        assert pl.select(pl.lit(s) / 2).dtypes == [pl.Float64]
+
+
+def test_duration_division_schema() -> None:
+    df = pl.DataFrame({"a": [1]})
+    q = (
+        df.lazy()
+        .with_columns(pl.col("a").cast(pl.Duration))
+        .select(pl.col("a") / pl.col("a"))
+    )
+
+    assert q.schema == {"a": pl.Float64}
+    assert q.collect().to_dict(as_series=False) == {"a": [1.0]}
@@ -0,0 +1,59 @@
+import polars as pl
+
+
+def test_pow_dtype() -> None:
+    df = pl.DataFrame(
+        {
+            "foo": [1, 2, 3, 4, 5],
+            "a": [1, 2, 3, 4, 5],
+            "b": [1, 2, 3, 4, 5],
+            "c": [1, 2, 3, 4, 5],
+            "d": [1, 2, 3, 4, 5],
+            "e": [1, 2, 3, 4, 5],
+            "f": [1, 2, 3, 4, 5],
+            "g": [1, 2, 1, 2, 1],
+            "h": [1, 2, 1, 2, 1],
+        },
+        schema_overrides={
+            "a": pl.Int64,
+            "b": pl.UInt64,
+            "c": pl.Int32,
+            "d": pl.UInt32,
+            "e": pl.Int16,
+            "f": pl.UInt16,
+            "g": pl.Int8,
+            "h": pl.UInt8,
+        },
+    ).lazy()
+
+    df = (
+        df.with_columns([pl.col("foo").cast(pl.UInt32)])
+        .with_columns(
+            (pl.col("foo") * 2**2).alias("scaled_foo"),
+            (pl.col("foo") * 2**2.1).alias("scaled_foo2"),
+            (pl.col("a") ** pl.col("h")).alias("a_pow_h"),
+            (pl.col("b") ** pl.col("h")).alias("b_pow_h"),
+            (pl.col("c") ** pl.col("h")).alias("c_pow_h"),
+            (pl.col("d") ** pl.col("h")).alias("d_pow_h"),
+            (pl.col("e") ** pl.col("h")).alias("e_pow_h"),
+            (pl.col("f") ** pl.col("h")).alias("f_pow_h"),
+            (pl.col("g") ** pl.col("h")).alias("g_pow_h"),
+            (pl.col("h") ** pl.col("h")).alias("h_pow_h"),
+        )
+        .drop(["a", "b", "c", "d", "e", "f", "g", "h"])
+    )
+    expected = [
+        pl.UInt32,
+        pl.UInt32,
+        pl.Float64,
+        pl.Int64,
+        pl.UInt64,
+        pl.Int32,
+        pl.UInt32,
+        pl.Int16,
+        pl.UInt16,
+        pl.Int8,
+        pl.UInt8,
+    ]
+    assert df.collect().dtypes == expected
+    assert df.dtypes == expected
@@ -84,3 +84,35 @@ def test_ufunc_args() -> None:
     result = df.select(z=np.add(2, pl.col("a")))  # type: ignore[call-overload]
     expected = pl.DataFrame({"z": [3, 4, 5]})
     assert_frame_equal(result, expected)
+
+
+def test_lazy_map_schema() -> None:
+    df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+
+    # identity
+    assert_frame_equal(df.lazy().map_batches(lambda x: x).collect(), df)
+
+    def custom(df: pl.DataFrame) -> pl.Series:
+        return df["a"]
+
+    with pytest.raises(
+        pl.ComputeError,
+        match="Expected 'LazyFrame.map' to return a 'DataFrame', got a",
+    ):
+        df.lazy().map_batches(custom).collect()  # type: ignore[arg-type]
+
+    def custom2(
+        df: pl.DataFrame,
+    ) -> pl.DataFrame:
+        # changes schema
+        return df.select(pl.all().cast(pl.String))
+
+    with pytest.raises(
+        pl.ComputeError,
+        match="The output schema of 'LazyFrame.map' is incorrect. Expected",
+    ):
+        df.lazy().map_batches(custom2).collect()
+
+    assert df.lazy().map_batches(
+        custom2, validate_output_schema=False
+    ).collect().to_dict(as_series=False) == {"a": ["1", "2", "3"], "b": ["a", "b", "c"]}