Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test(python): Move around some existing tests #16877

Merged
merged 3 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion py-polars/tests/unit/constructors/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import sys
from collections import OrderedDict
from typing import Any
from typing import Any, Iterator, Mapping

import pytest

Expand Down Expand Up @@ -159,3 +159,35 @@ def test_unit_and_empty_construction_15896() -> None:
A=pl.int_range("A"), # creates empty series
)
)


class CustomSchema(Mapping[str, Any]):
"""Dummy schema object for testing compatibility with Mapping."""

_entries: dict[str, Any]

def __init__(self, **named_entries: Any) -> None:
self._items = OrderedDict(named_entries.items())

def __getitem__(self, key: str) -> Any:
return self._items[key]

def __len__(self) -> int:
return len(self._items)

def __iter__(self) -> Iterator[str]:
yield from self._items


def test_custom_schema() -> None:
df = pl.DataFrame(schema=CustomSchema(bool=pl.Boolean, misc=pl.UInt8))
assert df.schema == OrderedDict([("bool", pl.Boolean), ("misc", pl.UInt8)])

with pytest.raises(ValueError):
pl.DataFrame(schema=CustomSchema(bool="boolean", misc="unsigned int"))


def test_list_null_constructor_schema() -> None:
expected = pl.List(pl.Null)
assert pl.DataFrame({"a": [[]]}).dtypes[0] == expected
assert pl.DataFrame(schema={"a": pl.List}).dtypes[0] == expected
6 changes: 6 additions & 0 deletions py-polars/tests/unit/constructors/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,9 @@ def test_series_init_np_2d_zero_zero_shape() -> None:
match=re.escape("cannot reshape empty array into shape (0, 0)"),
):
pl.Series(arr)


def test_list_null_constructor_schema() -> None:
expected = pl.List(pl.Null)
assert pl.Series([[]]).dtype == expected
assert pl.Series([[]], dtype=pl.List).dtype == expected
19 changes: 19 additions & 0 deletions py-polars/tests/unit/functions/as_datatype/test_struct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import polars as pl


def test_resolved_names_15442() -> None:
df = pl.DataFrame(
{
"x": [206.0],
"y": [225.0],
}
)
center = pl.struct(
x=pl.col("x"),
y=pl.col("y"),
)

left = 0
right = 1000
in_x = (left < center.struct.field("x")) & (center.struct.field("x") <= right)
assert df.lazy().filter(in_x).collect().shape == (1, 2)
39 changes: 39 additions & 0 deletions py-polars/tests/unit/functions/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,42 @@ def test_concat_lf_stack_overflow() -> None:
for i in range(n):
bar = pl.concat([bar, pl.DataFrame({"a": i}).lazy()])
assert bar.collect().shape == (1001, 1)


def test_concat_vertically_relaxed() -> None:
a = pl.DataFrame(
data={"a": [1, 2, 3], "b": [True, False, None]},
schema={"a": pl.Int8, "b": pl.Boolean},
)
b = pl.DataFrame(
data={"a": [43, 2, 3], "b": [32, 1, None]},
schema={"a": pl.Int16, "b": pl.Int64},
)
out = pl.concat([a, b], how="vertical_relaxed")
assert out.schema == {"a": pl.Int16, "b": pl.Int64}
assert out.to_dict(as_series=False) == {
"a": [1, 2, 3, 43, 2, 3],
"b": [1, 0, None, 32, 1, None],
}
out = pl.concat([b, a], how="vertical_relaxed")
assert out.schema == {"a": pl.Int16, "b": pl.Int64}
assert out.to_dict(as_series=False) == {
"a": [43, 2, 3, 1, 2, 3],
"b": [32, 1, None, 1, 0, None],
}

c = pl.DataFrame({"a": [1, 2], "b": [2, 1]})
d = pl.DataFrame({"a": [1.0, 0.2], "b": [None, 0.1]})

out = pl.concat([c, d], how="vertical_relaxed")
assert out.schema == {"a": pl.Float64, "b": pl.Float64}
assert out.to_dict(as_series=False) == {
"a": [1.0, 2.0, 1.0, 0.2],
"b": [2.0, 1.0, None, 0.1],
}
out = pl.concat([d, c], how="vertical_relaxed")
assert out.schema == {"a": pl.Float64, "b": pl.Float64}
assert out.to_dict(as_series=False) == {
"a": [1.0, 0.2, 1.0, 2.0],
"b": [None, 0.1, 2.0, 1.0],
}
15 changes: 15 additions & 0 deletions py-polars/tests/unit/lazyframe/test_rename.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import polars as pl


def test_lazy_rename() -> None:
df = pl.DataFrame({"x": [1], "y": [2]})

result = df.lazy().rename({"y": "x", "x": "y"}).select(["x", "y"])
assert result.collect().to_dict(as_series=False) == {"x": [2], "y": [1]}


def test_remove_redundant_mapping_4668() -> None:
lf = pl.LazyFrame([["a"]] * 2, ["A", "B "]).lazy()
clean_name_dict = {x: " ".join(x.split()) for x in lf.columns}
lf = lf.rename(clean_name_dict)
assert lf.columns == ["A", "B"]
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,22 @@ def test_cum_reduce() -> None:
}
)
assert_frame_equal(result, expected)


def test_alias_prune_in_fold_15438() -> None:
df = pl.DataFrame({"x": [1, 2], "expected_result": ["first", "second"]}).select(
actual_result=pl.fold(
acc=pl.lit("other", dtype=pl.Utf8),
function=lambda acc, x: pl.when(x).then(pl.lit(x.name)).otherwise(acc), # type: ignore[arg-type, return-value]
exprs=[
(pl.col("x") == 1).alias("first"),
(pl.col("x") == 2).alias("second"),
],
)
)
expected = pl.DataFrame(
{
"actual_result": ["first", "second"],
}
)
assert_frame_equal(df, expected)
48 changes: 48 additions & 0 deletions py-polars/tests/unit/operations/aggregation/test_horizontal.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import datetime
from collections import OrderedDict
from typing import Any

import pytest
Expand Down Expand Up @@ -407,3 +408,50 @@ def test_mean_horizontal_all_null() -> None:

expected = pl.LazyFrame({"a": [1.5, None]}, schema={"a": pl.Float64})
assert_frame_equal(result, expected)


@pytest.mark.parametrize(
("in_dtype", "out_dtype"),
[
(pl.Boolean, pl.Float64),
(pl.UInt8, pl.Float64),
(pl.UInt16, pl.Float64),
(pl.UInt32, pl.Float64),
(pl.UInt64, pl.Float64),
(pl.Int8, pl.Float64),
(pl.Int16, pl.Float64),
(pl.Int32, pl.Float64),
(pl.Int64, pl.Float64),
(pl.Float32, pl.Float32),
(pl.Float64, pl.Float64),
],
)
def test_schema_mean_horizontal_single_column(
in_dtype: pl.PolarsDataType,
out_dtype: pl.PolarsDataType,
) -> None:
lf = pl.LazyFrame({"a": pl.Series([1, 0], dtype=in_dtype)}).select(
pl.mean_horizontal(pl.all())
)

assert lf.schema == OrderedDict([("a", out_dtype)])


def test_schema_boolean_sum_horizontal() -> None:
lf = pl.LazyFrame({"a": [True, False]}).select(pl.sum_horizontal("a"))
assert lf.schema == OrderedDict([("a", pl.UInt32)])


def test_fold_all_schema() -> None:
df = pl.DataFrame(
{
"A": [1, 2, 3, 4, 5],
"fruits": ["banana", "banana", "apple", "apple", "banana"],
"B": [5, 4, 3, 2, 1],
"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
"optional": [28, 300, None, 2, -30],
}
)
# divide because of overflow
result = df.select(pl.sum_horizontal(pl.all().hash(seed=1) // int(1e8)))
assert result.dtypes == [pl.UInt64]
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,11 @@ def test_mean_overflow() -> None:

result = df.with_columns(pl.col("value").cast(pl.Int32)).get_column("value").mean()
assert np.isclose(result, expected)


def test_deep_subexpression_f32_schema_7129() -> None:
df = pl.DataFrame({"a": [1.1, 2.3, 3.4, 4.5]}, schema={"a": pl.Float32()})
assert df.with_columns(pl.col("a") - pl.col("a").median()).dtypes == [pl.Float32]
assert df.with_columns(
(pl.col("a") - pl.col("a").mean()) / (pl.col("a").std() + 0.001)
).dtypes == [pl.Float32]
49 changes: 49 additions & 0 deletions py-polars/tests/unit/operations/arithmetic/test_arithmetic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import operator
from collections import OrderedDict
from datetime import date, datetime, timedelta
from typing import Any

Expand Down Expand Up @@ -583,3 +584,51 @@ def test_array_arithmetic_same_size(expected: Any, expr: pl.Expr) -> None:
df.select(expr),
pl.Series("a", expected).to_frame(),
)


def test_schema_owned_arithmetic_5669() -> None:
df = (
pl.LazyFrame({"A": [1, 2, 3]})
.filter(pl.col("A") >= 3)
.with_columns(-pl.col("A").alias("B"))
.collect()
)
assert df.columns == ["A", "B"]
assert df.rows() == [(3, -3)]


def test_schema_true_divide_6643() -> None:
df = pl.DataFrame({"a": [1]})
a = pl.col("a")
assert df.lazy().select(a / 2).select(pl.col(pl.Int64)).collect().shape == (0, 0)


def test_literal_subtract_schema_13284() -> None:
assert (
pl.LazyFrame({"a": [23, 30]}, schema={"a": pl.UInt8})
.with_columns(pl.col("a") - pl.lit(1))
.group_by("a")
.len()
).schema == OrderedDict([("a", pl.UInt8), ("len", pl.UInt32)])


def test_int_operator_stability() -> None:
for dt in pl.datatypes.INTEGER_DTYPES:
s = pl.Series(values=[10], dtype=dt)
assert pl.select(pl.lit(s) // 2).dtypes == [dt]
assert pl.select(pl.lit(s) + 2).dtypes == [dt]
assert pl.select(pl.lit(s) - 2).dtypes == [dt]
assert pl.select(pl.lit(s) * 2).dtypes == [dt]
assert pl.select(pl.lit(s) / 2).dtypes == [pl.Float64]


def test_duration_division_schema() -> None:
df = pl.DataFrame({"a": [1]})
q = (
df.lazy()
.with_columns(pl.col("a").cast(pl.Duration))
.select(pl.col("a") / pl.col("a"))
)

assert q.schema == {"a": pl.Float64}
assert q.collect().to_dict(as_series=False) == {"a": [1.0]}
59 changes: 59 additions & 0 deletions py-polars/tests/unit/operations/arithmetic/test_pow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import polars as pl


def test_pow_dtype() -> None:
df = pl.DataFrame(
{
"foo": [1, 2, 3, 4, 5],
"a": [1, 2, 3, 4, 5],
"b": [1, 2, 3, 4, 5],
"c": [1, 2, 3, 4, 5],
"d": [1, 2, 3, 4, 5],
"e": [1, 2, 3, 4, 5],
"f": [1, 2, 3, 4, 5],
"g": [1, 2, 1, 2, 1],
"h": [1, 2, 1, 2, 1],
},
schema_overrides={
"a": pl.Int64,
"b": pl.UInt64,
"c": pl.Int32,
"d": pl.UInt32,
"e": pl.Int16,
"f": pl.UInt16,
"g": pl.Int8,
"h": pl.UInt8,
},
).lazy()

df = (
df.with_columns([pl.col("foo").cast(pl.UInt32)])
.with_columns(
(pl.col("foo") * 2**2).alias("scaled_foo"),
(pl.col("foo") * 2**2.1).alias("scaled_foo2"),
(pl.col("a") ** pl.col("h")).alias("a_pow_h"),
(pl.col("b") ** pl.col("h")).alias("b_pow_h"),
(pl.col("c") ** pl.col("h")).alias("c_pow_h"),
(pl.col("d") ** pl.col("h")).alias("d_pow_h"),
(pl.col("e") ** pl.col("h")).alias("e_pow_h"),
(pl.col("f") ** pl.col("h")).alias("f_pow_h"),
(pl.col("g") ** pl.col("h")).alias("g_pow_h"),
(pl.col("h") ** pl.col("h")).alias("h_pow_h"),
)
.drop(["a", "b", "c", "d", "e", "f", "g", "h"])
)
expected = [
pl.UInt32,
pl.UInt32,
pl.Float64,
pl.Int64,
pl.UInt64,
pl.Int32,
pl.UInt32,
pl.Int16,
pl.UInt16,
pl.Int8,
pl.UInt8,
]
assert df.collect().dtypes == expected
assert df.dtypes == expected
32 changes: 32 additions & 0 deletions py-polars/tests/unit/operations/map/test_map_batches.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,35 @@ def test_ufunc_args() -> None:
result = df.select(z=np.add(2, pl.col("a"))) # type: ignore[call-overload]
expected = pl.DataFrame({"z": [3, 4, 5]})
assert_frame_equal(result, expected)


def test_lazy_map_schema() -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})

# identity
assert_frame_equal(df.lazy().map_batches(lambda x: x).collect(), df)

def custom(df: pl.DataFrame) -> pl.Series:
return df["a"]

with pytest.raises(
pl.ComputeError,
match="Expected 'LazyFrame.map' to return a 'DataFrame', got a",
):
df.lazy().map_batches(custom).collect() # type: ignore[arg-type]

def custom2(
df: pl.DataFrame,
) -> pl.DataFrame:
# changes schema
return df.select(pl.all().cast(pl.String))

with pytest.raises(
pl.ComputeError,
match="The output schema of 'LazyFrame.map' is incorrect. Expected",
):
df.lazy().map_batches(custom2).collect()

assert df.lazy().map_batches(
custom2, validate_output_schema=False
).collect().to_dict(as_series=False) == {"a": ["1", "2", "3"], "b": ["a", "b", "c"]}
Loading