diff --git a/daft/daft.pyi b/daft/daft.pyi index 835cf86695..24d729b244 100644 --- a/daft/daft.pyi +++ b/daft/daft.pyi @@ -1042,6 +1042,7 @@ class PyExpr: def __hash__(self) -> int: ... def __reduce__(self) -> tuple: ... def is_nan(self) -> PyExpr: ... + def is_inf(self) -> PyExpr: ... def dt_date(self) -> PyExpr: ... def dt_day(self) -> PyExpr: ... def dt_hour(self) -> PyExpr: ... @@ -1207,6 +1208,7 @@ class PySeries: def utf8_ilike(self, pattern: PySeries) -> PySeries: ... def utf8_substr(self, start: PySeries, length: PySeries | None = None) -> PySeries: ... def is_nan(self) -> PySeries: ... + def is_inf(self) -> PySeries: ... def dt_date(self) -> PySeries: ... def dt_day(self) -> PySeries: ... def dt_hour(self) -> PySeries: ... diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index d6417c2f1b..392767f290 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -803,6 +803,21 @@ def is_nan(self) -> Expression: """ return Expression._from_pyexpr(self._expr.is_nan()) + def is_inf(self) -> Expression: + """Checks if values in the Expression are Infinity. + + .. NOTE:: + Nulls will be propagated! I.e. this operation will return a null for null values. + + Example: + >>> # [-float("inf"), 0., float("inf"), None] -> [True, False, True, None] + >>> col("x").float.is_inf() + + Returns: + Expression: Boolean Expression indicating whether values are Infinity. + """ + return Expression._from_pyexpr(self._expr.is_inf()) + class ExpressionDatetimeNamespace(ExpressionNamespace): def date(self) -> Expression: diff --git a/daft/series.py b/daft/series.py index ebf1262254..e36d64edfe 100644 --- a/daft/series.py +++ b/daft/series.py @@ -635,6 +635,9 @@ class SeriesFloatNamespace(SeriesNamespace): def is_nan(self) -> Series: return Series._from_pyseries(self._series.is_nan()) + def is_inf(self) -> Series: + return Series._from_pyseries(self._series.is_inf()) + class SeriesStringNamespace(SeriesNamespace): def endswith(self, suffix: Series) -> Series: diff --git a/docs/source/api_docs/expressions.rst b/docs/source/api_docs/expressions.rst index f70a7556d1..1e25d87faf 100644 --- a/docs/source/api_docs/expressions.rst +++ b/docs/source/api_docs/expressions.rst @@ -146,6 +146,20 @@ The following methods are available under the ``expr.str`` attribute. Expression.str.ilike Expression.str.substr +.. _api-float-expression-operations: + +Floats +####### + +The following methods are available under the ``expr.float`` attribute. + +.. autosummary:: + :nosignatures: + :toctree: doc_gen/expression_methods + :template: autosummary/accessor_method.rst + + Expression.float.is_inf + .. _api-expressions-temporal: Temporal diff --git a/src/daft-core/src/array/ops/float.rs b/src/daft-core/src/array/ops/float.rs index 7474886ccc..7fddedd555 100644 --- a/src/daft-core/src/array/ops/float.rs +++ b/src/daft-core/src/array/ops/float.rs @@ -5,6 +5,7 @@ use crate::{ use common_error::DaftResult; use num_traits::Float; +use super::DaftIsInf; use super::DaftIsNan; use super::as_arrow::AsArrow; @@ -38,3 +39,32 @@ impl DaftIsNan for DataArray { ))) } } + +impl DaftIsInf for DataArray +where + T: DaftFloatType, + ::Native: Float, +{ + type Output = DaftResult>; + + fn is_inf(&self) -> Self::Output { + let arrow_array = self.as_arrow(); + let result_arrow_array = arrow2::array::BooleanArray::from_trusted_len_values_iter( + arrow_array.values_iter().map(|v| v.is_infinite()), + ) + .with_validity(arrow_array.validity().cloned()); + Ok(BooleanArray::from((self.name(), result_arrow_array))) + } +} + +impl DaftIsInf for DataArray { + type Output = DaftResult>; + + fn is_inf(&self) -> Self::Output { + Ok(BooleanArray::from(( + self.name(), + arrow2::array::BooleanArray::from_slice(vec![false; self.len()]) + .with_validity(Some(arrow2::bitmap::Bitmap::from(vec![false; self.len()]))), + ))) + } +} diff --git a/src/daft-core/src/array/ops/mod.rs b/src/daft-core/src/array/ops/mod.rs index 56637a25f5..49bfdec6cb 100644 --- a/src/daft-core/src/array/ops/mod.rs +++ b/src/daft-core/src/array/ops/mod.rs @@ -120,6 +120,11 @@ pub trait DaftIsNan { fn is_nan(&self) -> Self::Output; } +pub trait DaftIsInf { + type Output; + fn is_inf(&self) -> Self::Output; +} + pub type VecIndices = Vec; pub type GroupIndices = Vec; pub type GroupIndicesPair = (VecIndices, GroupIndices); diff --git a/src/daft-core/src/python/series.rs b/src/daft-core/src/python/series.rs index 1da71cd150..446e9822cd 100644 --- a/src/daft-core/src/python/series.rs +++ b/src/daft-core/src/python/series.rs @@ -461,6 +461,10 @@ impl PySeries { Ok(self.series.is_nan()?.into()) } + pub fn is_inf(&self) -> PyResult { + Ok(self.series.is_inf()?.into()) + } + pub fn dt_date(&self) -> PyResult { Ok(self.series.dt_date()?.into()) } diff --git a/src/daft-core/src/series/ops/float.rs b/src/daft-core/src/series/ops/float.rs index 8a8795c277..890b024fd1 100644 --- a/src/daft-core/src/series/ops/float.rs +++ b/src/daft-core/src/series/ops/float.rs @@ -11,4 +11,11 @@ impl Series { Ok(DaftIsNan::is_nan(self.downcast::<<$T as DaftDataType>::ArrayType>()?)?.into_series()) }) } + + pub fn is_inf(&self) -> DaftResult { + use crate::array::ops::DaftIsInf; + with_match_float_and_null_daft_types!(self.data_type(), |$T| { + Ok(DaftIsInf::is_inf(self.downcast::<<$T as DaftDataType>::ArrayType>()?)?.into_series()) + }) + } } diff --git a/src/daft-dsl/src/functions/float/is_inf.rs b/src/daft-dsl/src/functions/float/is_inf.rs new file mode 100644 index 0000000000..e76796a471 --- /dev/null +++ b/src/daft-dsl/src/functions/float/is_inf.rs @@ -0,0 +1,51 @@ +use daft_core::{ + datatypes::{DataType, Field}, + schema::Schema, + series::Series, +}; + +use crate::ExprRef; + +use crate::functions::FunctionExpr; +use common_error::{DaftError, DaftResult}; + +use super::super::FunctionEvaluator; + +pub(super) struct IsInfEvaluator {} + +impl FunctionEvaluator for IsInfEvaluator { + fn fn_name(&self) -> &'static str { + "is_inf" + } + + fn to_field(&self, inputs: &[ExprRef], schema: &Schema, _: &FunctionExpr) -> DaftResult { + match inputs { + [data] => match data.to_field(schema) { + Ok(data_field) => match &data_field.dtype { + // DataType::Float16 | + DataType::Float32 | DataType::Float64 => { + Ok(Field::new(data_field.name, DataType::Boolean)) + } + _ => Err(DaftError::TypeError(format!( + "Expects input to is_inf to be float, but received {data_field}", + ))), + }, + Err(e) => Err(e), + }, + _ => Err(DaftError::SchemaMismatch(format!( + "Expected 1 input args, got {}", + inputs.len() + ))), + } + } + + fn evaluate(&self, inputs: &[Series], _: &FunctionExpr) -> DaftResult { + match inputs { + [data] => data.is_inf(), + _ => Err(DaftError::ValueError(format!( + "Expected 1 input args, got {}", + inputs.len() + ))), + } + } +} diff --git a/src/daft-dsl/src/functions/float/mod.rs b/src/daft-dsl/src/functions/float/mod.rs index 65018f3a9a..d5c9218b94 100644 --- a/src/daft-dsl/src/functions/float/mod.rs +++ b/src/daft-dsl/src/functions/float/mod.rs @@ -1,5 +1,7 @@ +mod is_inf; mod is_nan; +use is_inf::IsInfEvaluator; use is_nan::IsNanEvaluator; use serde::{Deserialize, Serialize}; @@ -10,6 +12,7 @@ use super::FunctionEvaluator; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] pub enum FloatExpr { IsNan, + IsInf, } impl FloatExpr { @@ -18,6 +21,7 @@ impl FloatExpr { use FloatExpr::*; match self { IsNan => &IsNanEvaluator {}, + IsInf => &IsInfEvaluator {}, } } } @@ -29,3 +33,11 @@ pub fn is_nan(data: ExprRef) -> ExprRef { } .into() } + +pub fn is_inf(data: ExprRef) -> ExprRef { + Expr::Function { + func: super::FunctionExpr::Float(FloatExpr::IsInf), + inputs: vec![data], + } + .into() +} diff --git a/src/daft-dsl/src/python.rs b/src/daft-dsl/src/python.rs index 7615601d6f..335168a253 100644 --- a/src/daft-dsl/src/python.rs +++ b/src/daft-dsl/src/python.rs @@ -475,6 +475,11 @@ impl PyExpr { Ok(is_nan(self.into()).into()) } + pub fn is_inf(&self) -> PyResult { + use functions::float::is_inf; + Ok(is_inf(self.into()).into()) + } + pub fn dt_date(&self) -> PyResult { use functions::temporal::date; Ok(date(self.into()).into()) diff --git a/tests/expressions/test_expressions.py b/tests/expressions/test_expressions.py index af27e272e3..8b3bba34b8 100644 --- a/tests/expressions/test_expressions.py +++ b/tests/expressions/test_expressions.py @@ -264,6 +264,13 @@ def test_float_is_nan() -> None: assert output == "is_nan(col(a))" +def test_float_is_inf() -> None: + a = col("a") + c = a.float.is_inf() + output = repr(c) + assert output == "is_inf(col(a))" + + def test_date_lit_post_epoch() -> None: d = lit(date(2022, 1, 1)) output = repr(d) diff --git a/tests/expressions/typing/test_float.py b/tests/expressions/typing/test_float.py index 22ad3522ac..8b641ae097 100644 --- a/tests/expressions/typing/test_float.py +++ b/tests/expressions/typing/test_float.py @@ -12,3 +12,12 @@ def test_float_is_nan(unary_data_fixture): run_kernel=unary_data_fixture.float.is_nan, resolvable=unary_data_fixture.datatype() in (DataType.float32(), DataType.float64()), ) + + +def test_float_is_inf(unary_data_fixture): + assert_typing_resolve_vs_runtime_behavior( + data=[unary_data_fixture], + expr=col(unary_data_fixture.name()).float.is_inf(), + run_kernel=unary_data_fixture.float.is_inf, + resolvable=unary_data_fixture.datatype() in (DataType.float32(), DataType.float64()), + ) diff --git a/tests/series/test_float.py b/tests/series/test_float.py index 416369a881..32296f0905 100644 --- a/tests/series/test_float.py +++ b/tests/series/test_float.py @@ -28,3 +28,27 @@ def test_float_is_nan_all_null() -> None: s = Series.from_arrow(pa.array([None, None, None])) result = s.float.is_nan() assert result.to_pylist() == [None, None, None] + + +def test_float_is_inf() -> None: + s = Series.from_arrow(pa.array([-float("inf"), 0.0, np.inf])) + result = s.float.is_inf() + assert result.to_pylist() == [True, False, True] + + +def test_float_is_inf_with_nulls() -> None: + s = Series.from_arrow(pa.array([-np.inf, None, 1.0, None, float("inf")])) + result = s.float.is_inf() + assert result.to_pylist() == [True, None, False, None, True] + + +def test_float_is_inf_empty() -> None: + s = Series.from_arrow(pa.array([], type=pa.float64())) + result = s.float.is_inf() + assert result.to_pylist() == [] + + +def test_float_is_inf_all_null() -> None: + s = Series.from_arrow(pa.array([None, None, None])) + result = s.float.is_inf() + assert result.to_pylist() == [None, None, None] diff --git a/tests/table/test_filter.py b/tests/table/test_filter.py index afaf1afb60..37dc6b2ef5 100644 --- a/tests/table/test_filter.py +++ b/tests/table/test_filter.py @@ -212,6 +212,13 @@ def test_table_float_is_nan() -> None: assert result_table.to_pydict() == {"a": [False, True, False, None, True]} +def test_table_float_is_inf() -> None: + table = MicroPartition.from_pydict({"a": [-np.inf, 0.0, None, float("inf")]}) + result_table = table.eval_expression_list([col("a").float.is_inf()]) + # Note that null entries are _not_ treated as float NaNs. + assert result_table.to_pydict() == {"a": [True, False, None, True]} + + def test_table_if_else() -> None: table = MicroPartition.from_arrow( pa.Table.from_pydict({"ones": [1, 1, 1], "zeros": [0, 0, 0], "pred": [True, False, None]})