diff --git a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs index 0f59c80d4651..7c08d4de622a 100644 --- a/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs +++ b/crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs @@ -1,8 +1,5 @@ -use arrow::bitmap::MutableBitmap; -use arrow::compute::cast::utf8view_to_utf8; use arrow::compute::take::take_unchecked; use arrow::offset::OffsetsBuffer; -use polars_utils::vec::PushUnchecked; use super::*; @@ -233,135 +230,3 @@ impl ChunkExplode for ArrayChunked { )) } } - -impl ChunkExplode for StringChunked { - fn offsets(&self) -> PolarsResult> { - let mut offsets = Vec::with_capacity(self.len() + 1); - let mut length_so_far = 0; - offsets.push(length_so_far); - - for arr in self.downcast_iter() { - for len in arr.len_iter() { - // SAFETY: - // pre-allocated - unsafe { offsets.push_unchecked(length_so_far) }; - length_so_far += len as i64; - } - } - - // SAFETY: - // Monotonically increasing. - unsafe { Ok(OffsetsBuffer::new_unchecked(offsets.into())) } - } - - fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer)> { - // A list array's memory layout is actually already 'exploded', so we can just take the values array - // of the list. And we also return a slice of the offsets. This slice can be used to find the old - // list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation - let ca = self.rechunk(); - let array = ca.downcast_iter().next().unwrap(); - // TODO! maybe optimize for new utf8view? - let array = utf8view_to_utf8(array); - - let values = array.values(); - let old_offsets = array.offsets().clone(); - - let (new_offsets, validity) = if let Some(validity) = array.validity() { - // capacity estimate - let capacity = self.get_values_size() + validity.unset_bits(); - - let old_offsets = old_offsets.as_slice(); - let mut old_offset = old_offsets[0]; - let mut new_offsets = Vec::with_capacity(capacity + 1); - new_offsets.push(old_offset); - - let mut bitmap = MutableBitmap::with_capacity(capacity); - let values = values.as_slice(); - for (&offset, valid) in old_offsets[1..].iter().zip(validity) { - // SAFETY: - // new_offsets already has a single value, so -1 is always in bounds - let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) }; - - if valid { - debug_assert!(old_offset as usize <= values.len()); - debug_assert!(offset as usize <= values.len()); - let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) }; - - // take the string value and find the char offsets - // create a new offset value for each char boundary - // SAFETY: - // we know we have string data. - let str_val = unsafe { std::str::from_utf8_unchecked(val) }; - - let char_offsets = str_val - .char_indices() - .skip(1) - .map(|t| t.0 as i64 + latest_offset); - - // extend the chars - // also keep track of the amount of offsets added - // as we must update the validity bitmap - let len_before = new_offsets.len(); - new_offsets.extend(char_offsets); - new_offsets.push(latest_offset + str_val.len() as i64); - bitmap.extend_constant(new_offsets.len() - len_before, true); - } else { - // no data, just add old offset and set null bit - new_offsets.push(latest_offset); - bitmap.push(false) - } - old_offset = offset; - } - - (new_offsets.into(), bitmap.into()) - } else { - // fast(er) explode - - // we cannot naively explode, because there might be empty strings. - - // capacity estimate - let capacity = self.get_values_size(); - let old_offsets = old_offsets.as_slice(); - let mut old_offset = old_offsets[0]; - let mut new_offsets = Vec::with_capacity(capacity + 1); - new_offsets.push(old_offset); - - let values = values.as_slice(); - for &offset in &old_offsets[1..] { - // SAFETY: - // new_offsets already has a single value, so -1 is always in bounds - let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) }; - debug_assert!(old_offset as usize <= values.len()); - debug_assert!(offset as usize <= values.len()); - let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) }; - - // take the string value and find the char offsets - // create a new offset value for each char boundary - // SAFETY: - // we know we have string data. - let str_val = unsafe { std::str::from_utf8_unchecked(val) }; - - let char_offsets = str_val - .char_indices() - .skip(1) - .map(|t| t.0 as i64 + latest_offset); - - // extend the chars - new_offsets.extend(char_offsets); - new_offsets.push(latest_offset + str_val.len() as i64); - old_offset = offset; - } - - (new_offsets.into(), None) - }; - - let array = unsafe { - Utf8Array::::from_data_unchecked_default(new_offsets, values.clone(), validity) - }; - - let new_arr = Box::new(array) as ArrayRef; - - let s = Series::try_from((self.name(), new_arr)).unwrap(); - Ok((s, old_offsets)) - } -} diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 4c932f8a131f..2f2f80e2e6d5 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -39,7 +39,6 @@ pub enum StringFunction { }, CountMatches(bool), EndsWith, - Explode, Extract(usize), ExtractAll, #[cfg(feature = "extract_groups")] @@ -137,7 +136,6 @@ impl StringFunction { Contains { .. } => mapper.with_dtype(DataType::Boolean), CountMatches(_) => mapper.with_dtype(DataType::UInt32), EndsWith | StartsWith => mapper.with_dtype(DataType::Boolean), - Explode => mapper.with_same_dtype(), Extract(_) => mapper.with_same_dtype(), ExtractAll => mapper.with_dtype(DataType::List(Box::new(DataType::String))), #[cfg(feature = "extract_groups")] @@ -208,7 +206,6 @@ impl Display for StringFunction { ConcatHorizontal { .. } => "concat_horizontal", #[cfg(feature = "concat_str")] ConcatVertical { .. } => "concat_vertical", - Explode => "explode", ExtractAll => "extract_all", #[cfg(feature = "extract_groups")] ExtractGroups { .. } => "extract_groups", @@ -365,7 +362,6 @@ impl From for SpecialEq> { Base64Encode => map!(strings::base64_encode), #[cfg(feature = "binary_encoding")] Base64Decode(strict) => map!(strings::base64_decode, strict), - Explode => map!(strings::explode), #[cfg(feature = "dtype-decimal")] ToDecimal(infer_len) => map!(strings::to_decimal, infer_len), #[cfg(feature = "extract_jsonpath")] @@ -972,11 +968,6 @@ pub(super) fn base64_decode(s: &Series, strict: bool) -> PolarsResult { s.str()?.base64_decode(strict).map(|ca| ca.into_series()) } -pub(super) fn explode(s: &Series) -> PolarsResult { - let ca = s.str()?; - ca.explode() -} - #[cfg(feature = "dtype-decimal")] pub(super) fn to_decimal(s: &Series, infer_len: usize) -> PolarsResult { let ca = s.str()?; diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index e5aa3fc58119..29f278a52a5d 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -547,11 +547,6 @@ impl StringNameSpace { ) } - pub fn explode(self) -> Expr { - self.0 - .apply_private(FunctionExpr::StringExpr(StringFunction::Explode)) - } - #[cfg(feature = "extract_jsonpath")] pub fn json_decode(self, dtype: Option, infer_schema_len: Option) -> Expr { self.0 diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index debc5e3ac886..e0f7b75c5ad7 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -4952,7 +4952,6 @@ def explode(self) -> Self: See Also -------- Expr.list.explode : Explode a list column. - Expr.str.explode : Explode a string column. Examples -------- diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 77044dcf4bc6..088071bd8987 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -6,6 +6,7 @@ import polars._reexport as pl from polars import functions as F from polars._utils.deprecation import ( + deprecate_function, deprecate_renamed_function, deprecate_renamed_parameter, issue_deprecation_warning, @@ -2352,10 +2353,23 @@ def tail(self, n: int | IntoExprColumn) -> Expr: n = parse_as_expression(n) return wrap_expr(self._pyexpr.str_tail(n)) + @deprecate_function( + 'Use `.str.split("").explode()` instead.' + " Note that empty strings will result in null instead of being preserved." + " To get the exact same behavior, split first and then use when/then/otherwise" + " to handle the empty list before exploding.", + version="0.20.31", + ) def explode(self) -> Expr: """ Returns a column with a separate row for every string character. + .. deprecated:: 0.20.31 + Use `.str.split("").explode()` instead. + Note that empty strings will result in null instead of being preserved. + To get the exact same behavior, split first and then use when/then/otherwise + to handle the empty list before exploding. + Returns ------- Expr @@ -2364,7 +2378,7 @@ def explode(self) -> Expr: Examples -------- >>> df = pl.DataFrame({"a": ["foo", "bar"]}) - >>> df.select(pl.col("a").str.explode()) + >>> df.select(pl.col("a").str.explode()) # doctest: +SKIP shape: (6, 1) ┌─────┐ │ a │ @@ -2379,7 +2393,8 @@ def explode(self) -> Expr: │ r │ └─────┘ """ - return wrap_expr(self._pyexpr.str_explode()) + split = self.split("") + return F.when(split.ne_missing([])).then(split).otherwise([""]).explode() def to_integer( self, *, base: int | IntoExprColumn = 10, strict: bool = True diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index a515dc6ae983..bb51e859fbec 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4080,7 +4080,6 @@ def explode(self) -> Series: See Also -------- Series.list.explode : Explode a list column. - Series.str.explode : Explode a string column. Examples -------- diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index da6304e7331e..a64649c615e3 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from polars._utils.deprecation import ( + deprecate_function, deprecate_renamed_function, deprecate_renamed_parameter, ) @@ -1776,10 +1777,23 @@ def tail(self, n: int | IntoExprColumn) -> Series: ] """ + @deprecate_function( + 'Use `.str.split("").explode()` instead.' + " Note that empty strings will result in null instead of being preserved." + " To get the exact same behavior, split first and then use when/then/otherwise" + " to handle the empty list before exploding.", + version="0.20.31", + ) def explode(self) -> Series: """ Returns a column with a separate row for every string character. + .. deprecated:: 0.20.31 + Use `.str.split("").explode()` instead. + Note that empty strings will result in null instead of being preserved. + To get the exact same behavior, split first and then use when/then/otherwise + to handle the empty list before exploding. + Returns ------- Series @@ -1788,7 +1802,7 @@ def explode(self) -> Series: Examples -------- >>> s = pl.Series("a", ["foo", "bar"]) - >>> s.str.explode() + >>> s.str.explode() # doctest: +SKIP shape: (6,) Series: 'a' [str] [ diff --git a/py-polars/src/expr/string.rs b/py-polars/src/expr/string.rs index eb623c0dce8e..4903413d604c 100644 --- a/py-polars/src/expr/string.rs +++ b/py-polars/src/expr/string.rs @@ -110,10 +110,6 @@ impl PyExpr { self.inner.clone().str().tail(n.inner).into() } - fn str_explode(&self) -> Self { - self.inner.clone().str().explode().into() - } - fn str_to_uppercase(&self) -> Self { self.inner.clone().str().to_uppercase().into() } diff --git a/py-polars/src/lazyframe/visitor/expr_nodes.rs b/py-polars/src/lazyframe/visitor/expr_nodes.rs index 2b2d9e087c55..4b911720efb3 100644 --- a/py-polars/src/lazyframe/visitor/expr_nodes.rs +++ b/py-polars/src/lazyframe/visitor/expr_nodes.rs @@ -116,7 +116,6 @@ pub enum PyStringFunction { Contains, CountMatches, EndsWith, - Explode, Extract, ExtractAll, ExtractGroups, @@ -675,9 +674,6 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { StringFunction::EndsWith => { (PyStringFunction::EndsWith.into_py(py),).to_object(py) }, - StringFunction::Explode => { - (PyStringFunction::Explode.into_py(py),).to_object(py) - }, StringFunction::Extract(_) => { (PyStringFunction::Extract.into_py(py),).to_object(py) }, diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py index 5936c729873a..309e43b5a3f0 100644 --- a/py-polars/tests/unit/operations/test_explode.py +++ b/py-polars/tests/unit/operations/test_explode.py @@ -8,14 +8,6 @@ from polars.testing import assert_frame_equal, assert_series_equal -def test_explode_string() -> None: - df = pl.Series("a", ["Hello", "World"]) - result = df.to_frame().select(pl.col("a").str.explode()).to_series() - - expected = pl.Series("a", ["H", "e", "l", "l", "o", "W", "o", "r", "l", "d"]) - assert_series_equal(result, expected) - - def test_explode_multiple() -> None: df = pl.DataFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]}) @@ -33,21 +25,6 @@ def test_group_by_flatten_list() -> None: assert_frame_equal(result, expected) -def test_group_by_flatten_string() -> None: - df = pl.DataFrame({"group": ["a", "b", "b"], "values": ["foo", "bar", "baz"]}) - result = df.group_by("group", maintain_order=True).agg( - pl.col("values").str.explode() - ) - - expected = pl.DataFrame( - { - "group": ["a", "b"], - "values": [["f", "o", "o"], ["b", "a", "r", "b", "a", "z"]], - } - ) - assert_frame_equal(result, expected) - - def test_explode_empty_df_3402() -> None: df = pl.DataFrame({"a": pa.array([], type=pa.large_list(pa.int32()))}) assert df.explode("a").dtypes == [pl.Int32] @@ -145,70 +122,6 @@ def test_sliced_null_explode() -> None: assert s.slice(2, 4).list.explode().to_list() == [True, False, None, True] -def test_string_explode() -> None: - assert pl.Series(["foobar", None]).str.explode().to_list() == [ - "f", - "o", - "o", - "b", - "a", - "r", - None, - ] - assert pl.Series([None, "foo", "bar"]).str.explode().to_list() == [ - None, - "f", - "o", - "o", - "b", - "a", - "r", - ] - assert pl.Series([None, "foo", "bar", None, "ham"]).str.explode().to_list() == [ - None, - "f", - "o", - "o", - "b", - "a", - "r", - None, - "h", - "a", - "m", - ] - assert pl.Series(["foo", "bar", "ham"]).str.explode().to_list() == [ - "f", - "o", - "o", - "b", - "a", - "r", - "h", - "a", - "m", - ] - assert pl.Series(["", None, "foo", "bar"]).str.explode().to_list() == [ - "", - None, - "f", - "o", - "o", - "b", - "a", - "r", - ] - assert pl.Series(["", "foo", "bar"]).str.explode().to_list() == [ - "", - "f", - "o", - "o", - "b", - "a", - "r", - ] - - def test_explode_in_agg_context() -> None: df = pl.DataFrame( {"idxs": [[0], [1], [0, 2]], "array": [[0.0, 3.5], [4.6, 0.0], [0.0, 7.8, 0.0]]} @@ -444,3 +357,50 @@ def test_explode_nullable_list() -> None: } ) assert_frame_equal(explode_expr, expected_df) + + +def test_group_by_flatten_string() -> None: + df = pl.DataFrame({"group": ["a", "b", "b"], "values": ["foo", "bar", "baz"]}) + + result = df.group_by("group", maintain_order=True).agg( + pl.col("values").str.split("").explode() + ) + + expected = pl.DataFrame( + { + "group": ["a", "b"], + "values": [["f", "o", "o"], ["b", "a", "r", "b", "a", "z"]], + } + ) + assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("values", "exploded"), + [ + (["foobar", None], ["f", "o", "o", "b", "a", "r", None]), + ([None, "foo", "bar"], [None, "f", "o", "o", "b", "a", "r"]), + ( + [None, "foo", "bar", None, "ham"], + [None, "f", "o", "o", "b", "a", "r", None, "h", "a", "m"], + ), + (["foo", "bar", "ham"], ["f", "o", "o", "b", "a", "r", "h", "a", "m"]), + (["", None, "foo", "bar"], ["", None, "f", "o", "o", "b", "a", "r"]), + (["", "foo", "bar"], ["", "f", "o", "o", "b", "a", "r"]), + ], +) +def test_series_str_explode_deprecated( + values: list[str | None], exploded: list[str | None] +) -> None: + with pytest.deprecated_call(): + result = pl.Series(values).str.explode() + assert result.to_list() == exploded + + +def test_expr_str_explode_deprecated() -> None: + df = pl.Series("a", ["Hello", "World"]) + with pytest.deprecated_call(): + result = df.to_frame().select(pl.col("a").str.explode()).to_series() + + expected = pl.Series("a", ["H", "e", "l", "l", "o", "W", "o", "r", "l", "d"]) + assert_series_equal(result, expected)