Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

depr(python,rust!): Deprecate str.explode in favor of str.split("").explode() #16508

Merged
merged 4 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 0 additions & 135 deletions crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
use arrow::bitmap::MutableBitmap;
use arrow::compute::cast::utf8view_to_utf8;
use arrow::compute::take::take_unchecked;
use arrow::offset::OffsetsBuffer;
use polars_utils::vec::PushUnchecked;

use super::*;

Expand Down Expand Up @@ -233,135 +230,3 @@ impl ChunkExplode for ArrayChunked {
))
}
}

impl ChunkExplode for StringChunked {
fn offsets(&self) -> PolarsResult<OffsetsBuffer<i64>> {
let mut offsets = Vec::with_capacity(self.len() + 1);
let mut length_so_far = 0;
offsets.push(length_so_far);

for arr in self.downcast_iter() {
for len in arr.len_iter() {
// SAFETY:
// pre-allocated
unsafe { offsets.push_unchecked(length_so_far) };
length_so_far += len as i64;
}
}

// SAFETY:
// Monotonically increasing.
unsafe { Ok(OffsetsBuffer::new_unchecked(offsets.into())) }
}

fn explode_and_offsets(&self) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
// A list array's memory layout is actually already 'exploded', so we can just take the values array
// of the list. And we also return a slice of the offsets. This slice can be used to find the old
// list layout or indexes to expand the DataFrame in the same manner as the 'explode' operation
let ca = self.rechunk();
let array = ca.downcast_iter().next().unwrap();
// TODO! maybe optimize for new utf8view?
let array = utf8view_to_utf8(array);

let values = array.values();
let old_offsets = array.offsets().clone();

let (new_offsets, validity) = if let Some(validity) = array.validity() {
// capacity estimate
let capacity = self.get_values_size() + validity.unset_bits();

let old_offsets = old_offsets.as_slice();
let mut old_offset = old_offsets[0];
let mut new_offsets = Vec::with_capacity(capacity + 1);
new_offsets.push(old_offset);

let mut bitmap = MutableBitmap::with_capacity(capacity);
let values = values.as_slice();
for (&offset, valid) in old_offsets[1..].iter().zip(validity) {
// SAFETY:
// new_offsets already has a single value, so -1 is always in bounds
let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) };

if valid {
debug_assert!(old_offset as usize <= values.len());
debug_assert!(offset as usize <= values.len());
let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) };

// take the string value and find the char offsets
// create a new offset value for each char boundary
// SAFETY:
// we know we have string data.
let str_val = unsafe { std::str::from_utf8_unchecked(val) };

let char_offsets = str_val
.char_indices()
.skip(1)
.map(|t| t.0 as i64 + latest_offset);

// extend the chars
// also keep track of the amount of offsets added
// as we must update the validity bitmap
let len_before = new_offsets.len();
new_offsets.extend(char_offsets);
new_offsets.push(latest_offset + str_val.len() as i64);
bitmap.extend_constant(new_offsets.len() - len_before, true);
} else {
// no data, just add old offset and set null bit
new_offsets.push(latest_offset);
bitmap.push(false)
}
old_offset = offset;
}

(new_offsets.into(), bitmap.into())
} else {
// fast(er) explode

// we cannot naively explode, because there might be empty strings.

// capacity estimate
let capacity = self.get_values_size();
let old_offsets = old_offsets.as_slice();
let mut old_offset = old_offsets[0];
let mut new_offsets = Vec::with_capacity(capacity + 1);
new_offsets.push(old_offset);

let values = values.as_slice();
for &offset in &old_offsets[1..] {
// SAFETY:
// new_offsets already has a single value, so -1 is always in bounds
let latest_offset = unsafe { *new_offsets.get_unchecked(new_offsets.len() - 1) };
debug_assert!(old_offset as usize <= values.len());
debug_assert!(offset as usize <= values.len());
let val = unsafe { values.get_unchecked(old_offset as usize..offset as usize) };

// take the string value and find the char offsets
// create a new offset value for each char boundary
// SAFETY:
// we know we have string data.
let str_val = unsafe { std::str::from_utf8_unchecked(val) };

let char_offsets = str_val
.char_indices()
.skip(1)
.map(|t| t.0 as i64 + latest_offset);

// extend the chars
new_offsets.extend(char_offsets);
new_offsets.push(latest_offset + str_val.len() as i64);
old_offset = offset;
}

(new_offsets.into(), None)
};

let array = unsafe {
Utf8Array::<i64>::from_data_unchecked_default(new_offsets, values.clone(), validity)
};

let new_arr = Box::new(array) as ArrayRef;

let s = Series::try_from((self.name(), new_arr)).unwrap();
Ok((s, old_offsets))
}
}
9 changes: 0 additions & 9 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ pub enum StringFunction {
},
CountMatches(bool),
EndsWith,
Explode,
Extract(usize),
ExtractAll,
#[cfg(feature = "extract_groups")]
Expand Down Expand Up @@ -137,7 +136,6 @@ impl StringFunction {
Contains { .. } => mapper.with_dtype(DataType::Boolean),
CountMatches(_) => mapper.with_dtype(DataType::UInt32),
EndsWith | StartsWith => mapper.with_dtype(DataType::Boolean),
Explode => mapper.with_same_dtype(),
Extract(_) => mapper.with_same_dtype(),
ExtractAll => mapper.with_dtype(DataType::List(Box::new(DataType::String))),
#[cfg(feature = "extract_groups")]
Expand Down Expand Up @@ -208,7 +206,6 @@ impl Display for StringFunction {
ConcatHorizontal { .. } => "concat_horizontal",
#[cfg(feature = "concat_str")]
ConcatVertical { .. } => "concat_vertical",
Explode => "explode",
ExtractAll => "extract_all",
#[cfg(feature = "extract_groups")]
ExtractGroups { .. } => "extract_groups",
Expand Down Expand Up @@ -365,7 +362,6 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
Base64Encode => map!(strings::base64_encode),
#[cfg(feature = "binary_encoding")]
Base64Decode(strict) => map!(strings::base64_decode, strict),
Explode => map!(strings::explode),
#[cfg(feature = "dtype-decimal")]
ToDecimal(infer_len) => map!(strings::to_decimal, infer_len),
#[cfg(feature = "extract_jsonpath")]
Expand Down Expand Up @@ -972,11 +968,6 @@ pub(super) fn base64_decode(s: &Series, strict: bool) -> PolarsResult<Series> {
s.str()?.base64_decode(strict).map(|ca| ca.into_series())
}

pub(super) fn explode(s: &Series) -> PolarsResult<Series> {
let ca = s.str()?;
ca.explode()
}

#[cfg(feature = "dtype-decimal")]
pub(super) fn to_decimal(s: &Series, infer_len: usize) -> PolarsResult<Series> {
let ca = s.str()?;
Expand Down
5 changes: 0 additions & 5 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -547,11 +547,6 @@ impl StringNameSpace {
)
}

pub fn explode(self) -> Expr {
self.0
.apply_private(FunctionExpr::StringExpr(StringFunction::Explode))
}

#[cfg(feature = "extract_jsonpath")]
pub fn json_decode(self, dtype: Option<DataType>, infer_schema_len: Option<usize>) -> Expr {
self.0
Expand Down
1 change: 0 additions & 1 deletion py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4952,7 +4952,6 @@ def explode(self) -> Self:
See Also
--------
Expr.list.explode : Explode a list column.
Expr.str.explode : Explode a string column.

Examples
--------
Expand Down
19 changes: 17 additions & 2 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import polars._reexport as pl
from polars import functions as F
from polars._utils.deprecation import (
deprecate_function,
deprecate_renamed_function,
deprecate_renamed_parameter,
issue_deprecation_warning,
Expand Down Expand Up @@ -2352,10 +2353,23 @@ def tail(self, n: int | IntoExprColumn) -> Expr:
n = parse_as_expression(n)
return wrap_expr(self._pyexpr.str_tail(n))

@deprecate_function(
'Use `.str.split("").explode()` instead.'
" Note that empty strings will result in null instead of being preserved."
" To get the exact same behavior, split first and then use when/then/otherwise"
" to handle the empty list before exploding.",
version="0.20.31",
)
def explode(self) -> Expr:
"""
Returns a column with a separate row for every string character.

.. deprecated:: 0.20.31
Use `.str.split("").explode()` instead.
Note that empty strings will result in null instead of being preserved.
To get the exact same behavior, split first and then use when/then/otherwise
to handle the empty list before exploding.

Returns
-------
Expr
Expand All @@ -2364,7 +2378,7 @@ def explode(self) -> Expr:
Examples
--------
>>> df = pl.DataFrame({"a": ["foo", "bar"]})
>>> df.select(pl.col("a").str.explode())
>>> df.select(pl.col("a").str.explode()) # doctest: +SKIP
shape: (6, 1)
┌─────┐
│ a │
Expand All @@ -2379,7 +2393,8 @@ def explode(self) -> Expr:
│ r │
└─────┘
"""
return wrap_expr(self._pyexpr.str_explode())
split = self.split("")
return F.when(split.ne_missing([])).then(split).otherwise([""]).explode()
Comment on lines +2396 to +2397
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This replaces the existing functionality until it is removed completely.


def to_integer(
self, *, base: int | IntoExprColumn = 10, strict: bool = True
Expand Down
1 change: 0 additions & 1 deletion py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4080,7 +4080,6 @@ def explode(self) -> Series:
See Also
--------
Series.list.explode : Explode a list column.
Series.str.explode : Explode a string column.

Examples
--------
Expand Down
16 changes: 15 additions & 1 deletion py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import TYPE_CHECKING

from polars._utils.deprecation import (
deprecate_function,
deprecate_renamed_function,
deprecate_renamed_parameter,
)
Expand Down Expand Up @@ -1776,10 +1777,23 @@ def tail(self, n: int | IntoExprColumn) -> Series:
]
"""

@deprecate_function(
'Use `.str.split("").explode()` instead.'
" Note that empty strings will result in null instead of being preserved."
" To get the exact same behavior, split first and then use when/then/otherwise"
" to handle the empty list before exploding.",
version="0.20.31",
)
def explode(self) -> Series:
"""
Returns a column with a separate row for every string character.

.. deprecated:: 0.20.31
Use `.str.split("").explode()` instead.
Note that empty strings will result in null instead of being preserved.
To get the exact same behavior, split first and then use when/then/otherwise
to handle the empty list before exploding.

Returns
-------
Series
Expand All @@ -1788,7 +1802,7 @@ def explode(self) -> Series:
Examples
--------
>>> s = pl.Series("a", ["foo", "bar"])
>>> s.str.explode()
>>> s.str.explode() # doctest: +SKIP
shape: (6,)
Series: 'a' [str]
[
Expand Down
4 changes: 0 additions & 4 deletions py-polars/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,6 @@ impl PyExpr {
self.inner.clone().str().tail(n.inner).into()
}

fn str_explode(&self) -> Self {
self.inner.clone().str().explode().into()
}

fn str_to_uppercase(&self) -> Self {
self.inner.clone().str().to_uppercase().into()
}
Expand Down
4 changes: 0 additions & 4 deletions py-polars/src/lazyframe/visitor/expr_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ pub enum PyStringFunction {
Contains,
CountMatches,
EndsWith,
Explode,
Extract,
ExtractAll,
ExtractGroups,
Expand Down Expand Up @@ -675,9 +674,6 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult<PyObject> {
StringFunction::EndsWith => {
(PyStringFunction::EndsWith.into_py(py),).to_object(py)
},
StringFunction::Explode => {
(PyStringFunction::Explode.into_py(py),).to_object(py)
},
StringFunction::Extract(_) => {
(PyStringFunction::Extract.into_py(py),).to_object(py)
},
Expand Down
Loading