Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python, rust): new n_chars functionality for utf8 strings #5252

Merged
merged 1 commit into from
Oct 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion polars/polars-arrow/src/kernels/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,14 @@ use crate::trusted_len::PushUnchecked;

pub fn string_lengths(array: &Utf8Array<i64>) -> ArrayRef {
let values = array.offsets().windows(2).map(|x| (x[1] - x[0]) as u32);

let values: Buffer<_> = Vec::from_trusted_len_iter(values).into();
let array = UInt32Array::from_data(DataType::UInt32, values, array.validity().cloned());
Box::new(array)
}

pub fn string_nchars(array: &Utf8Array<i64>) -> ArrayRef {
let values = array.values_iter().map(|x| x.chars().count() as u32);
let values: Buffer<_> = Vec::from_trusted_len_iter(values).into();
let array = UInt32Array::from_data(DataType::UInt32, values, array.validity().cloned());
Box::new(array)
}
8 changes: 7 additions & 1 deletion polars/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@ fn f_regex_extract<'a>(reg: &Regex, input: &'a str, group_index: usize) -> Optio
}

pub trait Utf8NameSpaceImpl: AsUtf8 {
/// Get the length of the string values.
/// Get the length of the string values as number of chars.
fn str_n_chars(&self) -> UInt32Chunked {
let ca = self.as_utf8();
ca.apply_kernel_cast(&string_nchars)
}

/// Get the length of the string values as number of bytes.
fn str_lengths(&self) -> UInt32Chunked {
let ca = self.as_utf8();
ca.apply_kernel_cast(&string_lengths)
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ The following methods are available under the `Expr.str` attribute.
ExprStringNameSpace.lengths
ExprStringNameSpace.ljust
ExprStringNameSpace.lstrip
ExprStringNameSpace.n_chars
ExprStringNameSpace.replace
ExprStringNameSpace.replace_all
ExprStringNameSpace.rjust
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ The following methods are available under the `Series.str` attribute.
StringNameSpace.lengths
StringNameSpace.ljust
StringNameSpace.lstrip
StringNameSpace.n_chars
StringNameSpace.replace
StringNameSpace.replace_all
StringNameSpace.rjust
Expand Down
78 changes: 63 additions & 15 deletions py-polars/polars/internals/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,28 +111,76 @@ def strptime(

def lengths(self) -> pli.Expr:
"""
Get the length of the Strings as UInt32.
Get length of the strings as UInt32 (as number of bytes).

Notes
-----
The returned lengths are equal to the number of bytes in the UTF8 string. If you
need the length in terms of the number of characters, use ``n_chars`` instead.

Examples
--------
>>> df = pl.DataFrame({"s": [None, "bears", "110"]})
>>> df.select(["s", pl.col("s").str.lengths().alias("len")])
shape: (3, 2)
┌───────┬──────┐
│ s ┆ len │
│ --- ┆ --- │
│ str ┆ u32 │
╞═══════╪══════╡
│ null ┆ null │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ bears ┆ 5 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 110 ┆ 3 │
└───────┴──────┘
>>> df = pl.DataFrame({"s": ["Café", None, "345", "東京"]}).with_columns(
... [
... pl.col("s").str.lengths().alias("length"),
... pl.col("s").str.n_chars().alias("nchars"),
... ]
... )
>>> df
shape: (4, 3)
┌──────┬────────┬────────┐
│ s ┆ length ┆ nchars │
│ --- ┆ --- ┆ --- │
│ str ┆ u32 ┆ u32 │
╞══════╪════════╪════════╡
│ Café ┆ 5 ┆ 4 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ null ┆ null ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 345 ┆ 3 ┆ 3 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 東京 ┆ 6 ┆ 2 │
└──────┴────────┴────────┘

"""
return pli.wrap_expr(self._pyexpr.str_lengths())

def n_chars(self) -> pli.Expr:
"""
Get length of the strings as UInt32 (as number of chars).

Notes
-----
If you know that you are working with ASCII text, ``lengths`` will be
equivalent, and faster (returns length in terms of the number of bytes).

Examples
--------
>>> df = pl.DataFrame({"s": ["Café", None, "345", "東京"]}).with_columns(
... [
... pl.col("s").str.n_chars().alias("nchars"),
... pl.col("s").str.lengths().alias("length"),
... ]
... )
>>> df
shape: (4, 3)
┌──────┬────────┬────────┐
│ s ┆ nchars ┆ length │
│ --- ┆ --- ┆ --- │
│ str ┆ u32 ┆ u32 │
╞══════╪════════╪════════╡
│ Café ┆ 4 ┆ 5 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ null ┆ null ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 345 ┆ 3 ┆ 3 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 東京 ┆ 2 ┆ 6 │
└──────┴────────┴────────┘

"""
return pli.wrap_expr(self._pyexpr.str_n_chars())

def concat(self, delimiter: str = "-") -> pli.Expr:
"""
Vertically concat the values in the Series to a single string value.
Expand Down
41 changes: 37 additions & 4 deletions py-polars/polars/internals/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,23 +93,56 @@ def strptime(

def lengths(self) -> pli.Series:
"""
Get length of the string values in the Series.
Get length of the string values in the Series (as number of bytes).

Notes
-----
The returned lengths are equal to the number of bytes in the UTF8 string. If you
need the length in terms of the number of characters, use ``n_chars`` instead.

Returns
-------
Series[u32]

Examples
--------
>>> s = pl.Series(["foo", None, "hello", "world"])
>>> s = pl.Series(["Café", None, "345", "東京"])
>>> s.str.lengths()
shape: (4,)
Series: '' [u32]
[
5
null
3
6
]

"""

def n_chars(self) -> pli.Series:
"""
Get length of the string values in the Series (as number of chars).

Returns
-------
Series[u32]

Notes
-----
If you know that you are working with ASCII text, ``lengths`` will be
equivalent, and faster (returns length in terms of the number of bytes).

Examples
--------
>>> s = pl.Series(["Café", None, "345", "東京"])
>>> s.str.n_chars()
shape: (4,)
Series: '' [u32]
[
4
null
5
5
3
2
]

"""
Expand Down
14 changes: 13 additions & 1 deletion py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,19 @@ impl PyExpr {
self.clone()
.inner
.map(function, GetOutput::from_type(DataType::UInt32))
.with_fmt("str.len")
.with_fmt("str.lengths")
.into()
}

pub fn str_n_chars(&self) -> PyExpr {
let function = |s: Series| {
let ca = s.utf8()?;
Ok(ca.str_n_chars().into_series())
};
self.clone()
.inner
.map(function, GetOutput::from_type(DataType::UInt32))
.with_fmt("str.n_chars")
.into()
}

Expand Down
10 changes: 8 additions & 2 deletions py-polars/tests/unit/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1617,11 +1617,17 @@ def test_str_concat() -> None:


def test_str_lengths() -> None:
s = pl.Series(["messi", "ronaldo", None])
expected = pl.Series([5, 7, None], dtype=UInt32)
s = pl.Series(["Café", None, "345", "東京"])
expected = pl.Series([5, None, 3, 6], dtype=UInt32)
verify_series_and_expr_api(s, expected, "str.lengths")


def test_str_n_chars() -> None:
s = pl.Series(["Café", None, "345", "東京"])
expected = pl.Series([4, None, 3, 2], dtype=UInt32)
verify_series_and_expr_api(s, expected, "str.n_chars")


def test_str_contains() -> None:
s = pl.Series(["messi", "ronaldo", "ibrahimovic"])
expected = pl.Series([True, False, False])
Expand Down
10 changes: 10 additions & 0 deletions py-polars/tests/unit/test_utf8.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,13 @@ def test_json_path_match_type_4905() -> None:
assert df.filter(
pl.col("json_val").str.json_path_match("$.a").is_in(["hello"])
).to_dict(False) == {"json_val": ['{"a":"hello"}']}


def test_length_vs_nchars() -> None:
df = pl.DataFrame({"s": ["café", "東京"]}).with_columns(
[
pl.col("s").str.lengths().alias("length"),
pl.col("s").str.n_chars().alias("nchars"),
]
)
assert df.rows() == [("café", 5, 4), ("東京", 6, 2)]