Skip to content

Commit

Permalink
feat(python, rust): new n_chars functionality for utf8 strings (#5252)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie authored Oct 18, 2022
1 parent df1a83c commit f8d60ee
Show file tree
Hide file tree
Showing 9 changed files with 146 additions and 24 deletions.
7 changes: 6 additions & 1 deletion polars/polars-arrow/src/kernels/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,14 @@ use crate::trusted_len::PushUnchecked;

pub fn string_lengths(array: &Utf8Array<i64>) -> ArrayRef {
let values = array.offsets().windows(2).map(|x| (x[1] - x[0]) as u32);

let values: Buffer<_> = Vec::from_trusted_len_iter(values).into();
let array = UInt32Array::from_data(DataType::UInt32, values, array.validity().cloned());
Box::new(array)
}

pub fn string_nchars(array: &Utf8Array<i64>) -> ArrayRef {
let values = array.values_iter().map(|x| x.chars().count() as u32);
let values: Buffer<_> = Vec::from_trusted_len_iter(values).into();
let array = UInt32Array::from_data(DataType::UInt32, values, array.validity().cloned());
Box::new(array)
}
8 changes: 7 additions & 1 deletion polars/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@ fn f_regex_extract<'a>(reg: &Regex, input: &'a str, group_index: usize) -> Optio
}

pub trait Utf8NameSpaceImpl: AsUtf8 {
/// Get the length of the string values.
/// Get the length of the string values as number of chars.
fn str_n_chars(&self) -> UInt32Chunked {
let ca = self.as_utf8();
ca.apply_kernel_cast(&string_nchars)
}

/// Get the length of the string values as number of bytes.
fn str_lengths(&self) -> UInt32Chunked {
let ca = self.as_utf8();
ca.apply_kernel_cast(&string_lengths)
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,7 @@ The following methods are available under the `Expr.str` attribute.
ExprStringNameSpace.lengths
ExprStringNameSpace.ljust
ExprStringNameSpace.lstrip
ExprStringNameSpace.n_chars
ExprStringNameSpace.replace
ExprStringNameSpace.replace_all
ExprStringNameSpace.rjust
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ The following methods are available under the `Series.str` attribute.
StringNameSpace.lengths
StringNameSpace.ljust
StringNameSpace.lstrip
StringNameSpace.n_chars
StringNameSpace.replace
StringNameSpace.replace_all
StringNameSpace.rjust
Expand Down
78 changes: 63 additions & 15 deletions py-polars/polars/internals/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,28 +111,76 @@ def strptime(

def lengths(self) -> pli.Expr:
"""
Get the length of the Strings as UInt32.
Get length of the strings as UInt32 (as number of bytes).
Notes
-----
The returned lengths are equal to the number of bytes in the UTF8 string. If you
need the length in terms of the number of characters, use ``n_chars`` instead.
Examples
--------
>>> df = pl.DataFrame({"s": [None, "bears", "110"]})
>>> df.select(["s", pl.col("s").str.lengths().alias("len")])
shape: (3, 2)
┌───────┬──────┐
│ s ┆ len │
│ --- ┆ --- │
│ str ┆ u32 │
╞═══════╪══════╡
│ null ┆ null │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ bears ┆ 5 │
├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 110 ┆ 3 │
└───────┴──────┘
>>> df = pl.DataFrame({"s": ["Café", None, "345", "東京"]}).with_columns(
... [
... pl.col("s").str.lengths().alias("length"),
... pl.col("s").str.n_chars().alias("nchars"),
... ]
... )
>>> df
shape: (4, 3)
┌──────┬────────┬────────┐
│ s ┆ length ┆ nchars │
│ --- ┆ --- ┆ --- │
│ str ┆ u32 ┆ u32 │
╞══════╪════════╪════════╡
│ Café ┆ 5 ┆ 4 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ null ┆ null ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 345 ┆ 3 ┆ 3 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 東京 ┆ 6 ┆ 2 │
└──────┴────────┴────────┘
"""
return pli.wrap_expr(self._pyexpr.str_lengths())

def n_chars(self) -> pli.Expr:
"""
Get length of the strings as UInt32 (as number of chars).
Notes
-----
If you know that you are working with ASCII text, ``lengths`` will be
equivalent, and faster (returns length in terms of the number of bytes).
Examples
--------
>>> df = pl.DataFrame({"s": ["Café", None, "345", "東京"]}).with_columns(
... [
... pl.col("s").str.n_chars().alias("nchars"),
... pl.col("s").str.lengths().alias("length"),
... ]
... )
>>> df
shape: (4, 3)
┌──────┬────────┬────────┐
│ s ┆ nchars ┆ length │
│ --- ┆ --- ┆ --- │
│ str ┆ u32 ┆ u32 │
╞══════╪════════╪════════╡
│ Café ┆ 4 ┆ 5 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ null ┆ null ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 345 ┆ 3 ┆ 3 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 東京 ┆ 2 ┆ 6 │
└──────┴────────┴────────┘
"""
return pli.wrap_expr(self._pyexpr.str_n_chars())

def concat(self, delimiter: str = "-") -> pli.Expr:
"""
Vertically concat the values in the Series to a single string value.
Expand Down
41 changes: 37 additions & 4 deletions py-polars/polars/internals/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,23 +93,56 @@ def strptime(

def lengths(self) -> pli.Series:
"""
Get length of the string values in the Series.
Get length of the string values in the Series (as number of bytes).
Notes
-----
The returned lengths are equal to the number of bytes in the UTF8 string. If you
need the length in terms of the number of characters, use ``n_chars`` instead.
Returns
-------
Series[u32]
Examples
--------
>>> s = pl.Series(["foo", None, "hello", "world"])
>>> s = pl.Series(["Café", None, "345", "東京"])
>>> s.str.lengths()
shape: (4,)
Series: '' [u32]
[
5
null
3
6
]
"""

def n_chars(self) -> pli.Series:
"""
Get length of the string values in the Series (as number of chars).
Returns
-------
Series[u32]
Notes
-----
If you know that you are working with ASCII text, ``lengths`` will be
equivalent, and faster (returns length in terms of the number of bytes).
Examples
--------
>>> s = pl.Series(["Café", None, "345", "東京"])
>>> s.str.n_chars()
shape: (4,)
Series: '' [u32]
[
4
null
5
5
3
2
]
"""
Expand Down
14 changes: 13 additions & 1 deletion py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,19 @@ impl PyExpr {
self.clone()
.inner
.map(function, GetOutput::from_type(DataType::UInt32))
.with_fmt("str.len")
.with_fmt("str.lengths")
.into()
}

pub fn str_n_chars(&self) -> PyExpr {
let function = |s: Series| {
let ca = s.utf8()?;
Ok(ca.str_n_chars().into_series())
};
self.clone()
.inner
.map(function, GetOutput::from_type(DataType::UInt32))
.with_fmt("str.n_chars")
.into()
}

Expand Down
10 changes: 8 additions & 2 deletions py-polars/tests/unit/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1617,11 +1617,17 @@ def test_str_concat() -> None:


def test_str_lengths() -> None:
s = pl.Series(["messi", "ronaldo", None])
expected = pl.Series([5, 7, None], dtype=UInt32)
s = pl.Series(["Café", None, "345", "東京"])
expected = pl.Series([5, None, 3, 6], dtype=UInt32)
verify_series_and_expr_api(s, expected, "str.lengths")


def test_str_n_chars() -> None:
s = pl.Series(["Café", None, "345", "東京"])
expected = pl.Series([4, None, 3, 2], dtype=UInt32)
verify_series_and_expr_api(s, expected, "str.n_chars")


def test_str_contains() -> None:
s = pl.Series(["messi", "ronaldo", "ibrahimovic"])
expected = pl.Series([True, False, False])
Expand Down
10 changes: 10 additions & 0 deletions py-polars/tests/unit/test_utf8.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,13 @@ def test_json_path_match_type_4905() -> None:
assert df.filter(
pl.col("json_val").str.json_path_match("$.a").is_in(["hello"])
).to_dict(False) == {"json_val": ['{"a":"hello"}']}


def test_length_vs_nchars() -> None:
df = pl.DataFrame({"s": ["café", "東京"]}).with_columns(
[
pl.col("s").str.lengths().alias("length"),
pl.col("s").str.n_chars().alias("nchars"),
]
)
assert df.rows() == [("café", 5, 4), ("東京", 6, 2)]

0 comments on commit f8d60ee

Please sign in to comment.