pola-rs · ritchie46 · Oct 18, 2022 · Oct 18, 2022
diff --git a/polars/polars-arrow/src/kernels/string.rs b/polars/polars-arrow/src/kernels/string.rs
@@ -7,9 +7,14 @@ use crate::trusted_len::PushUnchecked;
 
 pub fn string_lengths(array: &Utf8Array<i64>) -> ArrayRef {
     let values = array.offsets().windows(2).map(|x| (x[1] - x[0]) as u32);
-
     let values: Buffer<_> = Vec::from_trusted_len_iter(values).into();
+    let array = UInt32Array::from_data(DataType::UInt32, values, array.validity().cloned());
+    Box::new(array)
+}
 
+pub fn string_nchars(array: &Utf8Array<i64>) -> ArrayRef {
+    let values = array.values_iter().map(|x| x.chars().count() as u32);
+    let values: Buffer<_> = Vec::from_trusted_len_iter(values).into();
     let array = UInt32Array::from_data(DataType::UInt32, values, array.validity().cloned());
     Box::new(array)
 }
diff --git a/polars/polars-ops/src/chunked_array/strings/namespace.rs b/polars/polars-ops/src/chunked_array/strings/namespace.rs
@@ -13,7 +13,13 @@ fn f_regex_extract<'a>(reg: &Regex, input: &'a str, group_index: usize) -> Optio
 }
 
 pub trait Utf8NameSpaceImpl: AsUtf8 {
-    /// Get the length of the string values.
+    /// Get the length of the string values as number of chars.
+    fn str_n_chars(&self) -> UInt32Chunked {
+        let ca = self.as_utf8();
+        ca.apply_kernel_cast(&string_nchars)
+    }
+
+    /// Get the length of the string values as number of bytes.
     fn str_lengths(&self) -> UInt32Chunked {
         let ca = self.as_utf8();
         ca.apply_kernel_cast(&string_lengths)

diff --git a/py-polars/docs/source/reference/expression.rst b/py-polars/docs/source/reference/expression.rst
@@ -331,6 +331,7 @@ The following methods are available under the `Expr.str` attribute.
     ExprStringNameSpace.lengths
     ExprStringNameSpace.ljust
     ExprStringNameSpace.lstrip
+    ExprStringNameSpace.n_chars
     ExprStringNameSpace.replace
     ExprStringNameSpace.replace_all
     ExprStringNameSpace.rjust

diff --git a/py-polars/docs/source/reference/series.rst b/py-polars/docs/source/reference/series.rst
@@ -283,6 +283,7 @@ The following methods are available under the `Series.str` attribute.
     StringNameSpace.lengths
     StringNameSpace.ljust
     StringNameSpace.lstrip
+    StringNameSpace.n_chars
     StringNameSpace.replace
     StringNameSpace.replace_all
     StringNameSpace.rjust

diff --git a/py-polars/polars/internals/expr/string.py b/py-polars/polars/internals/expr/string.py
@@ -111,28 +111,76 @@ def strptime(
 
     def lengths(self) -> pli.Expr:
         """
-        Get the length of the Strings as UInt32.
+        Get length of the strings as UInt32 (as number of bytes).
+
+        Notes
+        -----
+        The returned lengths are equal to the number of bytes in the UTF8 string. If you
+        need the length in terms of the number of characters, use ``n_chars`` instead.
 
         Examples
         --------
-        >>> df = pl.DataFrame({"s": [None, "bears", "110"]})
-        >>> df.select(["s", pl.col("s").str.lengths().alias("len")])
-        shape: (3, 2)
-        ┌───────┬──────┐
-        │ s     ┆ len  │
-        │ ---   ┆ ---  │
-        │ str   ┆ u32  │
-        ╞═══════╪══════╡
-        │ null  ┆ null │
-        ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
-        │ bears ┆ 5    │
-        ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
-        │ 110   ┆ 3    │
-        └───────┴──────┘
+        >>> df = pl.DataFrame({"s": ["Café", None, "345", "東京"]}).with_columns(
+        ...     [
+        ...         pl.col("s").str.lengths().alias("length"),
+        ...         pl.col("s").str.n_chars().alias("nchars"),
+        ...     ]
+        ... )
+        >>> df
+        shape: (4, 3)
+        ┌──────┬────────┬────────┐
+        │ s    ┆ length ┆ nchars │
+        │ ---  ┆ ---    ┆ ---    │
+        │ str  ┆ u32    ┆ u32    │
+        ╞══════╪════════╪════════╡
+        │ Café ┆ 5      ┆ 4      │
+        ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+        │ null ┆ null   ┆ null   │
+        ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+        │ 345  ┆ 3      ┆ 3      │
+        ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+        │ 東京  ┆ 6      ┆ 2      │
+        └──────┴────────┴────────┘
 
         """
         return pli.wrap_expr(self._pyexpr.str_lengths())
 
+    def n_chars(self) -> pli.Expr:
+        """
+        Get length of the strings as UInt32 (as number of chars).
+
+        Notes
+        -----
+        If you know that you are working with ASCII text, ``lengths`` will be
+        equivalent, and faster (returns length in terms of the number of bytes).
+
+        Examples
+        --------
+        >>> df = pl.DataFrame({"s": ["Café", None, "345", "東京"]}).with_columns(
+        ...     [
+        ...         pl.col("s").str.n_chars().alias("nchars"),
+        ...         pl.col("s").str.lengths().alias("length"),
+        ...     ]
+        ... )
+        >>> df
+        shape: (4, 3)
+        ┌──────┬────────┬────────┐
+        │ s    ┆ nchars ┆ length │
+        │ ---  ┆ ---    ┆ ---    │
+        │ str  ┆ u32    ┆ u32    │
+        ╞══════╪════════╪════════╡
+        │ Café ┆ 4      ┆ 5      │
+        ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+        │ null ┆ null   ┆ null   │
+        ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+        │ 345  ┆ 3      ┆ 3      │
+        ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
+        │ 東京  ┆ 2      ┆ 6      │
+        └──────┴────────┴────────┘
+
+        """
+        return pli.wrap_expr(self._pyexpr.str_n_chars())
+
     def concat(self, delimiter: str = "-") -> pli.Expr:
         """
         Vertically concat the values in the Series to a single string value.

diff --git a/py-polars/polars/internals/series/string.py b/py-polars/polars/internals/series/string.py
@@ -93,23 +93,56 @@ def strptime(
 
     def lengths(self) -> pli.Series:
         """
-        Get length of the string values in the Series.
+        Get length of the string values in the Series (as number of bytes).
+
+        Notes
+        -----
+        The returned lengths are equal to the number of bytes in the UTF8 string. If you
+        need the length in terms of the number of characters, use ``n_chars`` instead.
 
         Returns
         -------
         Series[u32]
 
         Examples
         --------
-        >>> s = pl.Series(["foo", None, "hello", "world"])
+        >>> s = pl.Series(["Café", None, "345", "東京"])
         >>> s.str.lengths()
         shape: (4,)
         Series: '' [u32]
         [
+            5
+            null
             3
+            6
+        ]
+
+        """
+
+    def n_chars(self) -> pli.Series:
+        """
+        Get length of the string values in the Series (as number of chars).
+
+        Returns
+        -------
+        Series[u32]
+
+        Notes
+        -----
+        If you know that you are working with ASCII text, ``lengths`` will be
+        equivalent, and faster (returns length in terms of the number of bytes).
+
+        Examples
+        --------
+        >>> s = pl.Series(["Café", None, "345", "東京"])
+        >>> s.str.n_chars()
+        shape: (4,)
+        Series: '' [u32]
+        [
+            4
             null
-            5
-            5
+            3
+            2
         ]
 
         """

diff --git a/py-polars/src/lazy/dsl.rs b/py-polars/src/lazy/dsl.rs
@@ -601,7 +601,19 @@ impl PyExpr {
         self.clone()
             .inner
             .map(function, GetOutput::from_type(DataType::UInt32))
-            .with_fmt("str.len")
+            .with_fmt("str.lengths")
+            .into()
+    }
+
+    pub fn str_n_chars(&self) -> PyExpr {
+        let function = |s: Series| {
+            let ca = s.utf8()?;
+            Ok(ca.str_n_chars().into_series())
+        };
+        self.clone()
+            .inner
+            .map(function, GetOutput::from_type(DataType::UInt32))
+            .with_fmt("str.n_chars")
             .into()
     }
 

diff --git a/py-polars/tests/unit/test_series.py b/py-polars/tests/unit/test_series.py
@@ -1617,11 +1617,17 @@ def test_str_concat() -> None:
 
 
 def test_str_lengths() -> None:
-    s = pl.Series(["messi", "ronaldo", None])
-    expected = pl.Series([5, 7, None], dtype=UInt32)
+    s = pl.Series(["Café", None, "345", "東京"])
+    expected = pl.Series([5, None, 3, 6], dtype=UInt32)
     verify_series_and_expr_api(s, expected, "str.lengths")
 
 
+def test_str_n_chars() -> None:
+    s = pl.Series(["Café", None, "345", "東京"])
+    expected = pl.Series([4, None, 3, 2], dtype=UInt32)
+    verify_series_and_expr_api(s, expected, "str.n_chars")
+
+
 def test_str_contains() -> None:
     s = pl.Series(["messi", "ronaldo", "ibrahimovic"])
     expected = pl.Series([True, False, False])

diff --git a/py-polars/tests/unit/test_utf8.py b/py-polars/tests/unit/test_utf8.py
@@ -12,3 +12,13 @@ def test_json_path_match_type_4905() -> None:
     assert df.filter(
         pl.col("json_val").str.json_path_match("$.a").is_in(["hello"])
     ).to_dict(False) == {"json_val": ['{"a":"hello"}']}
+
+
+def test_length_vs_nchars() -> None:
+    df = pl.DataFrame({"s": ["café", "東京"]}).with_columns(
+        [
+            pl.col("s").str.lengths().alias("length"),
+            pl.col("s").str.n_chars().alias("nchars"),
+        ]
+    )
+    assert df.rows() == [("café", 5, 4), ("東京", 6, 2)]