[stdlib] polish: Change string __len__() methods to return length in

bytes MODULAR_ORIG_COMMIT_REV_ID: 7f87f1d40ee48279b20abac457d7bf3d2b326e5d
modular · Jan 17, 2025 · e9ce5dc · martinvuyk · Feb 1, 2025 · e9ce5dc
1 parent 1d41443
commit e9ce5dc
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 29 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -236,6 +236,19 @@ what we publish.
       # ...
   ```
 
+- The `String.__len__()` and `StringSlice.__len__()` methods now return the
+  length of the string in bytes.
+
+  Previously, these methods were documented to note that they would eventually
+  return a length in Unicode codepoints. They have been changed to guarantee
+  a length in bytes, since the length in bytes is how they are most often used
+  today (for example, as bounds to low-level memory manipulation logic).
+  Additionally, length in codepoints is a more specialized notion of string
+  length that is rarely the correct metric.
+
+  Users that know they need the length in codepoints can use the
+  `str.char_length()` method, or `len(str.chars())`.
+
 - Various functionality has moved from `String` and `StringRef` to the more
   general `StringSlice` type.
 

diff --git a/stdlib/src/collections/string/string.mojo b/stdlib/src/collections/string/string.mojo
@@ -1063,22 +1063,46 @@ struct String(
         """
         return self.byte_length() > 0
 
+    @always_inline
     fn __len__(self) -> Int:
-        """Gets the string length, in bytes (for now) PREFER:
-        String.byte_length(), a future version will make this method return
-        Unicode codepoints.
+        """Get the string length of in bytes.
+
+        This function returns the number of bytes in the underlying UTF-8
+        representation of the string.
+
+        To get the number of Unicode codepoints in a string, use
+        `len(str.chars())`.
 
         Returns:
-            The string length, in bytes (for now).
-        """
-        var unicode_length = self.byte_length()
+            The string length in bytes.
+
+        # Examples
 
-        # TODO: everything uses this method assuming it's byte length
-        # for i in range(unicode_length):
-        #     if _utf8_byte_type(self._buffer[i]) == 1:
-        #         unicode_length -= 1
+        Query the length of a string, in bytes and Unicode codepoints:
 
-        return unicode_length
+        ```mojo
+        from testing import assert_equal
+
+        var s = String("ನಮಸ್ಕಾರ")
+
+        assert_equal(len(s), 21)
+        assert_equal(len(s.chars()), 7)
+        ```
+
+        Strings containing only ASCII characters have the same byte and
+        Unicode codepoint length:
+
+        ```mojo
+        from testing import assert_equal
+
+        var s = String("abc")
+
+        assert_equal(len(s), 3)
+        assert_equal(len(s.chars()), 3)
+        ```
+        .
+        """
+        return self.byte_length()
 
     @always_inline
     fn __str__(self) -> String:

diff --git a/stdlib/src/collections/string/string_slice.mojo b/stdlib/src/collections/string/string_slice.mojo
@@ -547,13 +547,48 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]](
         else:
             return "'" + result + "'"
 
+    @always_inline
     fn __len__(self) -> Int:
-        """Nominally returns the _length in Unicode codepoints_ (not bytes!).
+        """Get the string length in bytes.
+
+        This function returns the number of bytes in the underlying UTF-8
+        representation of the string.
+
+        To get the number of Unicode codepoints in a string, use
+        `len(str.chars())`.
 
         Returns:
-            The length in Unicode codepoints.
+            The string length in bytes.
+
+        # Examples
+
+        Query the length of a string, in bytes and Unicode codepoints:
+
+        ```mojo
+        from collections.string import StringSlice
+        from testing import assert_equal
+
+        var s = StringSlice("ನಮಸ್ಕಾರ")
+
+        assert_equal(len(s), 21)
+        assert_equal(len(s.chars()), 7)
+        ```
+
+        Strings containing only ASCII characters have the same byte and
+        Unicode codepoint length:
+
+        ```mojo
+        from collections.string import StringSlice
+        from testing import assert_equal
+
+        var s = StringSlice("abc")
+
+        assert_equal(len(s), 3)
+        assert_equal(len(s.chars()), 3)
+        ```
+        .
         """
-        return self.char_length()
+        return self.byte_length()
 
     fn write_to[W: Writer](self, mut writer: W):
         """Formats this string slice to the provided `Writer`.
@@ -1012,6 +1047,46 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]](
 
         Returns:
             The length in Unicode codepoints.
+
+        # Examples
+
+        Query the length of a string, in bytes and Unicode codepoints:
+
+        ```mojo
+        from collections.string import StringSlice
+        from testing import assert_equal
+
+        var s = StringSlice("ನಮಸ್ಕಾರ")
+
+        assert_equal(s.char_length(), 7)
+        assert_equal(len(s), 21)
+        ```
+
+        Strings containing only ASCII characters have the same byte and
+        Unicode codepoint length:
+
+        ```mojo
+        from collections.string import StringSlice
+        from testing import assert_equal
+
+        var s = StringSlice("abc")
+
+        assert_equal(s.char_length(), 3)
+        assert_equal(len(s), 3)
+        ```
+
+        The character length of a string with visual combining characters is
+        the length in Unicode codepoints, not grapheme clusters:
+
+        ```mojo
+        from collections.string import StringSlice
+        from testing import assert_equal
+
+        var s = StringSlice("á")
+        assert_equal(s.char_length(), 2)
+        assert_equal(s.byte_length(), 3)
+        ```
+        .
         """
         # Every codepoint is encoded as one leading byte + 0 to 3 continuation
         # bytes.

diff --git a/stdlib/test/collections/string/test_string.mojo b/stdlib/test/collections/string/test_string.mojo
@@ -89,6 +89,20 @@ def test_copy():
     assert_equal("fine", s1)
 
 
+def test_len():
+    # String length is in bytes, not codepoints.
+    var s0 = String("ನಮಸ್ಕಾರ")
+
+    assert_equal(len(s0), 21)
+    assert_equal(len(s0.chars()), 7)
+
+    # For ASCII string, the byte and codepoint length are the same:
+    var s1 = String("abc")
+
+    assert_equal(len(s1), 3)
+    assert_equal(len(s1.chars()), 3)
+
+
 def test_equality_operators():
     var s0 = String("abc")
     var s1 = String("def")
@@ -1475,6 +1489,7 @@ def test_reserve():
 def main():
     test_constructors()
     test_copy()
+    test_len()
     test_equality_operators()
     test_comparison_operators()
     test_add()

diff --git a/stdlib/test/collections/string/test_string_slice.mojo b/stdlib/test/collections/string/test_string_slice.mojo
@@ -151,23 +151,22 @@ fn test_heap_string_from_string_slice() raises:
 
 
 fn test_slice_len() raises:
-    alias str1: StringLiteral = "12345"
-    alias str2: StringLiteral = "1234"
-    alias str3: StringLiteral = "123"
-    alias str4: StringLiteral = "12"
-    alias str5: StringLiteral = "1"
+    assert_equal(5, len(StringSlice("12345")))
+    assert_equal(4, len(StringSlice("1234")))
+    assert_equal(3, len(StringSlice("123")))
+    assert_equal(2, len(StringSlice("12")))
+    assert_equal(1, len(StringSlice("1")))
+    assert_equal(0, len(StringSlice("")))
 
-    alias slice1 = str1.as_string_slice()
-    alias slice2 = str2.as_string_slice()
-    alias slice3 = str3.as_string_slice()
-    alias slice4 = str4.as_string_slice()
-    alias slice5 = str5.as_string_slice()
+    # String length is in bytes, not codepoints.
+    var s0 = String("ನಮಸ್ಕಾರ")
+    assert_equal(len(s0), 21)
+    assert_equal(len(s0.chars()), 7)
 
-    assert_equal(5, len(slice1))
-    assert_equal(4, len(slice2))
-    assert_equal(3, len(slice3))
-    assert_equal(2, len(slice4))
-    assert_equal(1, len(slice5))
+    # For ASCII string, the byte and codepoint length are the same:
+    var s1 = String("abc")
+    assert_equal(len(s1), 3)
+    assert_equal(len(s1.chars()), 3)
 
 
 fn test_slice_char_length() raises:
@@ -189,6 +188,13 @@ fn test_slice_char_length() raises:
     assert_equal(s3.byte_length(), 37)
     assert_equal(s3.char_length(), 19)
 
+    # Character length is codepoints, not graphemes
+    # This is thumbs up + a skin tone modifier codepoint.
+    var s4 = StringSlice("👍🏻")
+    assert_equal(s4.byte_length(), 8)
+    assert_equal(s4.char_length(), 2)
+    # TODO: assert_equal(s4.grapheme_count(), 1)
+
 
 fn test_slice_eq() raises:
     var str1: String = "12345"