feat: add isAscii

dk1a · Dec 14, 2022 · dfb9916 · dfb9916
1 parent 93cd020
commit dfb9916
Show file tree

Hide file tree

Showing 6 changed files with 120 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -89,6 +89,8 @@ Internally `StrSlice` uses `Slice` and extends it with logic for multibyte UTF-8
 |                  | *replacen requires 0 < pattern.len() <= to.len()*|
 **iterate**
 | `chars`          | character iterator over the slice                |
+**ascii**
+| `isAscii`        | true if all chars are ASCII                      |
 **dangerous**
 | `asSlice`        | get underlying Slice                             |
 | `ptr`            | get memory pointer                               |
@@ -159,6 +161,7 @@ It's returned by some methods of `StrSlice` and `StrCharsIter`.
 | `lt`,`lte`       | <, <=                                            |
 | `gt`,`gte`       | >, >=                                            |
 | `isValidUtf8`    | usually true                                     |
+| `isAscii`        | true if the char is ASCII                        |
 
 Import `StrChar__` (static function lib) to use `StrChar__.fromCodePoint` for code point to `StrChar` conversion.
 

diff --git a/src/StrChar.sol b/src/StrChar.sol
@@ -66,7 +66,8 @@ using {
     len,
     toBytes32, toString, toCodePoint,
     cmp, eq, ne, lt, lte, gt, gte,
-    isValidUtf8
+    isValidUtf8,
+    isAscii
 } for StrChar global;
 
 /**
@@ -159,4 +160,11 @@ function gte(StrChar self, StrChar other) pure returns (bool) {
  */
 function isValidUtf8(StrChar self) pure returns (bool) {
     return _isValidUtf8(StrChar.unwrap(self)) != 0;
+}
+
+/**
+ * @dev Returns true if `StrChar` is within the ASCII range.
+ */
+function isAscii(StrChar self) pure returns (bool) {
+    return StrChar.unwrap(self)[0] < 0x80;
 }
diff --git a/src/StrSlice.sol b/src/StrSlice.sol
@@ -6,6 +6,7 @@ import { Slice, Slice__, Slice__OutOfBounds } from "./Slice.sol";
 import { StrChar, StrChar__ } from "./StrChar.sol";
 import { StrCharsIter, StrCharsIter__ } from "./StrCharsIter.sol";
 import { isValidUtf8 } from "./utils/utf8.sol";
+import { memIsAscii } from "./utils/memascii.sol";
 import { PackPtrLen } from "./utils/PackPtrLen.sol";
 
 /**
@@ -97,7 +98,9 @@ using {
     splitOnce, rsplitOnce,
     replacen,
     // iteration
-    chars
+    chars,
+    // ascii
+    isAscii
 } for StrSlice global;
 
 /**
@@ -447,6 +450,17 @@ function chars(StrSlice self) pure returns (StrCharsIter memory) {
     return StrCharsIter(self.ptr(), self.len());
 }
 
+/**
+ * @dev Checks if all characters are within the ASCII range.
+ * 
+ * Note this does NOT explicitly validate UTF-8.
+ * Whereas ASCII certainly is valid UTF-8, non-ASCII *could* be invalid UTF-8.
+ * Use `StrCharsIter` for explicit validation.
+ */
+function isAscii(StrSlice self) pure returns (bool) {
+    return memIsAscii(self.ptr(), self.len());
+}
+
 /*//////////////////////////////////////////////////////////////////////////
                               FILE FUNCTIONS
 //////////////////////////////////////////////////////////////////////////*/

diff --git a/src/utils/memascii.sol b/src/utils/memascii.sol
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+
+pragma solidity ^0.8.17;
+
+import { leftMask } from "./mem.sol";
+
+/*
+ * These functions are VERY DANGEROUS!
+ * They operate directly on memory pointers, use with caution.
+ *
+ * Assembly here is marked as memory-safe for optimization.
+ * The caller MUST use pointers in a memory-safe way!
+ * https://docs.soliditylang.org/en/latest/assembly.html#memory-safety
+ */
+
+/// @dev 32 0x80 bytes. 0x80 = 1000_0000
+uint256 constant ASCII_MASK = 0x80 * (type(uint256).max / type(uint8).max);
+
+/**
+ * @dev Efficiently checks if all bytes are within the ASCII range.
+ */
+function memIsAscii(uint256 textPtr, uint256 textLen) pure returns (bool) {
+    uint256 tailLen;
+    uint256 endPtr;
+    // safe because tailLen <= textLen (ptr+len is implicitly safe)
+    unchecked {
+        tailLen = textLen % 32;
+        endPtr = textPtr + (textLen - tailLen);
+    }
+
+    // check 32 byte chunks with the ascii mask
+    uint256 b;
+    while (textPtr < endPtr) {
+        /// @solidity memory-safe-assembly
+        assembly {
+            b := mload(textPtr)
+        }
+        // break if any non-ascii byte is found
+        if (b & ASCII_MASK != 0) {
+            return false;
+        }
+        // safe because textPtr < endPtr, and endPtr = textPtr + n*32 (see tailLen)
+        unchecked {
+            textPtr += 32;
+        }
+    }
+
+    // this mask removes any trailing bytes
+    uint256 trailingMask = leftMask(tailLen);
+    /// @solidity memory-safe-assembly
+    assembly {
+        b := and(mload(endPtr), trailingMask)
+    }
+    // check tail with the ascii mask
+    return b & ASCII_MASK == 0;
+}
diff --git a/test/StrChar.t.sol b/test/StrChar.t.sol
@@ -299,6 +299,25 @@ contract StrCharTest is PRBTest {
         assertEq(StrChar__.from(bytes32(hex"F09080801111")).toCodePoint(), 0x10000);
         assertEq(StrChar__.from(bytes32(hex"F090808000FF")).toCodePoint(), 0x10000);
     }
+
+    /*//////////////////////////////////////////////////////////////////////////
+                                    ASCII
+    //////////////////////////////////////////////////////////////////////////*/
+
+    function testIsAscii() public {
+        for (uint256 i; i < 0x80; i++) {
+            assertTrue(StrChar__.fromCodePoint(i).isAscii());
+        }
+
+        for (uint256 i = 0x80; i < 0x20000; i++) {
+            if (0xD800 <= i && i <= 0xDFFF) {
+                // skip surrogate halves
+                continue;
+            }
+            assertFalse(StrChar__.fromCodePoint(i).isAscii());
+        }
+        assertFalse(StrChar__.fromCodePoint(0x10FFFF).isAscii());
+    }
 }
 
 contract StrCharRevertHelper {

diff --git a/test/StrSlice.t.sol b/test/StrSlice.t.sol
@@ -218,4 +218,22 @@ contract StrSliceTest is PRBTest, StrSliceAssertions {
     }
 
     // TODO more tests
+
+    /*//////////////////////////////////////////////////////////////////////////
+                                    ASCII
+    //////////////////////////////////////////////////////////////////////////*/
+
+    function testIsAscii() public {
+        string memory ascii = hex"000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f";
+        assertTrue(toSlice("").isAscii());
+        assertTrue(toSlice("a").isAscii());
+        assertTrue(toSlice(ascii).isAscii());
+        assertTrue(toSlice(string(abi.encodePacked(ascii, ascii, ascii, ascii))).isAscii());
+        assertFalse(toSlice(unicode"📎").isAscii());
+        assertFalse(toSlice(unicode"012こ").isAscii());
+        assertFalse(toSlice(string(bytes(hex"FF"))).isAscii());
+        assertFalse(toSlice(string(abi.encodePacked(hex"80", ascii))).isAscii());
+        assertFalse(toSlice(string(abi.encodePacked(ascii, hex"80"))).isAscii());
+        assertFalse(toSlice(string(abi.encodePacked(ascii, unicode"📎"))).isAscii());
+    }
 }