diff --git a/README.md b/README.md index 267945c..09986cc 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,8 @@ Internally `StrSlice` uses `Slice` and extends it with logic for multibyte UTF-8 | | *replacen requires 0 < pattern.len() <= to.len()*| **iterate** | `chars` | character iterator over the slice | +**ascii** +| `isAscii` | true if all chars are ASCII | **dangerous** | `asSlice` | get underlying Slice | | `ptr` | get memory pointer | @@ -159,6 +161,7 @@ It's returned by some methods of `StrSlice` and `StrCharsIter`. | `lt`,`lte` | <, <= | | `gt`,`gte` | >, >= | | `isValidUtf8` | usually true | +| `isAscii` | true if the char is ASCII | Import `StrChar__` (static function lib) to use `StrChar__.fromCodePoint` for code point to `StrChar` conversion. diff --git a/src/StrChar.sol b/src/StrChar.sol index ff9875b..af53da1 100644 --- a/src/StrChar.sol +++ b/src/StrChar.sol @@ -66,7 +66,8 @@ using { len, toBytes32, toString, toCodePoint, cmp, eq, ne, lt, lte, gt, gte, - isValidUtf8 + isValidUtf8, + isAscii } for StrChar global; /** @@ -159,4 +160,11 @@ function gte(StrChar self, StrChar other) pure returns (bool) { */ function isValidUtf8(StrChar self) pure returns (bool) { return _isValidUtf8(StrChar.unwrap(self)) != 0; +} + +/** + * @dev Returns true if `StrChar` is within the ASCII range. + */ +function isAscii(StrChar self) pure returns (bool) { + return StrChar.unwrap(self)[0] < 0x80; } \ No newline at end of file diff --git a/src/StrSlice.sol b/src/StrSlice.sol index 28f0530..4cec3c0 100644 --- a/src/StrSlice.sol +++ b/src/StrSlice.sol @@ -6,6 +6,7 @@ import { Slice, Slice__, Slice__OutOfBounds } from "./Slice.sol"; import { StrChar, StrChar__ } from "./StrChar.sol"; import { StrCharsIter, StrCharsIter__ } from "./StrCharsIter.sol"; import { isValidUtf8 } from "./utils/utf8.sol"; +import { memIsAscii } from "./utils/memascii.sol"; import { PackPtrLen } from "./utils/PackPtrLen.sol"; /** @@ -97,7 +98,9 @@ using { splitOnce, rsplitOnce, replacen, // iteration - chars + chars, + // ascii + isAscii } for StrSlice global; /** @@ -447,6 +450,17 @@ function chars(StrSlice self) pure returns (StrCharsIter memory) { return StrCharsIter(self.ptr(), self.len()); } +/** + * @dev Checks if all characters are within the ASCII range. + * + * Note this does NOT explicitly validate UTF-8. + * Whereas ASCII certainly is valid UTF-8, non-ASCII *could* be invalid UTF-8. + * Use `StrCharsIter` for explicit validation. + */ +function isAscii(StrSlice self) pure returns (bool) { + return memIsAscii(self.ptr(), self.len()); +} + /*////////////////////////////////////////////////////////////////////////// FILE FUNCTIONS //////////////////////////////////////////////////////////////////////////*/ diff --git a/src/utils/memascii.sol b/src/utils/memascii.sol new file mode 100644 index 0000000..ff8f6bd --- /dev/null +++ b/src/utils/memascii.sol @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: MIT + +pragma solidity ^0.8.17; + +import { leftMask } from "./mem.sol"; + +/* + * These functions are VERY DANGEROUS! + * They operate directly on memory pointers, use with caution. + * + * Assembly here is marked as memory-safe for optimization. + * The caller MUST use pointers in a memory-safe way! + * https://docs.soliditylang.org/en/latest/assembly.html#memory-safety + */ + +/// @dev 32 0x80 bytes. 0x80 = 1000_0000 +uint256 constant ASCII_MASK = 0x80 * (type(uint256).max / type(uint8).max); + +/** + * @dev Efficiently checks if all bytes are within the ASCII range. + */ +function memIsAscii(uint256 textPtr, uint256 textLen) pure returns (bool) { + uint256 tailLen; + uint256 endPtr; + // safe because tailLen <= textLen (ptr+len is implicitly safe) + unchecked { + tailLen = textLen % 32; + endPtr = textPtr + (textLen - tailLen); + } + + // check 32 byte chunks with the ascii mask + uint256 b; + while (textPtr < endPtr) { + /// @solidity memory-safe-assembly + assembly { + b := mload(textPtr) + } + // break if any non-ascii byte is found + if (b & ASCII_MASK != 0) { + return false; + } + // safe because textPtr < endPtr, and endPtr = textPtr + n*32 (see tailLen) + unchecked { + textPtr += 32; + } + } + + // this mask removes any trailing bytes + uint256 trailingMask = leftMask(tailLen); + /// @solidity memory-safe-assembly + assembly { + b := and(mload(endPtr), trailingMask) + } + // check tail with the ascii mask + return b & ASCII_MASK == 0; +} \ No newline at end of file diff --git a/test/StrChar.t.sol b/test/StrChar.t.sol index 53260d9..c079f06 100644 --- a/test/StrChar.t.sol +++ b/test/StrChar.t.sol @@ -299,6 +299,25 @@ contract StrCharTest is PRBTest { assertEq(StrChar__.from(bytes32(hex"F09080801111")).toCodePoint(), 0x10000); assertEq(StrChar__.from(bytes32(hex"F090808000FF")).toCodePoint(), 0x10000); } + + /*////////////////////////////////////////////////////////////////////////// + ASCII + //////////////////////////////////////////////////////////////////////////*/ + + function testIsAscii() public { + for (uint256 i; i < 0x80; i++) { + assertTrue(StrChar__.fromCodePoint(i).isAscii()); + } + + for (uint256 i = 0x80; i < 0x20000; i++) { + if (0xD800 <= i && i <= 0xDFFF) { + // skip surrogate halves + continue; + } + assertFalse(StrChar__.fromCodePoint(i).isAscii()); + } + assertFalse(StrChar__.fromCodePoint(0x10FFFF).isAscii()); + } } contract StrCharRevertHelper { diff --git a/test/StrSlice.t.sol b/test/StrSlice.t.sol index ae81c5c..b91b52e 100644 --- a/test/StrSlice.t.sol +++ b/test/StrSlice.t.sol @@ -218,4 +218,22 @@ contract StrSliceTest is PRBTest, StrSliceAssertions { } // TODO more tests + + /*////////////////////////////////////////////////////////////////////////// + ASCII + //////////////////////////////////////////////////////////////////////////*/ + + function testIsAscii() public { + string memory ascii = hex"000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f"; + assertTrue(toSlice("").isAscii()); + assertTrue(toSlice("a").isAscii()); + assertTrue(toSlice(ascii).isAscii()); + assertTrue(toSlice(string(abi.encodePacked(ascii, ascii, ascii, ascii))).isAscii()); + assertFalse(toSlice(unicode"📎").isAscii()); + assertFalse(toSlice(unicode"012こ").isAscii()); + assertFalse(toSlice(string(bytes(hex"FF"))).isAscii()); + assertFalse(toSlice(string(abi.encodePacked(hex"80", ascii))).isAscii()); + assertFalse(toSlice(string(abi.encodePacked(ascii, hex"80"))).isAscii()); + assertFalse(toSlice(string(abi.encodePacked(ascii, unicode"📎"))).isAscii()); + } } \ No newline at end of file