Skip to content

Commit

Permalink
feat: add isAscii
Browse files Browse the repository at this point in the history
  • Loading branch information
dk1a committed Dec 14, 2022
1 parent 93cd020 commit dfb9916
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 2 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ Internally `StrSlice` uses `Slice` and extends it with logic for multibyte UTF-8
| | *replacen requires 0 < pattern.len() <= to.len()*|
**iterate**
| `chars` | character iterator over the slice |
**ascii**
| `isAscii` | true if all chars are ASCII |
**dangerous**
| `asSlice` | get underlying Slice |
| `ptr` | get memory pointer |
Expand Down Expand Up @@ -159,6 +161,7 @@ It's returned by some methods of `StrSlice` and `StrCharsIter`.
| `lt`,`lte` | <, <= |
| `gt`,`gte` | >, >= |
| `isValidUtf8` | usually true |
| `isAscii` | true if the char is ASCII |

Import `StrChar__` (static function lib) to use `StrChar__.fromCodePoint` for code point to `StrChar` conversion.

Expand Down
10 changes: 9 additions & 1 deletion src/StrChar.sol
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ using {
len,
toBytes32, toString, toCodePoint,
cmp, eq, ne, lt, lte, gt, gte,
isValidUtf8
isValidUtf8,
isAscii
} for StrChar global;

/**
Expand Down Expand Up @@ -159,4 +160,11 @@ function gte(StrChar self, StrChar other) pure returns (bool) {
*/
function isValidUtf8(StrChar self) pure returns (bool) {
return _isValidUtf8(StrChar.unwrap(self)) != 0;
}

/**
* @dev Returns true if `StrChar` is within the ASCII range.
*/
function isAscii(StrChar self) pure returns (bool) {
return StrChar.unwrap(self)[0] < 0x80;
}
16 changes: 15 additions & 1 deletion src/StrSlice.sol
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { Slice, Slice__, Slice__OutOfBounds } from "./Slice.sol";
import { StrChar, StrChar__ } from "./StrChar.sol";
import { StrCharsIter, StrCharsIter__ } from "./StrCharsIter.sol";
import { isValidUtf8 } from "./utils/utf8.sol";
import { memIsAscii } from "./utils/memascii.sol";
import { PackPtrLen } from "./utils/PackPtrLen.sol";

/**
Expand Down Expand Up @@ -97,7 +98,9 @@ using {
splitOnce, rsplitOnce,
replacen,
// iteration
chars
chars,
// ascii
isAscii
} for StrSlice global;

/**
Expand Down Expand Up @@ -447,6 +450,17 @@ function chars(StrSlice self) pure returns (StrCharsIter memory) {
return StrCharsIter(self.ptr(), self.len());
}

/**
* @dev Checks if all characters are within the ASCII range.
*
* Note this does NOT explicitly validate UTF-8.
* Whereas ASCII certainly is valid UTF-8, non-ASCII *could* be invalid UTF-8.
* Use `StrCharsIter` for explicit validation.
*/
function isAscii(StrSlice self) pure returns (bool) {
return memIsAscii(self.ptr(), self.len());
}

/*//////////////////////////////////////////////////////////////////////////
FILE FUNCTIONS
//////////////////////////////////////////////////////////////////////////*/
Expand Down
56 changes: 56 additions & 0 deletions src/utils/memascii.sol
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// SPDX-License-Identifier: MIT

pragma solidity ^0.8.17;

import { leftMask } from "./mem.sol";

/*
* These functions are VERY DANGEROUS!
* They operate directly on memory pointers, use with caution.
*
* Assembly here is marked as memory-safe for optimization.
* The caller MUST use pointers in a memory-safe way!
* https://docs.soliditylang.org/en/latest/assembly.html#memory-safety
*/

/// @dev 32 0x80 bytes. 0x80 = 1000_0000
uint256 constant ASCII_MASK = 0x80 * (type(uint256).max / type(uint8).max);

/**
* @dev Efficiently checks if all bytes are within the ASCII range.
*/
function memIsAscii(uint256 textPtr, uint256 textLen) pure returns (bool) {
uint256 tailLen;
uint256 endPtr;
// safe because tailLen <= textLen (ptr+len is implicitly safe)
unchecked {
tailLen = textLen % 32;
endPtr = textPtr + (textLen - tailLen);
}

// check 32 byte chunks with the ascii mask
uint256 b;
while (textPtr < endPtr) {
/// @solidity memory-safe-assembly
assembly {
b := mload(textPtr)
}
// break if any non-ascii byte is found
if (b & ASCII_MASK != 0) {
return false;
}
// safe because textPtr < endPtr, and endPtr = textPtr + n*32 (see tailLen)
unchecked {
textPtr += 32;
}
}

// this mask removes any trailing bytes
uint256 trailingMask = leftMask(tailLen);
/// @solidity memory-safe-assembly
assembly {
b := and(mload(endPtr), trailingMask)
}
// check tail with the ascii mask
return b & ASCII_MASK == 0;
}
19 changes: 19 additions & 0 deletions test/StrChar.t.sol
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,25 @@ contract StrCharTest is PRBTest {
assertEq(StrChar__.from(bytes32(hex"F09080801111")).toCodePoint(), 0x10000);
assertEq(StrChar__.from(bytes32(hex"F090808000FF")).toCodePoint(), 0x10000);
}

/*//////////////////////////////////////////////////////////////////////////
ASCII
//////////////////////////////////////////////////////////////////////////*/

function testIsAscii() public {
for (uint256 i; i < 0x80; i++) {
assertTrue(StrChar__.fromCodePoint(i).isAscii());
}

for (uint256 i = 0x80; i < 0x20000; i++) {
if (0xD800 <= i && i <= 0xDFFF) {
// skip surrogate halves
continue;
}
assertFalse(StrChar__.fromCodePoint(i).isAscii());
}
assertFalse(StrChar__.fromCodePoint(0x10FFFF).isAscii());
}
}

contract StrCharRevertHelper {
Expand Down
18 changes: 18 additions & 0 deletions test/StrSlice.t.sol
Original file line number Diff line number Diff line change
Expand Up @@ -218,4 +218,22 @@ contract StrSliceTest is PRBTest, StrSliceAssertions {
}

// TODO more tests

/*//////////////////////////////////////////////////////////////////////////
ASCII
//////////////////////////////////////////////////////////////////////////*/

function testIsAscii() public {
string memory ascii = hex"000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f";
assertTrue(toSlice("").isAscii());
assertTrue(toSlice("a").isAscii());
assertTrue(toSlice(ascii).isAscii());
assertTrue(toSlice(string(abi.encodePacked(ascii, ascii, ascii, ascii))).isAscii());
assertFalse(toSlice(unicode"📎").isAscii());
assertFalse(toSlice(unicode"012こ").isAscii());
assertFalse(toSlice(string(bytes(hex"FF"))).isAscii());
assertFalse(toSlice(string(abi.encodePacked(hex"80", ascii))).isAscii());
assertFalse(toSlice(string(abi.encodePacked(ascii, hex"80"))).isAscii());
assertFalse(toSlice(string(abi.encodePacked(ascii, unicode"📎"))).isAscii());
}
}

0 comments on commit dfb9916

Please sign in to comment.