diff --git a/src/Slice.sol b/src/Slice.sol index 235844c..210eca0 100644 --- a/src/Slice.sol +++ b/src/Slice.sol @@ -75,7 +75,7 @@ using { cmp, eq, ne, lt, lte, gt, gte, // index get, first, last, - splitAt, getSubslice, + splitAt, getSubslice, getBefore, getAfter, // search find, rfind, contains, startsWith, endsWith, @@ -199,7 +199,8 @@ function copyToBytes32(Slice self) pure returns (bytes32 b) { uint256 selfPtr = self.ptr(); // mask removes any trailing bytes - uint256 mask = leftMask(self.len()); + uint256 selfLen = self.len(); + uint256 mask = leftMask(selfLen > 32 ? 32 : selfLen); /// @solidity memory-safe-assembly assembly { @@ -322,7 +323,7 @@ function splitAt(Slice self, uint256 mid) pure returns (Slice, Slice) { } /** - * @dev Returns a subslice [start..end) of `self`. + * @dev Returns a subslice [start:end] of `self`. * Reverts if start/end are out of bounds. */ function getSubslice(Slice self, uint256 start, uint256 end) pure returns (Slice) { @@ -334,6 +335,29 @@ function getSubslice(Slice self, uint256 start, uint256 end) pure returns (Slice } } +/** + * @dev Returns a subslice [:index] of `self`. + * Reverts if `index` > length. + */ +function getBefore(Slice self, uint256 index) pure returns (Slice) { + uint256 selfLen = self.len(); + if (index > selfLen) revert Slice__OutOfBounds(); + return Slice__.fromRawParts(self.ptr(), index); +} + +/** + * @dev Returns a subslice [index:] of `self`. + * Reverts if `index` >= length. + */ +function getAfter(Slice self, uint256 index) pure returns (Slice) { + uint256 selfLen = self.len(); + if (index >= selfLen) revert Slice__OutOfBounds(); + // safe because index <= selfLen (ptr+len is implicitly safe) + unchecked { + return Slice__.fromRawParts(self.ptr() + index, selfLen - index); + } +} + /** * @dev Returns the byte index of the first slice of `self` that matches `pattern`. * Returns type(uint256).max if the `pattern` does not match. diff --git a/src/StrChar.sol b/src/StrChar.sol new file mode 100644 index 0000000..7fa576a --- /dev/null +++ b/src/StrChar.sol @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: MIT + +pragma solidity ^0.8.17; + +import { isValidUtf8 as _isValidUtf8, utf8CharWidth } from "./utf8.sol"; +import { leftMask } from "./mem.sol"; + +/** + * @title A single UTF-8 encoded character. + * @dev Internally it is stored as UTF-8 encoded bytes starting from left/MSB. + */ +type StrChar is bytes32; + +/*////////////////////////////////////////////////////////////////////////// + CUSTOM ERRORS +//////////////////////////////////////////////////////////////////////////*/ + +error StrChar__InvalidUTF8(); + +/*////////////////////////////////////////////////////////////////////////// + STATIC FUNCTIONS +//////////////////////////////////////////////////////////////////////////*/ + +library StrChar__ { + /** + * @dev Converts the first 1-4 bytes of `bytes32` to a `StrChar`. + * Starts from left/MSB, reverts if not valid UTF-8. + * @param b UTF-8 encoded character in the most significant bytes. + */ + function from(bytes32 b) internal pure returns (StrChar char) { + if (!_isValidUtf8(b)) revert StrChar__InvalidUTF8(); + return fromValidUtf8(b); + } + + /** + * @dev Like `from`, but does NOT check UTF-8 validity. + * If MSB of `bytes32` isn't valid UTF-8, this will return /0 character! + * Primarily for internal use. + */ + function fromValidUtf8(bytes32 b) internal pure returns (StrChar char) { + uint256 _len = len(StrChar.wrap(b)); + return StrChar.wrap(bytes32( + // zero-pad after the character + uint256(b) & leftMask(_len) + )); + } + + /** + * @dev Like `from`, but does NO validity checks. + * MSB of `bytes32` MUST be valid UTF-8! + * And `bytes32` MUST be zero-padded after the first UTF-8 character! + * Primarily for internal use. + */ + function fromUnchecked(bytes32 b) internal pure returns (StrChar char) { + return StrChar.wrap(b); + } + + // TODO codepoint to UTF-8, and the reverse + /** + * @dev Converts a `uint32` to a `StrChar`. + * Note that not all code points are valid. + * @param i a code point. E.g. for '€' code point = 0x20AC; wheareas UTF-8 = 0xE282AC. + * + function from(uint32 i) internal pure returns (StrChar char) { + // U+D800–U+DFFF are invalid UTF-16 surrogate halves + if (i > MAX || (i >= 0xD800 && i < 0xE000)) { + revert StrChar__InvalidUSV(); + } + + }*/ +} + +/*////////////////////////////////////////////////////////////////////////// + GLOBAL FUNCTIONS +//////////////////////////////////////////////////////////////////////////*/ + +using { + len, + toBytes32, toString, + cmp, eq, ne, lt, lte, gt, gte, + isValidUtf8 +} for StrChar global; + +/** + * @dev Returns the character's length in bytes (1-4). + * Returns 0 for some (not all!) invalid characters (e.g. due to unsafe use of fromValidUtf8). + */ +function len(StrChar self) pure returns (uint256) { + return utf8CharWidth( + // extract the leading byte + uint8(StrChar.unwrap(self)[0]) + ); +} + +/** + * @dev Converts a `StrChar` to its underlying bytes32 value. + */ +function toBytes32(StrChar self) pure returns (bytes32) { + return StrChar.unwrap(self); +} + +/** + * @dev Converts a `StrChar` to a newly allocated `string`. + */ +function toString(StrChar self) pure returns (string memory str) { + uint256 _len = self.len(); + str = new string(_len); + // TODO bitmask is probably better + for (uint256 i; i < _len; i++) { + bytes(str)[i] = StrChar.unwrap(self)[i]; + } + return str; +} + +/** + * @dev Compare characters lexicographically. + * @return result 0 for equal, < 0 for less than and > 0 for greater than. + */ +function cmp(StrChar self, StrChar other) pure returns (int result) { + uint256 selfUint = uint256(StrChar.unwrap(self)); + uint256 otherUint = uint256(StrChar.unwrap(other)); + if (selfUint > otherUint) { + return 1; + } else if (selfUint < otherUint) { + return -1; + } else { + return 0; + } +} + +/// @dev `self` == `other` +function eq(StrChar self, StrChar other) pure returns (bool) { + return uint256(StrChar.unwrap(self)) == uint256(StrChar.unwrap(other)); +} + +/// @dev `self` != `other` +function ne(StrChar self, StrChar other) pure returns (bool) { + return uint256(StrChar.unwrap(self)) != uint256(StrChar.unwrap(other)); +} + +/// @dev `self` < `other` +function lt(StrChar self, StrChar other) pure returns (bool) { + return uint256(StrChar.unwrap(self)) < uint256(StrChar.unwrap(other)); +} + +/// @dev `self` <= `other` +function lte(StrChar self, StrChar other) pure returns (bool) { + return uint256(StrChar.unwrap(self)) <= uint256(StrChar.unwrap(other)); +} + +/// @dev `self` > `other` +function gt(StrChar self, StrChar other) pure returns (bool) { + return uint256(StrChar.unwrap(self)) > uint256(StrChar.unwrap(other)); +} + +/// @dev `self` >= `other` +function gte(StrChar self, StrChar other) pure returns (bool) { + return uint256(StrChar.unwrap(self)) >= uint256(StrChar.unwrap(other)); +} + +/** + * @dev Returns true if `StrChar` is valid UTF-8. + * Can be false if it was formed with an unsafe method (fromValidUtf8, fromUnchecked, wrap). + */ +function isValidUtf8(StrChar self) pure returns (bool) { + return _isValidUtf8(StrChar.unwrap(self)); +} \ No newline at end of file diff --git a/src/StrCharsIter.sol b/src/StrCharsIter.sol new file mode 100644 index 0000000..2889dcb --- /dev/null +++ b/src/StrCharsIter.sol @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: MIT + +pragma solidity ^0.8.17; + +import { Slice } from "./Slice.sol"; +import { StrSlice } from "./StrSlice.sol"; +import { SliceIter, SliceIter__, SliceIter__StopIteration } from "./SliceIter.sol"; +import { StrChar, StrChar__, StrChar__InvalidUTF8 } from "./StrChar.sol"; + +/** + * @title String chars iterator. + * @dev This struct is created by the iter method on `StrSlice`. + * Iterates 1 UTF-8 encoded character at a time (which may have 1-4 bytes). + * + * Note StrCharsIter iterates over UTF-8 encoded codepoints, not unicode scalar values. + * This is mostly done for simplicity, since solidity doesn't care about unicode anyways. + * + * TODO think about actually adding char and unicode awareness? + * https://github.com/devstein/unicode-eth attempts something like that + */ +struct StrCharsIter { + uint256 _ptr; + uint256 _len; +} + +/*////////////////////////////////////////////////////////////////////////// + STATIC FUNCTIONS +//////////////////////////////////////////////////////////////////////////*/ + +library StrCharsIter__ { + /** + * @dev Creates a new `StrCharsIter` from `StrSlice`. + * Note the `StrSlice` is assumed to be memory-safe. + */ + function from(StrSlice slice) internal pure returns (StrCharsIter memory) { + return StrCharsIter(slice.ptr(), slice.len()); + + // TODO I'm curious about gas differences + // return StrCharsIter(SliceIter__.from(str.asSlice())); + } +} + +/*////////////////////////////////////////////////////////////////////////// + GLOBAL FUNCTIONS +//////////////////////////////////////////////////////////////////////////*/ + +using { + asStr, + ptr, len, isEmpty, + next, nextBack, + count +} for StrCharsIter global; + +/** + * @dev Views the underlying data as a subslice of the original data. + */ +function asStr(StrCharsIter memory self) pure returns (StrSlice slice) { + return StrSlice.wrap(Slice.unwrap( + self._sliceIter().asSlice() + )); +} + +/** + * @dev Returns the pointer to the start of an in-memory string slice. + * This method is primarily for internal use. + */ +function ptr(StrCharsIter memory self) pure returns (uint256) { + return self._ptr; +} + +/** + * @dev Returns the length in bytes, not codepoints. + */ +function len(StrCharsIter memory self) pure returns (uint256) { + return self._len; +} + +/** + * @dev Returns true if the iterator is empty. + */ +function isEmpty(StrCharsIter memory self) pure returns (bool) { + return self._sliceIter().isEmpty(); +} + +/** + * @dev Advances the iterator and returns the next character. + * Reverts if len == 0. + */ +function next(StrCharsIter memory self) pure returns (StrChar char) { + if (self.len() == 0) revert SliceIter__StopIteration(); + + bytes32 b = self._sliceIter().asSlice().copyToBytes32(); + // Reverts if can't make valid UTF-8 + char = StrChar__.from(b); + + // advance the iterator + // TODO this can probably be unchecked (copyToBytes32 zeros overflow, and selfLen != 0 so \0 can be a char too) + self._ptr += char.len(); + self._len -= char.len(); + + return char; +} + +/** + * @dev Advances the iterator from the back and returns the next character. + * Reverts if len == 0. + */ +function nextBack(StrCharsIter memory self) pure returns (StrChar char) { + if (self.len() == 0) revert SliceIter__StopIteration(); + + // _self shares memory with self! + SliceIter memory _self = self._sliceIter(); + + bool isValid; + uint256 b; + for (uint256 i; i < 4; i++) { + // an example of what's going on in the loop: + // b = 0x000000..0000 + // nextBack = 0xAB + // b = 0xAB0000..0000 + // nextBack = 0xCD + // b = 0xABCD00..0000 + // ...2 more times + + b = b | ( + // get 1 byte in LSB + uint256(_self.nextBack()) + // flip it to MSB + << ((31 - i) * 8) + ); + // break if the char is valid + char = StrChar__.fromUnchecked(bytes32(b)); + isValid = char.isValidUtf8(); + if (isValid) break; + } + if (!isValid) revert StrChar__InvalidUTF8(); + + // advance the iterator + self._len -= char.len(); + // fromUnchecked was safe, because UTF-8 was validated, + // and all the remaining bytes are 0 (since the loop went byte-by-byte) + return char; +} + +/** + * @dev Consumes the iterator, counting the number of UTF-8 characters. + * Note O(n) time! + * Reverts on invalid UTF-8. + */ +function count(StrCharsIter memory self) pure returns (uint256 result) { + while (!self.isEmpty()) { + self.next(); + result += 1; + } + return result; +} + +/*////////////////////////////////////////////////////////////////////////// + FILE-LEVEL FUNCTIONS +//////////////////////////////////////////////////////////////////////////*/ + +using { _sliceIter } for StrCharsIter; + +/** + * @dev Returns the underlying `SliceIter`. + * AVOID USING THIS EXTERNALLY! + * Advancing the underlying slice could lead to invalid UTF-8 for StrCharsIter. + */ +function _sliceIter(StrCharsIter memory self) pure returns (SliceIter memory result) { + assembly { + result := self + } +} \ No newline at end of file diff --git a/src/StrSlice.sol b/src/StrSlice.sol new file mode 100644 index 0000000..dccd0b1 --- /dev/null +++ b/src/StrSlice.sol @@ -0,0 +1,394 @@ +// SPDX-License-Identifier: MIT + +pragma solidity ^0.8.17; + +import { Slice, Slice__ } from "./Slice.sol"; +import { StrChar, StrChar__ } from "./StrChar.sol"; +import { StrCharsIter, StrCharsIter__ } from "./StrCharsIter.sol"; +import { isValidUtf8 } from "./utf8.sol"; + +/** + * @title A string slice. + * @dev String slices must always be valid UTF-8. + * Internally `StrSlice` uses `Slice`, adding only UTF-8 related logic on top. + */ +type StrSlice is uint256; + +/*////////////////////////////////////////////////////////////////////////// + CUSTOM ERRORS +//////////////////////////////////////////////////////////////////////////*/ + +error StrSlice__InvalidCharBoundary(); + +/*////////////////////////////////////////////////////////////////////////// + STATIC FUNCTIONS +//////////////////////////////////////////////////////////////////////////*/ + +library StrSlice__ { + /** + * @dev Converts a `string` to a `StrSlice`. + * The string is not copied. + * `StrSlice` points to the memory of `string`, right after the length word. + */ + function from(string memory str) internal pure returns (StrSlice slice) { + uint256 _ptr; + assembly { + _ptr := add(str, 0x20) + } + return fromRawParts(_ptr, bytes(str).length); + } + + /** + * @dev Creates a new `StrSlice` directly from length and memory pointer. + * Note that the caller MUST guarantee memory-safety. + * This method is primarily for internal use. + */ + function fromRawParts(uint256 _ptr, uint256 _len) internal pure returns (StrSlice slice) { + return StrSlice.wrap(Slice.unwrap( + Slice__.fromRawParts(_ptr, _len) + )); + } + + /** + * @dev Returns true if the byte slice starts with a valid UTF-8 character. + * Note this does not validate the whole slice. + */ + function isBoundaryStart(Slice slice) internal pure returns (bool) { + bytes32 b = slice.copyToBytes32(); + return StrChar__.fromUnchecked(b).isValidUtf8(); + } +} + +/** + * @dev Alternative to StrSlice__.from() + * Put this in your file (using for global is only for user-defined types): + * ``` + * using { toSlice } for string; + * ``` + */ +function toSlice(string memory str) pure returns (StrSlice slice) { + return StrSlice__.from(str); +} + +/*////////////////////////////////////////////////////////////////////////// + GLOBAL FUNCTIONS +//////////////////////////////////////////////////////////////////////////*/ + +using { + asSlice, + ptr, len, isEmpty, + // concatenation + add, join, + // copy + toString, + keccak, + // compare + cmp, eq, ne, lt, lte, gt, gte, + // index + isCharBoundary, + get, + splitAt, getSubslice, + // search + find, rfind, contains, + startsWith, endsWith, + // modify + stripPrefix, stripSuffix, + replacen, + // iteration + chars +} for StrSlice global; + +/** + * @dev Returns the underlying `Slice`. + * WARNING: manipulating `Slice`s can break UTF-8 for related `StrSlice`s! + */ +function asSlice(StrSlice self) pure returns (Slice) { + return Slice.wrap(StrSlice.unwrap(self)); +} + +/** + * @dev Returns the pointer to the start of an in-memory string slice. + * This method is primarily for internal use. + */ +function ptr(StrSlice self) pure returns (uint256) { + return self.asSlice().ptr(); +} + +/** + * @dev Returns the length in bytes, not codepoints. + */ +function len(StrSlice self) pure returns (uint256) { + return self.asSlice().len(); +} + +/** + * @dev Returns true if the slice has a length of 0. + */ +function isEmpty(StrSlice self) pure returns (bool) { + return self.asSlice().isEmpty(); +} + +/** + * @dev Concatenates two `StrSlice`s into a newly allocated string. + */ +function add(StrSlice self, StrSlice other) view returns (string memory) { + return string(self.asSlice().add(other.asSlice())); +} + +/** + * @dev Flattens an array of `StrSlice`s into a single newly allocated string, + * placing `self` as the separator between each. + */ +function join(StrSlice self, StrSlice[] memory strs) view returns (string memory) { + Slice[] memory slices; + // TODO is there another way to unwrap arrays of user-defined types? + assembly { + slices := strs + } + return string(self.asSlice().join(slices)); +} + +/** + * @dev Copies `StrSlice` to a newly allocated string. + * The `StrSlice` will NOT point to the new string. + */ +function toString(StrSlice self) view returns (string memory) { + return string(self.asSlice().copyToBytes()); +} + +/** + * @dev Returns keccak256 of all the bytes of `StrSlice`. + * Note `StrSlice` hashes will never equal `string` hashes, + * because `string` always stores length in-memory, but `StrSlice` never includes it. + */ +function keccak(StrSlice self) pure returns (bytes32 result) { + return self.asSlice().keccak(); +} + +/** + * @dev Compare string slices lexicographically. + * @return result 0 for equal, < 0 for less than and > 0 for greater than. + */ +function cmp(StrSlice self, StrSlice other) pure returns (int result) { + return self.asSlice().cmp(other.asSlice()); +} + +/// @dev `self` == `other` +/// Note more efficient than cmp for big slices +function eq(StrSlice self, StrSlice other) pure returns (bool) { + return self.asSlice().eq(other.asSlice()); +} + +/// @dev `self` != `other` +/// Note more efficient than cmp for big slices +function ne(StrSlice self, StrSlice other) pure returns (bool) { + return self.asSlice().ne(other.asSlice()); +} + +/// @dev `self` < `other` +function lt(StrSlice self, StrSlice other) pure returns (bool) { + return self.cmp(other) < 0; +} + +/// @dev `self` <= `other` +function lte(StrSlice self, StrSlice other) pure returns (bool) { + return self.cmp(other) <= 0; +} + +/// @dev `self` > `other` +function gt(StrSlice self, StrSlice other) pure returns (bool) { + return self.cmp(other) > 0; +} + +/// @dev `self` >= `other` +function gte(StrSlice self, StrSlice other) pure returns (bool) { + return self.cmp(other) >= 0; +} + +/** + * @dev Checks that `index`-th byte is safe to split on. + * The start and end of the string (when index == self.len()) are considered to be boundaries. + * Returns false if index is greater than self.len(). + */ +function isCharBoundary(StrSlice self, uint256 index) pure returns (bool) { + if (index < self.len()) { + return isValidUtf8(self.asSlice().getAfter(index).copyToBytes32()); + } else if (index == self.len()) { + return true; + } else { + return false; + } +} + +/** + * @dev Returns the character at `index` (in bytes). + * Reverts if index is out of bounds. + */ +function get(StrSlice self, uint256 index) pure returns (StrChar char) { + bytes32 b = self.asSlice().getAfter(index).copyToBytes32(); + if (!isValidUtf8(b)) revert StrSlice__InvalidCharBoundary(); + return StrChar__.fromValidUtf8(b); +} + +/** + * @dev Divides one string slice into two at an index. + * Reverts when splitting on a non-boundary (use isCharBoundary). + */ +function splitAt(StrSlice self, uint256 mid) pure returns (StrSlice, StrSlice) { + (Slice lSlice, Slice rSlice) = self.asSlice().splitAt(mid); + if (!StrSlice__.isBoundaryStart(lSlice) || !StrSlice__.isBoundaryStart(rSlice)) { + revert StrSlice__InvalidCharBoundary(); + } + return ( + StrSlice.wrap(Slice.unwrap(lSlice)), + StrSlice.wrap(Slice.unwrap(rSlice)) + ); +} + +/** + * @dev Returns a subslice [start..end) of `self`. + * Reverts when slicing a non-boundary (use isCharBoundary). + */ +function getSubslice(StrSlice self, uint256 start, uint256 end) pure returns (StrSlice) { + Slice subslice = self.asSlice().getSubslice(start, end); + if (!StrSlice__.isBoundaryStart(subslice)) revert StrSlice__InvalidCharBoundary(); + if (end != self.len()) { + (, Slice nextSubslice) = self.asSlice().splitAt(end); + if (!StrSlice__.isBoundaryStart(nextSubslice)) revert StrSlice__InvalidCharBoundary(); + } + return StrSlice.wrap(Slice.unwrap(subslice)); +} + +/** + * @dev Returns the byte index of the first slice of `self` that matches `pattern`. + * Returns type(uint256).max if the `pattern` does not match. + */ +function find(StrSlice self, StrSlice pattern) pure returns (uint256) { + return self.asSlice().find(pattern.asSlice()); +} + +/** + * @dev Returns the byte index of the last slice of `self` that matches `pattern`. + * Returns type(uint256).max if the `pattern` does not match. + */ +function rfind(StrSlice self, StrSlice pattern) pure returns (uint256) { + return self.asSlice().rfind(pattern.asSlice()); +} + +/** + * @dev Returns true if the given pattern matches a sub-slice of this string slice. + */ +function contains(StrSlice self, StrSlice pattern) pure returns (bool) { + return self.asSlice().contains(pattern.asSlice()); +} + +/** + * @dev Returns true if the given pattern matches a prefix of this string slice. + */ +function startsWith(StrSlice self, StrSlice pattern) pure returns (bool) { + return self.asSlice().startsWith(pattern.asSlice()); +} + +/** + * @dev Returns true if the given pattern matches a suffix of this string slice. + */ +function endsWith(StrSlice self, StrSlice pattern) pure returns (bool) { + return self.asSlice().endsWith(pattern.asSlice()); +} + +/** + * @dev Returns a subslice with the prefix removed. + * If it does not start with `prefix`, returns `self` unmodified. + */ +function stripPrefix(StrSlice self, StrSlice pattern) pure returns (StrSlice result) { + return StrSlice.wrap(Slice.unwrap( + self.asSlice().stripPrefix(pattern.asSlice()) + )); +} + +/** + * @dev Returns a subslice with the suffix removed. + * If it does not end with `suffix`, returns `self` unmodified. + */ +function stripSuffix(StrSlice self, StrSlice pattern) pure returns (StrSlice result) { + return StrSlice.wrap(Slice.unwrap( + self.asSlice().stripSuffix(pattern.asSlice()) + )); +} + +/** + * @dev Replaces first `n` matches of a pattern with another string slice. + * Returns the result in a newly allocated string. + * Note this does not modify the string `self` is a slice of. + * WARNING: Requires 0 < pattern.len() <= to.len() + */ +function replacen( + StrSlice self, + StrSlice pattern, + StrSlice to, + uint256 n +) view returns (string memory str) { + // TODO dynamic string; atm length can be reduced but not increased + assert(pattern.len() >= to.len()); + assert(pattern.len() > 0); + + str = new string(self.len()); + Slice iterSlice = self.asSlice(); + Slice resultSlice = StrSlice__.from(str).asSlice(); + + uint256 matchNum; + while (matchNum < n) { + uint256 index = iterSlice.find(pattern.asSlice()); + // break if no more matches + if (index == type(uint256).max) break; + // copy prefix + if (index > 0) { + resultSlice + .getSubslice(0, index) + .copyFromSlice( + iterSlice.getSubslice(0, index) + ); + } + + // copy replacement + // TODO this is fine atm only because pattern.len() <= to.len() + resultSlice + .getSubslice(index, index + to.len()) + .copyFromSlice(to.asSlice()); + + // advance slices past the match + iterSlice = iterSlice.getSubslice(index, index + pattern.len()); + resultSlice = iterSlice.getSubslice(index, index + to.len()); + + // break if iterSlice is done + if (iterSlice.len() == 0) { + break; + } + } + + uint256 realLen = resultSlice.ptr() - StrSlice__.from(str).ptr(); + // copy suffix + if (iterSlice.len() > 0) { + resultSlice + .getSubslice(0, iterSlice.len()) + .copyFromSlice(iterSlice); + realLen += iterSlice.len(); + } + // remove extra length + if (bytes(str).length != realLen) { + assert(realLen <= bytes(str).length); + /// @solidity memory-safe-assembly + assembly { + mstore(str, realLen) + } + } + return str; +} + +/** + * @dev Returns an character iterator over the slice. + * The iterator yields items from either side. + */ +function chars(StrSlice self) pure returns (StrCharsIter memory) { + return StrCharsIter__.from(self); +} \ No newline at end of file diff --git a/src/utf8.sol b/src/utf8.sol new file mode 100644 index 0000000..8560d10 --- /dev/null +++ b/src/utf8.sol @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: MIT + +pragma solidity ^0.8.17; + +/** + * @dev Returns the byte length for a UTF-8 character with the leading byte. + * Returns 0 for invalid leading bytes. + */ +function utf8CharWidth(uint8 leadingByte) pure returns (uint256) { + if (leadingByte < 0x80) { + return 1; + } else if (leadingByte < 0xC2) { + return 0; + } else if (leadingByte < 0xE0) { + return 2; + } else if (leadingByte < 0xF0) { + return 3; + } else if (leadingByte < 0xF5) { + return 4; + } else { + return 0; + } +} + +/** + * @dev Returns true if `b` is a valid UTF-8 leading byte. + */ +function isLeadingByte(uint8 b) pure returns (bool) { + return utf8CharWidth(b) > 0; +} + +/** + * @dev Returns true if the 1-4 bytes at MSB are a valid UTF-8 encoded character. + * Note if MSB is 0x00, this will return true, since 0x00 is valid UTF-8. + * Works faster for smaller code points. + * + * https://www.rfc-editor.org/rfc/rfc3629#section-4 + * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 + * UTF8-1 = %x00-7F + * UTF8-2 = %xC2-DF UTF8-tail + * UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + * %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + * %xF4 %x80-8F 2( UTF8-tail ) + * UTF8-tail = %x80-BF + */ +function isValidUtf8(bytes32 b) pure returns (bool) { + // TODO you can significantly optimize comparisons with bitmasks, + // some stuff to look at: + // https://github.com/zwegner/faster-utf8-validator/blob/master/z_validate.c + // https://github.com/websockets/utf-8-validate/blob/master/src/validation.c + // https://github.com/simdutf/simdutf/blob/master/src/scalar/utf8.h + + uint8 first = uint8(b[0]); + // UTF8-1 = %x00-7F + if (first <= 0x7F) { + // fast path for ascii + return true; + } + + uint256 w = utf8CharWidth(first); + if (w == 2) { + // UTF8-2 + return + // %xC2-DF UTF8-tail + _bRange(first, 0xC2, 0xDF) + && _utf8Tail(uint8(b[1])); + } else if (w == 3) { + uint8 second = uint8(b[1]); + // UTF8-3 + bool valid12 = + // = %xE0 %xA0-BF UTF8-tail + first == 0xE0 + && _bRange(second, 0xA0, 0xBF) + // / %xE1-EC 2( UTF8-tail ) + || _bRange(first, 0xE1, 0xEC) + && _utf8Tail(second) + // / %xED %x80-9F UTF8-tail + || first == 0xED + && _bRange(second, 0x80, 0x9F) + // / %xEE-EF 2( UTF8-tail ) + || _bRange(first, 0xEE, 0xEF) + && _utf8Tail(second); + + return valid12 && _utf8Tail(uint8(b[2])); + } else if (w == 4) { + uint8 second = uint8(b[1]); + // UTF8-4 + bool valid12 = + // = %xF0 %x90-BF 2( UTF8-tail ) + first == 0xF0 + && _bRange(second, 0x90, 0xBF) + // / %xF1-F3 3( UTF8-tail ) + || _bRange(first, 0xF1, 0xF3) + && _utf8Tail(second) + // / %xF4 %x80-8F 2( UTF8-tail ) + || first == 0xF4 + && _bRange(second, 0x80, 0x8F); + + return valid12 && _utf8Tail(uint8(b[2])) && _utf8Tail(uint8(b[3])); + } else { + return false; + } +} + +/// @dev left <= b <= right +function _bRange(uint8 b, uint8 left, uint8 right) pure returns (bool) { + return left <= b && b <= right; +} + +/// @dev UTF8-tail = %x80-BF +function _utf8Tail(uint8 b) pure returns (bool) { + // and,cmp should be faster than cmp,cmp,and + // 0xC0 = 0b1100_0000, 0x80 = 0b1000_0000 + return b & 0xC0 == 0x80; +} \ No newline at end of file diff --git a/test/StrCharsIter.t.sol b/test/StrCharsIter.t.sol new file mode 100644 index 0000000..f60feb9 --- /dev/null +++ b/test/StrCharsIter.t.sol @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: MIT + +pragma solidity ^0.8.17; + +import { PRBTest } from "@prb/test/src/PRBTest.sol"; + +import { StrSlice, toSlice, StrCharsIter } from "../src/StrSlice.sol"; +import { SliceIter__StopIteration } from "../src/SliceIter.sol"; + +using { toSlice } for string; + +contract StrCharsIterTest is PRBTest { + function testCount() public { + assertEq(toSlice("").chars().count(), 0); + assertEq(toSlice("Hello, world!").chars().count(), 13); + assertEq(toSlice(unicode"naïve").chars().count(), 5); + assertEq(toSlice(unicode"こんにちは").chars().count(), 5); + assertEq(toSlice(unicode"Z̤͔ͧ̑̓ä͖̭̈̇lͮ̒ͫǧ̗͚̚o̙̔ͮ̇͐̇Z̤͔ͧ̑̓ä͖̭̈̇lͮ̒ͫǧ̗͚̚o̙̔ͮ̇͐̇").chars().count(), 56); + assertEq(toSlice(unicode"🗮🐵🌝👤👿🗉💀🉄🍨🉔🈥🔥🏅🔪🉣📷🉳🍠🈃🉌🖷👍🌐💎🋀🌙💼💮🗹🗘💬🖜🐥🖸🈰🍦💈📆🋬🏇🖒🐜👮🊊🗒🈆🗻🏁🈰🎎🊶🉠🍖🉪🌖📎🌄💵🕷🔧🍸🋗🍁🋸") + .chars().count(), 64); + } + + function testNext() public { + StrSlice s = string(unicode"a¡ࠀ𐀡").toSlice(); + StrCharsIter memory iter = s.chars(); + + assertEq(iter.next().toString(), unicode"a"); + assertEq(iter.asStr().toString(), unicode"¡ࠀ𐀡"); + assertEq(iter.next().toString(), unicode"¡"); + assertEq(iter.asStr().toString(), unicode"ࠀ𐀡"); + assertEq(iter.next().toString(), unicode"ࠀ"); + assertEq(iter.asStr().toString(), unicode"𐀡"); + assertEq(iter.next().toString(), unicode"𐀡"); + assertEq(iter.asStr().toString(), unicode""); + + vm.expectRevert(SliceIter__StopIteration.selector); + iter.next(); + } + + function testNextBack() public { + StrSlice s = string(unicode"a¡ࠀ𐀡").toSlice(); + StrCharsIter memory iter = s.chars(); + + assertEq(iter.next().toString(), unicode"𐀡"); + assertEq(iter.asStr().toString(), unicode"a¡ࠀ"); + assertEq(iter.next().toString(), unicode"ࠀ"); + assertEq(iter.asStr().toString(), unicode"a¡"); + assertEq(iter.next().toString(), unicode"¡"); + assertEq(iter.asStr().toString(), unicode"a"); + assertEq(iter.next().toString(), unicode"a"); + assertEq(iter.asStr().toString(), unicode""); + + vm.expectRevert(SliceIter__StopIteration.selector); + iter.next(); + } +} \ No newline at end of file diff --git a/test/StrSlice.t.sol b/test/StrSlice.t.sol new file mode 100644 index 0000000..a1ac716 --- /dev/null +++ b/test/StrSlice.t.sol @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: MIT + +pragma solidity ^0.8.17; + +import { PRBTest } from "@prb/test/src/PRBTest.sol"; + +import { StrSlice, toSlice, StrSlice__InvalidCharBoundary } from "../src/StrSlice.sol"; + +using { toSlice } for string; + +contract StrSliceTest is PRBTest { + function testToString() public { + string memory _s = unicode"Hello, world!"; + assertEq(_s, _s.toSlice().toString()); + } + + function testLen() public { + string memory _s = unicode"こんにちは"; + assertEq(bytes(_s).length, _s.toSlice().len()); + } + + function testIsEmpty() public { + assertTrue(string("").toSlice().isEmpty()); + assertFalse(new string(1).toSlice().isEmpty()); + } + + /*////////////////////////////////////////////////////////////////////////// + CONCATENATION + //////////////////////////////////////////////////////////////////////////*/ + + function testAdd() public { + assertEq(unicode"こんにちは", toSlice(unicode"こん").add(toSlice(unicode"にちは"))); + } + + function testJoin() public { + StrSlice[] memory sliceArr = new StrSlice[](3); + sliceArr[0] = toSlice("Hello"); + sliceArr[1] = toSlice(unicode"こんにちは"); + sliceArr[2] = toSlice(""); + assertEq( + toSlice(unicode"📎!").join(sliceArr), + unicode"Hello📎!こんにちは📎!" + ); + } + + /*////////////////////////////////////////////////////////////////////////// + INDEX + //////////////////////////////////////////////////////////////////////////*/ + + function testIsCharBoundary() public { + string memory _s = unicode"こ"; + // start + assertTrue(toSlice(_s).isCharBoundary(0)); + // mid + assertFalse(toSlice(_s).isCharBoundary(1)); + assertFalse(toSlice(_s).isCharBoundary(2)); + // end (isn't a valid index, but is a valid boundary) + assertTrue(toSlice(_s).isCharBoundary(3)); + // out of bounds + assertFalse(toSlice(_s).isCharBoundary(4)); + } + + function testGet() public { + string memory _s = unicode"こんにちは"; + assertEq(_s.toSlice().get(3).toString(), unicode"ん"); + } + + function testGet__InvalidCharBoundary() public { + string memory _s = unicode"こんにちは"; + vm.expectRevert(StrSlice__InvalidCharBoundary.selector); + _s.toSlice().get(1); + } + + function testSplitAt() public { + string memory _s = unicode"こんにちは"; + (StrSlice s1, StrSlice s2) = _s.toSlice().splitAt(3); + assertEq(s1.toString(), unicode"こ"); + assertEq(s2.toString(), unicode"んにちは"); + } + + function testSplitAt__InvalidCharBoundary() public { + string memory _s = unicode"こんにちは"; + vm.expectRevert(StrSlice__InvalidCharBoundary.selector); + _s.toSlice().splitAt(1); + } + + function testGetSubslice() public { + string memory _s = unicode"こんにちは"; + assertEq(_s.toSlice().getSubslice(3, 9).toString(), unicode"んに"); + } + + function testGetSubslice__InvalidCharBoundary() public { + string memory _s = unicode"こんにちは"; + vm.expectRevert(StrSlice__InvalidCharBoundary.selector); + _s.toSlice().getSubslice(3, 8); + } + + /*////////////////////////////////////////////////////////////////////////// + SEARCH + //////////////////////////////////////////////////////////////////////////*/ + + function testFind() public { + string memory s1 = unicode"012こんにちはこんにちは34"; + string memory s2 = unicode"んに"; + uint256 index = s1.toSlice().find(s2.toSlice()); + assertEq(index, 6); + (, StrSlice rSlice) = s1.toSlice().splitAt(index); + assertEq(rSlice.toString(), unicode"んにちはこんにちは34"); + } + + function testRfind() public { + string memory s1 = unicode"012こんにちはこんにちは34"; + string memory s2 = unicode"んに"; + uint256 index = s1.toSlice().rfind(s2.toSlice()); + assertEq(index, 21); + (, StrSlice rSlice) = s1.toSlice().splitAt(index); + assertEq(rSlice.toString(), unicode"んにちは34"); + } + + function testContains() public { + string memory s1 = unicode"「lorem ipsum」の典型的なテキストのほかにも、原典からの距離の様々なバリエーションが存在する。他のバージョンでは、ラテン語にはあまり登場しないか存在しない"; + string memory s2 = unicode"登場"; + assertTrue(s1.toSlice().contains(s2.toSlice())); + } + + function testNotContains() public { + string memory s1 = unicode"「lorem ipsum」の典型的なテキストのほかにも、原典からの距離の様々なバリエーションが存在する。他のバージョンでは、ラテン語にはあまり登場しないか存在しない"; + string memory s2 = unicode"0"; + assertFalse(s1.toSlice().contains(s2.toSlice())); + } + + // TODO more tests +} \ No newline at end of file