Skip to content

Commit

Permalink
feat: add string slice, char, char iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
dk1a committed Dec 6, 2022
1 parent d4b2ed0 commit fe0a65e
Show file tree
Hide file tree
Showing 7 changed files with 1,066 additions and 3 deletions.
30 changes: 27 additions & 3 deletions src/Slice.sol
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ using {
cmp, eq, ne, lt, lte, gt, gte,
// index
get, first, last,
splitAt, getSubslice,
splitAt, getSubslice, getBefore, getAfter,
// search
find, rfind, contains,
startsWith, endsWith,
Expand Down Expand Up @@ -199,7 +199,8 @@ function copyToBytes32(Slice self) pure returns (bytes32 b) {
uint256 selfPtr = self.ptr();

// mask removes any trailing bytes
uint256 mask = leftMask(self.len());
uint256 selfLen = self.len();
uint256 mask = leftMask(selfLen > 32 ? 32 : selfLen);

/// @solidity memory-safe-assembly
assembly {
Expand Down Expand Up @@ -322,7 +323,7 @@ function splitAt(Slice self, uint256 mid) pure returns (Slice, Slice) {
}

/**
* @dev Returns a subslice [start..end) of `self`.
* @dev Returns a subslice [start:end] of `self`.
* Reverts if start/end are out of bounds.
*/
function getSubslice(Slice self, uint256 start, uint256 end) pure returns (Slice) {
Expand All @@ -334,6 +335,29 @@ function getSubslice(Slice self, uint256 start, uint256 end) pure returns (Slice
}
}

/**
* @dev Returns a subslice [:index] of `self`.
* Reverts if `index` > length.
*/
function getBefore(Slice self, uint256 index) pure returns (Slice) {
uint256 selfLen = self.len();
if (index > selfLen) revert Slice__OutOfBounds();
return Slice__.fromRawParts(self.ptr(), index);
}

/**
* @dev Returns a subslice [index:] of `self`.
* Reverts if `index` >= length.
*/
function getAfter(Slice self, uint256 index) pure returns (Slice) {
uint256 selfLen = self.len();
if (index >= selfLen) revert Slice__OutOfBounds();
// safe because index <= selfLen (ptr+len is implicitly safe)
unchecked {
return Slice__.fromRawParts(self.ptr() + index, selfLen - index);
}
}

/**
* @dev Returns the byte index of the first slice of `self` that matches `pattern`.
* Returns type(uint256).max if the `pattern` does not match.
Expand Down
167 changes: 167 additions & 0 deletions src/StrChar.sol
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// SPDX-License-Identifier: MIT

pragma solidity ^0.8.17;

import { isValidUtf8 as _isValidUtf8, utf8CharWidth } from "./utf8.sol";
import { leftMask } from "./mem.sol";

/**
* @title A single UTF-8 encoded character.
* @dev Internally it is stored as UTF-8 encoded bytes starting from left/MSB.
*/
type StrChar is bytes32;

/*//////////////////////////////////////////////////////////////////////////
CUSTOM ERRORS
//////////////////////////////////////////////////////////////////////////*/

error StrChar__InvalidUTF8();

/*//////////////////////////////////////////////////////////////////////////
STATIC FUNCTIONS
//////////////////////////////////////////////////////////////////////////*/

library StrChar__ {
/**
* @dev Converts the first 1-4 bytes of `bytes32` to a `StrChar`.
* Starts from left/MSB, reverts if not valid UTF-8.
* @param b UTF-8 encoded character in the most significant bytes.
*/
function from(bytes32 b) internal pure returns (StrChar char) {
if (!_isValidUtf8(b)) revert StrChar__InvalidUTF8();
return fromValidUtf8(b);
}

/**
* @dev Like `from`, but does NOT check UTF-8 validity.
* If MSB of `bytes32` isn't valid UTF-8, this will return /0 character!
* Primarily for internal use.
*/
function fromValidUtf8(bytes32 b) internal pure returns (StrChar char) {
uint256 _len = len(StrChar.wrap(b));
return StrChar.wrap(bytes32(
// zero-pad after the character
uint256(b) & leftMask(_len)
));
}

/**
* @dev Like `from`, but does NO validity checks.
* MSB of `bytes32` MUST be valid UTF-8!
* And `bytes32` MUST be zero-padded after the first UTF-8 character!
* Primarily for internal use.
*/
function fromUnchecked(bytes32 b) internal pure returns (StrChar char) {
return StrChar.wrap(b);
}

// TODO codepoint to UTF-8, and the reverse
/**
* @dev Converts a `uint32` to a `StrChar`.
* Note that not all code points are valid.
* @param i a code point. E.g. for '€' code point = 0x20AC; wheareas UTF-8 = 0xE282AC.
*
function from(uint32 i) internal pure returns (StrChar char) {
// U+D800–U+DFFF are invalid UTF-16 surrogate halves
if (i > MAX || (i >= 0xD800 && i < 0xE000)) {
revert StrChar__InvalidUSV();
}
}*/
}

/*//////////////////////////////////////////////////////////////////////////
GLOBAL FUNCTIONS
//////////////////////////////////////////////////////////////////////////*/

using {
len,
toBytes32, toString,
cmp, eq, ne, lt, lte, gt, gte,
isValidUtf8
} for StrChar global;

/**
* @dev Returns the character's length in bytes (1-4).
* Returns 0 for some (not all!) invalid characters (e.g. due to unsafe use of fromValidUtf8).
*/
function len(StrChar self) pure returns (uint256) {
return utf8CharWidth(
// extract the leading byte
uint8(StrChar.unwrap(self)[0])
);
}

/**
* @dev Converts a `StrChar` to its underlying bytes32 value.
*/
function toBytes32(StrChar self) pure returns (bytes32) {
return StrChar.unwrap(self);
}

/**
* @dev Converts a `StrChar` to a newly allocated `string`.
*/
function toString(StrChar self) pure returns (string memory str) {
uint256 _len = self.len();
str = new string(_len);
// TODO bitmask is probably better
for (uint256 i; i < _len; i++) {
bytes(str)[i] = StrChar.unwrap(self)[i];
}
return str;
}

/**
* @dev Compare characters lexicographically.
* @return result 0 for equal, < 0 for less than and > 0 for greater than.
*/
function cmp(StrChar self, StrChar other) pure returns (int result) {
uint256 selfUint = uint256(StrChar.unwrap(self));
uint256 otherUint = uint256(StrChar.unwrap(other));
if (selfUint > otherUint) {
return 1;
} else if (selfUint < otherUint) {
return -1;
} else {
return 0;
}
}

/// @dev `self` == `other`
function eq(StrChar self, StrChar other) pure returns (bool) {
return uint256(StrChar.unwrap(self)) == uint256(StrChar.unwrap(other));
}

/// @dev `self` != `other`
function ne(StrChar self, StrChar other) pure returns (bool) {
return uint256(StrChar.unwrap(self)) != uint256(StrChar.unwrap(other));
}

/// @dev `self` < `other`
function lt(StrChar self, StrChar other) pure returns (bool) {
return uint256(StrChar.unwrap(self)) < uint256(StrChar.unwrap(other));
}

/// @dev `self` <= `other`
function lte(StrChar self, StrChar other) pure returns (bool) {
return uint256(StrChar.unwrap(self)) <= uint256(StrChar.unwrap(other));
}

/// @dev `self` > `other`
function gt(StrChar self, StrChar other) pure returns (bool) {
return uint256(StrChar.unwrap(self)) > uint256(StrChar.unwrap(other));
}

/// @dev `self` >= `other`
function gte(StrChar self, StrChar other) pure returns (bool) {
return uint256(StrChar.unwrap(self)) >= uint256(StrChar.unwrap(other));
}

/**
* @dev Returns true if `StrChar` is valid UTF-8.
* Can be false if it was formed with an unsafe method (fromValidUtf8, fromUnchecked, wrap).
*/
function isValidUtf8(StrChar self) pure returns (bool) {
return _isValidUtf8(StrChar.unwrap(self));
}
Loading

0 comments on commit fe0a65e

Please sign in to comment.