Skip to content

Commit

Permalink
Fix bug for LIKE pattern with multiple-byte characters
Browse files Browse the repository at this point in the history
  • Loading branch information
hantangwangd authored and tdcmeehan committed Dec 20, 2023
1 parent 0bf0549 commit f7dc538
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -934,28 +934,29 @@ private RowExpression generateLikePrefixOrSuffixMatch(RowExpression value, RowEx
if (constObject instanceof Slice) {
Slice slice = (Slice) constObject;
String patternString = slice.toStringUtf8();
int matchLength = patternString.length();
if (matchLength > 1 && !patternString.contains("_")) {
int matchCharacterLength = patternString.length();
int matchBytesLength = slice.length();
if (matchCharacterLength > 1 && !patternString.contains("_")) {
if (LIKE_PREFIX_MATCH_PATTERN.matcher(patternString).matches()) {
// prefix match
// x LIKE 'some string%' is same as SUBSTR(x, 1, length('some string')) = 'some string', trialing .* won't matter
return buildEquals(
call(functionAndTypeManager, "SUBSTR", VARCHAR, value, constant(1L, BIGINT), constant((long) matchLength - 1, BIGINT)),
constant(slice.slice(0, matchLength - 1), VARCHAR));
call(functionAndTypeManager, "SUBSTR", VARCHAR, value, constant(1L, BIGINT), constant((long) matchCharacterLength - 1, BIGINT)),
constant(slice.slice(0, matchBytesLength - 1), VARCHAR));
}
else if (LIKE_SUFFIX_MATCH_PATTERN.matcher(patternString).matches()) {
// suffix match
// x LIKE '%some string' is same as SUBSTR(x, 'some string', -length('some string')) = 'some stirng'
return buildEquals(
call(functionAndTypeManager, "SUBSTR", VARCHAR, value, constant(-(long) (matchLength - 1), BIGINT)),
constant(slice.slice(1, matchLength - 1), VARCHAR));
call(functionAndTypeManager, "SUBSTR", VARCHAR, value, constant(-(long) (matchCharacterLength - 1), BIGINT)),
constant(slice.slice(1, matchBytesLength - 1), VARCHAR));
}
else if (LIKE_SIMPLE_EXISTS_PATTERN.matcher(patternString).matches()) {
// pattern should just exist in the string ignoring leading and trailing stuff
// x LIKE '%some string%' is same as CARDINALITY(SPLIT(x, 'some string', 2)) = 2
// Split is most efficient as it uses string.indexOf java builtin so little memory/cpu overhead
return buildEquals(
call(functionAndTypeManager, "CARDINALITY", BIGINT, call(functionAndTypeManager, "SPLIT", new ArrayType(VARCHAR), value, constant(slice.slice(1, matchLength - 2), VARCHAR), constant(2L, BIGINT))),
call(functionAndTypeManager, "CARDINALITY", BIGINT, call(functionAndTypeManager, "SPLIT", new ArrayType(VARCHAR), value, constant(slice.slice(1, matchBytesLength - 2), VARCHAR), constant(2L, BIGINT))),
constant(2L, BIGINT));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@ public void testLike()
assertFunction("'monkey' not like 'monkey' escape null", BOOLEAN, null);

assertInvalidFunction("'monkey' like 'monkey' escape 'foo'", "Escape string must be a single character");

assertFunction("'你好a世界' like '你好%'", BOOLEAN, true);
assertFunction("'你好a世界' like '你好a%'", BOOLEAN, true);
assertFunction("'你好a世界' like '%世界'", BOOLEAN, true);
assertFunction("'你好a世界' like '%好a世%'", BOOLEAN, true);
assertFunction("'你好a世界' not like '你好b%'", BOOLEAN, true);
assertFunction("'你好a世界' not like null", BOOLEAN, null);
}

@Test
Expand Down

0 comments on commit f7dc538

Please sign in to comment.