diff --git a/pkg/expression/builtin_ilike.go b/pkg/expression/builtin_ilike.go index 1251bb5376c60..d664f459ac6d3 100644 --- a/pkg/expression/builtin_ilike.go +++ b/pkg/expression/builtin_ilike.go @@ -96,7 +96,7 @@ func (b *builtinIlikeSig) evalInt(ctx EvalContext, row chunk.Row) (int64, bool, var pattern collate.WildcardPattern if b.args[1].ConstLevel() >= ConstOnlyInContext && b.args[2].ConstLevel() >= ConstOnlyInContext { pattern, err = b.patternCache.getOrInitCache(ctx, func() (collate.WildcardPattern, error) { - ret := collate.ConvertAndGetBinCollation(b.collation).Pattern() + ret := collate.ConvertAndGetBinCollator(b.collation).Pattern() ret.Compile(patternStr, byte(escape)) return ret, nil }) @@ -106,7 +106,7 @@ func (b *builtinIlikeSig) evalInt(ctx EvalContext, row chunk.Row) (int64, bool, return 0, true, err } } else { - pattern = collate.ConvertAndGetBinCollation(b.collation).Pattern() + pattern = collate.ConvertAndGetBinCollator(b.collation).Pattern() pattern.Compile(patternStr, byte(escape)) } return boolToInt64(pattern.DoMatch(valStr)), false, nil diff --git a/pkg/expression/builtin_ilike_vec.go b/pkg/expression/builtin_ilike_vec.go index 23ebeaa7e22bb..ae61897e86cb2 100644 --- a/pkg/expression/builtin_ilike_vec.go +++ b/pkg/expression/builtin_ilike_vec.go @@ -59,7 +59,7 @@ func (b *builtinIlikeSig) tryToVecMemorize(ctx EvalContext, param *funcParam, es } pattern, err := b.patternCache.getOrInitCache(ctx, func() (collate.WildcardPattern, error) { - pattern := collate.ConvertAndGetBinCollation(b.collation).Pattern() + pattern := collate.ConvertAndGetBinCollator(b.collation).Pattern() pattern.Compile(param.getStringVal(0), byte(escape)) return pattern, nil }) @@ -201,7 +201,7 @@ func (b *builtinIlikeSig) vecEvalInt(ctx EvalContext, input *chunk.Chunk, result pattern, ok := b.tryToVecMemorize(ctx, params[1], escape) if !ok { - pattern = collate.ConvertAndGetBinCollation(b.collation).Pattern() + pattern = collate.ConvertAndGetBinCollator(b.collation).Pattern() return b.ilikeWithoutMemorization(pattern, params, rowNum, escape, result) } diff --git a/pkg/expression/collation.go b/pkg/expression/collation.go index b9a02e7223adb..bd96f34bd41d6 100644 --- a/pkg/expression/collation.go +++ b/pkg/expression/collation.go @@ -321,7 +321,45 @@ func CheckAndDeriveCollationFromExprs(ctx BuildContext, funcName string, evalTyp return nil, illegalMixCollationErr(funcName, args) } - return ec, nil + return fixStringTypeForMaxLength(funcName, args, ec), nil +} + +// fixStringTypeForMaxLength changes the type of string from `VARCHAR` to `MEDIUM BLOB` or `LONG BLOB` according to the max length of +// the argument. However, as TiDB doesn't have `MaxLength` for `FieldType`, this function handles the logic manually for different types. Now it only +// handles the `JSON` type, because in MySQL, `JSON` type has a big max length and will lead to `LONG BLOB` in many situations. +// To learn more about this case, read the discussion under https://github.com/pingcap/tidb/issues/52833 +// +// TODO: also consider types other than `JSON`. And also think about when it'll become `MEDIUM BLOB`. This function only handles the collation, but +// not change the type and binary flag. +// TODO: some function will generate big values, like `repeat` and `space`. They should be handled according to the argument if it's a constant. +func fixStringTypeForMaxLength(funcName string, args []Expression, ec *ExprCollation) *ExprCollation { + // Be careful that the `args` is not all arguments of the `funcName`. You should check `deriveCollation` function to see which arguments are passed + // to the `CheckAndDeriveCollationFromExprs` function, and then passed here. + shouldChangeToBin := false + + switch funcName { + case ast.Reverse, ast.Lower, ast.Upper, ast.SubstringIndex, ast.Trim, ast.Quote, ast.InsertFunc, ast.Substr, ast.Repeat, ast.Replace: + shouldChangeToBin = args[0].GetType().EvalType() == types.ETJson + case ast.Concat, ast.ConcatWS, ast.Elt, ast.MakeSet: + for _, arg := range args { + if arg.GetType().EvalType() == types.ETJson { + shouldChangeToBin = true + break + } + } + case ast.ExportSet: + if len(args) >= 2 { + shouldChangeToBin = args[0].GetType().EvalType() == types.ETJson || args[1].GetType().EvalType() == types.ETJson + } + if len(args) >= 3 { + shouldChangeToBin = shouldChangeToBin || args[2].GetType().EvalType() == types.ETJson + } + } + + if shouldChangeToBin { + ec.Collation = collate.ConvertAndGetBinCollation(ec.Collation) + } + return ec } func safeConvert(ctx BuildContext, ec *ExprCollation, args ...Expression) bool { diff --git a/pkg/util/collate/collate.go b/pkg/util/collate/collate.go index 44dd9a0b4307b..4e19a79b7ed00 100644 --- a/pkg/util/collate/collate.go +++ b/pkg/util/collate/collate.go @@ -334,23 +334,29 @@ func IsCICollation(collate string) bool { collate == "utf8mb4_0900_ai_ci" } -// ConvertAndGetBinCollation converts collator to binary collator -func ConvertAndGetBinCollation(collate string) Collator { +// ConvertAndGetBinCollation converts collation to binary collation +func ConvertAndGetBinCollation(collate string) string { switch collate { case "utf8_general_ci": - return GetCollator("utf8_bin") + return "utf8_bin" case "utf8_unicode_ci": - return GetCollator("utf8_bin") + return "utf8_bin" case "utf8mb4_general_ci": - return GetCollator("utf8mb4_bin") + return "utf8mb4_bin" case "utf8mb4_unicode_ci": - return GetCollator("utf8mb4_bin") + return "utf8mb4_bin" case "utf8mb4_0900_ai_ci": - return GetCollator("utf8mb4_bin") + return "utf8mb4_bin" case "gbk_chinese_ci": - return GetCollator("gbk_bin") + return "gbk_bin" } - return GetCollator(collate) + + return collate +} + +// ConvertAndGetBinCollator converts collation to binary collator +func ConvertAndGetBinCollator(collate string) Collator { + return GetCollator(ConvertAndGetBinCollation(collate)) } // IsBinCollation returns if the collation is 'xx_bin' or 'bin'. diff --git a/tests/integrationtest/r/expression/charset_and_collation.result b/tests/integrationtest/r/expression/charset_and_collation.result index 4e792011683ba..1600b8d99b3a6 100644 --- a/tests/integrationtest/r/expression/charset_and_collation.result +++ b/tests/integrationtest/r/expression/charset_and_collation.result @@ -1991,3 +1991,68 @@ LOCATE('bar' collate utf8mb4_0900_ai_ci, 'FOOBAR' collate utf8mb4_0900_ai_ci) select 'FOOBAR' collate utf8mb4_0900_ai_ci REGEXP 'foo.*' collate utf8mb4_0900_ai_ci; 'FOOBAR' collate utf8mb4_0900_ai_ci REGEXP 'foo.*' collate utf8mb4_0900_ai_ci 1 +set names utf8mb4 collate utf8mb4_0900_ai_ci; +select reverse(cast('[]' as json)) between 'W' and 'm'; +reverse(cast('[]' as json)) between 'W' and 'm' +1 +select lower(cast('[]' as json)) between 'W' and 'm'; +lower(cast('[]' as json)) between 'W' and 'm' +1 +select upper(cast('[]' as json)) between 'W' and 'm'; +upper(cast('[]' as json)) between 'W' and 'm' +1 +select substring_index(cast('[]' as json), '.', 1) between 'W' and 'm'; +substring_index(cast('[]' as json), '.', 1) between 'W' and 'm' +1 +select trim(cast('[]' as json)) between 'W' and 'm'; +trim(cast('[]' as json)) between 'W' and 'm' +1 +select quote(cast('[]' as json)) between "'W'" and "'m'"; +quote(cast('[]' as json)) between "'W'" and "'m'" +1 +select concat(cast('[]' as json), '1') between 'W' and 'm'; +concat(cast('[]' as json), '1') between 'W' and 'm' +1 +select concat('1', cast('[]' as json)) between '1W' and '1m'; +concat('1', cast('[]' as json)) between '1W' and '1m' +1 +select concat_ws(cast('[]' as json), '1', '1') between '1W' and '1m'; +concat_ws(cast('[]' as json), '1', '1') between '1W' and '1m' +1 +select concat_ws('1', cast('[]' as json)) between 'W' and 'm'; +concat_ws('1', cast('[]' as json)) between 'W' and 'm' +1 +select elt(1, cast('[]' as json), '[]') between 'W' and 'm'; +elt(1, cast('[]' as json), '[]') between 'W' and 'm' +1 +select elt(2, cast('[]' as json), '[]') between 'W' and 'm'; +elt(2, cast('[]' as json), '[]') between 'W' and 'm' +1 +select make_set(1, cast('[]' as json), '[]') between 'W' and 'm'; +make_set(1, cast('[]' as json), '[]') between 'W' and 'm' +1 +select make_set(2, cast('[]' as json), '[]') between 'W' and 'm'; +make_set(2, cast('[]' as json), '[]') between 'W' and 'm' +1 +select replace(cast('[]' as json), '[]', '[]') between 'W' and 'm'; +replace(cast('[]' as json), '[]', '[]') between 'W' and 'm' +1 +select replace('[]', '[]', cast('[]' as json)) between 'W' and 'm'; +replace('[]', '[]', cast('[]' as json)) between 'W' and 'm' +0 +select insert(cast('[]' as json), 0, 100, '[]') between 'W' and 'm'; +insert(cast('[]' as json), 0, 100, '[]') between 'W' and 'm' +1 +select insert('[]', 0, 100, cast('[]' as json)) between 'W' and 'm'; +insert('[]', 0, 100, cast('[]' as json)) between 'W' and 'm' +0 +select substr(cast('[]' as json), 1) between 'W' and 'm'; +substr(cast('[]' as json), 1) between 'W' and 'm' +1 +select repeat(cast('[]' as json), 10) between 'W' and 'm'; +repeat(cast('[]' as json), 10) between 'W' and 'm' +1 +select export_set(3,cast('[]' as json),'2','-',8) between 'W' and 'm'; +export_set(3,cast('[]' as json),'2','-',8) between 'W' and 'm' +1 +set names default; diff --git a/tests/integrationtest/t/expression/charset_and_collation.test b/tests/integrationtest/t/expression/charset_and_collation.test index ef5ecffab1b88..a711edcd8d737 100644 --- a/tests/integrationtest/t/expression/charset_and_collation.test +++ b/tests/integrationtest/t/expression/charset_and_collation.test @@ -814,3 +814,28 @@ select min(id) from t group by str order by str; # TestUTF8MB40900AICIStrFunc select LOCATE('bar' collate utf8mb4_0900_ai_ci, 'FOOBAR' collate utf8mb4_0900_ai_ci); select 'FOOBAR' collate utf8mb4_0900_ai_ci REGEXP 'foo.*' collate utf8mb4_0900_ai_ci; + +# TestCollationWithJSONArg +set names utf8mb4 collate utf8mb4_0900_ai_ci; +select reverse(cast('[]' as json)) between 'W' and 'm'; +select lower(cast('[]' as json)) between 'W' and 'm'; +select upper(cast('[]' as json)) between 'W' and 'm'; +select substring_index(cast('[]' as json), '.', 1) between 'W' and 'm'; +select trim(cast('[]' as json)) between 'W' and 'm'; +select quote(cast('[]' as json)) between "'W'" and "'m'"; +select concat(cast('[]' as json), '1') between 'W' and 'm'; +select concat('1', cast('[]' as json)) between '1W' and '1m'; +select concat_ws(cast('[]' as json), '1', '1') between '1W' and '1m'; +select concat_ws('1', cast('[]' as json)) between 'W' and 'm'; +select elt(1, cast('[]' as json), '[]') between 'W' and 'm'; +select elt(2, cast('[]' as json), '[]') between 'W' and 'm'; +select make_set(1, cast('[]' as json), '[]') between 'W' and 'm'; +select make_set(2, cast('[]' as json), '[]') between 'W' and 'm'; +select replace(cast('[]' as json), '[]', '[]') between 'W' and 'm'; +select replace('[]', '[]', cast('[]' as json)) between 'W' and 'm'; +select insert(cast('[]' as json), 0, 100, '[]') between 'W' and 'm'; +select insert('[]', 0, 100, cast('[]' as json)) between 'W' and 'm'; +select substr(cast('[]' as json), 1) between 'W' and 'm'; +select repeat(cast('[]' as json), 10) between 'W' and 'm'; +select export_set(3,cast('[]' as json),'2','-',8) between 'W' and 'm'; +set names default; \ No newline at end of file