diff --git a/executor/importer/BUILD.bazel b/executor/importer/BUILD.bazel index 190398c7c5d82..140eb0fba02e5 100644 --- a/executor/importer/BUILD.bazel +++ b/executor/importer/BUILD.bazel @@ -16,6 +16,7 @@ go_library( "//parser/terror", "//planner/core", "//sessionctx", + "//sessionctx/variable", "//table", "//util/chunk", "//util/dbterror", diff --git a/executor/importer/import.go b/executor/importer/import.go index 893f87744bf7f..ec77b30e1d6aa 100644 --- a/executor/importer/import.go +++ b/executor/importer/import.go @@ -22,6 +22,7 @@ import ( "path/filepath" "runtime" "strings" + "unicode/utf8" "github.com/pingcap/errors" "github.com/pingcap/log" @@ -35,6 +36,7 @@ import ( "github.com/pingcap/tidb/parser/terror" plannercore "github.com/pingcap/tidb/planner/core" "github.com/pingcap/tidb/sessionctx" + "github.com/pingcap/tidb/sessionctx/variable" "github.com/pingcap/tidb/table" "github.com/pingcap/tidb/util/chunk" "github.com/pingcap/tidb/util/dbterror" @@ -168,6 +170,7 @@ type LoadDataController struct { logger *zap.Logger sqlMode mysql.SQLMode + charset *string importantSysVars map[string]string dataStore storage.ExternalStorage dataFiles []*mydump.SourceFileMeta @@ -193,7 +196,9 @@ func getImportantSysVars(sctx sessionctx.Context) map[string]string { } // NewLoadDataController create new controller. -func NewLoadDataController(sctx sessionctx.Context, plan *plannercore.LoadData, tbl table.Table) (*LoadDataController, error) { +func NewLoadDataController(userSctx sessionctx.Context, plan *plannercore.LoadData, tbl table.Table) (*LoadDataController, error) { + fullTableName := common.UniqueTable(plan.Table.Schema.L, plan.Table.Name.L) + logger := log.L().With(zap.String("table", fullTableName)) var format string if plan.Format != nil { format = strings.ToLower(*plan.Format) @@ -201,9 +206,19 @@ func NewLoadDataController(sctx sessionctx.Context, plan *plannercore.LoadData, // without FORMAT 'xxx' clause, default to DELIMITED DATA format = LoadDataFormatDelimitedData } - restrictive := sctx.GetSessionVars().SQLMode.HasStrictMode() && + charset := plan.Charset + if charset == nil { + // https://dev.mysql.com/doc/refman/8.0/en/load-data.html#load-data-character-set + d, err2 := userSctx.GetSessionVars().GetSessionOrGlobalSystemVar( + context.Background(), variable.CharsetDatabase) + if err2 != nil { + logger.Error("LOAD DATA get charset failed", zap.Error(err2)) + } else { + charset = &d + } + } + restrictive := userSctx.GetSessionVars().SQLMode.HasStrictMode() && plan.OnDuplicate != ast.OnDuplicateKeyHandlingIgnore - fullTableName := common.UniqueTable(plan.Table.Schema.L, plan.Table.Name.L) c := &LoadDataController{ FileLocRef: plan.FileLocRef, Path: plan.Path, @@ -217,14 +232,15 @@ func NewLoadDataController(sctx sessionctx.Context, plan *plannercore.LoadData, LineFieldsInfo: plannercore.NewLineFieldsInfo(plan.FieldsInfo, plan.LinesInfo), Restrictive: restrictive, - logger: log.L().With(zap.String("table", fullTableName)), - sqlMode: sctx.GetSessionVars().SQLMode, - importantSysVars: getImportantSysVars(sctx), + logger: logger, + sqlMode: userSctx.GetSessionVars().SQLMode, + charset: charset, + importantSysVars: getImportantSysVars(userSctx), } if err := c.initFieldParams(plan); err != nil { return nil, err } - if err := c.initOptions(sctx, plan.Options); err != nil { + if err := c.initOptions(userSctx, plan.Options); err != nil { return nil, err } @@ -707,7 +723,16 @@ func (e *LoadDataController) GetParser(ctx context.Context, dataFileInfo LoadDat }() switch e.Format { case LoadDataFormatDelimitedData: - // CSV-like + var charsetConvertor *mydump.CharsetConvertor + if e.charset != nil { + charsetConvertor, err = mydump.NewCharsetConvertor(*e.charset, string(utf8.RuneError)) + if err != nil { + return nil, err + } + } + if err != nil { + return nil, err + } parser, err = mydump.NewCSVParser( ctx, e.GenerateCSVConfig(), @@ -715,8 +740,7 @@ func (e *LoadDataController) GetParser(ctx context.Context, dataFileInfo LoadDat LoadDataReadBlockSize, nil, false, - // TODO: support charset conversion - nil) + charsetConvertor) case LoadDataFormatSQLDump: parser = mydump.NewChunkParser( ctx, diff --git a/executor/loadremotetest/one_csv_test.go b/executor/loadremotetest/one_csv_test.go index a5ab0415ea622..f5e77d4ca3e56 100644 --- a/executor/loadremotetest/one_csv_test.go +++ b/executor/loadremotetest/one_csv_test.go @@ -19,6 +19,7 @@ import ( "github.com/fsouza/fake-gcs-server/fakestorage" "github.com/pingcap/tidb/testkit" + "github.com/stretchr/testify/require" ) func (s *mockGCSSuite) TestLoadCSV() { @@ -294,3 +295,63 @@ func (s *mockGCSSuite) TestMultiValueIndex() { "2 [2, 3, 4]", )) } + +func (s *mockGCSSuite) TestGBK() { + s.tk.MustExec("DROP DATABASE IF EXISTS load_charset;") + s.tk.MustExec("CREATE DATABASE load_charset;") + s.tk.MustExec(`CREATE TABLE load_charset.gbk ( + i INT, j VARCHAR(255) + ) CHARACTER SET gbk;`) + s.tk.MustExec(`CREATE TABLE load_charset.utf8mb4 ( + i INT, j VARCHAR(255) + ) CHARACTER SET utf8mb4;`) + + s.server.CreateObject(fakestorage.Object{ + ObjectAttrs: fakestorage.ObjectAttrs{ + BucketName: "test-load", + Name: "gbk.tsv", + }, + Content: []byte{ + // 1 一丁丂七丄丅丆万丈三上下丌不与丏 + 0x31, 0x09, 0xd2, 0xbb, 0xb6, 0xa1, 0x81, 0x40, 0xc6, 0xdf, 0x81, + 0x41, 0x81, 0x42, 0x81, 0x43, 0xcd, 0xf2, 0xd5, 0xc9, 0xc8, 0xfd, + 0xc9, 0xcf, 0xcf, 0xc2, 0xd8, 0xa2, 0xb2, 0xbb, 0xd3, 0xeb, 0x81, + 0x44, 0x0a, + // 2 丐丑丒专且丕世丗丘丙业丛东丝丞丢 + 0x32, 0x09, 0xd8, 0xa4, 0xb3, 0xf3, 0x81, 0x45, 0xd7, 0xa8, 0xc7, + 0xd2, 0xd8, 0xa7, 0xca, 0xc0, 0x81, 0x46, 0xc7, 0xf0, 0xb1, 0xfb, + 0xd2, 0xb5, 0xb4, 0xd4, 0xb6, 0xab, 0xcb, 0xbf, 0xd8, 0xa9, 0xb6, + 0xaa, + }, + }) + + sql := fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s' + INTO TABLE load_charset.gbk CHARACTER SET gbk`, gcsEndpoint) + s.tk.MustExec(sql) + s.tk.MustQuery("SELECT * FROM load_charset.gbk;").Check(testkit.Rows( + "1 一丁丂七丄丅丆万丈三上下丌不与丏", + "2 丐丑丒专且丕世丗丘丙业丛东丝丞丢", + )) + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s' + INTO TABLE load_charset.utf8mb4 CHARACTER SET gbk`, gcsEndpoint) + s.tk.MustExec(sql) + s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows( + "1 一丁丂七丄丅丆万丈三上下丌不与丏", + "2 丐丑丒专且丕世丗丘丙业丛东丝丞丢", + )) + + s.tk.MustExec("TRUNCATE TABLE load_charset.utf8mb4;") + s.tk.MustExec("SET SESSION character_set_database = 'gbk';") + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s' + INTO TABLE load_charset.utf8mb4;`, gcsEndpoint) + s.tk.MustExec(sql) + s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows( + "1 一丁丂七丄丅丆万丈三上下丌不与丏", + "2 丐丑丒专且丕世丗丘丙业丛东丝丞丢", + )) + + sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s' + INTO TABLE load_charset.utf8mb4 CHARACTER SET unknown`, gcsEndpoint) + err := s.tk.ExecToErr(sql) + require.ErrorContains(s.T(), err, "Unknown character set: 'unknown'") +} diff --git a/parser/ast/dml.go b/parser/ast/dml.go index 19f23f69df74b..ffab15f9125a5 100644 --- a/parser/ast/dml.go +++ b/parser/ast/dml.go @@ -1824,6 +1824,7 @@ type LoadDataStmt struct { Format *string OnDuplicate OnDuplicateKeyHandlingType Table *TableName + Charset *string Columns []*ColumnName FieldsInfo *FieldsClause LinesInfo *LinesClause @@ -1857,6 +1858,10 @@ func (n *LoadDataStmt) Restore(ctx *format.RestoreCtx) error { if err := n.Table.Restore(ctx); err != nil { return errors.Annotate(err, "An error occurred while restore LoadDataStmt.Table") } + if n.Charset != nil { + ctx.WriteKeyWord(" CHARACTER SET ") + ctx.WritePlain(*n.Charset) + } if n.FieldsInfo != nil { n.FieldsInfo.Restore(ctx) } diff --git a/parser/ast/dml_test.go b/parser/ast/dml_test.go index 030469810ee5c..6b205cbde9e27 100644 --- a/parser/ast/dml_test.go +++ b/parser/ast/dml_test.go @@ -416,6 +416,14 @@ func TestLoadDataRestore(t *testing.T) { sourceSQL: "load data infile '/a.csv' format 'sql file' into table `t`", expectSQL: "LOAD DATA INFILE '/a.csv' FORMAT 'sql file' INTO TABLE `t`", }, + { + sourceSQL: "load data infile '/a.csv' format 'sql file' into table `t` character set utf8mb4", + expectSQL: "LOAD DATA INFILE '/a.csv' FORMAT 'sql file' INTO TABLE `t` CHARACTER SET utf8mb4", + }, + { + sourceSQL: "load data infile '/a.csv' format 'sql file' into table `t` character set gbk", + expectSQL: "LOAD DATA INFILE '/a.csv' FORMAT 'sql file' INTO TABLE `t` CHARACTER SET gbk", + }, // ignore N lines { sourceSQL: "load data infile '/a.csv' into table `t` ignore 0 lines", diff --git a/parser/parser.go b/parser/parser.go index 7d506909d37c1..45621f3f29b68 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -21933,6 +21933,7 @@ yynewstate: Format: yyS[yypt-11].item.(*string), OnDuplicate: yyS[yypt-10].item.(ast.OnDuplicateKeyHandlingType), Table: yyS[yypt-7].item.(*ast.TableName), + Charset: yyS[yypt-6].item.(*string), FieldsInfo: yyS[yypt-5].item.(*ast.FieldsClause), LinesInfo: yyS[yypt-4].item.(*ast.LinesClause), IgnoreLines: yyS[yypt-3].item.(*uint64), @@ -21976,6 +21977,15 @@ yynewstate: v := getUint64FromNUM(yyS[yypt-1].item) parser.yyVAL.item = &v } + case 2491: + { + parser.yyVAL.item = (*string)(nil) + } + case 2492: + { + v := yyS[yypt-0].ident + parser.yyVAL.item = &v + } case 2493: { parser.yyVAL.item = nil diff --git a/parser/parser.y b/parser/parser.y index 57eff457db02c..ae2de0543a73a 100644 --- a/parser/parser.y +++ b/parser/parser.y @@ -1014,6 +1014,7 @@ import ( Boolean "Boolean (0, 1, false, true)" OptionalBraces "optional braces" CastType "Cast function target type" + CharsetOpt "CHARACTER SET option in LOAD DATA" ColumnDef "table column definition" ColumnDefList "table column definition list" ColumnName "column name" @@ -13723,6 +13724,7 @@ LoadDataStmt: Format: $6.(*string), OnDuplicate: $7.(ast.OnDuplicateKeyHandlingType), Table: $10.(*ast.TableName), + Charset: $11.(*string), FieldsInfo: $12.(*ast.FieldsClause), LinesInfo: $13.(*ast.LinesClause), IgnoreLines: $14.(*uint64), @@ -13770,8 +13772,14 @@ IgnoreLines: } CharsetOpt: - {} + { + $$ = (*string)(nil) + } | "CHARACTER" "SET" CharsetName + { + v := $3 + $$ = &v + } LocalOpt: { @@ -14673,5 +14681,4 @@ CalibrateResourceStmt: { $$ = &ast.CalibrateResourceStmt{} } - %% diff --git a/parser/parser_test.go b/parser/parser_test.go index ce340e08e06ef..bcf813b5fab35 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -609,7 +609,7 @@ func TestDMLStmt(t *testing.T) { // load data {"load data local infile '/tmp/t.csv' into table t1 fields terminated by ',' optionally enclosed by '\"' ignore 1 lines", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t1` FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' IGNORE 1 LINES"}, {"load data infile '/tmp/t.csv' into table t", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t`"}, - {"load data infile '/tmp/t.csv' into table t character set utf8", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t`"}, + {"load data infile '/tmp/t.csv' into table t character set utf8", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` CHARACTER SET utf8"}, {"load data infile '/tmp/t.csv' into table t fields terminated by 'ab'", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` FIELDS TERMINATED BY 'ab'"}, {"load data infile '/tmp/t.csv' into table t columns terminated by 'ab'", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` FIELDS TERMINATED BY 'ab'"}, {"load data infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b'", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b'"}, @@ -623,7 +623,7 @@ func TestDMLStmt(t *testing.T) { {"load data local infile '/tmp/t.csv' into table t columns terminated by 'ab'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab'"}, {"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b'"}, {"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b' escaped by '*'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*'"}, - {"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*'"}, + {"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` CHARACTER SET utf8 FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*'"}, {"load data local infile '/tmp/t.csv' into table t lines starting by 'ab'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab'"}, {"load data local infile '/tmp/t.csv' into table t lines starting by 'ab' terminated by 'xy'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab' TERMINATED BY 'xy'"}, {"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' lines terminated by 'xy'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy'"}, @@ -634,10 +634,10 @@ func TestDMLStmt(t *testing.T) { {"load data local infile '/tmp/t.csv' into table t columns terminated by 'ab' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' (`a`,`b`)"}, {"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' (`a`,`b`)"}, {"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b' escaped by '*' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*' (`a`,`b`)"}, - {"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*' (`a`,`b`)"}, + {"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` CHARACTER SET utf8 FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*' (`a`,`b`)"}, {"load data local infile '/tmp/t.csv' into table t lines starting by 'ab' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab' (`a`,`b`)"}, {"load data local infile '/tmp/t.csv' into table t lines starting by 'ab' terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab' TERMINATED BY 'xy' (`a`,`b`)"}, - {"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' lines terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy' (`a`,`b`)"}, + {"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' lines terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` CHARACTER SET utf8 FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy' (`a`,`b`)"}, {"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' lines terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy' (`a`,`b`)"}, {"load data local infile '/tmp/t.csv' into table t (a,b) fields terminated by 'ab'", false, ""}, {"load data local infile '/tmp/t.csv' into table t ignore 1 lines", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` IGNORE 1 LINES"}, diff --git a/planner/core/common_plans.go b/planner/core/common_plans.go index 20196198e1dfd..879b30f6b51da 100644 --- a/planner/core/common_plans.go +++ b/planner/core/common_plans.go @@ -552,6 +552,7 @@ type LoadData struct { Path string Format *string Table *ast.TableName + Charset *string Columns []*ast.ColumnName FieldsInfo *ast.FieldsClause LinesInfo *ast.LinesClause diff --git a/planner/core/planbuilder.go b/planner/core/planbuilder.go index a061efd39c6bf..6facdff00a8be 100644 --- a/planner/core/planbuilder.go +++ b/planner/core/planbuilder.go @@ -4224,6 +4224,7 @@ func (b *PlanBuilder) buildLoadData(ctx context.Context, ld *ast.LoadDataStmt) ( Path: ld.Path, Format: ld.Format, Table: ld.Table, + Charset: ld.Charset, Columns: ld.Columns, FieldsInfo: ld.FieldsInfo, LinesInfo: ld.LinesInfo,