Skip to content

Commit

Permalink
*: LOAD DATA support GBK character set (#42644)
Browse files Browse the repository at this point in the history
ref #40499
  • Loading branch information
lance6716 authored Mar 29, 2023
1 parent fa08c36 commit 22ff002
Show file tree
Hide file tree
Showing 10 changed files with 134 additions and 16 deletions.
1 change: 1 addition & 0 deletions executor/importer/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ go_library(
"//parser/terror",
"//planner/core",
"//sessionctx",
"//sessionctx/variable",
"//table",
"//util/chunk",
"//util/dbterror",
Expand Down
44 changes: 34 additions & 10 deletions executor/importer/import.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"path/filepath"
"runtime"
"strings"
"unicode/utf8"

"github.com/pingcap/errors"
"github.com/pingcap/log"
Expand All @@ -35,6 +36,7 @@ import (
"github.com/pingcap/tidb/parser/terror"
plannercore "github.com/pingcap/tidb/planner/core"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/sessionctx/variable"
"github.com/pingcap/tidb/table"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/dbterror"
Expand Down Expand Up @@ -168,6 +170,7 @@ type LoadDataController struct {

logger *zap.Logger
sqlMode mysql.SQLMode
charset *string
importantSysVars map[string]string
dataStore storage.ExternalStorage
dataFiles []*mydump.SourceFileMeta
Expand All @@ -193,17 +196,29 @@ func getImportantSysVars(sctx sessionctx.Context) map[string]string {
}

// NewLoadDataController create new controller.
func NewLoadDataController(sctx sessionctx.Context, plan *plannercore.LoadData, tbl table.Table) (*LoadDataController, error) {
func NewLoadDataController(userSctx sessionctx.Context, plan *plannercore.LoadData, tbl table.Table) (*LoadDataController, error) {
fullTableName := common.UniqueTable(plan.Table.Schema.L, plan.Table.Name.L)
logger := log.L().With(zap.String("table", fullTableName))
var format string
if plan.Format != nil {
format = strings.ToLower(*plan.Format)
} else {
// without FORMAT 'xxx' clause, default to DELIMITED DATA
format = LoadDataFormatDelimitedData
}
restrictive := sctx.GetSessionVars().SQLMode.HasStrictMode() &&
charset := plan.Charset
if charset == nil {
// https://dev.mysql.com/doc/refman/8.0/en/load-data.html#load-data-character-set
d, err2 := userSctx.GetSessionVars().GetSessionOrGlobalSystemVar(
context.Background(), variable.CharsetDatabase)
if err2 != nil {
logger.Error("LOAD DATA get charset failed", zap.Error(err2))
} else {
charset = &d
}
}
restrictive := userSctx.GetSessionVars().SQLMode.HasStrictMode() &&
plan.OnDuplicate != ast.OnDuplicateKeyHandlingIgnore
fullTableName := common.UniqueTable(plan.Table.Schema.L, plan.Table.Name.L)
c := &LoadDataController{
FileLocRef: plan.FileLocRef,
Path: plan.Path,
Expand All @@ -217,14 +232,15 @@ func NewLoadDataController(sctx sessionctx.Context, plan *plannercore.LoadData,
LineFieldsInfo: plannercore.NewLineFieldsInfo(plan.FieldsInfo, plan.LinesInfo),
Restrictive: restrictive,

logger: log.L().With(zap.String("table", fullTableName)),
sqlMode: sctx.GetSessionVars().SQLMode,
importantSysVars: getImportantSysVars(sctx),
logger: logger,
sqlMode: userSctx.GetSessionVars().SQLMode,
charset: charset,
importantSysVars: getImportantSysVars(userSctx),
}
if err := c.initFieldParams(plan); err != nil {
return nil, err
}
if err := c.initOptions(sctx, plan.Options); err != nil {
if err := c.initOptions(userSctx, plan.Options); err != nil {
return nil, err
}

Expand Down Expand Up @@ -707,16 +723,24 @@ func (e *LoadDataController) GetParser(ctx context.Context, dataFileInfo LoadDat
}()
switch e.Format {
case LoadDataFormatDelimitedData:
// CSV-like
var charsetConvertor *mydump.CharsetConvertor
if e.charset != nil {
charsetConvertor, err = mydump.NewCharsetConvertor(*e.charset, string(utf8.RuneError))
if err != nil {
return nil, err
}
}
if err != nil {
return nil, err
}
parser, err = mydump.NewCSVParser(
ctx,
e.GenerateCSVConfig(),
reader,
LoadDataReadBlockSize,
nil,
false,
// TODO: support charset conversion
nil)
charsetConvertor)
case LoadDataFormatSQLDump:
parser = mydump.NewChunkParser(
ctx,
Expand Down
61 changes: 61 additions & 0 deletions executor/loadremotetest/one_csv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (

"github.com/fsouza/fake-gcs-server/fakestorage"
"github.com/pingcap/tidb/testkit"
"github.com/stretchr/testify/require"
)

func (s *mockGCSSuite) TestLoadCSV() {
Expand Down Expand Up @@ -294,3 +295,63 @@ func (s *mockGCSSuite) TestMultiValueIndex() {
"2 [2, 3, 4]",
))
}

func (s *mockGCSSuite) TestGBK() {
s.tk.MustExec("DROP DATABASE IF EXISTS load_charset;")
s.tk.MustExec("CREATE DATABASE load_charset;")
s.tk.MustExec(`CREATE TABLE load_charset.gbk (
i INT, j VARCHAR(255)
) CHARACTER SET gbk;`)
s.tk.MustExec(`CREATE TABLE load_charset.utf8mb4 (
i INT, j VARCHAR(255)
) CHARACTER SET utf8mb4;`)

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "gbk.tsv",
},
Content: []byte{
// 1 一丁丂七丄丅丆万丈三上下丌不与丏
0x31, 0x09, 0xd2, 0xbb, 0xb6, 0xa1, 0x81, 0x40, 0xc6, 0xdf, 0x81,
0x41, 0x81, 0x42, 0x81, 0x43, 0xcd, 0xf2, 0xd5, 0xc9, 0xc8, 0xfd,
0xc9, 0xcf, 0xcf, 0xc2, 0xd8, 0xa2, 0xb2, 0xbb, 0xd3, 0xeb, 0x81,
0x44, 0x0a,
// 2 丐丑丒专且丕世丗丘丙业丛东丝丞丢
0x32, 0x09, 0xd8, 0xa4, 0xb3, 0xf3, 0x81, 0x45, 0xd7, 0xa8, 0xc7,
0xd2, 0xd8, 0xa7, 0xca, 0xc0, 0x81, 0x46, 0xc7, 0xf0, 0xb1, 0xfb,
0xd2, 0xb5, 0xb4, 0xd4, 0xb6, 0xab, 0xcb, 0xbf, 0xd8, 0xa9, 0xb6,
0xaa,
},
})

sql := fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s'
INTO TABLE load_charset.gbk CHARACTER SET gbk`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.gbk;").Check(testkit.Rows(
"1 一丁丂七丄丅丆万丈三上下丌不与丏",
"2 丐丑丒专且丕世丗丘丙业丛东丝丞丢",
))
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4 CHARACTER SET gbk`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows(
"1 一丁丂七丄丅丆万丈三上下丌不与丏",
"2 丐丑丒专且丕世丗丘丙业丛东丝丞丢",
))

s.tk.MustExec("TRUNCATE TABLE load_charset.utf8mb4;")
s.tk.MustExec("SET SESSION character_set_database = 'gbk';")
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4;`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows(
"1 一丁丂七丄丅丆万丈三上下丌不与丏",
"2 丐丑丒专且丕世丗丘丙业丛东丝丞丢",
))

sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4 CHARACTER SET unknown`, gcsEndpoint)
err := s.tk.ExecToErr(sql)
require.ErrorContains(s.T(), err, "Unknown character set: 'unknown'")
}
5 changes: 5 additions & 0 deletions parser/ast/dml.go
Original file line number Diff line number Diff line change
Expand Up @@ -1824,6 +1824,7 @@ type LoadDataStmt struct {
Format *string
OnDuplicate OnDuplicateKeyHandlingType
Table *TableName
Charset *string
Columns []*ColumnName
FieldsInfo *FieldsClause
LinesInfo *LinesClause
Expand Down Expand Up @@ -1857,6 +1858,10 @@ func (n *LoadDataStmt) Restore(ctx *format.RestoreCtx) error {
if err := n.Table.Restore(ctx); err != nil {
return errors.Annotate(err, "An error occurred while restore LoadDataStmt.Table")
}
if n.Charset != nil {
ctx.WriteKeyWord(" CHARACTER SET ")
ctx.WritePlain(*n.Charset)
}
if n.FieldsInfo != nil {
n.FieldsInfo.Restore(ctx)
}
Expand Down
8 changes: 8 additions & 0 deletions parser/ast/dml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,14 @@ func TestLoadDataRestore(t *testing.T) {
sourceSQL: "load data infile '/a.csv' format 'sql file' into table `t`",
expectSQL: "LOAD DATA INFILE '/a.csv' FORMAT 'sql file' INTO TABLE `t`",
},
{
sourceSQL: "load data infile '/a.csv' format 'sql file' into table `t` character set utf8mb4",
expectSQL: "LOAD DATA INFILE '/a.csv' FORMAT 'sql file' INTO TABLE `t` CHARACTER SET utf8mb4",
},
{
sourceSQL: "load data infile '/a.csv' format 'sql file' into table `t` character set gbk",
expectSQL: "LOAD DATA INFILE '/a.csv' FORMAT 'sql file' INTO TABLE `t` CHARACTER SET gbk",
},
// ignore N lines
{
sourceSQL: "load data infile '/a.csv' into table `t` ignore 0 lines",
Expand Down
10 changes: 10 additions & 0 deletions parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -21933,6 +21933,7 @@ yynewstate:
Format: yyS[yypt-11].item.(*string),
OnDuplicate: yyS[yypt-10].item.(ast.OnDuplicateKeyHandlingType),
Table: yyS[yypt-7].item.(*ast.TableName),
Charset: yyS[yypt-6].item.(*string),
FieldsInfo: yyS[yypt-5].item.(*ast.FieldsClause),
LinesInfo: yyS[yypt-4].item.(*ast.LinesClause),
IgnoreLines: yyS[yypt-3].item.(*uint64),
Expand Down Expand Up @@ -21976,6 +21977,15 @@ yynewstate:
v := getUint64FromNUM(yyS[yypt-1].item)
parser.yyVAL.item = &v
}
case 2491:
{
parser.yyVAL.item = (*string)(nil)
}
case 2492:
{
v := yyS[yypt-0].ident
parser.yyVAL.item = &v
}
case 2493:
{
parser.yyVAL.item = nil
Expand Down
11 changes: 9 additions & 2 deletions parser/parser.y
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,7 @@ import (
Boolean "Boolean (0, 1, false, true)"
OptionalBraces "optional braces"
CastType "Cast function target type"
CharsetOpt "CHARACTER SET option in LOAD DATA"
ColumnDef "table column definition"
ColumnDefList "table column definition list"
ColumnName "column name"
Expand Down Expand Up @@ -13723,6 +13724,7 @@ LoadDataStmt:
Format: $6.(*string),
OnDuplicate: $7.(ast.OnDuplicateKeyHandlingType),
Table: $10.(*ast.TableName),
Charset: $11.(*string),
FieldsInfo: $12.(*ast.FieldsClause),
LinesInfo: $13.(*ast.LinesClause),
IgnoreLines: $14.(*uint64),
Expand Down Expand Up @@ -13770,8 +13772,14 @@ IgnoreLines:
}

CharsetOpt:
{}
{
$$ = (*string)(nil)
}
| "CHARACTER" "SET" CharsetName
{
v := $3
$$ = &v
}

LocalOpt:
{
Expand Down Expand Up @@ -14673,5 +14681,4 @@ CalibrateResourceStmt:
{
$$ = &ast.CalibrateResourceStmt{}
}

%%
8 changes: 4 additions & 4 deletions parser/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ func TestDMLStmt(t *testing.T) {
// load data
{"load data local infile '/tmp/t.csv' into table t1 fields terminated by ',' optionally enclosed by '\"' ignore 1 lines", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t1` FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' IGNORE 1 LINES"},
{"load data infile '/tmp/t.csv' into table t", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t`"},
{"load data infile '/tmp/t.csv' into table t character set utf8", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t`"},
{"load data infile '/tmp/t.csv' into table t character set utf8", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` CHARACTER SET utf8"},
{"load data infile '/tmp/t.csv' into table t fields terminated by 'ab'", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` FIELDS TERMINATED BY 'ab'"},
{"load data infile '/tmp/t.csv' into table t columns terminated by 'ab'", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` FIELDS TERMINATED BY 'ab'"},
{"load data infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b'", true, "LOAD DATA INFILE '/tmp/t.csv' INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b'"},
Expand All @@ -623,7 +623,7 @@ func TestDMLStmt(t *testing.T) {
{"load data local infile '/tmp/t.csv' into table t columns terminated by 'ab'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab'"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b'"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b' escaped by '*'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*'"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*'"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` CHARACTER SET utf8 FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*'"},
{"load data local infile '/tmp/t.csv' into table t lines starting by 'ab'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab'"},
{"load data local infile '/tmp/t.csv' into table t lines starting by 'ab' terminated by 'xy'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab' TERMINATED BY 'xy'"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' lines terminated by 'xy'", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy'"},
Expand All @@ -634,10 +634,10 @@ func TestDMLStmt(t *testing.T) {
{"load data local infile '/tmp/t.csv' into table t columns terminated by 'ab' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' enclosed by 'b' escaped by '*' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' enclosed by 'b' escaped by '*' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` CHARACTER SET utf8 FIELDS TERMINATED BY 'ab' ENCLOSED BY 'b' ESCAPED BY '*' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t lines starting by 'ab' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t lines starting by 'ab' terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` LINES STARTING BY 'ab' TERMINATED BY 'xy' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' lines terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t character set utf8 fields terminated by 'ab' lines terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` CHARACTER SET utf8 FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t fields terminated by 'ab' lines terminated by 'xy' (a,b)", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` FIELDS TERMINATED BY 'ab' LINES TERMINATED BY 'xy' (`a`,`b`)"},
{"load data local infile '/tmp/t.csv' into table t (a,b) fields terminated by 'ab'", false, ""},
{"load data local infile '/tmp/t.csv' into table t ignore 1 lines", true, "LOAD DATA LOCAL INFILE '/tmp/t.csv' IGNORE INTO TABLE `t` IGNORE 1 LINES"},
Expand Down
1 change: 1 addition & 0 deletions planner/core/common_plans.go
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ type LoadData struct {
Path string
Format *string
Table *ast.TableName
Charset *string
Columns []*ast.ColumnName
FieldsInfo *ast.FieldsClause
LinesInfo *ast.LinesClause
Expand Down
1 change: 1 addition & 0 deletions planner/core/planbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -4224,6 +4224,7 @@ func (b *PlanBuilder) buildLoadData(ctx context.Context, ld *ast.LoadDataStmt) (
Path: ld.Path,
Format: ld.Format,
Table: ld.Table,
Charset: ld.Charset,
Columns: ld.Columns,
FieldsInfo: ld.FieldsInfo,
LinesInfo: ld.LinesInfo,
Expand Down

0 comments on commit 22ff002

Please sign in to comment.