Skip to content
This repository has been archived by the owner on Nov 30, 2021. It is now read-only.

Commit

Permalink
importccl: DELIMITED should handle delimiters only at end of enclosed…
Browse files Browse the repository at this point in the history
… fields

Previously DELIMITED import mode will complain about importing field
`"This a nested single quote " inside quoted field"`
when importing using option `WITH fields_enclosed_by '"'`.
MySql implementation handles this case correctly.
For more info and examples see cockroachdb#40959.

Touches cockroachdb#40374.
Fixes cockroachdb#40959.

Release note (enterprise change): Fixes incorrect behavior for enclosed fields.

Release justification: This is low risk since only business logic of
import has changed. Moreover this functionality has been requested
from a potential client.
  • Loading branch information
spaskob committed Sep 24, 2019
1 parent a92c7d0 commit 9f0f622
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 33 deletions.
87 changes: 83 additions & 4 deletions pkg/ccl/importccl/import_stmt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,20 +243,88 @@ d
data: "1\t2",
err: "row 1: too many columns, expected 1",
},
{
name: "cannot parse data",
create: `i int8, j int8`,
typ: "DELIMITED",
data: "bad_int\t2",
err: "row 1: parse",
},
{
name: "unexpected number of columns",
create: `a string, b string`,
typ: "DELIMITED",
data: "1,2",
err: "row 1: unexpected number of columns, expected 2 got 1",
},
{
name: "unexpected number of columns in 1st row",
create: `a string, b string`,
typ: "DELIMITED",
data: "1,2\n3\t4",
err: "row 1: unexpected number of columns, expected 2 got 1",
},
{
name: "field enclosure",
create: `a string, b string`,
with: `WITH fields_enclosed_by = '$'`,
typ: "DELIMITED",
data: "$foo$\tnormal",
query: map[string][][]string{
`SELECT * from t`: {{"foo", "normal"}},
},
},
{
name: "field enclosure in middle of unquoted field",
create: `a string, b string`,
with: `WITH fields_enclosed_by = '$'`,
typ: "DELIMITED",
data: "fo$o\tb$a$z",
query: map[string][][]string{
`SELECT * from t`: {{"fo$o", "b$a$z"}},
},
},
{
name: "field enclosure in middle of quoted field",
create: `a string, b string`,
with: `WITH fields_enclosed_by = '$'`,
typ: "DELIMITED",
data: "$fo$o$\t$b$a$z$",
query: map[string][][]string{
`SELECT * from t`: {{"fo$o", "b$a$z"}},
},
},
{
name: "unmatched field enclosure",
create: `i int8`,
with: `WITH fields_enclosed_by = '"'`,
create: `a string, b string`,
with: `WITH fields_enclosed_by = '$'`,
typ: "DELIMITED",
data: "$foo\tnormal",
err: "row 1: unmatched field enclosure at start of field",
},
{
name: "unmatched field enclosure at end",
create: `a string, b string`,
with: `WITH fields_enclosed_by = '$'`,
typ: "DELIMITED",
data: "foo$\tnormal",
err: "row 1: unmatched field enclosure at end of field",
},
{
name: "unmatched field enclosure 2nd field",
create: `a string, b string`,
with: `WITH fields_enclosed_by = '$'`,
typ: "DELIMITED",
data: "normal\t$foo",
err: "row 1: unmatched field enclosure at start of field",
},
{
name: "unmatched field enclosure at end 2nd field",
create: `a string, b string`,
with: `WITH fields_enclosed_by = '$'`,
typ: "DELIMITED",
data: "\"1",
err: "row 1: unmatched field enclosure",
data: "normal\tfoo$",
err: "row 1: unmatched field enclosure at end of field",
},
{
name: "unmatched literal",
Expand All @@ -266,6 +334,17 @@ d
data: `\`,
err: "row 1: unmatched literal",
},
{
name: "escaped field enclosure",
create: `a string, b string`,
with: `WITH fields_enclosed_by = '$', fields_escaped_by = '\',
fields_terminated_by = ','`,
typ: "DELIMITED",
data: `\$foo\$,\$baz`,
query: map[string][][]string{
`SELECT * from t`: {{"$foo$", "$baz"}},
},
},
{
name: "weird escape char",
create: `s STRING`,
Expand Down
98 changes: 69 additions & 29 deletions pkg/ccl/importccl/read_import_mysqlout.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,9 @@ func (d *mysqloutfileReader) readFile(
var count int64 = 1

var row []tree.Datum
// the current field being read.
var field []byte
// The current field being read needs to be a list to be able to undo
// field enclosures at end of field.
var fieldParts []rune

// If we have an escaping char defined, seeing it means the next char is to be
// treated as escaped -- usually that means literal but has some specific
Expand All @@ -83,26 +84,30 @@ func (d *mysqloutfileReader) readFile(
// which means we do not look for separators until we see the end of the field
// as indicated by the matching enclosing char.
var readingField bool
// If we have just encountered a potential encloser symbol.
// That means if a end of field or line is next we should honor it.
var gotEncloser bool

var gotNull bool

reader := bufio.NewReaderSize(input, 1024*64)
addField := func() error {
field := string(fieldParts)
if len(row) >= len(d.conv.VisibleCols) {
return makeRowErr(inputName, count, pgcode.Syntax,
"too many columns, expected %d: %#v", len(d.conv.VisibleCols), row)
}
if gotNull {
if len(field) != 0 {
return makeRowErr(inputName, count, pgcode.Syntax,
"unexpected data after null encoding: %s", field)
"unexpected data after null encoding: %v", row)
}
row = append(row, tree.DNull)
gotNull = false
} else if !d.opts.HasEscape && string(field) == "NULL" {
} else if !d.opts.HasEscape && field == "NULL" {
row = append(row, tree.DNull)
} else {
datum, err := tree.ParseStringAs(d.conv.VisibleColTypes[len(row)], string(field), d.conv.EvalCtx)
datum, err := tree.ParseStringAs(d.conv.VisibleColTypes[len(row)], field, d.conv.EvalCtx)
if err != nil {
col := d.conv.VisibleCols[len(row)]
return wrapRowErr(err, inputName, count, pgcode.Syntax,
Expand All @@ -111,7 +116,7 @@ func (d *mysqloutfileReader) readFile(

row = append(row, datum)
}
field = field[:0]
fieldParts = fieldParts[:0]
return nil
}
addRow := func() error {
Expand All @@ -138,10 +143,21 @@ func (d *mysqloutfileReader) readFile(
if nextLiteral {
return makeRowErr(inputName, count, pgcode.Syntax, "unmatched literal")
}
if readingField {
return makeRowErr(inputName, count, pgcode.Syntax, "unmatched field enclosure")
// If previous symbol was field encloser it should be
// dropped as it only marks end of field. Otherwise
// throw an error since we don;t expect unmatched encloser.
if gotEncloser {
if readingField {
fieldParts = fieldParts[:len(fieldParts)-1]
} else {
return makeRowErr(inputName, count, pgcode.Syntax,
"unmatched field enclosure at end of field")
}
} else if readingField {
return makeRowErr(inputName, count, pgcode.Syntax,
"unmatched field enclosure at start of field")
}
if len(field) > 0 {
if len(fieldParts) > 0 {
if err := addField(); err != nil {
return err
}
Expand All @@ -158,6 +174,7 @@ func (d *mysqloutfileReader) readFile(
if err != nil {
return err
}

if c == unicode.ReplacementChar && w == 1 {
if err := reader.UnreadRune(); err != nil {
return err
Expand All @@ -166,7 +183,8 @@ func (d *mysqloutfileReader) readFile(
if err != nil {
return err
}
field = append(field, raw)
fieldParts = append(fieldParts, rune(raw))
gotEncloser = false
continue
}

Expand All @@ -177,45 +195,50 @@ func (d *mysqloutfileReader) readFile(
// See https://dev.mysql.com/doc/refman/8.0/en/load-data.html.
switch c {
case '0':
field = append(field, byte(0))
fieldParts = append(fieldParts, rune(0))
case 'b':
field = append(field, '\b')
fieldParts = append(fieldParts, rune('\b'))
case 'n':
field = append(field, '\n')
fieldParts = append(fieldParts, rune('\n'))
case 'r':
field = append(field, '\r')
fieldParts = append(fieldParts, rune('\r'))
case 't':
field = append(field, '\t')
fieldParts = append(fieldParts, rune('\t'))
case 'Z':
field = append(field, byte(26))
fieldParts = append(fieldParts, rune(byte(26)))
case 'N':
if gotNull {
return makeRowErr(inputName, count, pgcode.Syntax, "unexpected null encoding")
}
gotNull = true
default:
field = append(field, string(c)...)
fieldParts = append(fieldParts, c)
}
gotEncloser = false
continue
}

if c == d.opts.Escape {
nextLiteral = true
gotEncloser = false
continue
}
}

// If enclosing is not disabled, check for the encloser.
// Technically when it is not optional, we could _require_ it to start and
// end fields, but for the purposes of decoding, we don't actually care --
// we'll handle it if we see it either way.
if d.opts.Enclose != roachpb.MySQLOutfileOptions_Never && c == d.opts.Encloser {
readingField = !readingField
continue
}

// Are we done with the field, or even the whole row?
if !readingField && (c == d.opts.FieldSeparator || c == d.opts.RowSeparator) {
if (!readingField || gotEncloser) &&
(c == d.opts.FieldSeparator || c == d.opts.RowSeparator) {
if gotEncloser {
// If the encloser marked end of field
// drop it.
if readingField {
fieldParts = fieldParts[:len(fieldParts)-1]
} else {
// Unexpected since we did not see one at start of field.
return makeRowErr(inputName, count, pgcode.Syntax,
"unmatched field enclosure at end of field")
}
}
if err := addField(); err != nil {
return err
}
Expand All @@ -224,11 +247,28 @@ func (d *mysqloutfileReader) readFile(
return err
}
}
readingField = false
gotEncloser = false
continue
}

field = append(field, string(c)...)
}
if gotEncloser {
gotEncloser = false
}

// If enclosing is not disabled, check for the encloser.
// Technically when it is not optional, we could _require_ it to start and
// end fields, but for the purposes of decoding, we don't actually care --
// we'll handle it if we see it either way.
if d.opts.Enclose != roachpb.MySQLOutfileOptions_Never && c == d.opts.Encloser {
if !readingField && len(fieldParts) == 0 {
readingField = true
continue
}
gotEncloser = true
}

fieldParts = append(fieldParts, c)
}
return d.conv.SendBatch(ctx)
}

0 comments on commit 9f0f622

Please sign in to comment.