From 302c3a2848afe97e08e0c4f06a1142b14ef88629 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Mon, 13 Feb 2023 23:10:01 +0800 Subject: [PATCH] lighting/parser: align NULL and ESCAPED BY with LOAD DATA (#40909) ref pingcap/tidb#40499 --- br/pkg/lightning/config/config.go | 78 +++++-- br/pkg/lightning/config/config_test.go | 27 ++- br/pkg/lightning/mydump/BUILD.bazel | 1 + br/pkg/lightning/mydump/csv_parser.go | 136 +++++++---- br/pkg/lightning/mydump/csv_parser_test.go | 217 ++++++++++++++++-- br/pkg/lightning/mydump/parser.go | 26 ++- br/pkg/lightning/mydump/parser.rl | 2 +- br/pkg/lightning/mydump/parser_generated.go | 12 +- br/pkg/lightning/mydump/region_test.go | 20 +- br/pkg/lightning/restore/check_info_test.go | 14 +- .../lightning/restore/table_restore_test.go | 8 +- 11 files changed, 422 insertions(+), 119 deletions(-) diff --git a/br/pkg/lightning/config/config.go b/br/pkg/lightning/config/config.go index d535478c5f4c4..cf69ffda6c08b 100644 --- a/br/pkg/lightning/config/config.go +++ b/br/pkg/lightning/config/config.go @@ -529,20 +529,48 @@ type PostRestore struct { Compact bool `toml:"compact" json:"compact"` } +// StringOrStringSlice can unmarshal a TOML string as string slice with one element. +type StringOrStringSlice []string + +func (s *StringOrStringSlice) UnmarshalTOML(in interface{}) error { + switch v := in.(type) { + case string: + *s = []string{v} + case []interface{}: + *s = make([]string, 0, len(v)) + for _, vv := range v { + vs, ok := vv.(string) + if !ok { + return errors.Errorf("invalid string slice '%v'", in) + } + *s = append(*s, vs) + } + default: + return errors.Errorf("invalid string slice '%v'", in) + } + return nil +} + type CSVConfig struct { // Separator, Delimiter and Terminator should all be in utf8mb4 encoding. - Separator string `toml:"separator" json:"separator"` - Delimiter string `toml:"delimiter" json:"delimiter"` - Terminator string `toml:"terminator" json:"terminator"` - Null string `toml:"null" json:"null"` - Header bool `toml:"header" json:"header"` - HeaderSchemaMatch bool `toml:"header-schema-match" json:"header-schema-match"` - TrimLastSep bool `toml:"trim-last-separator" json:"trim-last-separator"` - NotNull bool `toml:"not-null" json:"not-null"` - BackslashEscape bool `toml:"backslash-escape" json:"backslash-escape"` + Separator string `toml:"separator" json:"separator"` + Delimiter string `toml:"delimiter" json:"delimiter"` + Terminator string `toml:"terminator" json:"terminator"` + Null StringOrStringSlice `toml:"null" json:"null"` + Header bool `toml:"header" json:"header"` + HeaderSchemaMatch bool `toml:"header-schema-match" json:"header-schema-match"` + TrimLastSep bool `toml:"trim-last-separator" json:"trim-last-separator"` + NotNull bool `toml:"not-null" json:"not-null"` + // deprecated, use `escaped-by` instead. + BackslashEscape bool `toml:"backslash-escape" json:"backslash-escape"` + // EscapedBy has higher priority than BackslashEscape, currently it must be a single character if set. + EscapedBy string `toml:"escaped-by" json:"escaped-by"` // hide these options for lightning configuration file, they can only be used by LOAD DATA // https://dev.mysql.com/doc/refman/8.0/en/load-data.html#load-data-field-line-handling StartingBy string `toml:"-" json:"-"` + // For non-empty Delimiter (for example quotes), null elements inside quotes are not considered as null except for + // `\N` (when escape-by is `\`). That is to say, `\N` is special for null because it always means null. + QuotedNullIsText bool } type MydumperRuntime struct { @@ -802,8 +830,9 @@ func NewConfig() *Config { Header: true, HeaderSchemaMatch: true, NotNull: false, - Null: `\N`, + Null: []string{`\N`}, BackslashEscape: true, + EscapedBy: `\`, TrimLastSep: false, }, StrictFormat: false, @@ -935,15 +964,30 @@ func (cfg *Config) Adjust(ctx context.Context) error { return common.ErrInvalidConfig.GenWithStack("`mydumper.csv.separator` and `mydumper.csv.delimiter` must not be prefix of each other") } - if csv.BackslashEscape { - if csv.Separator == `\` { - return common.ErrInvalidConfig.GenWithStack("cannot use '\\' as CSV separator when `mydumper.csv.backslash-escape` is true") + if len(csv.EscapedBy) > 1 { + return common.ErrInvalidConfig.GenWithStack("`mydumper.csv.escaped-by` must be empty or a single character") + } + if csv.BackslashEscape && csv.EscapedBy == "" { + csv.EscapedBy = `\` + } + if !csv.BackslashEscape && csv.EscapedBy == `\` { + csv.EscapedBy = "" + } + + // keep compatibility with old behaviour + if !csv.NotNull && len(csv.Null) == 0 { + csv.Null = []string{""} + } + + if len(csv.EscapedBy) > 0 { + if csv.Separator == csv.EscapedBy { + return common.ErrInvalidConfig.GenWithStack("cannot use '%s' both as CSV separator and `mydumper.csv.escaped-by`", csv.EscapedBy) } - if csv.Delimiter == `\` { - return common.ErrInvalidConfig.GenWithStack("cannot use '\\' as CSV delimiter when `mydumper.csv.backslash-escape` is true") + if csv.Delimiter == csv.EscapedBy { + return common.ErrInvalidConfig.GenWithStack("cannot use '%s' both as CSV delimiter and `mydumper.csv.escaped-by`", csv.EscapedBy) } - if csv.Terminator == `\` { - return common.ErrInvalidConfig.GenWithStack("cannot use '\\' as CSV terminator when `mydumper.csv.backslash-escape` is true") + if csv.Terminator == csv.EscapedBy { + return common.ErrInvalidConfig.GenWithStack("cannot use '%s' both as CSV terminator and `mydumper.csv.escaped-by`", csv.EscapedBy) } } diff --git a/br/pkg/lightning/config/config_test.go b/br/pkg/lightning/config/config_test.go index 98635ba4674e9..ffcbaf7d0d626 100644 --- a/br/pkg/lightning/config/config_test.go +++ b/br/pkg/lightning/config/config_test.go @@ -477,15 +477,15 @@ func TestInvalidCSV(t *testing.T) { separator = '\' backslash-escape = true `, - err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' as CSV separator when `mydumper.csv.backslash-escape` is true", + err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' both as CSV separator and `mydumper.csv.escaped-by`", }, { input: ` [mydumper.csv] delimiter = '\' - backslash-escape = true + escaped-by = '\' `, - err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' as CSV delimiter when `mydumper.csv.backslash-escape` is true", + err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' both as CSV delimiter and `mydumper.csv.escaped-by`", }, { input: ` @@ -528,7 +528,7 @@ func TestInvalidCSV(t *testing.T) { if tc.err != "" { require.EqualError(t, err, tc.err, comment) } else { - require.NoError(t, err) + require.NoError(t, err, tc.input) } } } @@ -543,6 +543,25 @@ func TestInvalidTOML(t *testing.T) { require.EqualError(t, err, "toml: line 2: expected '.' or '=', but got '[' instead") } +func TestStringOrStringSlice(t *testing.T) { + cfg := &config.Config{} + err := cfg.LoadFromTOML([]byte(` + [mydumper.csv] + null = '\N' + `)) + require.NoError(t, err) + err = cfg.LoadFromTOML([]byte(` + [mydumper.csv] + null = [ '\N', 'NULL' ] + `)) + require.NoError(t, err) + err = cfg.LoadFromTOML([]byte(` + [mydumper.csv] + null = [ '\N', 123 ] + `)) + require.ErrorContains(t, err, "invalid string slice") +} + func TestTOMLUnusedKeys(t *testing.T) { cfg := &config.Config{} err := cfg.LoadFromTOML([]byte(` diff --git a/br/pkg/lightning/mydump/BUILD.bazel b/br/pkg/lightning/mydump/BUILD.bazel index a4aa1626afc46..95d61b14465e8 100644 --- a/br/pkg/lightning/mydump/BUILD.bazel +++ b/br/pkg/lightning/mydump/BUILD.bazel @@ -36,6 +36,7 @@ go_library( "@com_github_xitongsys_parquet_go//parquet", "@com_github_xitongsys_parquet_go//reader", "@com_github_xitongsys_parquet_go//source", + "@org_golang_x_exp//slices", "@org_golang_x_text//encoding", "@org_golang_x_text//encoding/simplifiedchinese", "@org_uber_go_zap//:zap", diff --git a/br/pkg/lightning/mydump/csv_parser.go b/br/pkg/lightning/mydump/csv_parser.go index d9cd033d70861..2e48a9f0503d8 100644 --- a/br/pkg/lightning/mydump/csv_parser.go +++ b/br/pkg/lightning/mydump/csv_parser.go @@ -18,6 +18,7 @@ import ( "bytes" "context" "io" + "regexp" "strings" "github.com/pingcap/errors" @@ -28,6 +29,7 @@ import ( tidbconfig "github.com/pingcap/tidb/config" "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/mathutil" + "golang.org/x/exp/slices" ) var ( @@ -47,10 +49,12 @@ type CSVParser struct { blockParser cfg *config.CSVConfig - comma []byte - quote []byte - newLine []byte - startingBy []byte + comma []byte + quote []byte + newLine []byte + startingBy []byte + escapedBy string + unescapeRegexp *regexp.Regexp charsetConvertor *CharsetConvertor // These variables are used with IndexAnyByte to search a byte slice for the @@ -74,13 +78,20 @@ type CSVParser struct { // fieldIndexes is an index of fields inside recordBuffer. // The i'th field ends at offset fieldIndexes[i] in recordBuffer. - fieldIndexes []int + fieldIndexes []int + fieldIsQuoted []bool - lastRecord []string + lastRecord []field - escFlavor backslashEscapeFlavor + escFlavor escapeFlavor // if set to true, csv parser will treat the first non-empty line as header line shouldParseHeader bool + quotedNullIsText bool +} + +type field struct { + content string + quoted bool } // NewCSVParser creates a CSV parser. @@ -127,14 +138,19 @@ func NewCSVParser( } } - escFlavor := backslashEscapeFlavorNone - if cfg.BackslashEscape { - escFlavor = backslashEscapeFlavorMySQL - quoteStopSet = append(quoteStopSet, '\\') - unquoteStopSet = append(unquoteStopSet, '\\') + escFlavor := escapeFlavorNone + var r *regexp.Regexp + if len(cfg.EscapedBy) > 0 { + escFlavor = escapeFlavorMySQL + quoteStopSet = append(quoteStopSet, cfg.EscapedBy[0]) + unquoteStopSet = append(unquoteStopSet, cfg.EscapedBy[0]) // we need special treatment of the NULL value \N, used by MySQL. - if !cfg.NotNull && cfg.Null == `\N` { - escFlavor = backslashEscapeFlavorMySQLWithNull + if !cfg.NotNull && slices.Contains(cfg.Null, cfg.EscapedBy+`N`) { + escFlavor = escapeFlavorMySQLWithNull + } + r, err = regexp.Compile(`(?s)` + regexp.QuoteMeta(cfg.EscapedBy) + `.`) + if err != nil { + return nil, errors.Trace(err) } } metrics, _ := metric.FromContext(ctx) @@ -146,11 +162,14 @@ func NewCSVParser( quote: []byte(delimiter), newLine: []byte(terminator), startingBy: []byte(cfg.StartingBy), + escapedBy: cfg.EscapedBy, + unescapeRegexp: r, escFlavor: escFlavor, quoteByteSet: makeByteSet(quoteStopSet), unquoteByteSet: makeByteSet(unquoteStopSet), newLineByteSet: makeByteSet(newLineStopSet), shouldParseHeader: shouldParseHeader, + quotedNullIsText: cfg.QuotedNullIsText, }, nil } @@ -175,18 +194,30 @@ func encodeSpecialSymbols(cfg *config.CSVConfig, cc *CharsetConvertor) (separato return } -func (parser *CSVParser) unescapeString(input string) (unescaped string, isNull bool, err error) { +func (parser *CSVParser) unescapeString(input field) (unescaped string, isNull bool, err error) { // Convert the input from another charset to utf8mb4 before we return the string. - if input, err = parser.charsetConvertor.Decode(input); err != nil { + if unescaped, err = parser.charsetConvertor.Decode(input.content); err != nil { return } - if parser.escFlavor == backslashEscapeFlavorMySQLWithNull && input == `\N` { - return input, true, nil + if parser.escFlavor == escapeFlavorMySQLWithNull && unescaped == parser.escapedBy+`N` { + return input.content, true, nil + } + if len(parser.escapedBy) > 0 { + unescaped = unescape(unescaped, "", parser.escFlavor, parser.escapedBy[0], parser.unescapeRegexp) + } + if len(parser.quote) == 0 || !parser.quotedNullIsText { + isNull = parser.escFlavor != escapeFlavorMySQLWithNull && + !parser.cfg.NotNull && + slices.Contains(parser.cfg.Null, unescaped) + } else if !input.quoted { + // quoted string can never be NULL except for \N, which must be escapeFlavorMySQLWithNull + isNull = !parser.cfg.NotNull && + slices.Contains(parser.cfg.Null, unescaped) + if parser.escFlavor == escapeFlavorMySQLWithNull && unescaped == parser.escapedBy+`N` { + // avoid \\N + isNull = false + } } - unescaped = unescape(input, "", parser.escFlavor) - isNull = parser.escFlavor != backslashEscapeFlavorMySQLWithNull && - !parser.cfg.NotNull && - unescaped == parser.cfg.Null return } @@ -198,9 +229,9 @@ type csvToken int16 const ( // csvTokenAnyUnquoted is a placeholder to represent any unquoted character. csvTokenAnyUnquoted csvToken = 0 - // csvTokenWithBackslash is a mask indicating an escaped character. - // The actual token is represented like `csvTokenWithBackslash | 'n'`. - csvTokenWithBackslash csvToken = 0x100 + // csvTokenEscaped is a mask indicating an escaped character. + // The actual token is represented like `csvTokenEscaped | 'n'`. + csvTokenEscaped csvToken = 0x100 // csvTokenComma is the CSV separator token. csvTokenComma csvToken = 0x200 // csvTokenNewLine is the CSV terminator token. @@ -293,8 +324,11 @@ func (parser *CSVParser) tryReadComma(b byte) (bool, error) { return parser.tryReadExact(parser.comma[1:]) } -func (parser *CSVParser) tryReadBackslashed(bs byte) (bool, byte, error) { - if bs != '\\' || parser.escFlavor == backslashEscapeFlavorNone { +func (parser *CSVParser) tryReadEscaped(bs byte) (bool, byte, error) { + if parser.escapedBy == "" { + return false, 0, nil + } + if bs != parser.escapedBy[0] || parser.escFlavor == escapeFlavorNone { return false, 0, nil } b, err := parser.readByte() @@ -306,8 +340,8 @@ func (parser *CSVParser) readQuotedToken(b byte) (csvToken, error) { if ok, err := parser.tryReadCloseDelimiter(b); ok || err != nil { return csvTokenDelimiter, err } - if ok, eb, err := parser.tryReadBackslashed(b); ok || err != nil { - return csvTokenWithBackslash | csvToken(eb), err + if ok, eb, err := parser.tryReadEscaped(b); ok || err != nil { + return csvTokenEscaped | csvToken(eb), err } return csvToken(b), nil } @@ -323,15 +357,15 @@ func (parser *CSVParser) readUnquoteToken(b byte) (csvToken, error) { if ok, err := parser.tryReadOpenDelimiter(b); ok || err != nil { return csvTokenDelimiter, err } - if ok, eb, err := parser.tryReadBackslashed(b); ok || err != nil { - return csvTokenWithBackslash | csvToken(eb), err + if ok, eb, err := parser.tryReadEscaped(b); ok || err != nil { + return csvTokenEscaped | csvToken(eb), err } return csvToken(b), nil } func (parser *CSVParser) appendCSVTokenToRecordBuffer(token csvToken) { - if token&csvTokenWithBackslash != 0 { - parser.recordBuffer = append(parser.recordBuffer, '\\') + if token&csvTokenEscaped != 0 { + parser.recordBuffer = append(parser.recordBuffer, parser.escapedBy[0]) } parser.recordBuffer = append(parser.recordBuffer, byte(token)) } @@ -372,14 +406,16 @@ func (parser *CSVParser) readUntil(chars *byteSet) ([]byte, byte, error) { } } -func (parser *CSVParser) readRecord(dst []string) ([]string, error) { +func (parser *CSVParser) readRecord(dst []field) ([]field, error) { parser.recordBuffer = parser.recordBuffer[:0] parser.fieldIndexes = parser.fieldIndexes[:0] + parser.fieldIsQuoted = parser.fieldIsQuoted[:0] isEmptyLine := true whitespaceLine := true foundStartingByThisLine := false prevToken := csvTokenNewLine + fieldIsQuoted := false var firstToken csvToken outside: @@ -445,6 +481,8 @@ outside: case csvTokenComma: whitespaceLine = false parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer)) + parser.fieldIsQuoted = append(parser.fieldIsQuoted, fieldIsQuoted) + fieldIsQuoted = false case csvTokenDelimiter: if prevToken != csvTokenComma && prevToken != csvTokenNewLine { parser.logSyntaxError() @@ -453,6 +491,7 @@ outside: if err = parser.readQuotedField(); err != nil { return nil, err } + fieldIsQuoted = true whitespaceLine = false case csvTokenNewLine: foundStartingByThisLine = false @@ -467,6 +506,8 @@ outside: continue } parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer)) + parser.fieldIsQuoted = append(parser.fieldIsQuoted, fieldIsQuoted) + // the loop is end, no need to reset fieldIsQuoted break outside default: if prevToken == csvTokenDelimiter { @@ -483,12 +524,13 @@ outside: str := string(parser.recordBuffer) // Convert to string once to batch allocations dst = dst[:0] if cap(dst) < len(parser.fieldIndexes) { - dst = make([]string, len(parser.fieldIndexes)) + dst = make([]field, len(parser.fieldIndexes)) } dst = dst[:len(parser.fieldIndexes)] var preIdx int for i, idx := range parser.fieldIndexes { - dst[i] = str[preIdx:idx] + dst[i].content = str[preIdx:idx] + dst[i].quoted = parser.fieldIsQuoted[i] preIdx = idx } @@ -574,7 +616,7 @@ func (parser *CSVParser) ReadRow() error { // remove the last empty value if parser.cfg.TrimLastSep { i := len(records) - 1 - if i >= 0 && len(records[i]) == 0 { + if i >= 0 && len(records[i].content) == 0 { records = records[:i] } } @@ -586,7 +628,7 @@ func (parser *CSVParser) ReadRow() error { row.Row = make([]types.Datum, len(records)) } for i, record := range records { - row.Length += len(record) + row.Length += len(record.content) unescaped, isNull, err := parser.unescapeString(record) if err != nil { return errors.Trace(err) @@ -612,31 +654,37 @@ func (parser *CSVParser) ReadColumns() error { } parser.columns = make([]string, 0, len(columns)) for _, colName := range columns { - colName, _, err = parser.unescapeString(colName) + colNameStr, _, err := parser.unescapeString(colName) if err != nil { return errors.Trace(err) } - parser.columns = append(parser.columns, strings.ToLower(colName)) + parser.columns = append(parser.columns, strings.ToLower(colNameStr)) } return nil } // ReadUntilTerminator seeks the file until the terminator token is found, and // returns -// - the content before terminator -// - the file offset beyond the terminator +// - the content with terminator, or the content read before meet error +// - the file offset beyond the terminator, or the offset when meet error // - error // Note that the terminator string pattern may be the content of a field, which // means it's inside quotes. Caller should make sure to handle this case. func (parser *CSVParser) ReadUntilTerminator() ([]byte, int64, error) { + var ret []byte for { content, firstByte, err := parser.readUntil(&parser.newLineByteSet) + ret = append(ret, content...) if err != nil { - return content, 0, err + return ret, parser.pos, err } parser.skipBytes(1) + ret = append(ret, firstByte) if ok, err := parser.tryReadNewLine(firstByte); ok || err != nil { - return content, parser.pos, err + if len(parser.newLine) >= 1 { + ret = append(ret, parser.newLine[1:]...) + } + return ret, parser.pos, err } } } diff --git a/br/pkg/lightning/mydump/csv_parser_test.go b/br/pkg/lightning/mydump/csv_parser_test.go index 8980ce221fe75..8d0a16ccfc290 100644 --- a/br/pkg/lightning/mydump/csv_parser_test.go +++ b/br/pkg/lightning/mydump/csv_parser_test.go @@ -413,11 +413,11 @@ zzz,yyy,xxx`), int64(config.ReadBlockSize), ioWorkers, false, nil) func TestMySQL(t *testing.T) { cfg := config.CSVConfig{ - Separator: ",", - Delimiter: `"`, - BackslashEscape: true, - NotNull: false, - Null: `\N`, + Separator: ",", + Delimiter: `"`, + EscapedBy: `\`, + NotNull: false, + Null: []string{`\N`}, } parser, err := mydump.NewCSVParser(context.Background(), &cfg, mydump.NewStringReader(`"\"","\\","\?" @@ -452,12 +452,53 @@ func TestMySQL(t *testing.T) { require.ErrorIs(t, errors.Cause(parser.ReadRow()), io.EOF) } +func TestCustomEscapeChar(t *testing.T) { + cfg := config.CSVConfig{ + Separator: ",", + Delimiter: `"`, + EscapedBy: `!`, + NotNull: false, + Null: []string{`!N`}, + } + + parser, err := mydump.NewCSVParser(context.Background(), &cfg, mydump.NewStringReader(`"!"","!!","!\" +"! +",!N,!!N`), int64(config.ReadBlockSize), ioWorkers, false, nil) + require.NoError(t, err) + + require.Nil(t, parser.ReadRow()) + require.Equal(t, mydump.Row{ + RowID: 1, + Row: []types.Datum{ + types.NewStringDatum(`"`), + types.NewStringDatum(`!`), + types.NewStringDatum(`\`), + }, + Length: 6, + }, parser.LastRow()) + assertPosEqual(t, parser, 15, 1) + + require.Nil(t, parser.ReadRow()) + require.Equal(t, mydump.Row{ + RowID: 2, + Row: []types.Datum{ + types.NewStringDatum("\n"), + nullDatum, + types.NewStringDatum(`!N`), + }, + Length: 7, + }, parser.LastRow()) + assertPosEqual(t, parser, 26, 2) + + require.ErrorIs(t, errors.Cause(parser.ReadRow()), io.EOF) +} + func TestSyntaxErrorCSV(t *testing.T) { cfg := config.MydumperRuntime{ CSV: config.CSVConfig{ - Separator: ",", - Delimiter: `"`, - BackslashEscape: true, + Separator: ",", + Delimiter: `"`, + EscapedBy: `\`, }, } @@ -475,7 +516,7 @@ func TestSyntaxErrorCSV(t *testing.T) { runFailingTestCasesCSV(t, &cfg, int64(config.ReadBlockSize), inputs) - cfg.CSV.BackslashEscape = false + cfg.CSV.EscapedBy = "" runFailingTestCasesCSV(t, &cfg, int64(config.ReadBlockSize), []string{`"\`}) } @@ -485,7 +526,7 @@ func TestTSV(t *testing.T) { Delimiter: "", BackslashEscape: false, NotNull: false, - Null: "", + Null: []string{""}, Header: true, HeaderSchemaMatch: true, } @@ -549,6 +590,7 @@ func TestCsvWithWhiteSpaceLine(t *testing.T) { cfg := config.CSVConfig{ Separator: ",", Delimiter: `"`, + Null: []string{""}, } data := " \r\n\r\n0,,abc\r\n \r\n123,1999-12-31,test\r\n" parser, err := mydump.NewCSVParser(context.Background(), &cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), ioWorkers, false, nil) @@ -777,10 +819,10 @@ func TestSpecialChars(t *testing.T) { func TestContinuationCSV(t *testing.T) { cfg := config.MydumperRuntime{ CSV: config.CSVConfig{ - Separator: ",", - Delimiter: `"`, - BackslashEscape: true, - TrimLastSep: true, + Separator: ",", + Delimiter: `"`, + EscapedBy: `\`, + TrimLastSep: true, }, } @@ -814,6 +856,7 @@ func TestBackslashAsSep(t *testing.T) { CSV: config.CSVConfig{ Separator: `\`, Delimiter: `"`, + Null: []string{""}, }, } @@ -841,6 +884,7 @@ func TestBackslashAsDelim(t *testing.T) { CSV: config.CSVConfig{ Separator: ",", Delimiter: `\`, + Null: []string{""}, }, } @@ -856,6 +900,23 @@ func TestBackslashAsDelim(t *testing.T) { `"\`, } runFailingTestCasesCSV(t, &cfg, 1, failingInputs) + + cfg = config.MydumperRuntime{ + CSV: config.CSVConfig{ + Separator: ",", + Delimiter: `\`, + Null: []string{""}, + QuotedNullIsText: true, + }, + } + + testCases = []testCase{ + { + input: `\\`, + expected: [][]types.Datum{{types.NewStringDatum("")}}, + }, + } + runTestCasesCSV(t, &cfg, 1, testCases) } // errorReader implements the Reader interface which always returns an error. @@ -967,6 +1028,134 @@ func TestTerminator(t *testing.T) { runTestCasesCSV(t, &cfg, 1, testCases) } +func TestReadUntilTerminator(t *testing.T) { + cfg := config.MydumperRuntime{ + CSV: config.CSVConfig{ + Separator: "#", + Terminator: "#\n", + }, + } + parser, err := mydump.NewCSVParser( + context.Background(), + &cfg.CSV, + mydump.NewStringReader("xxx1#2#3#4#\n56#78"), + int64(config.ReadBlockSize), + ioWorkers, + false, + nil, + ) + require.NoError(t, err) + content, idx, err := parser.ReadUntilTerminator() + require.NoError(t, err) + require.Equal(t, "xxx1#2#3#4#\n", string(content)) + require.Equal(t, int64(12), idx) + content, idx, err = parser.ReadUntilTerminator() + require.ErrorIs(t, err, io.EOF) + require.Equal(t, "56#78", string(content)) + require.Equal(t, int64(17), idx) +} + +func TestNULL(t *testing.T) { + // https://dev.mysql.com/doc/refman/8.0/en/load-data.html + // - For the default FIELDS and LINES values, NULL is written as a field value of \N for output, and a field value of \N is read as NULL for input (assuming that the ESCAPED BY character is \). + // - If FIELDS ENCLOSED BY is not empty, a field containing the literal word NULL as its value is read as a NULL value. This differs from the word NULL enclosed within FIELDS ENCLOSED BY characters, which is read as the string 'NULL'. + // - If FIELDS ESCAPED BY is empty, NULL is written as the word NULL. + + cfg := config.MydumperRuntime{ + CSV: config.CSVConfig{ + Separator: ",", + Delimiter: `"`, + Terminator: "\n", + Null: []string{`\N`, `NULL`}, + EscapedBy: `\`, + QuotedNullIsText: true, + }, + } + testCases := []testCase{ + { + input: `NULL,"NULL" +\N,"\N" +\\N,"\\N"`, + expected: [][]types.Datum{ + {nullDatum, types.NewStringDatum("NULL")}, + {nullDatum, nullDatum}, + {types.NewStringDatum(`\N`), types.NewStringDatum(`\N`)}, + }, + }, + } + runTestCasesCSV(t, &cfg, 1, testCases) + + cfg = config.MydumperRuntime{ + CSV: config.CSVConfig{ + Separator: ",", + Delimiter: ``, + Terminator: "\n", + Null: []string{`\N`}, + EscapedBy: `\`, + }, + } + testCases = []testCase{ + { + input: `NULL,"NULL" +\N,"\N" +\\N,"\\N"`, + expected: [][]types.Datum{ + {types.NewStringDatum("NULL"), types.NewStringDatum(`"NULL"`)}, + {nullDatum, types.NewStringDatum(`"N"`)}, + {types.NewStringDatum(`\N`), types.NewStringDatum(`"\N"`)}, + }, + }, + } + runTestCasesCSV(t, &cfg, 1, testCases) + + cfg = config.MydumperRuntime{ + CSV: config.CSVConfig{ + Separator: ",", + Delimiter: ``, + Terminator: "\n", + Null: []string{`\N`}, + EscapedBy: `\`, + }, + } + testCases = []testCase{ + { + input: `NULL,"NULL" +\N,"\N" +\\N,"\\N"`, + expected: [][]types.Datum{ + {types.NewStringDatum("NULL"), types.NewStringDatum(`"NULL"`)}, + {nullDatum, types.NewStringDatum(`"N"`)}, + {types.NewStringDatum(`\N`), types.NewStringDatum(`"\N"`)}, + }, + }, + } + runTestCasesCSV(t, &cfg, 1, testCases) + + cfg = config.MydumperRuntime{ + CSV: config.CSVConfig{ + Separator: ",", + Delimiter: `"`, + Terminator: "\n", + Null: []string{`NULL`}, + EscapedBy: ``, + QuotedNullIsText: true, + }, + } + testCases = []testCase{ + { + input: `NULL,"NULL" +\N,"\N" +\\N,"\\N"`, + expected: [][]types.Datum{ + {nullDatum, types.NewStringDatum(`NULL`)}, + {types.NewStringDatum(`\N`), types.NewStringDatum(`\N`)}, + {types.NewStringDatum(`\\N`), types.NewStringDatum(`\\N`)}, + }, + }, + } + runTestCasesCSV(t, &cfg, 1, testCases) +} + func TestStartingBy(t *testing.T) { cfg := config.MydumperRuntime{ CSV: config.CSVConfig{ diff --git a/br/pkg/lightning/mydump/parser.go b/br/pkg/lightning/mydump/parser.go index 0ac82ce189d71..c99330a96211a 100644 --- a/br/pkg/lightning/mydump/parser.go +++ b/br/pkg/lightning/mydump/parser.go @@ -88,7 +88,7 @@ func makeBlockParser( type ChunkParser struct { blockParser - escFlavor backslashEscapeFlavor + escFlavor escapeFlavor } // Chunk represents a portion of the data file. @@ -116,12 +116,12 @@ func (row Row) MarshalLogArray(encoder zapcore.ArrayEncoder) error { return nil } -type backslashEscapeFlavor uint8 +type escapeFlavor uint8 const ( - backslashEscapeFlavorNone backslashEscapeFlavor = iota - backslashEscapeFlavorMySQL - backslashEscapeFlavorMySQLWithNull + escapeFlavorNone escapeFlavor = iota + escapeFlavorMySQL + escapeFlavorMySQLWithNull ) // Parser provides some methods to parse a source data file. @@ -153,9 +153,9 @@ func NewChunkParser( blockBufSize int64, ioWorkers *worker.Pool, ) *ChunkParser { - escFlavor := backslashEscapeFlavorMySQL + escFlavor := escapeFlavorMySQL if sqlMode.HasNoBackslashEscapesMode() { - escFlavor = backslashEscapeFlavorNone + escFlavor = escapeFlavorNone } metrics, _ := metric.FromContext(ctx) return &ChunkParser{ @@ -303,12 +303,14 @@ func (parser *blockParser) readBlock() error { } } -var unescapeRegexp = regexp.MustCompile(`(?s)\\.`) +var chunkParserUnescapeRegexp = regexp.MustCompile(`(?s)\\.`) func unescape( input string, delim string, - escFlavor backslashEscapeFlavor, + escFlavor escapeFlavor, + escChar byte, + unescapeRegexp *regexp.Regexp, ) string { if len(delim) > 0 { delim2 := delim + delim @@ -316,7 +318,7 @@ func unescape( input = strings.ReplaceAll(input, delim2, delim) } } - if escFlavor != backslashEscapeFlavorNone && strings.IndexByte(input, '\\') != -1 { + if escFlavor != escapeFlavorNone && strings.IndexByte(input, escChar) != -1 { input = unescapeRegexp.ReplaceAllStringFunc(input, func(substr string) string { switch substr[1] { case '0': @@ -343,9 +345,9 @@ func (parser *ChunkParser) unescapeString(input string) string { if len(input) >= 2 { switch input[0] { case '\'', '"': - return unescape(input[1:len(input)-1], input[:1], parser.escFlavor) + return unescape(input[1:len(input)-1], input[:1], parser.escFlavor, '\\', chunkParserUnescapeRegexp) case '`': - return unescape(input[1:len(input)-1], "`", backslashEscapeFlavorNone) + return unescape(input[1:len(input)-1], "`", escapeFlavorNone, '\\', chunkParserUnescapeRegexp) } } return input diff --git a/br/pkg/lightning/mydump/parser.rl b/br/pkg/lightning/mydump/parser.rl index 25627dbf2e865..edd2643a73622 100644 --- a/br/pkg/lightning/mydump/parser.rl +++ b/br/pkg/lightning/mydump/parser.rl @@ -54,7 +54,7 @@ comment = 'using utf8mb4)'i; # The patterns parse quoted strings. -bs = '\\' when { parser.escFlavor != backslashEscapeFlavorNone }; +bs = '\\' when { parser.escFlavor != escapeFlavorNone }; single_quoted = "'" (^"'" | bs any | "''")** "'"; double_quoted = '"' (^'"' | bs any | '""')** '"'; diff --git a/br/pkg/lightning/mydump/parser_generated.go b/br/pkg/lightning/mydump/parser_generated.go index c803c0c4c2e40..afb0602d822aa 100644 --- a/br/pkg/lightning/mydump/parser_generated.go +++ b/br/pkg/lightning/mydump/parser_generated.go @@ -610,7 +610,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) { _widec = int16(data[p]) if 92 <= data[p] && data[p] <= 92 { _widec = 256 + (int16(data[p]) - 0) - if parser.escFlavor != backslashEscapeFlavorNone { + if parser.escFlavor != escapeFlavorNone { _widec += 256 } } @@ -639,7 +639,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) { _widec = int16(data[p]) if 92 <= data[p] && data[p] <= 92 { _widec = 256 + (int16(data[p]) - 0) - if parser.escFlavor != backslashEscapeFlavorNone { + if parser.escFlavor != escapeFlavorNone { _widec += 256 } } @@ -685,7 +685,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) { _widec = int16(data[p]) if 92 <= data[p] && data[p] <= 92 { _widec = 256 + (int16(data[p]) - 0) - if parser.escFlavor != backslashEscapeFlavorNone { + if parser.escFlavor != escapeFlavorNone { _widec += 256 } } @@ -716,7 +716,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) { _widec = int16(data[p]) if 92 <= data[p] && data[p] <= 92 { _widec = 256 + (int16(data[p]) - 0) - if parser.escFlavor != backslashEscapeFlavorNone { + if parser.escFlavor != escapeFlavorNone { _widec += 256 } } @@ -745,7 +745,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) { _widec = int16(data[p]) if 92 <= data[p] && data[p] <= 92 { _widec = 256 + (int16(data[p]) - 0) - if parser.escFlavor != backslashEscapeFlavorNone { + if parser.escFlavor != escapeFlavorNone { _widec += 256 } } @@ -791,7 +791,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) { _widec = int16(data[p]) if 92 <= data[p] && data[p] <= 92 { _widec = 256 + (int16(data[p]) - 0) - if parser.escFlavor != backslashEscapeFlavorNone { + if parser.escFlavor != escapeFlavorNone { _widec += 256 } } diff --git a/br/pkg/lightning/mydump/region_test.go b/br/pkg/lightning/mydump/region_test.go index 362ff8603c7f9..ad23590f4c057 100644 --- a/br/pkg/lightning/mydump/region_test.go +++ b/br/pkg/lightning/mydump/region_test.go @@ -180,8 +180,8 @@ func TestMakeSourceFileRegion(t *testing.T) { HeaderSchemaMatch: true, TrimLastSep: false, NotNull: false, - Null: "NULL", - BackslashEscape: true, + Null: []string{"NULL"}, + EscapedBy: `\`, }, StrictFormat: true, Filter: []string{"*.*"}, @@ -237,8 +237,8 @@ func TestCompressedMakeSourceFileRegion(t *testing.T) { HeaderSchemaMatch: true, TrimLastSep: false, NotNull: false, - Null: "NULL", - BackslashEscape: true, + Null: []string{"NULL"}, + EscapedBy: `\`, }, StrictFormat: true, Filter: []string{"*.*"}, @@ -292,8 +292,8 @@ func TestSplitLargeFile(t *testing.T) { HeaderSchemaMatch: true, TrimLastSep: false, NotNull: false, - Null: "NULL", - BackslashEscape: true, + Null: []string{"NULL"}, + EscapedBy: `\`, }, StrictFormat: true, Filter: []string{"*.*"}, @@ -351,8 +351,8 @@ func TestSplitLargeFileNoNewLineAtEOF(t *testing.T) { HeaderSchemaMatch: true, TrimLastSep: false, NotNull: false, - Null: "NULL", - BackslashEscape: true, + Null: []string{"NULL"}, + EscapedBy: `\`, }, StrictFormat: true, Filter: []string{"*.*"}, @@ -457,8 +457,8 @@ func TestSplitLargeFileOnlyOneChunk(t *testing.T) { HeaderSchemaMatch: true, TrimLastSep: false, NotNull: false, - Null: "NULL", - BackslashEscape: true, + Null: []string{"NULL"}, + EscapedBy: `\`, }, StrictFormat: true, Filter: []string{"*.*"}, diff --git a/br/pkg/lightning/restore/check_info_test.go b/br/pkg/lightning/restore/check_info_test.go index 36903ab93b22c..303a9ef929624 100644 --- a/br/pkg/lightning/restore/check_info_test.go +++ b/br/pkg/lightning/restore/check_info_test.go @@ -326,13 +326,13 @@ func TestCheckCSVHeader(t *testing.T) { Mydumper: config.MydumperRuntime{ ReadBlockSize: config.ReadBlockSize, CSV: config.CSVConfig{ - Separator: ",", - Delimiter: `"`, - Header: false, - NotNull: false, - Null: `\N`, - BackslashEscape: true, - TrimLastSep: false, + Separator: ",", + Delimiter: `"`, + Header: false, + NotNull: false, + Null: []string{`\N`}, + EscapedBy: `\`, + TrimLastSep: false, }, }, } diff --git a/br/pkg/lightning/restore/table_restore_test.go b/br/pkg/lightning/restore/table_restore_test.go index 0f6d87892e329..0f7e488415c40 100644 --- a/br/pkg/lightning/restore/table_restore_test.go +++ b/br/pkg/lightning/restore/table_restore_test.go @@ -2142,8 +2142,8 @@ func (s *tableRestoreSuite) TestSchemaIsValid() { Header: ca.hasHeader, HeaderSchemaMatch: true, NotNull: false, - Null: `\N`, - BackslashEscape: true, + Null: []string{`\N`}, + EscapedBy: `\`, TrimLastSep: false, }, IgnoreColumns: ca.ignoreColumns, @@ -2178,8 +2178,8 @@ func (s *tableRestoreSuite) TestGBKEncodedSchemaIsValid() { Header: true, HeaderSchemaMatch: true, NotNull: false, - Null: `\N`, - BackslashEscape: true, + Null: []string{`\N`}, + EscapedBy: `\`, TrimLastSep: false, }, IgnoreColumns: nil,