From 302c3a2848afe97e08e0c4f06a1142b14ef88629 Mon Sep 17 00:00:00 2001
From: lance6716 <lance6716@gmail.com>
Date: Mon, 13 Feb 2023 23:10:01 +0800
Subject: [PATCH] lighting/parser: align NULL and ESCAPED BY with LOAD DATA
 (#40909)

ref pingcap/tidb#40499
---
 br/pkg/lightning/config/config.go             |  78 +++++--
 br/pkg/lightning/config/config_test.go        |  27 ++-
 br/pkg/lightning/mydump/BUILD.bazel           |   1 +
 br/pkg/lightning/mydump/csv_parser.go         | 136 +++++++----
 br/pkg/lightning/mydump/csv_parser_test.go    | 217 ++++++++++++++++--
 br/pkg/lightning/mydump/parser.go             |  26 ++-
 br/pkg/lightning/mydump/parser.rl             |   2 +-
 br/pkg/lightning/mydump/parser_generated.go   |  12 +-
 br/pkg/lightning/mydump/region_test.go        |  20 +-
 br/pkg/lightning/restore/check_info_test.go   |  14 +-
 .../lightning/restore/table_restore_test.go   |   8 +-
 11 files changed, 422 insertions(+), 119 deletions(-)

diff --git a/br/pkg/lightning/config/config.go b/br/pkg/lightning/config/config.go
index d535478c5f4c4..cf69ffda6c08b 100644
--- a/br/pkg/lightning/config/config.go
+++ b/br/pkg/lightning/config/config.go
@@ -529,20 +529,48 @@ type PostRestore struct {
 	Compact           bool        `toml:"compact" json:"compact"`
 }
 
+// StringOrStringSlice can unmarshal a TOML string as string slice with one element.
+type StringOrStringSlice []string
+
+func (s *StringOrStringSlice) UnmarshalTOML(in interface{}) error {
+	switch v := in.(type) {
+	case string:
+		*s = []string{v}
+	case []interface{}:
+		*s = make([]string, 0, len(v))
+		for _, vv := range v {
+			vs, ok := vv.(string)
+			if !ok {
+				return errors.Errorf("invalid string slice '%v'", in)
+			}
+			*s = append(*s, vs)
+		}
+	default:
+		return errors.Errorf("invalid string slice '%v'", in)
+	}
+	return nil
+}
+
 type CSVConfig struct {
 	// Separator, Delimiter and Terminator should all be in utf8mb4 encoding.
-	Separator         string `toml:"separator" json:"separator"`
-	Delimiter         string `toml:"delimiter" json:"delimiter"`
-	Terminator        string `toml:"terminator" json:"terminator"`
-	Null              string `toml:"null" json:"null"`
-	Header            bool   `toml:"header" json:"header"`
-	HeaderSchemaMatch bool   `toml:"header-schema-match" json:"header-schema-match"`
-	TrimLastSep       bool   `toml:"trim-last-separator" json:"trim-last-separator"`
-	NotNull           bool   `toml:"not-null" json:"not-null"`
-	BackslashEscape   bool   `toml:"backslash-escape" json:"backslash-escape"`
+	Separator         string              `toml:"separator" json:"separator"`
+	Delimiter         string              `toml:"delimiter" json:"delimiter"`
+	Terminator        string              `toml:"terminator" json:"terminator"`
+	Null              StringOrStringSlice `toml:"null" json:"null"`
+	Header            bool                `toml:"header" json:"header"`
+	HeaderSchemaMatch bool                `toml:"header-schema-match" json:"header-schema-match"`
+	TrimLastSep       bool                `toml:"trim-last-separator" json:"trim-last-separator"`
+	NotNull           bool                `toml:"not-null" json:"not-null"`
+	// deprecated, use `escaped-by` instead.
+	BackslashEscape bool `toml:"backslash-escape" json:"backslash-escape"`
+	// EscapedBy has higher priority than BackslashEscape, currently it must be a single character if set.
+	EscapedBy string `toml:"escaped-by" json:"escaped-by"`
 	// hide these options for lightning configuration file, they can only be used by LOAD DATA
 	// https://dev.mysql.com/doc/refman/8.0/en/load-data.html#load-data-field-line-handling
 	StartingBy string `toml:"-" json:"-"`
+	// For non-empty Delimiter (for example quotes), null elements inside quotes are not considered as null except for
+	// `\N` (when escape-by is `\`). That is to say, `\N` is special for null because it always means null.
+	QuotedNullIsText bool
 }
 
 type MydumperRuntime struct {
@@ -802,8 +830,9 @@ func NewConfig() *Config {
 				Header:            true,
 				HeaderSchemaMatch: true,
 				NotNull:           false,
-				Null:              `\N`,
+				Null:              []string{`\N`},
 				BackslashEscape:   true,
+				EscapedBy:         `\`,
 				TrimLastSep:       false,
 			},
 			StrictFormat:           false,
@@ -935,15 +964,30 @@ func (cfg *Config) Adjust(ctx context.Context) error {
 		return common.ErrInvalidConfig.GenWithStack("`mydumper.csv.separator` and `mydumper.csv.delimiter` must not be prefix of each other")
 	}
 
-	if csv.BackslashEscape {
-		if csv.Separator == `\` {
-			return common.ErrInvalidConfig.GenWithStack("cannot use '\\' as CSV separator when `mydumper.csv.backslash-escape` is true")
+	if len(csv.EscapedBy) > 1 {
+		return common.ErrInvalidConfig.GenWithStack("`mydumper.csv.escaped-by` must be empty or a single character")
+	}
+	if csv.BackslashEscape && csv.EscapedBy == "" {
+		csv.EscapedBy = `\`
+	}
+	if !csv.BackslashEscape && csv.EscapedBy == `\` {
+		csv.EscapedBy = ""
+	}
+
+	// keep compatibility with old behaviour
+	if !csv.NotNull && len(csv.Null) == 0 {
+		csv.Null = []string{""}
+	}
+
+	if len(csv.EscapedBy) > 0 {
+		if csv.Separator == csv.EscapedBy {
+			return common.ErrInvalidConfig.GenWithStack("cannot use '%s' both as CSV separator and `mydumper.csv.escaped-by`", csv.EscapedBy)
 		}
-		if csv.Delimiter == `\` {
-			return common.ErrInvalidConfig.GenWithStack("cannot use '\\' as CSV delimiter when `mydumper.csv.backslash-escape` is true")
+		if csv.Delimiter == csv.EscapedBy {
+			return common.ErrInvalidConfig.GenWithStack("cannot use '%s' both as CSV delimiter and `mydumper.csv.escaped-by`", csv.EscapedBy)
 		}
-		if csv.Terminator == `\` {
-			return common.ErrInvalidConfig.GenWithStack("cannot use '\\' as CSV terminator when `mydumper.csv.backslash-escape` is true")
+		if csv.Terminator == csv.EscapedBy {
+			return common.ErrInvalidConfig.GenWithStack("cannot use '%s' both as CSV terminator and `mydumper.csv.escaped-by`", csv.EscapedBy)
 		}
 	}
 
diff --git a/br/pkg/lightning/config/config_test.go b/br/pkg/lightning/config/config_test.go
index 98635ba4674e9..ffcbaf7d0d626 100644
--- a/br/pkg/lightning/config/config_test.go
+++ b/br/pkg/lightning/config/config_test.go
@@ -477,15 +477,15 @@ func TestInvalidCSV(t *testing.T) {
 				separator = '\'
 				backslash-escape = true
 			`,
-			err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' as CSV separator when `mydumper.csv.backslash-escape` is true",
+			err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' both as CSV separator and `mydumper.csv.escaped-by`",
 		},
 		{
 			input: `
 				[mydumper.csv]
 				delimiter = '\'
-				backslash-escape = true
+				escaped-by = '\'
 			`,
-			err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' as CSV delimiter when `mydumper.csv.backslash-escape` is true",
+			err: "[Lightning:Config:ErrInvalidConfig]cannot use '\\' both as CSV delimiter and `mydumper.csv.escaped-by`",
 		},
 		{
 			input: `
@@ -528,7 +528,7 @@ func TestInvalidCSV(t *testing.T) {
 		if tc.err != "" {
 			require.EqualError(t, err, tc.err, comment)
 		} else {
-			require.NoError(t, err)
+			require.NoError(t, err, tc.input)
 		}
 	}
 }
@@ -543,6 +543,25 @@ func TestInvalidTOML(t *testing.T) {
 	require.EqualError(t, err, "toml: line 2: expected '.' or '=', but got '[' instead")
 }
 
+func TestStringOrStringSlice(t *testing.T) {
+	cfg := &config.Config{}
+	err := cfg.LoadFromTOML([]byte(`
+		[mydumper.csv]
+		null = '\N'
+	`))
+	require.NoError(t, err)
+	err = cfg.LoadFromTOML([]byte(`
+		[mydumper.csv]
+		null = [ '\N', 'NULL' ]
+	`))
+	require.NoError(t, err)
+	err = cfg.LoadFromTOML([]byte(`
+		[mydumper.csv]
+		null = [ '\N', 123 ]
+	`))
+	require.ErrorContains(t, err, "invalid string slice")
+}
+
 func TestTOMLUnusedKeys(t *testing.T) {
 	cfg := &config.Config{}
 	err := cfg.LoadFromTOML([]byte(`
diff --git a/br/pkg/lightning/mydump/BUILD.bazel b/br/pkg/lightning/mydump/BUILD.bazel
index a4aa1626afc46..95d61b14465e8 100644
--- a/br/pkg/lightning/mydump/BUILD.bazel
+++ b/br/pkg/lightning/mydump/BUILD.bazel
@@ -36,6 +36,7 @@ go_library(
         "@com_github_xitongsys_parquet_go//parquet",
         "@com_github_xitongsys_parquet_go//reader",
         "@com_github_xitongsys_parquet_go//source",
+        "@org_golang_x_exp//slices",
         "@org_golang_x_text//encoding",
         "@org_golang_x_text//encoding/simplifiedchinese",
         "@org_uber_go_zap//:zap",
diff --git a/br/pkg/lightning/mydump/csv_parser.go b/br/pkg/lightning/mydump/csv_parser.go
index d9cd033d70861..2e48a9f0503d8 100644
--- a/br/pkg/lightning/mydump/csv_parser.go
+++ b/br/pkg/lightning/mydump/csv_parser.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"context"
 	"io"
+	"regexp"
 	"strings"
 
 	"github.com/pingcap/errors"
@@ -28,6 +29,7 @@ import (
 	tidbconfig "github.com/pingcap/tidb/config"
 	"github.com/pingcap/tidb/types"
 	"github.com/pingcap/tidb/util/mathutil"
+	"golang.org/x/exp/slices"
 )
 
 var (
@@ -47,10 +49,12 @@ type CSVParser struct {
 	blockParser
 	cfg *config.CSVConfig
 
-	comma      []byte
-	quote      []byte
-	newLine    []byte
-	startingBy []byte
+	comma          []byte
+	quote          []byte
+	newLine        []byte
+	startingBy     []byte
+	escapedBy      string
+	unescapeRegexp *regexp.Regexp
 
 	charsetConvertor *CharsetConvertor
 	// These variables are used with IndexAnyByte to search a byte slice for the
@@ -74,13 +78,20 @@ type CSVParser struct {
 
 	// fieldIndexes is an index of fields inside recordBuffer.
 	// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
-	fieldIndexes []int
+	fieldIndexes  []int
+	fieldIsQuoted []bool
 
-	lastRecord []string
+	lastRecord []field
 
-	escFlavor backslashEscapeFlavor
+	escFlavor escapeFlavor
 	// if set to true, csv parser will treat the first non-empty line as header line
 	shouldParseHeader bool
+	quotedNullIsText  bool
+}
+
+type field struct {
+	content string
+	quoted  bool
 }
 
 // NewCSVParser creates a CSV parser.
@@ -127,14 +138,19 @@ func NewCSVParser(
 		}
 	}
 
-	escFlavor := backslashEscapeFlavorNone
-	if cfg.BackslashEscape {
-		escFlavor = backslashEscapeFlavorMySQL
-		quoteStopSet = append(quoteStopSet, '\\')
-		unquoteStopSet = append(unquoteStopSet, '\\')
+	escFlavor := escapeFlavorNone
+	var r *regexp.Regexp
+	if len(cfg.EscapedBy) > 0 {
+		escFlavor = escapeFlavorMySQL
+		quoteStopSet = append(quoteStopSet, cfg.EscapedBy[0])
+		unquoteStopSet = append(unquoteStopSet, cfg.EscapedBy[0])
 		// we need special treatment of the NULL value \N, used by MySQL.
-		if !cfg.NotNull && cfg.Null == `\N` {
-			escFlavor = backslashEscapeFlavorMySQLWithNull
+		if !cfg.NotNull && slices.Contains(cfg.Null, cfg.EscapedBy+`N`) {
+			escFlavor = escapeFlavorMySQLWithNull
+		}
+		r, err = regexp.Compile(`(?s)` + regexp.QuoteMeta(cfg.EscapedBy) + `.`)
+		if err != nil {
+			return nil, errors.Trace(err)
 		}
 	}
 	metrics, _ := metric.FromContext(ctx)
@@ -146,11 +162,14 @@ func NewCSVParser(
 		quote:             []byte(delimiter),
 		newLine:           []byte(terminator),
 		startingBy:        []byte(cfg.StartingBy),
+		escapedBy:         cfg.EscapedBy,
+		unescapeRegexp:    r,
 		escFlavor:         escFlavor,
 		quoteByteSet:      makeByteSet(quoteStopSet),
 		unquoteByteSet:    makeByteSet(unquoteStopSet),
 		newLineByteSet:    makeByteSet(newLineStopSet),
 		shouldParseHeader: shouldParseHeader,
+		quotedNullIsText:  cfg.QuotedNullIsText,
 	}, nil
 }
 
@@ -175,18 +194,30 @@ func encodeSpecialSymbols(cfg *config.CSVConfig, cc *CharsetConvertor) (separato
 	return
 }
 
-func (parser *CSVParser) unescapeString(input string) (unescaped string, isNull bool, err error) {
+func (parser *CSVParser) unescapeString(input field) (unescaped string, isNull bool, err error) {
 	// Convert the input from another charset to utf8mb4 before we return the string.
-	if input, err = parser.charsetConvertor.Decode(input); err != nil {
+	if unescaped, err = parser.charsetConvertor.Decode(input.content); err != nil {
 		return
 	}
-	if parser.escFlavor == backslashEscapeFlavorMySQLWithNull && input == `\N` {
-		return input, true, nil
+	if parser.escFlavor == escapeFlavorMySQLWithNull && unescaped == parser.escapedBy+`N` {
+		return input.content, true, nil
+	}
+	if len(parser.escapedBy) > 0 {
+		unescaped = unescape(unescaped, "", parser.escFlavor, parser.escapedBy[0], parser.unescapeRegexp)
+	}
+	if len(parser.quote) == 0 || !parser.quotedNullIsText {
+		isNull = parser.escFlavor != escapeFlavorMySQLWithNull &&
+			!parser.cfg.NotNull &&
+			slices.Contains(parser.cfg.Null, unescaped)
+	} else if !input.quoted {
+		// quoted string can never be NULL except for \N, which must be escapeFlavorMySQLWithNull
+		isNull = !parser.cfg.NotNull &&
+			slices.Contains(parser.cfg.Null, unescaped)
+		if parser.escFlavor == escapeFlavorMySQLWithNull && unescaped == parser.escapedBy+`N` {
+			// avoid \\N
+			isNull = false
+		}
 	}
-	unescaped = unescape(input, "", parser.escFlavor)
-	isNull = parser.escFlavor != backslashEscapeFlavorMySQLWithNull &&
-		!parser.cfg.NotNull &&
-		unescaped == parser.cfg.Null
 	return
 }
 
@@ -198,9 +229,9 @@ type csvToken int16
 const (
 	// csvTokenAnyUnquoted is a placeholder to represent any unquoted character.
 	csvTokenAnyUnquoted csvToken = 0
-	// csvTokenWithBackslash is a mask indicating an escaped character.
-	// The actual token is represented like `csvTokenWithBackslash | 'n'`.
-	csvTokenWithBackslash csvToken = 0x100
+	// csvTokenEscaped is a mask indicating an escaped character.
+	// The actual token is represented like `csvTokenEscaped | 'n'`.
+	csvTokenEscaped csvToken = 0x100
 	// csvTokenComma is the CSV separator token.
 	csvTokenComma csvToken = 0x200
 	// csvTokenNewLine is the CSV terminator token.
@@ -293,8 +324,11 @@ func (parser *CSVParser) tryReadComma(b byte) (bool, error) {
 	return parser.tryReadExact(parser.comma[1:])
 }
 
-func (parser *CSVParser) tryReadBackslashed(bs byte) (bool, byte, error) {
-	if bs != '\\' || parser.escFlavor == backslashEscapeFlavorNone {
+func (parser *CSVParser) tryReadEscaped(bs byte) (bool, byte, error) {
+	if parser.escapedBy == "" {
+		return false, 0, nil
+	}
+	if bs != parser.escapedBy[0] || parser.escFlavor == escapeFlavorNone {
 		return false, 0, nil
 	}
 	b, err := parser.readByte()
@@ -306,8 +340,8 @@ func (parser *CSVParser) readQuotedToken(b byte) (csvToken, error) {
 	if ok, err := parser.tryReadCloseDelimiter(b); ok || err != nil {
 		return csvTokenDelimiter, err
 	}
-	if ok, eb, err := parser.tryReadBackslashed(b); ok || err != nil {
-		return csvTokenWithBackslash | csvToken(eb), err
+	if ok, eb, err := parser.tryReadEscaped(b); ok || err != nil {
+		return csvTokenEscaped | csvToken(eb), err
 	}
 	return csvToken(b), nil
 }
@@ -323,15 +357,15 @@ func (parser *CSVParser) readUnquoteToken(b byte) (csvToken, error) {
 	if ok, err := parser.tryReadOpenDelimiter(b); ok || err != nil {
 		return csvTokenDelimiter, err
 	}
-	if ok, eb, err := parser.tryReadBackslashed(b); ok || err != nil {
-		return csvTokenWithBackslash | csvToken(eb), err
+	if ok, eb, err := parser.tryReadEscaped(b); ok || err != nil {
+		return csvTokenEscaped | csvToken(eb), err
 	}
 	return csvToken(b), nil
 }
 
 func (parser *CSVParser) appendCSVTokenToRecordBuffer(token csvToken) {
-	if token&csvTokenWithBackslash != 0 {
-		parser.recordBuffer = append(parser.recordBuffer, '\\')
+	if token&csvTokenEscaped != 0 {
+		parser.recordBuffer = append(parser.recordBuffer, parser.escapedBy[0])
 	}
 	parser.recordBuffer = append(parser.recordBuffer, byte(token))
 }
@@ -372,14 +406,16 @@ func (parser *CSVParser) readUntil(chars *byteSet) ([]byte, byte, error) {
 	}
 }
 
-func (parser *CSVParser) readRecord(dst []string) ([]string, error) {
+func (parser *CSVParser) readRecord(dst []field) ([]field, error) {
 	parser.recordBuffer = parser.recordBuffer[:0]
 	parser.fieldIndexes = parser.fieldIndexes[:0]
+	parser.fieldIsQuoted = parser.fieldIsQuoted[:0]
 
 	isEmptyLine := true
 	whitespaceLine := true
 	foundStartingByThisLine := false
 	prevToken := csvTokenNewLine
+	fieldIsQuoted := false
 	var firstToken csvToken
 
 outside:
@@ -445,6 +481,8 @@ outside:
 		case csvTokenComma:
 			whitespaceLine = false
 			parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer))
+			parser.fieldIsQuoted = append(parser.fieldIsQuoted, fieldIsQuoted)
+			fieldIsQuoted = false
 		case csvTokenDelimiter:
 			if prevToken != csvTokenComma && prevToken != csvTokenNewLine {
 				parser.logSyntaxError()
@@ -453,6 +491,7 @@ outside:
 			if err = parser.readQuotedField(); err != nil {
 				return nil, err
 			}
+			fieldIsQuoted = true
 			whitespaceLine = false
 		case csvTokenNewLine:
 			foundStartingByThisLine = false
@@ -467,6 +506,8 @@ outside:
 				continue
 			}
 			parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer))
+			parser.fieldIsQuoted = append(parser.fieldIsQuoted, fieldIsQuoted)
+			// the loop is end, no need to reset fieldIsQuoted
 			break outside
 		default:
 			if prevToken == csvTokenDelimiter {
@@ -483,12 +524,13 @@ outside:
 	str := string(parser.recordBuffer) // Convert to string once to batch allocations
 	dst = dst[:0]
 	if cap(dst) < len(parser.fieldIndexes) {
-		dst = make([]string, len(parser.fieldIndexes))
+		dst = make([]field, len(parser.fieldIndexes))
 	}
 	dst = dst[:len(parser.fieldIndexes)]
 	var preIdx int
 	for i, idx := range parser.fieldIndexes {
-		dst[i] = str[preIdx:idx]
+		dst[i].content = str[preIdx:idx]
+		dst[i].quoted = parser.fieldIsQuoted[i]
 		preIdx = idx
 	}
 
@@ -574,7 +616,7 @@ func (parser *CSVParser) ReadRow() error {
 	// remove the last empty value
 	if parser.cfg.TrimLastSep {
 		i := len(records) - 1
-		if i >= 0 && len(records[i]) == 0 {
+		if i >= 0 && len(records[i].content) == 0 {
 			records = records[:i]
 		}
 	}
@@ -586,7 +628,7 @@ func (parser *CSVParser) ReadRow() error {
 		row.Row = make([]types.Datum, len(records))
 	}
 	for i, record := range records {
-		row.Length += len(record)
+		row.Length += len(record.content)
 		unescaped, isNull, err := parser.unescapeString(record)
 		if err != nil {
 			return errors.Trace(err)
@@ -612,31 +654,37 @@ func (parser *CSVParser) ReadColumns() error {
 	}
 	parser.columns = make([]string, 0, len(columns))
 	for _, colName := range columns {
-		colName, _, err = parser.unescapeString(colName)
+		colNameStr, _, err := parser.unescapeString(colName)
 		if err != nil {
 			return errors.Trace(err)
 		}
-		parser.columns = append(parser.columns, strings.ToLower(colName))
+		parser.columns = append(parser.columns, strings.ToLower(colNameStr))
 	}
 	return nil
 }
 
 // ReadUntilTerminator seeks the file until the terminator token is found, and
 // returns
-// - the content before terminator
-// - the file offset beyond the terminator
+// - the content with terminator, or the content read before meet error
+// - the file offset beyond the terminator, or the offset when meet error
 // - error
 // Note that the terminator string pattern may be the content of a field, which
 // means it's inside quotes. Caller should make sure to handle this case.
 func (parser *CSVParser) ReadUntilTerminator() ([]byte, int64, error) {
+	var ret []byte
 	for {
 		content, firstByte, err := parser.readUntil(&parser.newLineByteSet)
+		ret = append(ret, content...)
 		if err != nil {
-			return content, 0, err
+			return ret, parser.pos, err
 		}
 		parser.skipBytes(1)
+		ret = append(ret, firstByte)
 		if ok, err := parser.tryReadNewLine(firstByte); ok || err != nil {
-			return content, parser.pos, err
+			if len(parser.newLine) >= 1 {
+				ret = append(ret, parser.newLine[1:]...)
+			}
+			return ret, parser.pos, err
 		}
 	}
 }
diff --git a/br/pkg/lightning/mydump/csv_parser_test.go b/br/pkg/lightning/mydump/csv_parser_test.go
index 8980ce221fe75..8d0a16ccfc290 100644
--- a/br/pkg/lightning/mydump/csv_parser_test.go
+++ b/br/pkg/lightning/mydump/csv_parser_test.go
@@ -413,11 +413,11 @@ zzz,yyy,xxx`), int64(config.ReadBlockSize), ioWorkers, false, nil)
 
 func TestMySQL(t *testing.T) {
 	cfg := config.CSVConfig{
-		Separator:       ",",
-		Delimiter:       `"`,
-		BackslashEscape: true,
-		NotNull:         false,
-		Null:            `\N`,
+		Separator: ",",
+		Delimiter: `"`,
+		EscapedBy: `\`,
+		NotNull:   false,
+		Null:      []string{`\N`},
 	}
 
 	parser, err := mydump.NewCSVParser(context.Background(), &cfg, mydump.NewStringReader(`"\"","\\","\?"
@@ -452,12 +452,53 @@ func TestMySQL(t *testing.T) {
 	require.ErrorIs(t, errors.Cause(parser.ReadRow()), io.EOF)
 }
 
+func TestCustomEscapeChar(t *testing.T) {
+	cfg := config.CSVConfig{
+		Separator: ",",
+		Delimiter: `"`,
+		EscapedBy: `!`,
+		NotNull:   false,
+		Null:      []string{`!N`},
+	}
+
+	parser, err := mydump.NewCSVParser(context.Background(), &cfg, mydump.NewStringReader(`"!"","!!","!\"
+"!
+",!N,!!N`), int64(config.ReadBlockSize), ioWorkers, false, nil)
+	require.NoError(t, err)
+
+	require.Nil(t, parser.ReadRow())
+	require.Equal(t, mydump.Row{
+		RowID: 1,
+		Row: []types.Datum{
+			types.NewStringDatum(`"`),
+			types.NewStringDatum(`!`),
+			types.NewStringDatum(`\`),
+		},
+		Length: 6,
+	}, parser.LastRow())
+	assertPosEqual(t, parser, 15, 1)
+
+	require.Nil(t, parser.ReadRow())
+	require.Equal(t, mydump.Row{
+		RowID: 2,
+		Row: []types.Datum{
+			types.NewStringDatum("\n"),
+			nullDatum,
+			types.NewStringDatum(`!N`),
+		},
+		Length: 7,
+	}, parser.LastRow())
+	assertPosEqual(t, parser, 26, 2)
+
+	require.ErrorIs(t, errors.Cause(parser.ReadRow()), io.EOF)
+}
+
 func TestSyntaxErrorCSV(t *testing.T) {
 	cfg := config.MydumperRuntime{
 		CSV: config.CSVConfig{
-			Separator:       ",",
-			Delimiter:       `"`,
-			BackslashEscape: true,
+			Separator: ",",
+			Delimiter: `"`,
+			EscapedBy: `\`,
 		},
 	}
 
@@ -475,7 +516,7 @@ func TestSyntaxErrorCSV(t *testing.T) {
 
 	runFailingTestCasesCSV(t, &cfg, int64(config.ReadBlockSize), inputs)
 
-	cfg.CSV.BackslashEscape = false
+	cfg.CSV.EscapedBy = ""
 	runFailingTestCasesCSV(t, &cfg, int64(config.ReadBlockSize), []string{`"\`})
 }
 
@@ -485,7 +526,7 @@ func TestTSV(t *testing.T) {
 		Delimiter:         "",
 		BackslashEscape:   false,
 		NotNull:           false,
-		Null:              "",
+		Null:              []string{""},
 		Header:            true,
 		HeaderSchemaMatch: true,
 	}
@@ -549,6 +590,7 @@ func TestCsvWithWhiteSpaceLine(t *testing.T) {
 	cfg := config.CSVConfig{
 		Separator: ",",
 		Delimiter: `"`,
+		Null:      []string{""},
 	}
 	data := " \r\n\r\n0,,abc\r\n \r\n123,1999-12-31,test\r\n"
 	parser, err := mydump.NewCSVParser(context.Background(), &cfg, mydump.NewStringReader(data), int64(config.ReadBlockSize), ioWorkers, false, nil)
@@ -777,10 +819,10 @@ func TestSpecialChars(t *testing.T) {
 func TestContinuationCSV(t *testing.T) {
 	cfg := config.MydumperRuntime{
 		CSV: config.CSVConfig{
-			Separator:       ",",
-			Delimiter:       `"`,
-			BackslashEscape: true,
-			TrimLastSep:     true,
+			Separator:   ",",
+			Delimiter:   `"`,
+			EscapedBy:   `\`,
+			TrimLastSep: true,
 		},
 	}
 
@@ -814,6 +856,7 @@ func TestBackslashAsSep(t *testing.T) {
 		CSV: config.CSVConfig{
 			Separator: `\`,
 			Delimiter: `"`,
+			Null:      []string{""},
 		},
 	}
 
@@ -841,6 +884,7 @@ func TestBackslashAsDelim(t *testing.T) {
 		CSV: config.CSVConfig{
 			Separator: ",",
 			Delimiter: `\`,
+			Null:      []string{""},
 		},
 	}
 
@@ -856,6 +900,23 @@ func TestBackslashAsDelim(t *testing.T) {
 		`"\`,
 	}
 	runFailingTestCasesCSV(t, &cfg, 1, failingInputs)
+
+	cfg = config.MydumperRuntime{
+		CSV: config.CSVConfig{
+			Separator:        ",",
+			Delimiter:        `\`,
+			Null:             []string{""},
+			QuotedNullIsText: true,
+		},
+	}
+
+	testCases = []testCase{
+		{
+			input:    `\\`,
+			expected: [][]types.Datum{{types.NewStringDatum("")}},
+		},
+	}
+	runTestCasesCSV(t, &cfg, 1, testCases)
 }
 
 // errorReader implements the Reader interface which always returns an error.
@@ -967,6 +1028,134 @@ func TestTerminator(t *testing.T) {
 	runTestCasesCSV(t, &cfg, 1, testCases)
 }
 
+func TestReadUntilTerminator(t *testing.T) {
+	cfg := config.MydumperRuntime{
+		CSV: config.CSVConfig{
+			Separator:  "#",
+			Terminator: "#\n",
+		},
+	}
+	parser, err := mydump.NewCSVParser(
+		context.Background(),
+		&cfg.CSV,
+		mydump.NewStringReader("xxx1#2#3#4#\n56#78"),
+		int64(config.ReadBlockSize),
+		ioWorkers,
+		false,
+		nil,
+	)
+	require.NoError(t, err)
+	content, idx, err := parser.ReadUntilTerminator()
+	require.NoError(t, err)
+	require.Equal(t, "xxx1#2#3#4#\n", string(content))
+	require.Equal(t, int64(12), idx)
+	content, idx, err = parser.ReadUntilTerminator()
+	require.ErrorIs(t, err, io.EOF)
+	require.Equal(t, "56#78", string(content))
+	require.Equal(t, int64(17), idx)
+}
+
+func TestNULL(t *testing.T) {
+	// https://dev.mysql.com/doc/refman/8.0/en/load-data.html
+	// - For the default FIELDS and LINES values, NULL is written as a field value of \N for output, and a field value of \N is read as NULL for input (assuming that the ESCAPED BY character is \).
+	// - If FIELDS ENCLOSED BY is not empty, a field containing the literal word NULL as its value is read as a NULL value. This differs from the word NULL enclosed within FIELDS ENCLOSED BY characters, which is read as the string 'NULL'.
+	// - If FIELDS ESCAPED BY is empty, NULL is written as the word NULL.
+
+	cfg := config.MydumperRuntime{
+		CSV: config.CSVConfig{
+			Separator:        ",",
+			Delimiter:        `"`,
+			Terminator:       "\n",
+			Null:             []string{`\N`, `NULL`},
+			EscapedBy:        `\`,
+			QuotedNullIsText: true,
+		},
+	}
+	testCases := []testCase{
+		{
+			input: `NULL,"NULL"
+\N,"\N"
+\\N,"\\N"`,
+			expected: [][]types.Datum{
+				{nullDatum, types.NewStringDatum("NULL")},
+				{nullDatum, nullDatum},
+				{types.NewStringDatum(`\N`), types.NewStringDatum(`\N`)},
+			},
+		},
+	}
+	runTestCasesCSV(t, &cfg, 1, testCases)
+
+	cfg = config.MydumperRuntime{
+		CSV: config.CSVConfig{
+			Separator:  ",",
+			Delimiter:  ``,
+			Terminator: "\n",
+			Null:       []string{`\N`},
+			EscapedBy:  `\`,
+		},
+	}
+	testCases = []testCase{
+		{
+			input: `NULL,"NULL"
+\N,"\N"
+\\N,"\\N"`,
+			expected: [][]types.Datum{
+				{types.NewStringDatum("NULL"), types.NewStringDatum(`"NULL"`)},
+				{nullDatum, types.NewStringDatum(`"N"`)},
+				{types.NewStringDatum(`\N`), types.NewStringDatum(`"\N"`)},
+			},
+		},
+	}
+	runTestCasesCSV(t, &cfg, 1, testCases)
+
+	cfg = config.MydumperRuntime{
+		CSV: config.CSVConfig{
+			Separator:  ",",
+			Delimiter:  ``,
+			Terminator: "\n",
+			Null:       []string{`\N`},
+			EscapedBy:  `\`,
+		},
+	}
+	testCases = []testCase{
+		{
+			input: `NULL,"NULL"
+\N,"\N"
+\\N,"\\N"`,
+			expected: [][]types.Datum{
+				{types.NewStringDatum("NULL"), types.NewStringDatum(`"NULL"`)},
+				{nullDatum, types.NewStringDatum(`"N"`)},
+				{types.NewStringDatum(`\N`), types.NewStringDatum(`"\N"`)},
+			},
+		},
+	}
+	runTestCasesCSV(t, &cfg, 1, testCases)
+
+	cfg = config.MydumperRuntime{
+		CSV: config.CSVConfig{
+			Separator:        ",",
+			Delimiter:        `"`,
+			Terminator:       "\n",
+			Null:             []string{`NULL`},
+			EscapedBy:        ``,
+			QuotedNullIsText: true,
+		},
+	}
+	testCases = []testCase{
+		{
+			input: `NULL,"NULL"
+\N,"\N"
+\\N,"\\N"`,
+			expected: [][]types.Datum{
+				{nullDatum, types.NewStringDatum(`NULL`)},
+				{types.NewStringDatum(`\N`), types.NewStringDatum(`\N`)},
+				{types.NewStringDatum(`\\N`), types.NewStringDatum(`\\N`)},
+			},
+		},
+	}
+	runTestCasesCSV(t, &cfg, 1, testCases)
+}
+
 func TestStartingBy(t *testing.T) {
 	cfg := config.MydumperRuntime{
 		CSV: config.CSVConfig{
diff --git a/br/pkg/lightning/mydump/parser.go b/br/pkg/lightning/mydump/parser.go
index 0ac82ce189d71..c99330a96211a 100644
--- a/br/pkg/lightning/mydump/parser.go
+++ b/br/pkg/lightning/mydump/parser.go
@@ -88,7 +88,7 @@ func makeBlockParser(
 type ChunkParser struct {
 	blockParser
 
-	escFlavor backslashEscapeFlavor
+	escFlavor escapeFlavor
 }
 
 // Chunk represents a portion of the data file.
@@ -116,12 +116,12 @@ func (row Row) MarshalLogArray(encoder zapcore.ArrayEncoder) error {
 	return nil
 }
 
-type backslashEscapeFlavor uint8
+type escapeFlavor uint8
 
 const (
-	backslashEscapeFlavorNone backslashEscapeFlavor = iota
-	backslashEscapeFlavorMySQL
-	backslashEscapeFlavorMySQLWithNull
+	escapeFlavorNone escapeFlavor = iota
+	escapeFlavorMySQL
+	escapeFlavorMySQLWithNull
 )
 
 // Parser provides some methods to parse a source data file.
@@ -153,9 +153,9 @@ func NewChunkParser(
 	blockBufSize int64,
 	ioWorkers *worker.Pool,
 ) *ChunkParser {
-	escFlavor := backslashEscapeFlavorMySQL
+	escFlavor := escapeFlavorMySQL
 	if sqlMode.HasNoBackslashEscapesMode() {
-		escFlavor = backslashEscapeFlavorNone
+		escFlavor = escapeFlavorNone
 	}
 	metrics, _ := metric.FromContext(ctx)
 	return &ChunkParser{
@@ -303,12 +303,14 @@ func (parser *blockParser) readBlock() error {
 	}
 }
 
-var unescapeRegexp = regexp.MustCompile(`(?s)\\.`)
+var chunkParserUnescapeRegexp = regexp.MustCompile(`(?s)\\.`)
 
 func unescape(
 	input string,
 	delim string,
-	escFlavor backslashEscapeFlavor,
+	escFlavor escapeFlavor,
+	escChar byte,
+	unescapeRegexp *regexp.Regexp,
 ) string {
 	if len(delim) > 0 {
 		delim2 := delim + delim
@@ -316,7 +318,7 @@ func unescape(
 			input = strings.ReplaceAll(input, delim2, delim)
 		}
 	}
-	if escFlavor != backslashEscapeFlavorNone && strings.IndexByte(input, '\\') != -1 {
+	if escFlavor != escapeFlavorNone && strings.IndexByte(input, escChar) != -1 {
 		input = unescapeRegexp.ReplaceAllStringFunc(input, func(substr string) string {
 			switch substr[1] {
 			case '0':
@@ -343,9 +345,9 @@ func (parser *ChunkParser) unescapeString(input string) string {
 	if len(input) >= 2 {
 		switch input[0] {
 		case '\'', '"':
-			return unescape(input[1:len(input)-1], input[:1], parser.escFlavor)
+			return unescape(input[1:len(input)-1], input[:1], parser.escFlavor, '\\', chunkParserUnescapeRegexp)
 		case '`':
-			return unescape(input[1:len(input)-1], "`", backslashEscapeFlavorNone)
+			return unescape(input[1:len(input)-1], "`", escapeFlavorNone, '\\', chunkParserUnescapeRegexp)
 		}
 	}
 	return input
diff --git a/br/pkg/lightning/mydump/parser.rl b/br/pkg/lightning/mydump/parser.rl
index 25627dbf2e865..edd2643a73622 100644
--- a/br/pkg/lightning/mydump/parser.rl
+++ b/br/pkg/lightning/mydump/parser.rl
@@ -54,7 +54,7 @@ comment =
 	'using utf8mb4)'i;
 
 # The patterns parse quoted strings.
-bs = '\\' when { parser.escFlavor != backslashEscapeFlavorNone };
+bs = '\\' when { parser.escFlavor != escapeFlavorNone };
 
 single_quoted = "'" (^"'" | bs any | "''")** "'";
 double_quoted = '"' (^'"' | bs any | '""')** '"';
diff --git a/br/pkg/lightning/mydump/parser_generated.go b/br/pkg/lightning/mydump/parser_generated.go
index c803c0c4c2e40..afb0602d822aa 100644
--- a/br/pkg/lightning/mydump/parser_generated.go
+++ b/br/pkg/lightning/mydump/parser_generated.go
@@ -610,7 +610,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) {
 			_widec = int16(data[p])
 			if 92 <= data[p] && data[p] <= 92 {
 				_widec = 256 + (int16(data[p]) - 0)
-				if parser.escFlavor != backslashEscapeFlavorNone {
+				if parser.escFlavor != escapeFlavorNone {
 					_widec += 256
 				}
 			}
@@ -639,7 +639,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) {
 			_widec = int16(data[p])
 			if 92 <= data[p] && data[p] <= 92 {
 				_widec = 256 + (int16(data[p]) - 0)
-				if parser.escFlavor != backslashEscapeFlavorNone {
+				if parser.escFlavor != escapeFlavorNone {
 					_widec += 256
 				}
 			}
@@ -685,7 +685,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) {
 			_widec = int16(data[p])
 			if 92 <= data[p] && data[p] <= 92 {
 				_widec = 256 + (int16(data[p]) - 0)
-				if parser.escFlavor != backslashEscapeFlavorNone {
+				if parser.escFlavor != escapeFlavorNone {
 					_widec += 256
 				}
 			}
@@ -716,7 +716,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) {
 			_widec = int16(data[p])
 			if 92 <= data[p] && data[p] <= 92 {
 				_widec = 256 + (int16(data[p]) - 0)
-				if parser.escFlavor != backslashEscapeFlavorNone {
+				if parser.escFlavor != escapeFlavorNone {
 					_widec += 256
 				}
 			}
@@ -745,7 +745,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) {
 			_widec = int16(data[p])
 			if 92 <= data[p] && data[p] <= 92 {
 				_widec = 256 + (int16(data[p]) - 0)
-				if parser.escFlavor != backslashEscapeFlavorNone {
+				if parser.escFlavor != escapeFlavorNone {
 					_widec += 256
 				}
 			}
@@ -791,7 +791,7 @@ func (parser *ChunkParser) lex() (token, []byte, error) {
 			_widec = int16(data[p])
 			if 92 <= data[p] && data[p] <= 92 {
 				_widec = 256 + (int16(data[p]) - 0)
-				if parser.escFlavor != backslashEscapeFlavorNone {
+				if parser.escFlavor != escapeFlavorNone {
 					_widec += 256
 				}
 			}
diff --git a/br/pkg/lightning/mydump/region_test.go b/br/pkg/lightning/mydump/region_test.go
index 362ff8603c7f9..ad23590f4c057 100644
--- a/br/pkg/lightning/mydump/region_test.go
+++ b/br/pkg/lightning/mydump/region_test.go
@@ -180,8 +180,8 @@ func TestMakeSourceFileRegion(t *testing.T) {
 				HeaderSchemaMatch: true,
 				TrimLastSep:       false,
 				NotNull:           false,
-				Null:              "NULL",
-				BackslashEscape:   true,
+				Null:              []string{"NULL"},
+				EscapedBy:         `\`,
 			},
 			StrictFormat: true,
 			Filter:       []string{"*.*"},
@@ -237,8 +237,8 @@ func TestCompressedMakeSourceFileRegion(t *testing.T) {
 				HeaderSchemaMatch: true,
 				TrimLastSep:       false,
 				NotNull:           false,
-				Null:              "NULL",
-				BackslashEscape:   true,
+				Null:              []string{"NULL"},
+				EscapedBy:         `\`,
 			},
 			StrictFormat: true,
 			Filter:       []string{"*.*"},
@@ -292,8 +292,8 @@ func TestSplitLargeFile(t *testing.T) {
 				HeaderSchemaMatch: true,
 				TrimLastSep:       false,
 				NotNull:           false,
-				Null:              "NULL",
-				BackslashEscape:   true,
+				Null:              []string{"NULL"},
+				EscapedBy:         `\`,
 			},
 			StrictFormat: true,
 			Filter:       []string{"*.*"},
@@ -351,8 +351,8 @@ func TestSplitLargeFileNoNewLineAtEOF(t *testing.T) {
 				HeaderSchemaMatch: true,
 				TrimLastSep:       false,
 				NotNull:           false,
-				Null:              "NULL",
-				BackslashEscape:   true,
+				Null:              []string{"NULL"},
+				EscapedBy:         `\`,
 			},
 			StrictFormat:  true,
 			Filter:        []string{"*.*"},
@@ -457,8 +457,8 @@ func TestSplitLargeFileOnlyOneChunk(t *testing.T) {
 				HeaderSchemaMatch: true,
 				TrimLastSep:       false,
 				NotNull:           false,
-				Null:              "NULL",
-				BackslashEscape:   true,
+				Null:              []string{"NULL"},
+				EscapedBy:         `\`,
 			},
 			StrictFormat:  true,
 			Filter:        []string{"*.*"},
diff --git a/br/pkg/lightning/restore/check_info_test.go b/br/pkg/lightning/restore/check_info_test.go
index 36903ab93b22c..303a9ef929624 100644
--- a/br/pkg/lightning/restore/check_info_test.go
+++ b/br/pkg/lightning/restore/check_info_test.go
@@ -326,13 +326,13 @@ func TestCheckCSVHeader(t *testing.T) {
 		Mydumper: config.MydumperRuntime{
 			ReadBlockSize: config.ReadBlockSize,
 			CSV: config.CSVConfig{
-				Separator:       ",",
-				Delimiter:       `"`,
-				Header:          false,
-				NotNull:         false,
-				Null:            `\N`,
-				BackslashEscape: true,
-				TrimLastSep:     false,
+				Separator:   ",",
+				Delimiter:   `"`,
+				Header:      false,
+				NotNull:     false,
+				Null:        []string{`\N`},
+				EscapedBy:   `\`,
+				TrimLastSep: false,
 			},
 		},
 	}
diff --git a/br/pkg/lightning/restore/table_restore_test.go b/br/pkg/lightning/restore/table_restore_test.go
index 0f6d87892e329..0f7e488415c40 100644
--- a/br/pkg/lightning/restore/table_restore_test.go
+++ b/br/pkg/lightning/restore/table_restore_test.go
@@ -2142,8 +2142,8 @@ func (s *tableRestoreSuite) TestSchemaIsValid() {
 					Header:            ca.hasHeader,
 					HeaderSchemaMatch: true,
 					NotNull:           false,
-					Null:              `\N`,
-					BackslashEscape:   true,
+					Null:              []string{`\N`},
+					EscapedBy:         `\`,
 					TrimLastSep:       false,
 				},
 				IgnoreColumns: ca.ignoreColumns,
@@ -2178,8 +2178,8 @@ func (s *tableRestoreSuite) TestGBKEncodedSchemaIsValid() {
 				Header:            true,
 				HeaderSchemaMatch: true,
 				NotNull:           false,
-				Null:              `\N`,
-				BackslashEscape:   true,
+				Null:              []string{`\N`},
+				EscapedBy:         `\`,
 				TrimLastSep:       false,
 			},
 			IgnoreColumns: nil,