Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

loaddata, lightning: support ascii, latin1, utf8 charset of source file #42699

Merged
merged 2 commits into from
Apr 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion br/pkg/lightning/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,8 @@ const (
UTF8MB4
GB18030
GBK
Latin1
ASCII
)

// String return the string value of charset
Expand All @@ -779,6 +781,10 @@ func (c Charset) String() string {
return "gb18030"
case GBK:
return "gbk"
case Latin1:
return "latin1"
case ASCII:
return "ascii"
default:
return "unknown_charset"
}
Expand All @@ -789,12 +795,16 @@ func ParseCharset(dataCharacterSet string) (Charset, error) {
switch strings.ToLower(dataCharacterSet) {
case "", "binary":
return Binary, nil
case "utf8mb4":
case "utf8", "utf8mb4":
return UTF8MB4, nil
case "gb18030":
return GB18030, nil
case "gbk":
return GBK, nil
case "latin1":
return Latin1, nil
case "ascii":
return ASCII, nil
default:
return Binary, errors.Errorf("found unsupported data-character-set: %s", dataCharacterSet)
}
Expand Down
1 change: 1 addition & 0 deletions br/pkg/lightning/mydump/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ go_library(
"@com_github_xitongsys_parquet_go//source",
"@org_golang_x_exp//slices",
"@org_golang_x_text//encoding",
"@org_golang_x_text//encoding/charmap",
"@org_golang_x_text//encoding/simplifiedchinese",
"@org_uber_go_zap//:zap",
"@org_uber_go_zap//zapcore",
Expand Down
19 changes: 16 additions & 3 deletions br/pkg/lightning/mydump/charset_convertor.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/pingcap/errors"
"github.com/pingcap/tidb/br/pkg/lightning/config"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/simplifiedchinese"
)

Expand Down Expand Up @@ -60,28 +61,38 @@ func NewCharsetConvertor(dataCharacterSet, dataInvalidCharReplace string) (*Char

func (cc *CharsetConvertor) initDecoder() error {
switch cc.sourceCharacterSet {
case config.Binary, config.UTF8MB4:
case config.Binary, config.UTF8MB4, config.ASCII:
return nil
case config.GB18030:
cc.decoder = simplifiedchinese.GB18030.NewDecoder()
return nil
case config.GBK:
cc.decoder = simplifiedchinese.GBK.NewDecoder()
return nil
case config.Latin1:
// use Windows1252 (not ISO 8859-1) to decode Latin1
// https://dev.mysql.com/doc/refman/8.0/en/charset-we-sets.html
cc.decoder = charmap.Windows1252.NewDecoder()
return nil
}
return errors.Errorf("not support %s as the conversion source yet", cc.sourceCharacterSet)
}

func (cc *CharsetConvertor) initEncoder() error {
switch cc.sourceCharacterSet {
case config.Binary, config.UTF8MB4:
case config.Binary, config.UTF8MB4, config.ASCII:
return nil
case config.GB18030:
cc.encoder = simplifiedchinese.GB18030.NewEncoder()
return nil
case config.GBK:
cc.encoder = simplifiedchinese.GBK.NewEncoder()
return nil
case config.Latin1:
// use Windows1252 (not ISO 8859-1) to encode Latin1
// https://dev.mysql.com/doc/refman/8.0/en/charset-we-sets.html
cc.encoder = charmap.Windows1252.NewEncoder()
return nil
}
return errors.Errorf("not support %s as the conversion source yet", cc.sourceCharacterSet)
}
Expand All @@ -105,7 +116,9 @@ func (cc *CharsetConvertor) Decode(src string) (string, error) {
func (cc *CharsetConvertor) precheck(src string) bool {
// No need to convert the charset encoding, just return the original data.
if len(src) == 0 || cc == nil ||
cc.sourceCharacterSet == config.Binary || cc.sourceCharacterSet == config.UTF8MB4 ||
cc.sourceCharacterSet == config.Binary ||
cc.sourceCharacterSet == config.UTF8MB4 ||
cc.sourceCharacterSet == config.ASCII ||
cc.decoder == nil || cc.encoder == nil {
return false
}
Expand Down
132 changes: 131 additions & 1 deletion executor/loadremotetest/one_csv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,8 +350,138 @@ func (s *mockGCSSuite) TestGBK() {
"2 丐丑丒专且丕世丗丘丙业丛东丝丞丢",
))

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "utf8mb4.tsv",
},
Content: []byte("1\t一丁丂七丄丅丆万丈三上下丌不与丏\n" +
"2\t丐丑丒专且丕世丗丘丙业丛东丝丞丢"),
})

s.tk.MustExec("TRUNCATE TABLE load_charset.gbk;")
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/utf8mb4.tsv?endpoint=%s'
INTO TABLE load_charset.gbk CHARACTER SET utf8mb4`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.gbk;").Check(testkit.Rows(
"1 一丁丂七丄丅丆万丈三上下丌不与丏",
"2 丐丑丒专且丕世丗丘丙业丛东丝丞丢",
))

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "emoji.tsv",
},
Content: []byte("1\t一丁丂七😀😁😂😃\n" +
"2\t丐丑丒专😄😅😆😇"),
})

s.tk.MustExec("TRUNCATE TABLE load_charset.gbk;")
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/emoji.tsv?endpoint=%s'
INTO TABLE load_charset.gbk CHARACTER SET utf8mb4`, gcsEndpoint)
err := s.tk.ExecToErr(sql)
checkClientErrorMessage(s.T(), err, `ERROR 1366 (HY000): Incorrect string value '\xF0\x9F\x98\x80' for column 'j'`)

sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/emoji.tsv?endpoint=%s'
IGNORE INTO TABLE load_charset.gbk CHARACTER SET utf8mb4`, gcsEndpoint)
s.tk.MustExec(sql)
require.Equal(s.T(), "Records: 2 Deleted: 0 Skipped: 0 Warnings: 2", s.tk.Session().GetSessionVars().StmtCtx.GetMessage())
s.tk.MustQuery("SELECT HEX(j) FROM load_charset.gbk;").Check(testkit.Rows(
"D2BBB6A18140C6DF3F3F3F3F",
"D8A4B3F38145D7A83F3F3F3F",
))

sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/gbk.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4 CHARACTER SET unknown`, gcsEndpoint)
err := s.tk.ExecToErr(sql)
err = s.tk.ExecToErr(sql)
require.ErrorContains(s.T(), err, "Unknown character set: 'unknown'")
}

func (s *mockGCSSuite) TestOtherCharset() {
s.tk.MustExec("DROP DATABASE IF EXISTS load_charset;")
s.tk.MustExec("CREATE DATABASE load_charset;")
s.tk.MustExec(`CREATE TABLE load_charset.utf8 (
i INT, j VARCHAR(255)
) CHARACTER SET utf8;`)
s.tk.MustExec(`CREATE TABLE load_charset.utf8mb4 (
i INT, j VARCHAR(255)
) CHARACTER SET utf8mb4;`)

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "utf8.tsv",
},
Content: []byte("1\tကခဂဃ\n2\tငစဆဇ"),
})

sql := fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/utf8.tsv?endpoint=%s'
INTO TABLE load_charset.utf8 CHARACTER SET utf8`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.utf8;").Check(testkit.Rows(
"1 ကခဂဃ",
"2 ငစဆဇ",
))
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/utf8.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4 CHARACTER SET utf8`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows(
"1 ကခဂဃ",
"2 ငစဆဇ",
))

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "latin1.tsv",
},
// "1\t‘’“”\n2\t¡¢£¤"
Content: []byte{0x31, 0x09, 0x91, 0x92, 0x93, 0x94, 0x0a, 0x32, 0x09, 0xa1, 0xa2, 0xa3, 0xa4},
})
s.tk.MustExec(`CREATE TABLE load_charset.latin1 (
i INT, j VARCHAR(255)
) CHARACTER SET latin1;`)
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/latin1.tsv?endpoint=%s'
INTO TABLE load_charset.latin1 CHARACTER SET latin1`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.latin1;").Check(testkit.Rows(
"1 ‘’“”",
"2 ¡¢£¤",
))

s.tk.MustExec("TRUNCATE TABLE load_charset.utf8mb4;")
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/latin1.tsv?endpoint=%s'
INTO TABLE load_charset.utf8mb4 CHARACTER SET latin1`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT * FROM load_charset.utf8mb4;").Check(testkit.Rows(
"1 ‘’“”",
"2 ¡¢£¤",
))

s.server.CreateObject(fakestorage.Object{
ObjectAttrs: fakestorage.ObjectAttrs{
BucketName: "test-load",
Name: "ascii.tsv",
},
Content: []byte{0, 1, 2, 3, 4, 5, 6, 7},
})
s.tk.MustExec(`CREATE TABLE load_charset.ascii (
j VARCHAR(255)
) CHARACTER SET ascii;`)
s.tk.MustExec(`CREATE TABLE load_charset.binary (
j VARCHAR(255)
) CHARACTER SET binary;`)
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/ascii.tsv?endpoint=%s'
INTO TABLE load_charset.ascii CHARACTER SET ascii`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT HEX(j) FROM load_charset.ascii;").Check(testkit.Rows(
"0001020304050607",
))
sql = fmt.Sprintf(`LOAD DATA INFILE 'gs://test-load/ascii.tsv?endpoint=%s'
INTO TABLE load_charset.binary CHARACTER SET binary`, gcsEndpoint)
s.tk.MustExec(sql)
s.tk.MustQuery("SELECT HEX(j) FROM load_charset.binary;").Check(testkit.Rows(
"0001020304050607",
))
}