Skip to content

Commit

Permalink
charset: implement utf8_unicode_ci and utf8mb4_unicode_ci collation (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
ti-srebot authored Jan 28, 2021
1 parent 80d420d commit 1f5b303
Show file tree
Hide file tree
Showing 14 changed files with 1,017 additions and 62 deletions.
29 changes: 23 additions & 6 deletions ddl/db_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import (
"github.com/pingcap/tidb/tablecodec"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/admin"
"github.com/pingcap/tidb/util/collate"
"github.com/pingcap/tidb/util/domainutil"
"github.com/pingcap/tidb/util/israce"
"github.com/pingcap/tidb/util/mock"
Expand Down Expand Up @@ -2205,19 +2206,35 @@ func (s *testDBSuite1) TestCreateTable(c *C) {
s.tk.MustExec("use test")
failSQL := "create table t_enum (a enum('e','e'));"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a enum('e','E'));"
}

func (s *testSerialDBSuite) TestCreateTableWithCollation(c *C) {
collate.SetNewCollationEnabledForTest(true)
defer collate.SetNewCollationEnabledForTest(false)
s.tk.MustExec("use test")
failSQL := "create table t_enum (a enum('e','E')) charset=utf8 collate=utf8_general_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a enum('abc','Abc')) charset=utf8 collate=utf8_general_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a enum('abc','Abc'));"
failSQL = "create table t_enum (a enum('e','E')) charset=utf8 collate=utf8_unicode_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a enum('ss','ß')) charset=utf8 collate=utf8_unicode_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
// test for set column
failSQL = "create table t_enum (a set('e','e'));"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a set('e','E'));"
failSQL = "create table t_enum (a set('e','E')) charset=utf8 collate=utf8_general_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a set('abc','Abc')) charset=utf8 collate=utf8_general_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
_, err := s.tk.Exec("create table t_enum (a enum('B','b')) charset=utf8 collate=utf8_general_ci;")
c.Assert(err.Error(), Equals, "[types:1291]Column 'a' has duplicated value 'b' in ENUM")
failSQL = "create table t_enum (a set('e','E')) charset=utf8 collate=utf8_unicode_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
failSQL = "create table t_enum (a set('abc','Abc'));"
failSQL = "create table t_enum (a set('ss','ß')) charset=utf8 collate=utf8_unicode_ci;"
s.tk.MustGetErrCode(failSQL, errno.ErrDuplicatedValueInType)
_, err = s.tk.Exec("create table t_enum (a enum('B','b'));")
c.Assert(err.Error(), Equals, "[types:1291]Column 'a' has duplicated value 'B' in ENUM")
_, err = s.tk.Exec("create table t_enum (a enum('ss','ß')) charset=utf8 collate=utf8_unicode_ci;")
c.Assert(err.Error(), Equals, "[types:1291]Column 'a' has duplicated value 'ß' in ENUM")
}

func (s *testDBSuite5) TestRepairTable(c *C) {
Expand Down
11 changes: 11 additions & 0 deletions executor/collation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,17 @@ func (s *testCollationSuite) TestVecGroupChecker(c *C) {
}
c.Assert(groupChecker.isExhausted(), IsTrue)

tp.Collate = "utf8_unicode_ci"
groupChecker.reset()
_, err = groupChecker.splitIntoGroups(chk)
c.Assert(err, IsNil)
for i := 0; i < 3; i++ {
b, e := groupChecker.getNextGroup()
c.Assert(b, Equals, i*2)
c.Assert(e, Equals, i*2+2)
}
c.Assert(groupChecker.isExhausted(), IsTrue)

// test padding
tp.Collate = "utf8_bin"
tp.Flen = 6
Expand Down
63 changes: 43 additions & 20 deletions expression/builtin_like_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,26 +59,37 @@ func (s *testEvaluatorSerialSuites) TestCILike(c *C) {
collate.SetNewCollationEnabledForTest(true)
defer collate.SetNewCollationEnabledForTest(false)
tests := []struct {
input string
pattern string
match int
input string
pattern string
generalMatch int
unicodeMatch int
}{
{"a", "", 0},
{"a", "a", 1},
{"a", "á", 1},
{"a", "b", 0},
{"aA", "Aa", 1},
{"áAb", `Aa%`, 1},
{"áAb", `%ab%`, 1},
{"áAb", `%ab`, 1},
{"ÀAb", "aA_", 1},
{"áééá", "a_%a", 1},
{"áééá", "a%_a", 1},
{"áéá", "a_%a", 1},
{"áéá", "a%_a", 1},
{"áá", "a_%a", 0},
{"áá", "a%_a", 0},
{"áééáííí", "a_%a%", 1},
{"a", "", 0, 0},
{"a", "a", 1, 1},
{"a", "á", 1, 1},
{"a", "b", 0, 0},
{"aA", "Aa", 1, 1},
{"áAb", `Aa%`, 1, 1},
{"áAb", `%ab%`, 1, 1},
{"áAb", `%ab`, 1, 1},
{"ÀAb", "aA_", 1, 1},
{"áééá", "a_%a", 1, 1},
{"áééá", "a%_a", 1, 1},
{"áéá", "a_%a", 1, 1},
{"áéá", "a%_a", 1, 1},
{"áá", "a_%a", 0, 0},
{"áá", "a%_a", 0, 0},
{"áééáííí", "a_%a%", 1, 1},

// performs matching on a per-character basis
// https://dev.mysql.com/doc/refman/5.7/en/string-comparison-functions.html#operator_like
{"ß", "s%", 1, 0},
{"ß", "%s", 1, 0},
{"ß", "ss", 0, 0},
{"ß", "s", 1, 0},
{"ss", "%ß%", 1, 0},
{"ß", "_", 1, 1},
{"ß", "__", 0, 0},
}
for _, tt := range tests {
commentf := Commentf(`for input = "%s", pattern = "%s"`, tt.input, tt.pattern)
Expand All @@ -88,8 +99,20 @@ func (s *testEvaluatorSerialSuites) TestCILike(c *C) {
c.Assert(err, IsNil, commentf)
f.setCollator(collate.GetCollator("utf8mb4_general_ci"))
r, err := evalBuiltinFunc(f, chunk.Row{})
c.Assert(err, IsNil)
c.Assert(r, testutil.DatumEquals, types.NewDatum(tt.generalMatch))
}

for _, tt := range tests {
commentf := Commentf(`for input = "%s", pattern = "%s"`, tt.input, tt.pattern)
fc := funcs[ast.Like]
inputs := s.datumsToConstants(types.MakeDatums(tt.input, tt.pattern, 0))
f, err := fc.getFunction(s.ctx, inputs)
c.Assert(err, IsNil, commentf)
f.setCollator(collate.GetCollator("utf8mb4_unicode_ci"))
r, err := evalBuiltinFunc(f, chunk.Row{})
c.Assert(err, IsNil, commentf)
c.Assert(r, testutil.DatumEquals, types.NewDatum(tt.match), commentf)
c.Assert(r, testutil.DatumEquals, types.NewDatum(tt.unicodeMatch), commentf)
}
}

Expand Down
72 changes: 48 additions & 24 deletions expression/builtin_string_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2375,13 +2375,40 @@ func (s *testEvaluatorSerialSuites) TestCIWeightString(c *C) {
collate.SetNewCollationEnabledForTest(true)
defer collate.SetNewCollationEnabledForTest(false)

fc := funcs[ast.WeightString]
tests := []struct {
type weightStringTest struct {
str string
padding string
length int
expect interface{}
}{
}

checkResult := func(collation string, tests []weightStringTest) {
fc := funcs[ast.WeightString]
for _, test := range tests {
str := types.NewCollationStringDatum(test.str, collation, utf8.RuneCountInString(test.str))
var f builtinFunc
var err error
if test.padding == "NONE" {
f, err = fc.getFunction(s.ctx, s.datumsToConstants([]types.Datum{str}))
} else {
padding := types.NewDatum(test.padding)
length := types.NewDatum(test.length)
f, err = fc.getFunction(s.ctx, s.datumsToConstants([]types.Datum{str, padding, length}))
}
c.Assert(err, IsNil)
result, err := evalBuiltinFunc(f, chunk.Row{})
c.Assert(err, IsNil)
if result.IsNull() {
c.Assert(test.expect, IsNil)
continue
}
res, err := result.ToString()
c.Assert(err, IsNil)
c.Assert(res, Equals, test.expect)
}
}

generalTests := []weightStringTest{
{"aAÁàãăâ", "NONE", 0, "\x00A\x00A\x00A\x00A\x00A\x00A\x00A"},
{"中", "NONE", 0, "\x4E\x2D"},
{"a", "CHAR", 5, "\x00A"},
Expand All @@ -2398,26 +2425,23 @@ func (s *testEvaluatorSerialSuites) TestCIWeightString(c *C) {
{"中", "BINARY", 5, "中\x00\x00"},
}

for _, test := range tests {
str := types.NewCollationStringDatum(test.str, "utf8mb4_general_ci", utf8.RuneCountInString(test.str))
var f builtinFunc
var err error
if test.padding == "NONE" {
f, err = fc.getFunction(s.ctx, s.datumsToConstants([]types.Datum{str}))
} else {
padding := types.NewDatum(test.padding)
length := types.NewDatum(test.length)
f, err = fc.getFunction(s.ctx, s.datumsToConstants([]types.Datum{str, padding, length}))
}
c.Assert(err, IsNil)
result, err := evalBuiltinFunc(f, chunk.Row{})
c.Assert(err, IsNil)
if result.IsNull() {
c.Assert(test.expect, IsNil)
continue
}
res, err := result.ToString()
c.Assert(err, IsNil)
c.Assert(res, Equals, test.expect)
unicodeTests := []weightStringTest{
{"aAÁàãăâ", "NONE", 0, "\x0e3\x0e3\x0e3\x0e3\x0e3\x0e3\x0e3"},
{"中", "NONE", 0, "\xfb\x40\xce\x2d"},
{"a", "CHAR", 5, "\x0e3"},
{"a ", "CHAR", 5, "\x0e3"},
{"中", "CHAR", 5, "\xfb\x40\xce\x2d"},
{"中 ", "CHAR", 5, "\xfb\x40\xce\x2d"},
{"a", "BINARY", 1, "a"},
{"ab", "BINARY", 1, "a"},
{"a", "BINARY", 5, "a\x00\x00\x00\x00"},
{"a ", "BINARY", 5, "a \x00\x00\x00"},
{"中", "BINARY", 1, "\xe4"},
{"中", "BINARY", 2, "\xe4\xb8"},
{"中", "BINARY", 3, "中"},
{"中", "BINARY", 5, "中\x00\x00"},
}

checkResult("utf8mb4_general_ci", generalTests)
checkResult("utf8mb4_unicode_ci", unicodeTests)
}
10 changes: 10 additions & 0 deletions expression/collation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ func (s *testCollationSuites) TestCompareString(c *C) {
c.Assert(types.CompareString("À", "A", "utf8_general_ci"), Equals, 0)
c.Assert(types.CompareString("😜", "😃", "utf8_general_ci"), Equals, 0)
c.Assert(types.CompareString("a ", "a ", "utf8_general_ci"), Equals, 0)
c.Assert(types.CompareString("ß", "s", "utf8_general_ci"), Equals, 0)
c.Assert(types.CompareString("ß", "ss", "utf8_general_ci"), Not(Equals), 0)

c.Assert(types.CompareString("a", "A", "utf8_unicode_ci"), Equals, 0)
c.Assert(types.CompareString("À", "A", "utf8_unicode_ci"), Equals, 0)
c.Assert(types.CompareString("😜", "😃", "utf8_unicode_ci"), Equals, 0)
c.Assert(types.CompareString("a ", "a ", "utf8_unicode_ci"), Equals, 0)
c.Assert(types.CompareString("ß", "s", "utf8_unicode_ci"), Not(Equals), 0)
c.Assert(types.CompareString("ß", "ss", "utf8_unicode_ci"), Equals, 0)

c.Assert(types.CompareString("a", "A", "binary"), Not(Equals), 0)
c.Assert(types.CompareString("À", "A", "binary"), Not(Equals), 0)
c.Assert(types.CompareString("😜", "😃", "binary"), Not(Equals), 0)
Expand Down
29 changes: 27 additions & 2 deletions expression/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6140,6 +6140,7 @@ func (s *testIntegrationSerialSuite) TestWeightString(c *C) {
// test explicit collation
c.Assert(tk.MustQuery("select weight_string('中 ' collate utf8mb4_general_ci);").Rows()[0][0], Equals, "\x4E\x2D")
c.Assert(tk.MustQuery("select weight_string('中 ' collate utf8mb4_bin);").Rows()[0][0], Equals, "中")
c.Assert(tk.MustQuery("select weight_string('中 ' collate utf8mb4_unicode_ci);").Rows()[0][0], Equals, "\xFB\x40\xCE\x2D")
c.Assert(tk.MustQuery("select collation(a collate utf8mb4_general_ci) from t order by id").Rows()[0][0], Equals, "utf8mb4_general_ci")
c.Assert(tk.MustQuery("select collation('中 ' collate utf8mb4_general_ci);").Rows()[0][0], Equals, "utf8mb4_general_ci")
rows = tk.MustQuery("select weight_string(a collate utf8mb4_bin) from t order by id").Rows()
Expand All @@ -6163,8 +6164,23 @@ func (s *testIntegrationSerialSuite) TestCollationCreateIndex(c *C) {
tk.MustExec("insert into t values ('B');")
tk.MustExec("insert into t values ('a');")
tk.MustExec("insert into t values ('A');")
tk.MustExec("insert into t values ('ß');")
tk.MustExec("insert into t values ('sa');")
tk.MustExec("create index idx on t(a);")
tk.MustQuery("select * from t order by a").Check(testkit.Rows("a", "A", "a", "A", "b", "B"))
tk.MustQuery("select * from t order by a").Check(testkit.Rows("a", "A", "a", "A", "b", "B", "ß", "sa"))

tk.MustExec("drop table if exists t")
tk.MustExec("create table t (a varchar(10) collate utf8mb4_unicode_ci);")
tk.MustExec("insert into t values ('a');")
tk.MustExec("insert into t values ('A');")
tk.MustExec("insert into t values ('b');")
tk.MustExec("insert into t values ('B');")
tk.MustExec("insert into t values ('a');")
tk.MustExec("insert into t values ('A');")
tk.MustExec("insert into t values ('ß');")
tk.MustExec("insert into t values ('sa');")
tk.MustExec("create index idx on t(a);")
tk.MustQuery("select * from t order by a").Check(testkit.Rows("a", "A", "a", "A", "b", "B", "sa", "ß"))
}

func (s *testIntegrationSerialSuite) TestCollateConstantPropagation(c *C) {
Expand Down Expand Up @@ -6332,6 +6348,8 @@ func (s *testIntegrationSerialSuite) TestCollateSort(c *C) {
tk.MustExec("insert into t values ('a'), ('A'), ('b')")
tk.MustExec("insert into t values ('a'), ('A'), ('b')")
tk.MustQuery("select * from t order by a collate utf8mb4_bin").Check(testkit.Rows("A", "A", "A", "a", "a", "a", "b", "b", "b"))
tk.MustQuery("select * from t order by a collate utf8mb4_general_ci").Check(testkit.Rows("a", "A", "a", "A", "a", "A", "b", "b", "b"))
tk.MustQuery("select * from t order by a collate utf8mb4_unicode_ci").Check(testkit.Rows("a", "A", "a", "A", "a", "A", "b", "b", "b"))
}

func (s *testIntegrationSerialSuite) TestCollateHashAgg(c *C) {
Expand All @@ -6352,7 +6370,10 @@ func (s *testIntegrationSerialSuite) TestCollateHashAgg(c *C) {
tk.MustExec("insert into t values ('a'), ('A'), ('b')")
tk.MustExec("insert into t values ('a'), ('A'), ('b')")
tk.MustExec("insert into t values ('a'), ('A'), ('b')")
tk.MustQuery("select count(1) from t group by a collate utf8mb4_bin").Check(testkit.Rows("3", "3", "3"))
tk.MustExec("insert into t values ('s'), ('ss'), ('ß')")
tk.MustQuery("select count(1) from t group by a collate utf8mb4_bin order by a collate utf8mb4_bin").Check(testkit.Rows("3", "3", "3", "1", "1", "1"))
tk.MustQuery("select count(1) from t group by a collate utf8mb4_unicode_ci order by a collate utf8mb4_unicode_ci").Check(testkit.Rows("6", "3", "1", "2"))
tk.MustQuery("select count(1) from t group by a collate utf8mb4_general_ci order by a collate utf8mb4_general_ci").Check(testkit.Rows("6", "3", "2", "1"))
}

func (s *testIntegrationSerialSuite) TestCollateStreamAgg(c *C) {
Expand Down Expand Up @@ -6418,6 +6439,8 @@ func (s *testIntegrationSerialSuite) TestCollateStringFunction(c *C) {
tk.MustQuery("select field('a', 'b', 'A');").Check(testkit.Rows("0"))
tk.MustQuery("select field('a', 'b', 'A' collate utf8mb4_bin);").Check(testkit.Rows("0"))
tk.MustQuery("select field('a', 'b', 'a ' collate utf8mb4_bin);").Check(testkit.Rows("2"))
tk.MustQuery("select field('a', 'b', 'A' collate utf8mb4_unicode_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select field('a', 'b', 'a ' collate utf8mb4_unicode_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select field('a', 'b', 'A' collate utf8mb4_general_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select field('a', 'b', 'a ' collate utf8mb4_general_ci);").Check(testkit.Rows("2"))

Expand All @@ -6433,6 +6456,8 @@ func (s *testIntegrationSerialSuite) TestCollateStringFunction(c *C) {
tk.MustQuery("select FIND_IN_SET('a','b,a ,c,d' collate utf8mb4_bin);").Check(testkit.Rows("2"))
tk.MustQuery("select FIND_IN_SET('a','b,A,c,d' collate utf8mb4_general_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select FIND_IN_SET('a','b,a ,c,d' collate utf8mb4_general_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select FIND_IN_SET('a','b,A,c,d' collate utf8mb4_unicode_ci);").Check(testkit.Rows("2"))
tk.MustQuery("select FIND_IN_SET('a','b,a ,c,d' collate utf8mb4_unicode_ci);").Check(testkit.Rows("2"))

tk.MustExec("select concat('a' collate utf8mb4_bin, 'b' collate utf8mb4_bin);")
tk.MustGetErrMsg("select concat('a' collate utf8mb4_bin, 'b' collate utf8mb4_general_ci);", "[expression:1267]Illegal mix of collations (utf8mb4_bin,EXPLICIT) and (utf8mb4_general_ci,EXPLICIT) for operation 'concat'")
Expand Down
15 changes: 15 additions & 0 deletions types/enum_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,21 @@ func (s *testEnumSuite) TestEnum(c *C) {
c.Assert(e.String(), Equals, t.Elems[t.Expected-1])
c.Assert(e.ToNumber(), Equals, float64(t.Expected))
}

for _, t := range tbl {
e, err := ParseEnumName(t.Elems, t.Name, "utf8_unicode_ci")
if t.Expected == 0 {
c.Assert(err, NotNil)
c.Assert(e.ToNumber(), Equals, float64(0))
c.Assert(e.String(), Equals, "")
continue
}

c.Assert(err, IsNil)
c.Assert(e.String(), Equals, t.Elems[t.Expected-1])
c.Assert(e.ToNumber(), Equals, float64(t.Expected))
}

for _, t := range citbl {
e, err := ParseEnumName(t.Elems, t.Name, "utf8_general_ci")
if t.Expected == 0 {
Expand Down
8 changes: 8 additions & 0 deletions types/set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ func (s *testSetSuite) TestSet(c *C) {
c.Assert(e.ToNumber(), Equals, float64(t.ExpectedValue))
c.Assert(e.String(), Equals, t.ExpectedName)
}

for _, t := range tbl {
e, err := ParseSetName(elems, t.Name, "utf8_unicode_ci")
c.Assert(err, IsNil)
c.Assert(e.ToNumber(), Equals, float64(t.ExpectedValue))
c.Assert(e.String(), Equals, t.ExpectedName)
}

for _, t := range citbl {
e, err := ParseSetName(elems, t.Name, "utf8_general_ci")
c.Assert(err, IsNil)
Expand Down
Loading

0 comments on commit 1f5b303

Please sign in to comment.