Skip to content

Commit

Permalink
Bug #25167284: LIKE FUNCTION GIVES WRONG RESULT FOR ACCENTED CHARACTERS
Browse files Browse the repository at this point in the history
For ai_ci collations, accented characters compare equal with their base
character, but the LIKE function treats them different. This is because
my_uca_charcmp_900 compares characters without handling ignorable weight.

Solution:
Change my_uca_charcmp_900 function to compare characters' weight correctly.

This patch also fixed the bug that LIKE function returns error result
when the last two pattern characters are w_many followed by escape
(ex, 'a%\\'), and reduced duplicated mb_wc call on same character.

Change-Id: I960bb14d6b239cf3f504f2f703bf69cd58ecb1e8
  • Loading branch information
Xing Zhang committed Dec 12, 2016
1 parent 294a327 commit c5c1459
Show file tree
Hide file tree
Showing 2 changed files with 223 additions and 97 deletions.
256 changes: 159 additions & 97 deletions strings/ctype-uca.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2240,46 +2240,61 @@ my_strnxfrm_uca(const CHARSET_INFO *cs, Mb_wc mb_wc,

static int my_uca_charcmp_900(const CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
{
uint16 *weight1= my_char_weight_addr_900(cs->uca, wc1); /* W3-TODO */
uint16 *weight2= my_char_weight_addr_900(cs->uca, wc2);
uint16 *weight1_ptr= my_char_weight_addr_900(cs->uca, wc1); /* W3-TODO */
uint16 *weight2_ptr= my_char_weight_addr_900(cs->uca, wc2);

/* Check if some of the characters does not have implicit weights */
if (!weight1 || !weight2)
if (!weight1_ptr|| !weight2_ptr)
return wc1 != wc2;

/* Quickly compare first weights */
if (weight1[0] != weight2[0])
if (weight1_ptr[0] && weight2_ptr[0] && weight1_ptr[0] != weight2_ptr[0])
return 1;

/* Thoroughly compare all weights */
size_t length1= weight1[-UCA900_DISTANCE_BETWEEN_LEVELS];
size_t length2= weight2[-UCA900_DISTANCE_BETWEEN_LEVELS];

if (length1 != length2)
return 1;
size_t length1= weight1_ptr[-UCA900_DISTANCE_BETWEEN_LEVELS];
size_t length2= weight2_ptr[-UCA900_DISTANCE_BETWEEN_LEVELS];

if (cs->state & MY_CS_CSSORT)
for (int level= 0; level< cs->levels_for_compare; ++level)
{
for (size_t weightind= 0; weightind < length1 * MY_UCA_900_CE_SIZE;
++weightind)
size_t wt_ind1= 0;
size_t wt_ind2= 0;
uint16 *weight1= weight1_ptr + level * UCA900_DISTANCE_BETWEEN_LEVELS;
uint16 *weight2= weight2_ptr + level * UCA900_DISTANCE_BETWEEN_LEVELS;
while (wt_ind1 < length1 && wt_ind2 < length2)
{
// Zero weight is ignorable.
for (; wt_ind1 < length1 && !*weight1; wt_ind1++)
weight1+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
if (wt_ind1 == length1)
break;
for (; wt_ind2 < length2 && !*weight2; wt_ind2++)
weight2+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
if (wt_ind2 == length2)
break;

// Check if these two non-ignorable weights are equal.
if (*weight1 != *weight2)
return 1;
weight1+= UCA900_DISTANCE_BETWEEN_LEVELS;
weight2+= UCA900_DISTANCE_BETWEEN_LEVELS;
wt_ind1++;
wt_ind2++;
weight1+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
weight2+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
}
}
else
{
for (size_t weightind= 0; weightind < length1; ++weightind)
/*
If either character is out of weights but we have equality so far,
check if the other character has any non-ignorable weights left.
*/
for (; wt_ind1 < length1; wt_ind1++)
{
if (*weight1 != *weight2)
return 1;
if (*weight1) return 1;
weight1+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
}
for (; wt_ind2 < length2; wt_ind2++)
{
if (*weight2) return 1;
weight2+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
}
}

return 0;
}

Expand Down Expand Up @@ -2339,129 +2354,176 @@ int my_wildcmp_uca_impl(const CHARSET_INFO *cs,
const char *wildstr,const char *wildend,
int escape, int w_one, int w_many, int recurse_level)
{
int result= -1; /* Not found, using wildcards */
my_wc_t s_wc, w_wc;
int scan;
int (*mb_wc)(const struct charset_info_st *, my_wc_t *,
const uchar *, const uchar *);
mb_wc= cs->cset->mb_wc;

if (my_string_stack_guard && my_string_stack_guard(recurse_level))
return 1;
if (my_string_stack_guard && my_string_stack_guard(recurse_level))
return 1;
while (wildstr != wildend)
{
int result= -1; /* Not found, using wildcards */
auto mb_wc= cs->cset->mb_wc;

/*
Compare the expression and pattern strings character-by-character until
we find a '%' (w_many) in the pattern string. Once we do, we break out
of the loop and try increasingly large widths for the '%' match,
calling ourselves recursively until we find a match. (As an
optimization, we test for the character immediately after '%' before we
recurse.) This takes exponential time in the worst case.
Example: Say we are trying to match the pattern 'ab%cd' against the
string 'ab..c.cd'. We first match the initial 'ab' against each other,
and then see the '%' in the pattern. Since the first character after
'%' is 'c', we skip to the first 'c' in the expression string, and try
to match 'c.cd' against 'cd' by a recursive call. Since this failed, we
scan for the next 'c', and try to match 'cd' against 'cd', which works.
*/
my_wc_t w_wc;
while (1)
{
my_bool escaped= 0;
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
return 1;
int mb_len;
if ((mb_len= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
return 1;

wildstr+= mb_len;
// If we found '%' (w_many), break out this loop.
if (w_wc == (my_wc_t)w_many)
{
result= 1; /* Found an anchor char */
result= 1;
break;
}

wildstr+= scan;
if (w_wc == (my_wc_t)escape && wildstr < wildend)
/*
If the character we just read was an escape character, skip it and
read the next character instead. This character is used verbatim
without checking if it is a wildcard (% or _). However, as a
special exception, a lone escape character at the end of a string is
treated as itself.
*/
bool escaped= false;
if (w_wc == (my_wc_t)escape && wildstr < wildend)
{
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
if ((mb_len= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
return 1;
wildstr+= scan;
wildstr+= mb_len;
escaped= 1;
}

if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end)) <= 0)

my_wc_t s_wc;
if ((mb_len= mb_wc(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end)) <= 0)
return 1;
str+= scan;

str+= mb_len;

// If we found '_' (w_one), skip one character in expression string.
if (!escaped && w_wc == (my_wc_t)w_one)
{
result= 1; /* Found an anchor char */
result= 1;
}
else
{
if (my_uca_charcmp(cs, s_wc, w_wc))
return 1;
}
if (wildstr == wildend)
return (str != str_end); /* Match if both are at end */
return (str != str_end); /* Match if both are at end */
}


if (w_wc == (my_wc_t)w_many)
{ /* Found w_many */

/* Remove any '%' and '_' from the wild search string */
for ( ; wildstr != wildend ; )
{
// Remove any '%' and '_' following w_many in the pattern string.
for ( ;; )
{
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
if (wildstr == wildend)
{
/*
The previous w_many (%) was the last character in the pattern
string, so we have a match no matter what the rest of the
expression string looks like (even empty).
*/
return 0;
}
int mb_len= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend);
if (mb_len <= 0)
return 1;

if (w_wc == (my_wc_t)w_many)
{
wildstr+= scan;
continue;
}

if (w_wc == (my_wc_t)w_one)
{
wildstr+= scan;
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end)) <= 0)
wildstr+= mb_len;
if (w_wc == (my_wc_t)w_many)
continue;

if (w_wc == (my_wc_t)w_one)
{
/*
Skip one character in expression string because '_' needs to
match one.
*/
my_wc_t s_wc;
int mb_len= mb_wc(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end);
if (mb_len <= 0)
return 1;
str+= scan;
continue;
}
break; /* Not a wild character */
str+= mb_len;
continue;
}
break; /* Not a wild character */
}

if (wildstr == wildend)
return 0; /* Ok if w_many is last */


// No character in the expression string to match w_wc.
if (str == str_end)
return -1;

if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
return 1;

if (w_wc == (my_wc_t)escape)
return -1;

// Skip the escape character ('\') in the pattern if needed.
if (w_wc == (my_wc_t)escape && wildstr < wildend)
{
wildstr+= scan;
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend)) <= 0)
int mb_len= mb_wc(cs, &w_wc, (const uchar*)wildstr,
(const uchar*)wildend);
if (mb_len <= 0)
return 1;
wildstr+= mb_len;
}


/*
w_wc is now the character following w_many (e.g., if the pattern is
"a%c", w_wc is 'c').
*/
while (1)
{
/* Skip until the first character from wildstr is found */
/*
Skip until we find a character in the expression string that is
equal to w_wc.
*/
int mb_len;
while (str != str_end)
{
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end)) <= 0)
my_wc_t s_wc;
if ((mb_len= mb_wc(cs, &s_wc, (const uchar*)str,
(const uchar*)str_end)) <= 0)
return 1;

if (!my_uca_charcmp(cs, s_wc, w_wc))
break;
str+= scan;
str+= mb_len;
}
// No character in the expression string is equal to w_wc.
if (str == str_end)
return -1;

str+= mb_len;

/*
The strings match up until the first character after w_many in the
pattern string. For the rest part of pattern string and expression
string, we recursively call to get wild compare result.
Example, wildcmp(..., "abcdefg", "a%de%g", ...), we'll run again on
wildcmp(..., "efg", "e%g", ...).
*/
result= my_wildcmp_uca_impl(cs, str, str_end, wildstr, wildend,
escape, w_one, w_many, recurse_level + 1);

if (result <= 0)
return result;

str+= scan;
}

}
}
}
return (str != str_end ? 1 : 0);
Expand Down
Loading

0 comments on commit c5c1459

Please sign in to comment.