Skip to content

Commit c5c1459

Browse files
author
Xing Zhang
committed
Bug #25167284: LIKE FUNCTION GIVES WRONG RESULT FOR ACCENTED CHARACTERS
For ai_ci collations, accented characters compare equal with their base character, but the LIKE function treats them different. This is because my_uca_charcmp_900 compares characters without handling ignorable weight. Solution: Change my_uca_charcmp_900 function to compare characters' weight correctly. This patch also fixed the bug that LIKE function returns error result when the last two pattern characters are w_many followed by escape (ex, 'a%\\'), and reduced duplicated mb_wc call on same character. Change-Id: I960bb14d6b239cf3f504f2f703bf69cd58ecb1e8
1 parent 294a327 commit c5c1459

File tree

2 files changed

+223
-97
lines changed

2 files changed

+223
-97
lines changed

strings/ctype-uca.cc

Lines changed: 159 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -2240,46 +2240,61 @@ my_strnxfrm_uca(const CHARSET_INFO *cs, Mb_wc mb_wc,
22402240

22412241
static int my_uca_charcmp_900(const CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
22422242
{
2243-
uint16 *weight1= my_char_weight_addr_900(cs->uca, wc1); /* W3-TODO */
2244-
uint16 *weight2= my_char_weight_addr_900(cs->uca, wc2);
2243+
uint16 *weight1_ptr= my_char_weight_addr_900(cs->uca, wc1); /* W3-TODO */
2244+
uint16 *weight2_ptr= my_char_weight_addr_900(cs->uca, wc2);
22452245

22462246
/* Check if some of the characters does not have implicit weights */
2247-
if (!weight1 || !weight2)
2247+
if (!weight1_ptr|| !weight2_ptr)
22482248
return wc1 != wc2;
22492249

2250-
/* Quickly compare first weights */
2251-
if (weight1[0] != weight2[0])
2250+
if (weight1_ptr[0] && weight2_ptr[0] && weight1_ptr[0] != weight2_ptr[0])
22522251
return 1;
22532252

22542253
/* Thoroughly compare all weights */
2255-
size_t length1= weight1[-UCA900_DISTANCE_BETWEEN_LEVELS];
2256-
size_t length2= weight2[-UCA900_DISTANCE_BETWEEN_LEVELS];
2257-
2258-
if (length1 != length2)
2259-
return 1;
2254+
size_t length1= weight1_ptr[-UCA900_DISTANCE_BETWEEN_LEVELS];
2255+
size_t length2= weight2_ptr[-UCA900_DISTANCE_BETWEEN_LEVELS];
22602256

2261-
if (cs->state & MY_CS_CSSORT)
2257+
for (int level= 0; level< cs->levels_for_compare; ++level)
22622258
{
2263-
for (size_t weightind= 0; weightind < length1 * MY_UCA_900_CE_SIZE;
2264-
++weightind)
2259+
size_t wt_ind1= 0;
2260+
size_t wt_ind2= 0;
2261+
uint16 *weight1= weight1_ptr + level * UCA900_DISTANCE_BETWEEN_LEVELS;
2262+
uint16 *weight2= weight2_ptr + level * UCA900_DISTANCE_BETWEEN_LEVELS;
2263+
while (wt_ind1 < length1 && wt_ind2 < length2)
22652264
{
2265+
// Zero weight is ignorable.
2266+
for (; wt_ind1 < length1 && !*weight1; wt_ind1++)
2267+
weight1+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
2268+
if (wt_ind1 == length1)
2269+
break;
2270+
for (; wt_ind2 < length2 && !*weight2; wt_ind2++)
2271+
weight2+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
2272+
if (wt_ind2 == length2)
2273+
break;
2274+
2275+
// Check if these two non-ignorable weights are equal.
22662276
if (*weight1 != *weight2)
22672277
return 1;
2268-
weight1+= UCA900_DISTANCE_BETWEEN_LEVELS;
2269-
weight2+= UCA900_DISTANCE_BETWEEN_LEVELS;
2278+
wt_ind1++;
2279+
wt_ind2++;
2280+
weight1+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
2281+
weight2+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
22702282
}
2271-
}
2272-
else
2273-
{
2274-
for (size_t weightind= 0; weightind < length1; ++weightind)
2283+
/*
2284+
If either character is out of weights but we have equality so far,
2285+
check if the other character has any non-ignorable weights left.
2286+
*/
2287+
for (; wt_ind1 < length1; wt_ind1++)
22752288
{
2276-
if (*weight1 != *weight2)
2277-
return 1;
2289+
if (*weight1) return 1;
22782290
weight1+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
2291+
}
2292+
for (; wt_ind2 < length2; wt_ind2++)
2293+
{
2294+
if (*weight2) return 1;
22792295
weight2+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
22802296
}
22812297
}
2282-
22832298
return 0;
22842299
}
22852300

@@ -2339,129 +2354,176 @@ int my_wildcmp_uca_impl(const CHARSET_INFO *cs,
23392354
const char *wildstr,const char *wildend,
23402355
int escape, int w_one, int w_many, int recurse_level)
23412356
{
2342-
int result= -1; /* Not found, using wildcards */
2343-
my_wc_t s_wc, w_wc;
2344-
int scan;
2345-
int (*mb_wc)(const struct charset_info_st *, my_wc_t *,
2346-
const uchar *, const uchar *);
2347-
mb_wc= cs->cset->mb_wc;
2348-
2349-
if (my_string_stack_guard && my_string_stack_guard(recurse_level))
2350-
return 1;
2357+
if (my_string_stack_guard && my_string_stack_guard(recurse_level))
2358+
return 1;
23512359
while (wildstr != wildend)
23522360
{
2361+
int result= -1; /* Not found, using wildcards */
2362+
auto mb_wc= cs->cset->mb_wc;
2363+
2364+
/*
2365+
Compare the expression and pattern strings character-by-character until
2366+
we find a '%' (w_many) in the pattern string. Once we do, we break out
2367+
of the loop and try increasingly large widths for the '%' match,
2368+
calling ourselves recursively until we find a match. (As an
2369+
optimization, we test for the character immediately after '%' before we
2370+
recurse.) This takes exponential time in the worst case.
2371+
2372+
Example: Say we are trying to match the pattern 'ab%cd' against the
2373+
string 'ab..c.cd'. We first match the initial 'ab' against each other,
2374+
and then see the '%' in the pattern. Since the first character after
2375+
'%' is 'c', we skip to the first 'c' in the expression string, and try
2376+
to match 'c.cd' against 'cd' by a recursive call. Since this failed, we
2377+
scan for the next 'c', and try to match 'cd' against 'cd', which works.
2378+
*/
2379+
my_wc_t w_wc;
23532380
while (1)
23542381
{
2355-
my_bool escaped= 0;
2356-
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
2357-
(const uchar*)wildend)) <= 0)
2358-
return 1;
2382+
int mb_len;
2383+
if ((mb_len= mb_wc(cs, &w_wc, (const uchar*)wildstr,
2384+
(const uchar*)wildend)) <= 0)
2385+
return 1;
23592386

2387+
wildstr+= mb_len;
2388+
// If we found '%' (w_many), break out this loop.
23602389
if (w_wc == (my_wc_t)w_many)
23612390
{
2362-
result= 1; /* Found an anchor char */
2391+
result= 1;
23632392
break;
23642393
}
23652394

2366-
wildstr+= scan;
2367-
if (w_wc == (my_wc_t)escape && wildstr < wildend)
2395+
/*
2396+
If the character we just read was an escape character, skip it and
2397+
read the next character instead. This character is used verbatim
2398+
without checking if it is a wildcard (% or _). However, as a
2399+
special exception, a lone escape character at the end of a string is
2400+
treated as itself.
2401+
*/
2402+
bool escaped= false;
2403+
if (w_wc == (my_wc_t)escape && wildstr < wildend)
23682404
{
2369-
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
2370-
(const uchar*)wildend)) <= 0)
2405+
if ((mb_len= mb_wc(cs, &w_wc, (const uchar*)wildstr,
2406+
(const uchar*)wildend)) <= 0)
23712407
return 1;
2372-
wildstr+= scan;
2408+
wildstr+= mb_len;
23732409
escaped= 1;
23742410
}
2375-
2376-
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
2377-
(const uchar*)str_end)) <= 0)
2411+
2412+
my_wc_t s_wc;
2413+
if ((mb_len= mb_wc(cs, &s_wc, (const uchar*)str,
2414+
(const uchar*)str_end)) <= 0)
23782415
return 1;
2379-
str+= scan;
2380-
2416+
str+= mb_len;
2417+
2418+
// If we found '_' (w_one), skip one character in expression string.
23812419
if (!escaped && w_wc == (my_wc_t)w_one)
23822420
{
2383-
result= 1; /* Found an anchor char */
2421+
result= 1;
23842422
}
23852423
else
23862424
{
23872425
if (my_uca_charcmp(cs, s_wc, w_wc))
23882426
return 1;
23892427
}
23902428
if (wildstr == wildend)
2391-
return (str != str_end); /* Match if both are at end */
2429+
return (str != str_end); /* Match if both are at end */
23922430
}
2393-
2394-
2431+
2432+
23952433
if (w_wc == (my_wc_t)w_many)
2396-
{ /* Found w_many */
2397-
2398-
/* Remove any '%' and '_' from the wild search string */
2399-
for ( ; wildstr != wildend ; )
2434+
{
2435+
// Remove any '%' and '_' following w_many in the pattern string.
2436+
for ( ;; )
24002437
{
2401-
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
2402-
(const uchar*)wildend)) <= 0)
2438+
if (wildstr == wildend)
2439+
{
2440+
/*
2441+
The previous w_many (%) was the last character in the pattern
2442+
string, so we have a match no matter what the rest of the
2443+
expression string looks like (even empty).
2444+
*/
2445+
return 0;
2446+
}
2447+
int mb_len= mb_wc(cs, &w_wc, (const uchar*)wildstr,
2448+
(const uchar*)wildend);
2449+
if (mb_len <= 0)
24032450
return 1;
2404-
2405-
if (w_wc == (my_wc_t)w_many)
2406-
{
2407-
wildstr+= scan;
2408-
continue;
2409-
}
2410-
2411-
if (w_wc == (my_wc_t)w_one)
2412-
{
2413-
wildstr+= scan;
2414-
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
2415-
(const uchar*)str_end)) <= 0)
2451+
wildstr+= mb_len;
2452+
if (w_wc == (my_wc_t)w_many)
2453+
continue;
2454+
2455+
if (w_wc == (my_wc_t)w_one)
2456+
{
2457+
/*
2458+
Skip one character in expression string because '_' needs to
2459+
match one.
2460+
*/
2461+
my_wc_t s_wc;
2462+
int mb_len= mb_wc(cs, &s_wc, (const uchar*)str,
2463+
(const uchar*)str_end);
2464+
if (mb_len <= 0)
24162465
return 1;
2417-
str+= scan;
2418-
continue;
2419-
}
2420-
break; /* Not a wild character */
2466+
str+= mb_len;
2467+
continue;
2468+
}
2469+
break; /* Not a wild character */
24212470
}
2422-
2423-
if (wildstr == wildend)
2424-
return 0; /* Ok if w_many is last */
2425-
2471+
2472+
// No character in the expression string to match w_wc.
24262473
if (str == str_end)
2427-
return -1;
2428-
2429-
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
2430-
(const uchar*)wildend)) <= 0)
2431-
return 1;
2432-
2433-
if (w_wc == (my_wc_t)escape)
2474+
return -1;
2475+
2476+
// Skip the escape character ('\') in the pattern if needed.
2477+
if (w_wc == (my_wc_t)escape && wildstr < wildend)
24342478
{
2435-
wildstr+= scan;
2436-
if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
2437-
(const uchar*)wildend)) <= 0)
2479+
int mb_len= mb_wc(cs, &w_wc, (const uchar*)wildstr,
2480+
(const uchar*)wildend);
2481+
if (mb_len <= 0)
24382482
return 1;
2483+
wildstr+= mb_len;
24392484
}
2440-
2485+
2486+
/*
2487+
w_wc is now the character following w_many (e.g., if the pattern is
2488+
"a%c", w_wc is 'c').
2489+
*/
24412490
while (1)
24422491
{
2443-
/* Skip until the first character from wildstr is found */
2492+
/*
2493+
Skip until we find a character in the expression string that is
2494+
equal to w_wc.
2495+
*/
2496+
int mb_len;
24442497
while (str != str_end)
24452498
{
2446-
if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
2447-
(const uchar*)str_end)) <= 0)
2499+
my_wc_t s_wc;
2500+
if ((mb_len= mb_wc(cs, &s_wc, (const uchar*)str,
2501+
(const uchar*)str_end)) <= 0)
24482502
return 1;
2449-
2503+
24502504
if (!my_uca_charcmp(cs, s_wc, w_wc))
24512505
break;
2452-
str+= scan;
2506+
str+= mb_len;
24532507
}
2508+
// No character in the expression string is equal to w_wc.
24542509
if (str == str_end)
24552510
return -1;
2456-
2511+
str+= mb_len;
2512+
2513+
/*
2514+
The strings match up until the first character after w_many in the
2515+
pattern string. For the rest part of pattern string and expression
2516+
string, we recursively call to get wild compare result.
2517+
Example, wildcmp(..., "abcdefg", "a%de%g", ...), we'll run again on
2518+
wildcmp(..., "efg", "e%g", ...).
2519+
*/
24572520
result= my_wildcmp_uca_impl(cs, str, str_end, wildstr, wildend,
24582521
escape, w_one, w_many, recurse_level + 1);
2459-
2522+
24602523
if (result <= 0)
24612524
return result;
2462-
2463-
str+= scan;
2464-
}
2525+
2526+
}
24652527
}
24662528
}
24672529
return (str != str_end ? 1 : 0);

0 commit comments

Comments
 (0)