@@ -2240,46 +2240,61 @@ my_strnxfrm_uca(const CHARSET_INFO *cs, Mb_wc mb_wc,
2240
2240
2241
2241
static int my_uca_charcmp_900 (const CHARSET_INFO *cs, my_wc_t wc1, my_wc_t wc2)
2242
2242
{
2243
- uint16 *weight1 = my_char_weight_addr_900 (cs->uca , wc1); /* W3-TODO */
2244
- uint16 *weight2 = my_char_weight_addr_900 (cs->uca , wc2);
2243
+ uint16 *weight1_ptr = my_char_weight_addr_900 (cs->uca , wc1); /* W3-TODO */
2244
+ uint16 *weight2_ptr = my_char_weight_addr_900 (cs->uca , wc2);
2245
2245
2246
2246
/* Check if some of the characters does not have implicit weights */
2247
- if (!weight1 || !weight2 )
2247
+ if (!weight1_ptr || !weight2_ptr )
2248
2248
return wc1 != wc2;
2249
2249
2250
- /* Quickly compare first weights */
2251
- if (weight1[0 ] != weight2[0 ])
2250
+ if (weight1_ptr[0 ] && weight2_ptr[0 ] && weight1_ptr[0 ] != weight2_ptr[0 ])
2252
2251
return 1 ;
2253
2252
2254
2253
/* Thoroughly compare all weights */
2255
- size_t length1= weight1[-UCA900_DISTANCE_BETWEEN_LEVELS];
2256
- size_t length2= weight2[-UCA900_DISTANCE_BETWEEN_LEVELS];
2257
-
2258
- if (length1 != length2)
2259
- return 1 ;
2254
+ size_t length1= weight1_ptr[-UCA900_DISTANCE_BETWEEN_LEVELS];
2255
+ size_t length2= weight2_ptr[-UCA900_DISTANCE_BETWEEN_LEVELS];
2260
2256
2261
- if ( cs->state & MY_CS_CSSORT )
2257
+ for ( int level= 0 ; level< cs->levels_for_compare ; ++level )
2262
2258
{
2263
- for (size_t weightind= 0 ; weightind < length1 * MY_UCA_900_CE_SIZE;
2264
- ++weightind)
2259
+ size_t wt_ind1= 0 ;
2260
+ size_t wt_ind2= 0 ;
2261
+ uint16 *weight1= weight1_ptr + level * UCA900_DISTANCE_BETWEEN_LEVELS;
2262
+ uint16 *weight2= weight2_ptr + level * UCA900_DISTANCE_BETWEEN_LEVELS;
2263
+ while (wt_ind1 < length1 && wt_ind2 < length2)
2265
2264
{
2265
+ // Zero weight is ignorable.
2266
+ for (; wt_ind1 < length1 && !*weight1; wt_ind1++)
2267
+ weight1+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
2268
+ if (wt_ind1 == length1)
2269
+ break ;
2270
+ for (; wt_ind2 < length2 && !*weight2; wt_ind2++)
2271
+ weight2+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
2272
+ if (wt_ind2 == length2)
2273
+ break ;
2274
+
2275
+ // Check if these two non-ignorable weights are equal.
2266
2276
if (*weight1 != *weight2)
2267
2277
return 1 ;
2268
- weight1+= UCA900_DISTANCE_BETWEEN_LEVELS;
2269
- weight2+= UCA900_DISTANCE_BETWEEN_LEVELS;
2278
+ wt_ind1++;
2279
+ wt_ind2++;
2280
+ weight1+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
2281
+ weight2+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
2270
2282
}
2271
- }
2272
- else
2273
- {
2274
- for (size_t weightind= 0 ; weightind < length1; ++weightind)
2283
+ /*
2284
+ If either character is out of weights but we have equality so far,
2285
+ check if the other character has any non-ignorable weights left.
2286
+ */
2287
+ for (; wt_ind1 < length1; wt_ind1++)
2275
2288
{
2276
- if (*weight1 != *weight2)
2277
- return 1 ;
2289
+ if (*weight1) return 1 ;
2278
2290
weight1+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
2291
+ }
2292
+ for (; wt_ind2 < length2; wt_ind2++)
2293
+ {
2294
+ if (*weight2) return 1 ;
2279
2295
weight2+= UCA900_DISTANCE_BETWEEN_WEIGHTS;
2280
2296
}
2281
2297
}
2282
-
2283
2298
return 0 ;
2284
2299
}
2285
2300
@@ -2339,129 +2354,176 @@ int my_wildcmp_uca_impl(const CHARSET_INFO *cs,
2339
2354
const char *wildstr,const char *wildend,
2340
2355
int escape, int w_one, int w_many, int recurse_level)
2341
2356
{
2342
- int result= -1 ; /* Not found, using wildcards */
2343
- my_wc_t s_wc, w_wc;
2344
- int scan;
2345
- int (*mb_wc)(const struct charset_info_st *, my_wc_t *,
2346
- const uchar *, const uchar *);
2347
- mb_wc= cs->cset ->mb_wc ;
2348
-
2349
- if (my_string_stack_guard && my_string_stack_guard (recurse_level))
2350
- return 1 ;
2357
+ if (my_string_stack_guard && my_string_stack_guard (recurse_level))
2358
+ return 1 ;
2351
2359
while (wildstr != wildend)
2352
2360
{
2361
+ int result= -1 ; /* Not found, using wildcards */
2362
+ auto mb_wc= cs->cset ->mb_wc ;
2363
+
2364
+ /*
2365
+ Compare the expression and pattern strings character-by-character until
2366
+ we find a '%' (w_many) in the pattern string. Once we do, we break out
2367
+ of the loop and try increasingly large widths for the '%' match,
2368
+ calling ourselves recursively until we find a match. (As an
2369
+ optimization, we test for the character immediately after '%' before we
2370
+ recurse.) This takes exponential time in the worst case.
2371
+
2372
+ Example: Say we are trying to match the pattern 'ab%cd' against the
2373
+ string 'ab..c.cd'. We first match the initial 'ab' against each other,
2374
+ and then see the '%' in the pattern. Since the first character after
2375
+ '%' is 'c', we skip to the first 'c' in the expression string, and try
2376
+ to match 'c.cd' against 'cd' by a recursive call. Since this failed, we
2377
+ scan for the next 'c', and try to match 'cd' against 'cd', which works.
2378
+ */
2379
+ my_wc_t w_wc;
2353
2380
while (1 )
2354
2381
{
2355
- my_bool escaped= 0 ;
2356
- if ((scan = mb_wc (cs, &w_wc, (const uchar*)wildstr,
2357
- (const uchar*)wildend)) <= 0 )
2358
- return 1 ;
2382
+ int mb_len ;
2383
+ if ((mb_len = mb_wc (cs, &w_wc, (const uchar*)wildstr,
2384
+ (const uchar*)wildend)) <= 0 )
2385
+ return 1 ;
2359
2386
2387
+ wildstr+= mb_len;
2388
+ // If we found '%' (w_many), break out this loop.
2360
2389
if (w_wc == (my_wc_t )w_many)
2361
2390
{
2362
- result= 1 ; /* Found an anchor char */
2391
+ result= 1 ;
2363
2392
break ;
2364
2393
}
2365
2394
2366
- wildstr+= scan;
2367
- if (w_wc == (my_wc_t )escape && wildstr < wildend)
2395
+ /*
2396
+ If the character we just read was an escape character, skip it and
2397
+ read the next character instead. This character is used verbatim
2398
+ without checking if it is a wildcard (% or _). However, as a
2399
+ special exception, a lone escape character at the end of a string is
2400
+ treated as itself.
2401
+ */
2402
+ bool escaped= false ;
2403
+ if (w_wc == (my_wc_t )escape && wildstr < wildend)
2368
2404
{
2369
- if ((scan = mb_wc (cs, &w_wc, (const uchar*)wildstr,
2370
- (const uchar*)wildend)) <= 0 )
2405
+ if ((mb_len = mb_wc (cs, &w_wc, (const uchar*)wildstr,
2406
+ (const uchar*)wildend)) <= 0 )
2371
2407
return 1 ;
2372
- wildstr+= scan ;
2408
+ wildstr+= mb_len ;
2373
2409
escaped= 1 ;
2374
2410
}
2375
-
2376
- if ((scan= mb_wc (cs, &s_wc, (const uchar*)str,
2377
- (const uchar*)str_end)) <= 0 )
2411
+
2412
+ my_wc_t s_wc;
2413
+ if ((mb_len= mb_wc (cs, &s_wc, (const uchar*)str,
2414
+ (const uchar*)str_end)) <= 0 )
2378
2415
return 1 ;
2379
- str+= scan;
2380
-
2416
+ str+= mb_len;
2417
+
2418
+ // If we found '_' (w_one), skip one character in expression string.
2381
2419
if (!escaped && w_wc == (my_wc_t )w_one)
2382
2420
{
2383
- result= 1 ; /* Found an anchor char */
2421
+ result= 1 ;
2384
2422
}
2385
2423
else
2386
2424
{
2387
2425
if (my_uca_charcmp (cs, s_wc, w_wc))
2388
2426
return 1 ;
2389
2427
}
2390
2428
if (wildstr == wildend)
2391
- return (str != str_end); /* Match if both are at end */
2429
+ return (str != str_end); /* Match if both are at end */
2392
2430
}
2393
-
2394
-
2431
+
2432
+
2395
2433
if (w_wc == (my_wc_t )w_many)
2396
- { /* Found w_many */
2397
-
2398
- /* Remove any '%' and '_' from the wild search string */
2399
- for ( ; wildstr != wildend ; )
2434
+ {
2435
+ // Remove any '%' and '_' following w_many in the pattern string.
2436
+ for ( ;; )
2400
2437
{
2401
- if ((scan= mb_wc (cs, &w_wc, (const uchar*)wildstr,
2402
- (const uchar*)wildend)) <= 0 )
2438
+ if (wildstr == wildend)
2439
+ {
2440
+ /*
2441
+ The previous w_many (%) was the last character in the pattern
2442
+ string, so we have a match no matter what the rest of the
2443
+ expression string looks like (even empty).
2444
+ */
2445
+ return 0 ;
2446
+ }
2447
+ int mb_len= mb_wc (cs, &w_wc, (const uchar*)wildstr,
2448
+ (const uchar*)wildend);
2449
+ if (mb_len <= 0 )
2403
2450
return 1 ;
2404
-
2405
- if (w_wc == (my_wc_t )w_many)
2406
- {
2407
- wildstr+= scan;
2408
- continue ;
2409
- }
2410
-
2411
- if (w_wc == (my_wc_t )w_one)
2412
- {
2413
- wildstr+= scan;
2414
- if ((scan= mb_wc (cs, &s_wc, (const uchar*)str,
2415
- (const uchar*)str_end)) <= 0 )
2451
+ wildstr+= mb_len;
2452
+ if (w_wc == (my_wc_t )w_many)
2453
+ continue ;
2454
+
2455
+ if (w_wc == (my_wc_t )w_one)
2456
+ {
2457
+ /*
2458
+ Skip one character in expression string because '_' needs to
2459
+ match one.
2460
+ */
2461
+ my_wc_t s_wc;
2462
+ int mb_len= mb_wc (cs, &s_wc, (const uchar*)str,
2463
+ (const uchar*)str_end);
2464
+ if (mb_len <= 0 )
2416
2465
return 1 ;
2417
- str+= scan ;
2418
- continue ;
2419
- }
2420
- break ; /* Not a wild character */
2466
+ str+= mb_len ;
2467
+ continue ;
2468
+ }
2469
+ break ; /* Not a wild character */
2421
2470
}
2422
-
2423
- if (wildstr == wildend)
2424
- return 0 ; /* Ok if w_many is last */
2425
-
2471
+
2472
+ // No character in the expression string to match w_wc.
2426
2473
if (str == str_end)
2427
- return -1 ;
2428
-
2429
- if ((scan= mb_wc (cs, &w_wc, (const uchar*)wildstr,
2430
- (const uchar*)wildend)) <= 0 )
2431
- return 1 ;
2432
-
2433
- if (w_wc == (my_wc_t )escape)
2474
+ return -1 ;
2475
+
2476
+ // Skip the escape character ('\') in the pattern if needed.
2477
+ if (w_wc == (my_wc_t )escape && wildstr < wildend)
2434
2478
{
2435
- wildstr+= scan;
2436
- if ((scan= mb_wc (cs, &w_wc, (const uchar*)wildstr,
2437
- ( const uchar*)wildend)) <= 0 )
2479
+ int mb_len= mb_wc (cs, &w_wc, ( const uchar*)wildstr,
2480
+ (const uchar*)wildend);
2481
+ if (mb_len <= 0 )
2438
2482
return 1 ;
2483
+ wildstr+= mb_len;
2439
2484
}
2440
-
2485
+
2486
+ /*
2487
+ w_wc is now the character following w_many (e.g., if the pattern is
2488
+ "a%c", w_wc is 'c').
2489
+ */
2441
2490
while (1 )
2442
2491
{
2443
- /* Skip until the first character from wildstr is found */
2492
+ /*
2493
+ Skip until we find a character in the expression string that is
2494
+ equal to w_wc.
2495
+ */
2496
+ int mb_len;
2444
2497
while (str != str_end)
2445
2498
{
2446
- if ((scan= mb_wc (cs, &s_wc, (const uchar*)str,
2447
- (const uchar*)str_end)) <= 0 )
2499
+ my_wc_t s_wc;
2500
+ if ((mb_len= mb_wc (cs, &s_wc, (const uchar*)str,
2501
+ (const uchar*)str_end)) <= 0 )
2448
2502
return 1 ;
2449
-
2503
+
2450
2504
if (!my_uca_charcmp (cs, s_wc, w_wc))
2451
2505
break ;
2452
- str+= scan ;
2506
+ str+= mb_len ;
2453
2507
}
2508
+ // No character in the expression string is equal to w_wc.
2454
2509
if (str == str_end)
2455
2510
return -1 ;
2456
-
2511
+ str+= mb_len;
2512
+
2513
+ /*
2514
+ The strings match up until the first character after w_many in the
2515
+ pattern string. For the rest part of pattern string and expression
2516
+ string, we recursively call to get wild compare result.
2517
+ Example, wildcmp(..., "abcdefg", "a%de%g", ...), we'll run again on
2518
+ wildcmp(..., "efg", "e%g", ...).
2519
+ */
2457
2520
result= my_wildcmp_uca_impl (cs, str, str_end, wildstr, wildend,
2458
2521
escape, w_one, w_many, recurse_level + 1 );
2459
-
2522
+
2460
2523
if (result <= 0 )
2461
2524
return result;
2462
-
2463
- str+= scan;
2464
- }
2525
+
2526
+ }
2465
2527
}
2466
2528
}
2467
2529
return (str != str_end ? 1 : 0 );
0 commit comments