Skip to content

Commit eea410b

Browse files
committed
Improve code for "starts with" optimization in the interpreters.
1 parent d5a61ee commit eea410b

File tree

3 files changed

+95
-52
lines changed

3 files changed

+95
-52
lines changed

ChangeLog

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ quote old; it was released in 2014.
4141
detecting symlink loops. This is dependent on the availability of realpath(),
4242
which is now tested for in ./configure and CMakeLists.txt.
4343

44+
5. Implemented a modified version of Thomas Tempelmann's patch for handling
45+
case-independent "first code unit" searches for unanchored patterns in 8-bit
46+
mode in the interpreters. Instead of just remembering whether one case matched
47+
or not, it remembers the position of a previous match so as to avoid
48+
unnecessary repeated searching.
4449

4550

4651
Version 10.37 26-May-2021

src/pcre2_dfa_match.c

Lines changed: 44 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
77
88
Written by Philip Hazel
99
Original API code Copyright (c) 1997-2012 University of Cambridge
10-
New API code Copyright (c) 2016-2020 University of Cambridge
10+
New API code Copyright (c) 2016-2021 University of Cambridge
1111
1212
-----------------------------------------------------------------------------
1313
Redistribution and use in source and binary forms, with or without
@@ -3256,8 +3256,8 @@ BOOL has_first_cu = FALSE;
32563256
BOOL has_req_cu = FALSE;
32573257

32583258
#if PCRE2_CODE_UNIT_WIDTH == 8
3259-
BOOL memchr_not_found_first_cu = FALSE;
3260-
BOOL memchr_not_found_first_cu2 = FALSE;
3259+
PCRE2_SPTR memchr_found_first_cu = NULL;
3260+
PCRE2_SPTR memchr_found_first_cu2 = NULL;
32613261
#endif
32623262

32633263
PCRE2_UCHAR first_cu = 0;
@@ -3648,57 +3648,76 @@ for (;;)
36483648
}
36493649
}
36503650

3651-
/* Not anchored. Advance to a unique first code unit if there is one. In
3652-
8-bit mode, the use of memchr() gives a big speed up, even though we have
3653-
to call it twice in caseless mode, in order to find the earliest occurrence
3654-
of the character in either of its cases. If a call to memchr() that
3655-
searches the rest of the subject fails to find one case, remember that in
3656-
order not to keep on repeating the search. This can make a huge difference
3657-
when the strings are very long and only one case is present. */
3651+
/* Not anchored. Advance to a unique first code unit if there is one. */
36583652

36593653
else
36603654
{
36613655
if (has_first_cu)
36623656
{
36633657
if (first_cu != first_cu2) /* Caseless */
36643658
{
3659+
/* In 16-bit and 32_bit modes we have to do our own search, so can
3660+
look for both cases at once. */
3661+
36653662
#if PCRE2_CODE_UNIT_WIDTH != 8
36663663
PCRE2_UCHAR smc;
36673664
while (start_match < end_subject &&
36683665
(smc = UCHAR21TEST(start_match)) != first_cu &&
3669-
smc != first_cu2)
3666+
smc != first_cu2)
36703667
start_match++;
3668+
#else
3669+
/* In 8-bit mode, the use of memchr() gives a big speed up, even
3670+
though we have to call it twice in order to find the earliest
3671+
occurrence of the code unit in either of its cases. Caching is used
3672+
to remember the positions of previously found code units. This can
3673+
make a huge difference when the strings are very long and only one
3674+
case is actually present. */
36713675

3672-
#else /* 8-bit code units */
36733676
PCRE2_SPTR pp1 = NULL;
36743677
PCRE2_SPTR pp2 = NULL;
3675-
PCRE2_SIZE cu2size = end_subject - start_match;
3678+
PCRE2_SIZE searchlength = end_subject - start_match;
36763679

3677-
if (!memchr_not_found_first_cu)
3680+
/* If we haven't got a previously found position for first_cu, or if
3681+
the current starting position is later, we need to do a search. If
3682+
the code unit is not found, set it to the end. */
3683+
3684+
if (memchr_found_first_cu == NULL ||
3685+
start_match > memchr_found_first_cu)
36783686
{
3679-
pp1 = memchr(start_match, first_cu, end_subject - start_match);
3680-
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
3681-
else cu2size = pp1 - start_match;
3687+
pp1 = memchr(start_match, first_cu, searchlength);
3688+
memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
36823689
}
36833690

3684-
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
3685-
to see if the other case is earlier, so we can set "not found" only
3686-
when both searches have returned NULL. */
3691+
/* If the start is before a previously found position, use the
3692+
previous position, or NULL if a previous search failed. */
3693+
3694+
else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3695+
memchr_found_first_cu;
36873696

3688-
if (!memchr_not_found_first_cu2)
3697+
/* Do the same thing for the other case. */
3698+
3699+
if (memchr_found_first_cu2 == NULL ||
3700+
start_match > memchr_found_first_cu2)
36893701
{
3690-
pp2 = memchr(start_match, first_cu2, cu2size);
3691-
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
3702+
pp2 = memchr(start_match, first_cu2, searchlength);
3703+
memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
36923704
}
36933705

3706+
else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3707+
memchr_found_first_cu2;
3708+
3709+
/* Set the start to the end of the subject if neither case was found.
3710+
Otherwise, use the earlier found point. */
3711+
36943712
if (pp1 == NULL)
36953713
start_match = (pp2 == NULL)? end_subject : pp2;
36963714
else
36973715
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3698-
#endif
3716+
3717+
#endif /* 8-bit handling */
36993718
}
37003719

3701-
/* The caseful case */
3720+
/* The caseful case is much simpler. */
37023721

37033722
else
37043723
{

src/pcre2_match.c

Lines changed: 46 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
77
88
Written by Philip Hazel
99
Original API code Copyright (c) 1997-2012 University of Cambridge
10-
New API code Copyright (c) 2015-2020 University of Cambridge
10+
New API code Copyright (c) 2015-2021 University of Cambridge
1111
1212
-----------------------------------------------------------------------------
1313
Redistribution and use in source and binary forms, with or without
@@ -6117,8 +6117,8 @@ BOOL has_req_cu = FALSE;
61176117
BOOL startline;
61186118

61196119
#if PCRE2_CODE_UNIT_WIDTH == 8
6120-
BOOL memchr_not_found_first_cu;
6121-
BOOL memchr_not_found_first_cu2;
6120+
PCRE2_SPTR memchr_found_first_cu;
6121+
PCRE2_SPTR memchr_found_first_cu2;
61226122
#endif
61236123

61246124
PCRE2_UCHAR first_cu = 0;
@@ -6712,8 +6712,8 @@ start_partial = match_partial = NULL;
67126712
mb->hitend = FALSE;
67136713

67146714
#if PCRE2_CODE_UNIT_WIDTH == 8
6715-
memchr_not_found_first_cu = FALSE;
6716-
memchr_not_found_first_cu2 = FALSE;
6715+
memchr_found_first_cu = NULL;
6716+
memchr_found_first_cu2 = NULL;
67176717
#endif
67186718

67196719
for(;;)
@@ -6782,57 +6782,76 @@ for(;;)
67826782
}
67836783
}
67846784

6785-
/* Not anchored. Advance to a unique first code unit if there is one. In
6786-
8-bit mode, the use of memchr() gives a big speed up, even though we have
6787-
to call it twice in caseless mode, in order to find the earliest occurrence
6788-
of the character in either of its cases. If a call to memchr() that
6789-
searches the rest of the subject fails to find one case, remember that in
6790-
order not to keep on repeating the search. This can make a huge difference
6791-
when the strings are very long and only one case is present. */
6785+
/* Not anchored. Advance to a unique first code unit if there is one. */
67926786

67936787
else
67946788
{
67956789
if (has_first_cu)
67966790
{
67976791
if (first_cu != first_cu2) /* Caseless */
67986792
{
6793+
/* In 16-bit and 32_bit modes we have to do our own search, so can
6794+
look for both cases at once. */
6795+
67996796
#if PCRE2_CODE_UNIT_WIDTH != 8
68006797
PCRE2_UCHAR smc;
68016798
while (start_match < end_subject &&
68026799
(smc = UCHAR21TEST(start_match)) != first_cu &&
6803-
smc != first_cu2)
6800+
smc != first_cu2)
68046801
start_match++;
6802+
#else
6803+
/* In 8-bit mode, the use of memchr() gives a big speed up, even
6804+
though we have to call it twice in order to find the earliest
6805+
occurrence of the code unit in either of its cases. Caching is used
6806+
to remember the positions of previously found code units. This can
6807+
make a huge difference when the strings are very long and only one
6808+
case is actually present. */
68056809

6806-
#else /* 8-bit code units */
68076810
PCRE2_SPTR pp1 = NULL;
68086811
PCRE2_SPTR pp2 = NULL;
6809-
PCRE2_SIZE cu2size = end_subject - start_match;
6812+
PCRE2_SIZE searchlength = end_subject - start_match;
68106813

6811-
if (!memchr_not_found_first_cu)
6814+
/* If we haven't got a previously found position for first_cu, or if
6815+
the current starting position is later, we need to do a search. If
6816+
the code unit is not found, set it to the end. */
6817+
6818+
if (memchr_found_first_cu == NULL ||
6819+
start_match > memchr_found_first_cu)
68126820
{
6813-
pp1 = memchr(start_match, first_cu, end_subject - start_match);
6814-
if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
6815-
else cu2size = pp1 - start_match;
6821+
pp1 = memchr(start_match, first_cu, searchlength);
6822+
memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
68166823
}
68176824

6818-
/* If pp1 is not NULL, we have arranged to search only as far as pp1,
6819-
to see if the other case is earlier, so we can set "not found" only
6820-
when both searches have returned NULL. */
6825+
/* If the start is before a previously found position, use the
6826+
previous position, or NULL if a previous search failed. */
6827+
6828+
else pp1 = (memchr_found_first_cu == end_subject)? NULL :
6829+
memchr_found_first_cu;
68216830

6822-
if (!memchr_not_found_first_cu2)
6831+
/* Do the same thing for the other case. */
6832+
6833+
if (memchr_found_first_cu2 == NULL ||
6834+
start_match > memchr_found_first_cu2)
68236835
{
6824-
pp2 = memchr(start_match, first_cu2, cu2size);
6825-
memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
6836+
pp2 = memchr(start_match, first_cu2, searchlength);
6837+
memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
68266838
}
68276839

6840+
else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
6841+
memchr_found_first_cu2;
6842+
6843+
/* Set the start to the end of the subject if neither case was found.
6844+
Otherwise, use the earlier found point. */
6845+
68286846
if (pp1 == NULL)
68296847
start_match = (pp2 == NULL)? end_subject : pp2;
68306848
else
68316849
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
6832-
#endif
6850+
6851+
#endif /* 8-bit handling */
68336852
}
68346853

6835-
/* The caseful case */
6854+
/* The caseful case is much simpler. */
68366855

68376856
else
68386857
{

0 commit comments

Comments
 (0)