@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
7
7
8
8
Written by Philip Hazel
9
9
Original API code Copyright (c) 1997-2012 University of Cambridge
10
- New API code Copyright (c) 2015-2020 University of Cambridge
10
+ New API code Copyright (c) 2015-2021 University of Cambridge
11
11
12
12
-----------------------------------------------------------------------------
13
13
Redistribution and use in source and binary forms, with or without
@@ -6117,8 +6117,8 @@ BOOL has_req_cu = FALSE;
6117
6117
BOOL startline ;
6118
6118
6119
6119
#if PCRE2_CODE_UNIT_WIDTH == 8
6120
- BOOL memchr_not_found_first_cu ;
6121
- BOOL memchr_not_found_first_cu2 ;
6120
+ PCRE2_SPTR memchr_found_first_cu ;
6121
+ PCRE2_SPTR memchr_found_first_cu2 ;
6122
6122
#endif
6123
6123
6124
6124
PCRE2_UCHAR first_cu = 0 ;
@@ -6712,8 +6712,8 @@ start_partial = match_partial = NULL;
6712
6712
mb -> hitend = FALSE;
6713
6713
6714
6714
#if PCRE2_CODE_UNIT_WIDTH == 8
6715
- memchr_not_found_first_cu = FALSE ;
6716
- memchr_not_found_first_cu2 = FALSE ;
6715
+ memchr_found_first_cu = NULL ;
6716
+ memchr_found_first_cu2 = NULL ;
6717
6717
#endif
6718
6718
6719
6719
for (;;)
@@ -6782,57 +6782,76 @@ for(;;)
6782
6782
}
6783
6783
}
6784
6784
6785
- /* Not anchored. Advance to a unique first code unit if there is one. In
6786
- 8-bit mode, the use of memchr() gives a big speed up, even though we have
6787
- to call it twice in caseless mode, in order to find the earliest occurrence
6788
- of the character in either of its cases. If a call to memchr() that
6789
- searches the rest of the subject fails to find one case, remember that in
6790
- order not to keep on repeating the search. This can make a huge difference
6791
- when the strings are very long and only one case is present. */
6785
+ /* Not anchored. Advance to a unique first code unit if there is one. */
6792
6786
6793
6787
else
6794
6788
{
6795
6789
if (has_first_cu )
6796
6790
{
6797
6791
if (first_cu != first_cu2 ) /* Caseless */
6798
6792
{
6793
+ /* In 16-bit and 32_bit modes we have to do our own search, so can
6794
+ look for both cases at once. */
6795
+
6799
6796
#if PCRE2_CODE_UNIT_WIDTH != 8
6800
6797
PCRE2_UCHAR smc ;
6801
6798
while (start_match < end_subject &&
6802
6799
(smc = UCHAR21TEST (start_match )) != first_cu &&
6803
- smc != first_cu2 )
6800
+ smc != first_cu2 )
6804
6801
start_match ++ ;
6802
+ #else
6803
+ /* In 8-bit mode, the use of memchr() gives a big speed up, even
6804
+ though we have to call it twice in order to find the earliest
6805
+ occurrence of the code unit in either of its cases. Caching is used
6806
+ to remember the positions of previously found code units. This can
6807
+ make a huge difference when the strings are very long and only one
6808
+ case is actually present. */
6805
6809
6806
- #else /* 8-bit code units */
6807
6810
PCRE2_SPTR pp1 = NULL ;
6808
6811
PCRE2_SPTR pp2 = NULL ;
6809
- PCRE2_SIZE cu2size = end_subject - start_match ;
6812
+ PCRE2_SIZE searchlength = end_subject - start_match ;
6810
6813
6811
- if (!memchr_not_found_first_cu )
6814
+ /* If we haven't got a previously found position for first_cu, or if
6815
+ the current starting position is later, we need to do a search. If
6816
+ the code unit is not found, set it to the end. */
6817
+
6818
+ if (memchr_found_first_cu == NULL ||
6819
+ start_match > memchr_found_first_cu )
6812
6820
{
6813
- pp1 = memchr (start_match , first_cu , end_subject - start_match );
6814
- if (pp1 == NULL ) memchr_not_found_first_cu = TRUE;
6815
- else cu2size = pp1 - start_match ;
6821
+ pp1 = memchr (start_match , first_cu , searchlength );
6822
+ memchr_found_first_cu = (pp1 == NULL )? end_subject : pp1 ;
6816
6823
}
6817
6824
6818
- /* If pp1 is not NULL, we have arranged to search only as far as pp1,
6819
- to see if the other case is earlier, so we can set "not found" only
6820
- when both searches have returned NULL. */
6825
+ /* If the start is before a previously found position, use the
6826
+ previous position, or NULL if a previous search failed. */
6827
+
6828
+ else pp1 = (memchr_found_first_cu == end_subject )? NULL :
6829
+ memchr_found_first_cu ;
6821
6830
6822
- if (!memchr_not_found_first_cu2 )
6831
+ /* Do the same thing for the other case. */
6832
+
6833
+ if (memchr_found_first_cu2 == NULL ||
6834
+ start_match > memchr_found_first_cu2 )
6823
6835
{
6824
- pp2 = memchr (start_match , first_cu2 , cu2size );
6825
- memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL ) ;
6836
+ pp2 = memchr (start_match , first_cu2 , searchlength );
6837
+ memchr_found_first_cu2 = (pp2 == NULL )? end_subject : pp2 ;
6826
6838
}
6827
6839
6840
+ else pp2 = (memchr_found_first_cu2 == end_subject )? NULL :
6841
+ memchr_found_first_cu2 ;
6842
+
6843
+ /* Set the start to the end of the subject if neither case was found.
6844
+ Otherwise, use the earlier found point. */
6845
+
6828
6846
if (pp1 == NULL )
6829
6847
start_match = (pp2 == NULL )? end_subject : pp2 ;
6830
6848
else
6831
6849
start_match = (pp2 == NULL || pp1 < pp2 )? pp1 : pp2 ;
6832
- #endif
6850
+
6851
+ #endif /* 8-bit handling */
6833
6852
}
6834
6853
6835
- /* The caseful case */
6854
+ /* The caseful case is much simpler. */
6836
6855
6837
6856
else
6838
6857
{
0 commit comments