Skip to content

Commit bd3f5d9

Browse files
committed
S_parse_ident: Add ability to parse only IDFIRST strings
An identifier parsed by this function can include the ones most people would expect, but also ones that begin with a digit followed by ASCII \w characters. This commit adds a flag so that the function doesn't recognize the latter type as an identifier
1 parent 8fd4cbe commit bd3f5d9

File tree

1 file changed

+14
-7
lines changed

1 file changed

+14
-7
lines changed

toke.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ static const char ident_var_zero_multi_digit[] = "Numeric variables with more th
177177
#define CHECK_KEYWORD (1 << 0)
178178
#define ALLOW_PACKAGE (1 << 1)
179179
#define CHECK_DOLLAR (1 << 2)
180+
#define IDFIRST_ONLY (1 << 3)
180181

181182
#ifdef DEBUGGING
182183
static const char* const lex_state_names[] = {
@@ -10552,9 +10553,12 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end,
1055210553
* 1) A normal identifier whose first character matches IDFIRST followed
1055310554
* by any number of characters which match IDCONT.
1055410555
* 2) An identifier that begins with an ASCII digit followed by any number
10555-
* of ASCII \w characters
10556-
*
10557-
* The function copies the identifier into the destination starting at *d
10556+
* of ASCII \w characters. This type can be prohibited, so that
10557+
* anything that doesn't match type 1) is not considered an identifier.
10558+
*/
10559+
const bool idfirst_only = flags & IDFIRST_ONLY;
10560+
10561+
/* The function copies the identifier into the destination starting at *d
1055810562
* (whose upper bound is 'e') and advances *d to point to just beyond the
1055910563
* end of the identifier, setting **d to a NUL character. The reason it
1056010564
* needs to copy is that it may convert apostrophe package separators into
@@ -10585,15 +10589,18 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end,
1058510589
* Unicode definition only when UTF-8 is in effect. We have to check
1058610590
* for the subset before checking for the superset. */
1058710591
Size_t advance;
10588-
if (is_utf8 && (advance = isIDFIRST_utf8_safe(s, s_end))) {
10592+
if ( (advance = isIDFIRST_lazy_if_safe(s, s_end, is_utf8))
10593+
&& (is_utf8 || idfirst_only))
10594+
{
1058910595
const char *this_start = s;
1059010596
s += advance;
1059110597

1059210598
/* Find the end of the identifier by accumulating characters until
1059310599
* find a non-identifier character */
1059410600
while (s < s_end) {
10595-
advance = isIDCONT_utf8_safe((const U8*) s,
10596-
(const U8*) s_end);
10601+
advance = isIDCONT_lazy_if_safe((const U8*) s,
10602+
(const U8*) s_end,
10603+
is_utf8);
1059710604
if (advance == 0) { /* Not an identifier character */
1059810605
break;
1059910606
}
@@ -10612,7 +10619,7 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end,
1061210619
Copy(this_start, *d, this_length, char);
1061310620
*d += this_length;
1061410621
}
10615-
else if ( isWORDCHAR_A(*s) ) {
10622+
else if (! idfirst_only && isWORDCHAR_A(*s) ) {
1061610623

1061710624
/* This is the superset; it accepts \w+, including an initial
1061810625
* digit */

0 commit comments

Comments
 (0)