From d4aa8578b18d12380c841de96e8f80cac52bf61a Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Wed, 19 Apr 2023 14:51:31 -0600 Subject: [PATCH] gh-102856: Clean some of the PEP 701 tokenizer implementation (#103634) --- Parser/tokenizer.c | 136 ++++++++++++++++++++++----------------------- Parser/tokenizer.h | 5 +- 2 files changed, 67 insertions(+), 74 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 1dfd2d6e5e1717..f992e55dcac488 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -11,11 +11,6 @@ #include "tokenizer.h" #include "errcode.h" -#include "unicodeobject.h" -#include "bytesobject.h" -#include "fileobject.h" -#include "abstract.h" - /* Alternate tab spacing */ #define ALTTABSIZE 1 @@ -43,6 +38,8 @@ tok->lineno++; \ tok->col_offset = 0; +#define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0) +#define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0) #ifdef Py_DEBUG static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { assert(tok->tok_mode_stack_index >= 0); @@ -54,15 +51,9 @@ static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { assert(tok->tok_mode_stack_index < MAXLEVEL); return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); } -static inline int *TOK_GET_BRACKET_MARK(tokenizer_mode* mode) { - assert(mode->bracket_mark_index >= 0); - assert(mode->bracket_mark_index < MAX_EXPR_NESTING); - return &(mode->bracket_mark[mode->bracket_mark_index]); -} #else #define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index])) #define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index])) -#define TOK_GET_BRACKET_MARK(mode) (&(mode->bracket_mark[mode->bracket_mark_index])) #endif /* Forward */ @@ -398,20 +389,7 @@ update_fstring_expr(struct tok_state *tok, char cur) tokenizer_mode *tok_mode = TOK_GET_MODE(tok); switch (cur) { - case '{': - if (tok_mode->last_expr_buffer != NULL) { - PyMem_Free(tok_mode->last_expr_buffer); - } - tok_mode->last_expr_buffer = PyMem_Malloc(size); - if (tok_mode->last_expr_buffer == NULL) { - tok->done = E_NOMEM; - return 0; - } - tok_mode->last_expr_size = size; - tok_mode->last_expr_end = -1; - strncpy(tok_mode->last_expr_buffer, tok->cur, size); - break; - case 0: + case 0: if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) { return 1; } @@ -421,13 +399,24 @@ update_fstring_expr(struct tok_state *tok, char cur) ); if (new_buffer == NULL) { PyMem_Free(tok_mode->last_expr_buffer); - tok->done = E_NOMEM; - return 0; + goto error; } tok_mode->last_expr_buffer = new_buffer; strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); tok_mode->last_expr_size += size; break; + case '{': + if (tok_mode->last_expr_buffer != NULL) { + PyMem_Free(tok_mode->last_expr_buffer); + } + tok_mode->last_expr_buffer = PyMem_Malloc(size); + if (tok_mode->last_expr_buffer == NULL) { + goto error; + } + tok_mode->last_expr_size = size; + tok_mode->last_expr_end = -1; + strncpy(tok_mode->last_expr_buffer, tok->cur, size); + break; case '}': case '!': case ':': @@ -435,9 +424,13 @@ update_fstring_expr(struct tok_state *tok, char cur) tok_mode->last_expr_end = strlen(tok->start); } break; + default: + Py_UNREACHABLE(); } - return 1; +error: + tok->done = E_NOMEM; + return 0; } static void @@ -1766,7 +1759,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Skip comment, unless it's a type comment */ if (c == '#') { - if (tok->tok_mode_stack_index > 0) { + if (INSIDE_FSTRING(tok)) { return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'")); } @@ -2208,32 +2201,31 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t p_start = tok->start; p_end = tok->cur; - tokenizer_mode *current_tok = TOK_NEXT_MODE(tok); - current_tok->kind = TOK_FSTRING_MODE; - current_tok->f_string_quote = quote; - current_tok->f_string_quote_size = quote_size; - current_tok->f_string_start = tok->start; - current_tok->f_string_multi_line_start = tok->line_start; - current_tok->last_expr_buffer = NULL; - current_tok->last_expr_size = 0; - current_tok->last_expr_end = -1; + tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); + the_current_tok->kind = TOK_FSTRING_MODE; + the_current_tok->f_string_quote = quote; + the_current_tok->f_string_quote_size = quote_size; + the_current_tok->f_string_start = tok->start; + the_current_tok->f_string_multi_line_start = tok->line_start; + the_current_tok->last_expr_buffer = NULL; + the_current_tok->last_expr_size = 0; + the_current_tok->last_expr_end = -1; switch (*tok->start) { case 'F': case 'f': - current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r'; + the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r'; break; case 'R': case 'r': - current_tok->f_string_raw = 1; + the_current_tok->f_string_raw = 1; break; default: Py_UNREACHABLE(); } - current_tok->bracket_stack = 0; - current_tok->bracket_mark[0] = 0; - current_tok->bracket_mark_index = -1; + the_current_tok->curly_bracket_depth = 0; + the_current_tok->curly_bracket_expr_start_depth = -1; return MAKE_TOKEN(FSTRING_START); } @@ -2282,15 +2274,15 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t int start = tok->lineno; tok->lineno = tok->first_lineno; - if (tok->tok_mode_stack_index > 0) { + if (INSIDE_FSTRING(tok)) { /* When we are in an f-string, before raising the * unterminated string literal error, check whether * does the initial quote matches with f-strings quotes * and if it is, then this must be a missing '}' token * so raise the proper error */ - tokenizer_mode *current_tok = TOK_GET_MODE(tok); - if (current_tok->f_string_quote == quote && - current_tok->f_string_quote_size == quote_size) { + tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); + if (the_current_tok->f_string_quote == quote && + the_current_tok->f_string_quote_size == quote_size) { return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start)); } } @@ -2339,18 +2331,17 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Punctuation character */ int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{'); - if (is_punctuation && tok->tok_mode_stack_index > 0 && current_tok->bracket_mark_index >= 0) { - int mark = *TOK_GET_BRACKET_MARK(current_tok); - /* This code block gets executed before the bracket_stack is incremented + if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) { + /* This code block gets executed before the curly_bracket_depth is incremented * by the `{` case, so for ensuring that we are on the 0th level, we need * to adjust it manually */ - int cursor = current_tok->bracket_stack - (c != '{'); + int cursor = current_tok->curly_bracket_depth - (c != '{'); if (cursor == 0 && !update_fstring_expr(tok, c)) { return MAKE_TOKEN(ENDMARKER); } - if (c == ':' && cursor == mark) { + if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { current_tok->kind = TOK_FSTRING_MODE; p_start = tok->start; p_end = tok->cur; @@ -2390,16 +2381,15 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->parenlinenostack[tok->level] = tok->lineno; tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); tok->level++; - - if (tok->tok_mode_stack_index > 0) { - current_tok->bracket_stack++; + if (INSIDE_FSTRING(tok)) { + current_tok->curly_bracket_depth++; } break; case ')': case ']': case '}': if (!tok->level) { - if (tok->tok_mode_stack_index > 0 && !current_tok->bracket_stack && c == '}') { + if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') { return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed")); } return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c)); @@ -2415,10 +2405,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t nested expression, then instead of matching a different syntactical construct with it; we'll throw an unmatched parentheses error. */ - if (tok->tok_mode_stack_index > 0 && opening == '{') { - assert(current_tok->bracket_stack >= 0); - int previous_bracket = current_tok->bracket_stack - 1; - if (previous_bracket == *TOK_GET_BRACKET_MARK(current_tok)) { + if (INSIDE_FSTRING(tok) && opening == '{') { + assert(current_tok->curly_bracket_depth >= 0); + int previous_bracket = current_tok->curly_bracket_depth - 1; + if (previous_bracket == current_tok->curly_bracket_expr_start_depth) { return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c)); } } @@ -2436,14 +2426,16 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } } - if (tok->tok_mode_stack_index > 0) { - current_tok->bracket_stack--; - if (c == '}' && current_tok->bracket_stack == *TOK_GET_BRACKET_MARK(current_tok)) { - current_tok->bracket_mark_index--; + if (INSIDE_FSTRING(tok)) { + current_tok->curly_bracket_depth--; + if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { + current_tok->curly_bracket_expr_start_depth--; current_tok->kind = TOK_FSTRING_MODE; } } break; + default: + break; } if (!Py_UNICODE_ISPRINTABLE(c)) { @@ -2479,11 +2471,10 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct if ((start_char == '{' && peek1 != '{') || (start_char == '}' && peek1 != '}')) { if (start_char == '{') { - current_tok->bracket_mark_index++; - if (current_tok->bracket_mark_index >= MAX_EXPR_NESTING) { + current_tok->curly_bracket_expr_start_depth++; + if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply")); } - *TOK_GET_BRACKET_MARK(current_tok) = current_tok->bracket_stack; } TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; return tok_get_normal_mode(tok, current_tok, token); @@ -2544,17 +2535,20 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct end_quote_size = 0; } - int in_format_spec = current_tok->last_expr_end != -1 && current_tok->bracket_mark_index >= 0; + int in_format_spec = ( + current_tok->last_expr_end != -1 + && + INSIDE_FSTRING_EXPR(current_tok) + ); if (c == '{') { int peek = tok_nextc(tok); if (peek != '{' || in_format_spec) { tok_backup(tok, peek); tok_backup(tok, c); - current_tok->bracket_mark_index++; - if (current_tok->bracket_mark_index >= MAX_EXPR_NESTING) { + current_tok->curly_bracket_expr_start_depth++; + if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply")); } - *TOK_GET_BRACKET_MARK(current_tok) = current_tok->bracket_stack; TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; p_start = tok->start; p_end = tok->cur; diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index f67e0cd1ac2f4a..b75e4e8293d3d6 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -43,9 +43,8 @@ enum tokenizer_mode_kind_t { typedef struct _tokenizer_mode { enum tokenizer_mode_kind_t kind; - int bracket_stack; - int bracket_mark[MAX_EXPR_NESTING]; - int bracket_mark_index; + int curly_bracket_depth; + int curly_bracket_expr_start_depth; char f_string_quote; int f_string_quote_size;