From cd8dcbc851fcc312722cdb5544c2f25cf46b3f8a Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sun, 14 Mar 2021 04:38:40 +0100 Subject: [PATCH] bpo-43410: Fix crash in the parser when producing syntax errors when reading from stdin (GH-24763) --- Lib/test/test_cmd_line.py | 9 ++- .../2021-03-05-17-23-36.bpo-43410.lCzIg0.rst | 2 + Parser/pegen.c | 14 +++- Parser/tokenizer.c | 78 ++++++++++++------- Parser/tokenizer.h | 4 +- 5 files changed, 76 insertions(+), 31 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-03-05-17-23-36.bpo-43410.lCzIg0.rst diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index f12dff3202fe31..95ab9d8c139656 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -816,9 +816,16 @@ def test_sys_flags_not_set(self): PYTHONVERBOSE="1", ) +class SyntaxErrorTests(unittest.TestCase): + def test_tokenizer_error_with_stdin(self): + proc = subprocess.run([sys.executable, "-"], input = b"(1+2+3", + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + self.assertNotEqual(proc.returncode, 0) + self.assertNotEqual(proc.stderr, None) + self.assertIn(b"\nSyntaxError", proc.stderr) def test_main(): - support.run_unittest(CmdLineTest, IgnoreEnvironmentTest) + support.run_unittest(CmdLineTest, IgnoreEnvironmentTest, SyntaxErrorTests) support.reap_children() if __name__ == "__main__": diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-03-05-17-23-36.bpo-43410.lCzIg0.rst b/Misc/NEWS.d/next/Core and Builtins/2021-03-05-17-23-36.bpo-43410.lCzIg0.rst new file mode 100644 index 00000000000000..245bda5ff72dd6 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-03-05-17-23-36.bpo-43410.lCzIg0.rst @@ -0,0 +1,2 @@ +Fix a bug that was causing the parser to crash when emiting syntax errors +when reading input from stdin. Patch by Pablo Galindo diff --git a/Parser/pegen.c b/Parser/pegen.c index 68f0e329f083d1..301199368651da 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -397,7 +397,8 @@ get_error_line(Parser *p, Py_ssize_t lineno) are stored in p->tok->stdin_content */ assert(p->tok->fp == NULL || p->tok->fp == stdin); - char *cur_line = p->tok->fp == NULL ? p->tok->str : p->tok->stdin_content; + char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str; + for (int i = 0; i < lineno - 1; i++) { cur_line = strchr(cur_line, '\n') + 1; } @@ -440,7 +441,10 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype, goto error; } - if (p->start_rule == Py_file_input) { + if (p->tok->fp_interactive) { + error_line = get_error_line(p, lineno); + } + else if (p->start_rule == Py_file_input) { error_line = PyErr_ProgramTextObject(p->tok->filename, (int) lineno); } @@ -1232,7 +1236,7 @@ _PyPegen_run_parser(Parser *p) if (p->fill == 0) { RAISE_SYNTAX_ERROR("error at start before reading any input"); } - else if (p->tok->done == E_EOF) { + else if (p->tok->done == E_EOF) { if (p->tok->level) { raise_unclosed_parentheses_error(p); } else { @@ -1287,6 +1291,10 @@ _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filena } return NULL; } + if (!tok->fp || ps1 != NULL || ps2 != NULL || + PyUnicode_CompareWithASCIIString(filename_ob, "") == 0) { + tok->fp_interactive = 1; + } // This transfers the ownership to the tokenizer tok->filename = filename_ob; Py_INCREF(filename_ob); diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index d9334aaf148ba2..09d8b88cadf357 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -56,6 +56,9 @@ tok_new(void) if (tok == NULL) return NULL; tok->buf = tok->cur = tok->inp = NULL; + tok->fp_interactive = 0; + tok->interactive_src_start = NULL; + tok->interactive_src_end = NULL; tok->start = NULL; tok->end = NULL; tok->done = E_OK; @@ -80,8 +83,6 @@ tok_new(void) tok->decoding_readline = NULL; tok->decoding_buffer = NULL; tok->type_comments = 0; - tok->stdin_content = NULL; - tok->async_hacks = 0; tok->async_def = 0; tok->async_def_indent = 0; @@ -323,6 +324,35 @@ check_bom(int get_char(struct tok_state *), return 1; } +static int tok_concatenate_interactive_new_line(struct tok_state* tok, char* line) { + assert(tok->fp_interactive); + + if (!line) { + return 0; + } + + Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; + Py_ssize_t line_size = strlen(line); + char* new_str = tok->interactive_src_start; + + new_str = PyMem_Realloc(new_str, current_size + line_size + 1); + if (!new_str) { + if (tok->interactive_src_start) { + PyMem_Free(tok->interactive_src_start); + } + tok->interactive_src_start = NULL; + tok->interactive_src_end = NULL; + tok->done = E_NOMEM; + return -1; + } + strcpy(new_str + current_size, line); + + tok->interactive_src_start = new_str; + tok->interactive_src_end = new_str + current_size + line_size; + return 0; +} + + /* Read a line of text from TOK into S, using the stream in TOK. Return NULL on failure, else S. @@ -552,6 +582,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok) badchar, tok->filename, tok->lineno + 1); return error_ret(tok); } + + if (tok->fp_interactive && + tok_concatenate_interactive_new_line(tok, line) == -1) { + return NULL; + } + return line; } @@ -807,17 +843,21 @@ PyTokenizer_FromFile(FILE *fp, const char* enc, void PyTokenizer_Free(struct tok_state *tok) { - if (tok->encoding != NULL) + if (tok->encoding != NULL) { PyMem_Free(tok->encoding); + } Py_XDECREF(tok->decoding_readline); Py_XDECREF(tok->decoding_buffer); Py_XDECREF(tok->filename); - if (tok->fp != NULL && tok->buf != NULL) + if (tok->fp != NULL && tok->buf != NULL) { PyMem_Free(tok->buf); - if (tok->input) + } + if (tok->input) { PyMem_Free(tok->input); - if (tok->stdin_content) - PyMem_Free(tok->stdin_content); + } + if (tok->interactive_src_start != NULL) { + PyMem_Free(tok->interactive_src_start); + } PyMem_Free(tok); } @@ -858,24 +898,6 @@ tok_nextc(struct tok_state *tok) if (translated == NULL) return EOF; newtok = translated; - if (tok->stdin_content == NULL) { - tok->stdin_content = PyMem_Malloc(strlen(translated) + 1); - if (tok->stdin_content == NULL) { - tok->done = E_NOMEM; - return EOF; - } - sprintf(tok->stdin_content, "%s", translated); - } - else { - char *new_str = PyMem_Malloc(strlen(tok->stdin_content) + strlen(translated) + 1); - if (new_str == NULL) { - tok->done = E_NOMEM; - return EOF; - } - sprintf(new_str, "%s%s", tok->stdin_content, translated); - PyMem_Free(tok->stdin_content); - tok->stdin_content = new_str; - } } if (tok->encoding && newtok && *newtok) { /* Recode to UTF-8 */ @@ -898,6 +920,10 @@ tok_nextc(struct tok_state *tok) strcpy(newtok, buf); Py_DECREF(u); } + if (tok->fp_interactive && + tok_concatenate_interactive_new_line(tok, newtok) == -1) { + return EOF; + } if (tok->nextprompt != NULL) tok->prompt = tok->nextprompt; if (newtok == NULL) @@ -958,7 +984,7 @@ tok_nextc(struct tok_state *tok) } if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf), tok) == NULL) { - if (!tok->decoding_erred) + if (!tok->decoding_erred && !(tok->done == E_NOMEM)) tok->done = E_EOF; done = 1; } diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 56074b61ae100e..111126c67f2d52 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -26,6 +26,9 @@ struct tok_state { char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ char *cur; /* Next character in buffer */ char *inp; /* End of data in buffer */ + int fp_interactive; /* If the file descriptor is interactive */ + char *interactive_src_start; /* The start of the source parsed so far in interactive mode */ + char *interactive_src_end; /* The end of the source parsed so far in interactive mode */ const char *end; /* End of input buffer if buf != NULL */ const char *start; /* Start of current token if not NULL */ int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ @@ -37,7 +40,6 @@ struct tok_state { int atbol; /* Nonzero if at begin of new line */ int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ const char *prompt, *nextprompt; /* For interactive prompting */ - char *stdin_content; int lineno; /* Current line number */ int first_lineno; /* First line of a single line or multi line string expression (cf. issue 16806) */