@@ -31,6 +31,11 @@ typedef struct
3131{
3232 PyObject_HEAD struct tok_state * tok ;
3333 int done ;
34+
35+ /* Needed to cache line for performance */
36+ PyObject * last_line ;
37+ Py_ssize_t last_lineno ;
38+ Py_ssize_t byte_col_offset_diff ;
3439} tokenizeriterobject ;
3540
3641/*[clinic input]
@@ -67,6 +72,11 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
6772 self -> tok -> tok_extra_tokens = 1 ;
6873 }
6974 self -> done = 0 ;
75+
76+ self -> last_line = NULL ;
77+ self -> byte_col_offset_diff = 0 ;
78+ self -> last_lineno = 0 ;
79+
7080 return (PyObject * )self ;
7181}
7282
@@ -209,7 +219,18 @@ tokenizeriter_next(tokenizeriterobject *it)
209219 if (size >= 1 && it -> tok -> implicit_newline ) {
210220 size -= 1 ;
211221 }
212- line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
222+
223+ if (it -> tok -> lineno != it -> last_lineno ) {
224+ // Line has changed since last token, so we fetch the new line and cache it
225+ // in the iter object.
226+ Py_XDECREF (it -> last_line );
227+ line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
228+ it -> last_line = line ;
229+ it -> byte_col_offset_diff = 0 ;
230+ } else {
231+ // Line hasn't changed so we reuse the cached one.
232+ line = it -> last_line ;
233+ }
213234 }
214235 if (line == NULL ) {
215236 Py_DECREF (str );
@@ -218,13 +239,28 @@ tokenizeriter_next(tokenizeriterobject *it)
218239
219240 Py_ssize_t lineno = ISSTRINGLIT (type ) ? it -> tok -> first_lineno : it -> tok -> lineno ;
220241 Py_ssize_t end_lineno = it -> tok -> lineno ;
242+ it -> last_lineno = lineno ;
243+
221244 Py_ssize_t col_offset = -1 ;
222245 Py_ssize_t end_col_offset = -1 ;
246+ Py_ssize_t byte_offset = -1 ;
223247 if (token .start != NULL && token .start >= line_start ) {
224- col_offset = _PyPegen_byte_offset_to_character_offset (line , token .start - line_start );
248+ byte_offset = token .start - line_start ;
249+ col_offset = byte_offset - it -> byte_col_offset_diff ;
225250 }
226251 if (token .end != NULL && token .end >= it -> tok -> line_start ) {
227- end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , token .end - it -> tok -> line_start );
252+ Py_ssize_t end_byte_offset = token .end - it -> tok -> line_start ;
253+ if (lineno == end_lineno ) {
254+ // If the whole token is at the same line, we can just use the token.start
255+ // buffer for figuring out the new column offset, since using line is not
256+ // performant for very long lines.
257+ Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line (line , byte_offset , end_byte_offset );
258+ end_col_offset = col_offset + token_col_offset ;
259+ it -> byte_col_offset_diff += token .end - token .start - token_col_offset ;
260+ } else {
261+ end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , end_byte_offset );
262+ it -> byte_col_offset_diff += end_byte_offset - end_col_offset ;
263+ }
228264 }
229265
230266 if (it -> tok -> tok_extra_tokens ) {
@@ -264,7 +300,7 @@ tokenizeriter_next(tokenizeriterobject *it)
264300 }
265301 }
266302
267- result = Py_BuildValue ("(iN(nn)(nn)N )" , type , str , lineno , col_offset , end_lineno , end_col_offset , line );
303+ result = Py_BuildValue ("(iN(nn)(nn)O )" , type , str , lineno , col_offset , end_lineno , end_col_offset , line );
268304exit :
269305 _PyToken_Free (& token );
270306 if (type == ENDMARKER ) {
0 commit comments