Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-89855: Improve support of non-ASCII identifiers in IDLE #29381

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions Lib/idlelib/autocomplete.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@
TRY_A = False, False, False, ATTRS # '.' for attributes.
TRY_F = False, False, False, FILES # '/' in quotes for file name.

# This string includes all chars that may be in an identifier.
# TODO Update this here and elsewhere.
ID_CHARS = string.ascii_letters + string.digits + "_"
# all ASCII chars that may be in an identifier
_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")

SEPS = f"{os.sep}{os.altsep if os.altsep else ''}"
TRIGGERS = f".{SEPS}"
Expand Down Expand Up @@ -134,7 +133,11 @@ def open_completions(self, args):
elif hp.is_in_code() and (not mode or mode==ATTRS):
self._remove_autocomplete_window()
mode = ATTRS
while i and (curline[i-1] in ID_CHARS or ord(curline[i-1]) > 127):
while i:
c = curline[i-1]
if c not in _ASCII_ID_CHARS:
if c <= '\x7f' or not ('a' + c).isidentifier():
break
i -= 1
comp_start = curline[i:j]
if i and curline[i-1] == '.': # Need object with attributes.
Expand Down
10 changes: 3 additions & 7 deletions Lib/idlelib/autoexpand.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@
There is only one instance of Autoexpand.
'''
import re
import string

_LAST_WORD_RE = re.compile(r'\b\w+\Z')

class AutoExpand:
wordchars = string.ascii_letters + string.digits + "_"

def __init__(self, editwin):
self.text = editwin.text
self.bell = self.text.bell
Expand Down Expand Up @@ -85,10 +83,8 @@ def getwords(self):
def getprevword(self):
"Return the word prefix before the cursor."
line = self.text.get("insert linestart", "insert")
i = len(line)
while i > 0 and line[i-1] in self.wordchars:
i = i-1
return line[i:]
m = _LAST_WORD_RE.search(line)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to test with long lines.

return m[0] if m else ''


if __name__ == '__main__':
Expand Down
7 changes: 2 additions & 5 deletions Lib/idlelib/editor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os
import platform
import re
import string
import sys
import tokenize
import traceback
Expand Down Expand Up @@ -806,14 +805,12 @@ def ResetColorizer(self):
if self.line_numbers is not None:
self.line_numbers.update_colors()

IDENTCHARS = string.ascii_letters + string.digits + "_"

def colorize_syntax_error(self, text, pos):
text.tag_add("ERROR", pos)
char = text.get(pos)
if char and char in self.IDENTCHARS:
if char and ('a' + char).isidentifier():
text.tag_add("ERROR", pos + " wordstart", pos)
if '\n' == text.get(pos): # error at line end
if char == '\n': # error at line end
text.mark_set("insert", pos)
else:
text.mark_set("insert", pos + "+1c")
Expand Down
35 changes: 11 additions & 24 deletions Lib/idlelib/hyperparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,6 @@
# all ASCII chars that may be the first char of an identifier
_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")

# lookup table for whether 7-bit ASCII chars are valid in a Python identifier
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_IS_ASCII_ID_CHAR[ord(c)] is slower than c in _ASCII_ID_CHARS.

_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
# lookup table for whether 7-bit ASCII chars are valid as the first
# char in a Python identifier
_IS_ASCII_ID_FIRST_CHAR = \
[(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]


class HyperParser:
def __init__(self, editwin, index):
Expand Down Expand Up @@ -166,53 +159,47 @@ def _eat_identifier(cls, str, limit, pos):

This ignores non-identifier eywords are not identifiers.
"""
is_ascii_id_char = _IS_ASCII_ID_CHAR

# Start at the end (pos) and work backwards.
i = pos

# Go backwards as long as the characters are valid ASCII
# identifier characters. This is an optimization, since it
# is faster in the common case where most of the characters
# are ASCII.
while i > limit and (
ord(str[i - 1]) < 128 and
is_ascii_id_char[ord(str[i - 1])]
):
while i > limit and str[i - 1] in _ASCII_ID_CHARS:
i -= 1

# If the above loop ended due to reaching a non-ASCII
# character, continue going backwards using the most generic
# test for whether a string contains only valid identifier
# characters.
if i > limit and ord(str[i - 1]) >= 128:
while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
if i > limit and str[i - 1] > '\x7f':
while i - 4 >= limit and ('a' + str[i - 4:i]).isidentifier():
i -= 4
if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
if i - 2 >= limit and ('a' + str[i - 2:i]).isidentifier():
i -= 2
if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
if i - 1 >= limit and ('a' + str[i - 1]).isidentifier():
i -= 1

# The identifier candidate starts here. If it isn't a valid
# identifier, don't eat anything. At this point that is only
# possible if the first character isn't a valid first
# character for an identifier.
if not str[i:pos].isidentifier():
if i < pos and not str[i].isidentifier():
return 0
elif i < pos:
# All characters in str[i:pos] are valid ASCII identifier
# characters, so it is enough to check that the first is
# valid as the first character of an identifier.
if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
if str[i] not in _ASCII_ID_FIRST_CHARS:
return 0

# All keywords are valid identifiers, but should not be
# considered identifiers here, except for True, False and None.
if i < pos and (
iskeyword(str[i:pos]) and
str[i:pos] not in cls._ID_KEYWORDS
):
return 0
if i < pos:
word = str[i:pos]
if iskeyword(word) and word not in cls._ID_KEYWORDS:
return 0

return pos - i

Expand Down
6 changes: 1 addition & 5 deletions Lib/idlelib/undo.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import string

from idlelib.delegator import Delegator

# tkinter import not needed because module does not create widgets,
Expand Down Expand Up @@ -251,10 +249,8 @@ def merge(self, cmd):
self.chars = self.chars + cmd.chars
return True

alphanumeric = string.ascii_letters + string.digits + "_"

def classify(self, c):
if c in self.alphanumeric:
if ('a' + c).isidentifier():
return "alphanumeric"
if c == "\n":
return "newline"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Improve support of non-ASCII identifiers in IDLE
(autoexpanding, autocompletion, undo, etc).y