Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Correctly cast from UTF16 positions #304

Merged
merged 1 commit into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning][semver].

### Changed
### Fixed
[#304]: https://github.com/openlawlibrary/pygls/issues/304

- `pygls` no longer overrides the event loop for the current thread when given an explicit loop to use. ([#334])
- Fixed `MethodTypeNotRegisteredError` when registering a `TEXT_DOCUMENT_DID_SAVE` feature with options. ([#338])
Expand Down
57 changes: 46 additions & 11 deletions pygls/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,17 @@
log = logging.getLogger(__name__)


def is_char_beyond_multilingual_plane(char: str) -> bool:
return ord(char) > 0xFFFF


def utf16_unit_offset(chars: str):
"""Calculate the number of characters which need two utf-16 code units.

Arguments:
chars (str): The string to count occurrences of utf-16 code units for.
"""
return sum(ord(ch) > 0xFFFF for ch in chars)
return sum(is_char_beyond_multilingual_plane(ch) for ch in chars)


def utf16_num_units(chars: str):
Expand All @@ -59,7 +63,7 @@ def position_from_utf16(lines: List[str], position: Position) -> Position:
"""Convert the position.character from utf-16 code units to utf-32.

A python application can't use the character member of `Position`
directly as per specification it is represented as a zero-based line and
directly. As per specification it is represented as a zero-based line and
character offset based on a UTF-16 string representation.

All characters whose code point exceeds the Basic Multilingual Plane are
Expand All @@ -80,14 +84,44 @@ def position_from_utf16(lines: List[str], position: Position) -> Position:
Returns:
The position with `character` being converted to utf-32 code units.
"""
try:
return Position(
line=position.line,
character=position.character
- utf16_unit_offset(lines[position.line][:position.character])
if len(lines) == 0:
return Position(0, 0)
if position.line >= len(lines):
return Position(len(lines) - 1, utf16_num_units(lines[-1]))

_line = lines[position.line]
_line = _line.replace('\r\n', '\n') # TODO: it's a bit of a hack
_utf16_len = utf16_num_units(_line)
_utf32_len = len(_line)

if _utf16_len == 0:
return Position(position.line, 0)

_utf16_end_of_line = utf16_num_units(_line)
if position.character > _utf16_end_of_line:
position.character = _utf16_end_of_line - 1

Copy link

@pyscripter pyscripter Dec 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

position can be greater than utf16_num_units(line), if for example the cursor is at the end of the line.
I would just drop this whole if block not least because utf16_num_units slows things down.
_apply_incremental_change is designed to work correctly with positions beyond the end of the line.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i've kept this just to satisfy this test assert doc.offset_at_position(Position(line=1, character=5)) == 12, which seems to make sense to me

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pyscripter I need some help to understand why you consider that existing test line to be wrong. Do you disagree with this assertion assert doc.offset_at_position(Position(line=1, character=5)) == 12?

_utf16_index = 0
utf32_index = 0
while True:
_is_searching_queried_position = _utf16_index < position.character
_is_before_end_of_line = utf32_index < _utf32_len
_is_searching_for_position = (
_is_searching_queried_position and _is_before_end_of_line
)
except IndexError:
return Position(line=len(lines), character=0)
if not _is_searching_for_position:
break

_current_char = _line[utf32_index]
_is_double_width = is_char_beyond_multilingual_plane(_current_char)
if _is_double_width:
_utf16_index += 2
else:
_utf16_index += 1
utf32_index += 1

position = Position(line=position.line, character=utf32_index)
return position


def position_to_utf16(lines: List[str], position: Position) -> Position:
Expand Down Expand Up @@ -137,10 +171,11 @@ def range_from_utf16(lines: List[str], range: Range) -> Range:
Returns:
The range with `character` offsets being converted to utf-16 code units.
"""
return Range(
range_new = Range(
start=position_from_utf16(lines, range.start),
end=position_from_utf16(lines, range.end)
)
return range_new


def range_to_utf16(lines: List[str], range: Range) -> Range:
Expand Down Expand Up @@ -280,7 +315,7 @@ def offset_at_position(self, position: Position) -> int:
lines = self.lines
pos = position_from_utf16(lines, position)
row, col = pos.line, pos.character
return col + sum(len(line) for line in lines[:row])
return col + sum(utf16_num_units(line) for line in lines[:row])

@property
def source(self) -> str:
Expand Down
40 changes: 29 additions & 11 deletions tests/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,11 @@ def test_range_from_utf16():
range = Range(
start=Position(line=0, character=3), end=Position(line=0, character=5)
)
range_from_utf16(['x="😋"'], range)
assert range == Range(
start=Position(line=0, character=3), end=Position(line=0, character=5)
actual = range_from_utf16(['x="😋😋"'], range)
expected = Range(
start=Position(line=0, character=3), end=Position(line=0, character=4)
)
assert actual == expected


def test_range_to_utf16():
Expand All @@ -239,23 +240,40 @@ def test_range_to_utf16():
range = Range(
start=Position(line=0, character=3), end=Position(line=0, character=4)
)
range_to_utf16(['x="😋"'], range)
assert range == Range(
start=Position(line=0, character=3), end=Position(line=0, character=4)
actual = range_to_utf16(['x="😋😋"'], range)
expected = Range(
start=Position(line=0, character=3), end=Position(line=0, character=5)
)
assert actual == expected


def test_offset_at_position(doc):
assert doc.offset_at_position(Position(line=0, character=8)) == 8
assert doc.offset_at_position(Position(line=1, character=5)) == 14
assert doc.offset_at_position(Position(line=1, character=5)) == 12
assert doc.offset_at_position(Position(line=2, character=0)) == 13
assert doc.offset_at_position(Position(line=2, character=4)) == 17
assert doc.offset_at_position(Position(line=3, character=6)) == 27
assert doc.offset_at_position(Position(line=3, character=7)) == 27
assert doc.offset_at_position(Position(line=3, character=7)) == 28
assert doc.offset_at_position(Position(line=3, character=8)) == 28
assert doc.offset_at_position(Position(line=4, character=0)) == 39
assert doc.offset_at_position(Position(line=5, character=0)) == 39

assert doc.offset_at_position(Position(line=4, character=0)) == 40
assert doc.offset_at_position(Position(line=5, character=0)) == 40

def test_utf16_to_utf32_position_cast(doc):
lines = ['', '😋😋', '']
assert position_from_utf16(lines, Position(line=0, character=0)) == Position(line=0, character=0)
assert position_from_utf16(lines, Position(line=0, character=1)) == Position(line=0, character=0)
assert position_from_utf16(lines, Position(line=1, character=0)) == Position(line=1, character=0)
assert position_from_utf16(lines, Position(line=1, character=2)) == Position(line=1, character=1)
assert position_from_utf16(lines, Position(line=1, character=3)) == Position(line=1, character=2)
assert position_from_utf16(lines, Position(line=1, character=4)) == Position(line=1, character=2)
assert position_from_utf16(lines, Position(line=1, character=100)) == Position(line=1, character=2)
assert position_from_utf16(lines, Position(line=3, character=0)) == Position(line=2, character=0)
assert position_from_utf16(lines, Position(line=4, character=10)) == Position(line=2, character=0)

def test_position_for_line_endings(doc):
lines = ['x\r\n', 'y\n']
assert position_from_utf16(lines, Position(line=0, character=10)) == Position(line=0, character=1)
assert position_from_utf16(lines, Position(line=1, character=10)) == Position(line=1, character=1)

def test_word_at_position(doc):
"""
Expand Down