Skip to content

Commit 5fe0e9d

Browse files
zeertzjqyegappan
authored andcommitted
vim-patch:9.0.1485: no functions for converting from/to UTF-16 index (neovim#23318)
Problem: no functions for converting from/to UTF-16 index. Solution: Add UTF-16 flag to existing funtions and add strutf16len() and utf16idx(). (Yegappan Lakshmanan, closes vim/vim#12216) vim/vim@67672ef Co-authored-by: Yegappan Lakshmanan <yegappan@yahoo.com>
1 parent e3bbf6f commit 5fe0e9d

File tree

6 files changed

+664
-44
lines changed

6 files changed

+664
-44
lines changed

runtime/doc/builtin.txt

Lines changed: 88 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,10 @@ bufnr([{buf} [, {create}]]) Number Number of the buffer {buf}
6969
bufwinid({buf}) Number window ID of buffer {buf}
7070
bufwinnr({buf}) Number window number of buffer {buf}
7171
byte2line({byte}) Number line number at byte count {byte}
72-
byteidx({expr}, {nr}) Number byte index of {nr}th char in {expr}
73-
byteidxcomp({expr}, {nr}) Number byte index of {nr}th char in {expr}
72+
byteidx({expr}, {nr} [, {utf16}])
73+
Number byte index of {nr}th char in {expr}
74+
byteidxcomp({expr}, {nr} [, {utf16}])
75+
Number byte index of {nr}th char in {expr}
7476
call({func}, {arglist} [, {dict}])
7577
any call {func} with arguments {arglist}
7678
ceil({expr}) Float round {expr} up
@@ -80,7 +82,7 @@ chansend({id}, {data}) Number Writes {data} to channel
8082
char2nr({expr} [, {utf8}]) Number ASCII/UTF-8 value of first char in {expr}
8183
charclass({string}) Number character class of {string}
8284
charcol({expr} [, {winid}]) Number column number of cursor or mark
83-
charidx({string}, {idx} [, {countcc}])
85+
charidx({string}, {idx} [, {countcc} [, {utf16}]])
8486
Number char index of byte {idx} in {string}
8587
chdir({dir}) String change current working directory
8688
cindent({lnum}) Number C indent for line {lnum}
@@ -501,6 +503,8 @@ strptime({format}, {timestring})
501503
strridx({haystack}, {needle} [, {start}])
502504
Number last index of {needle} in {haystack}
503505
strtrans({expr}) String translate string to make it printable
506+
strutf16len({string} [, {countcc}])
507+
Number number of UTF-16 code units in {string}
504508
strwidth({expr}) Number display cell length of the String {expr}
505509
submatch({nr} [, {list}]) String or List
506510
specific match in ":s" or substitute()
@@ -545,6 +549,8 @@ undofile({name}) String undo file name for {name}
545549
undotree() List undo file tree
546550
uniq({list} [, {func} [, {dict}]])
547551
List remove adjacent duplicates from a list
552+
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
553+
Number UTF-16 index of byte {idx} in {string}
548554
values({dict}) List values in {dict}
549555
virtcol({expr} [, {list}]) Number or List
550556
screen column of cursor or mark
@@ -982,7 +988,7 @@ byte2line({byte}) *byte2line()*
982988
Can also be used as a |method|: >
983989
GetOffset()->byte2line()
984990
985-
byteidx({expr}, {nr}) *byteidx()*
991+
byteidx({expr}, {nr} [, {utf16}]) *byteidx()*
986992
Return byte index of the {nr}th character in the String
987993
{expr}. Use zero for the first character, it then returns
988994
zero.
@@ -992,6 +998,13 @@ byteidx({expr}, {nr}) *byteidx()*
992998
length is added to the preceding base character. See
993999
|byteidxcomp()| below for counting composing characters
9941000
separately.
1001+
When {utf16} is present and TRUE, {nr} is used as the UTF-16
1002+
index in the String {expr} instead of as the character index.
1003+
The UTF-16 index is the index in the string when it is encoded
1004+
with 16-bit words. If the specified UTF-16 index is in the
1005+
middle of a character (e.g. in a 4-byte character), then the
1006+
byte index of the first byte in the character is returned.
1007+
Refer to |string-offset-encoding| for more information.
9951008
Example : >
9961009
echo matchstr(str, ".", byteidx(str, 3))
9971010
< will display the fourth character. Another way to do the
@@ -1003,11 +1016,17 @@ byteidx({expr}, {nr}) *byteidx()*
10031016
If there are less than {nr} characters -1 is returned.
10041017
If there are exactly {nr} characters the length of the string
10051018
in bytes is returned.
1006-
1019+
See |charidx()| and |utf16idx()| for getting the character and
1020+
UTF-16 index respectively from the byte index.
1021+
Examples: >
1022+
echo byteidx('a😊😊', 2) returns 5
1023+
echo byteidx('a😊😊', 2, 1) returns 1
1024+
echo byteidx('a😊😊', 3, 1) returns 5
1025+
<
10071026
Can also be used as a |method|: >
10081027
GetName()->byteidx(idx)
10091028
1010-
byteidxcomp({expr}, {nr}) *byteidxcomp()*
1029+
byteidxcomp({expr}, {nr} [, {utf16}]) *byteidxcomp()*
10111030
Like byteidx(), except that a composing character is counted
10121031
as a separate character. Example: >
10131032
let s = 'e' .. nr2char(0x301)
@@ -1131,27 +1150,36 @@ charcol({expr} [, {winid}]) *charcol()*
11311150
GetPos()->col()
11321151
<
11331152
*charidx()*
1134-
charidx({string}, {idx} [, {countcc}])
1153+
charidx({string}, {idx} [, {countcc} [, {utf16}]])
11351154
Return the character index of the byte at {idx} in {string}.
11361155
The index of the first character is zero.
11371156
If there are no multibyte characters the returned value is
11381157
equal to {idx}.
1158+
11391159
When {countcc} is omitted or |FALSE|, then composing characters
1140-
are not counted separately, their byte length is
1141-
added to the preceding base character.
1160+
are not counted separately, their byte length is added to the
1161+
preceding base character.
11421162
When {countcc} is |TRUE|, then composing characters are
11431163
counted as separate characters.
1164+
1165+
When {utf16} is present and TRUE, {idx} is used as the UTF-16
1166+
index in the String {expr} instead of as the byte index.
1167+
11441168
Returns -1 if the arguments are invalid or if {idx} is greater
11451169
than the index of the last byte in {string}. An error is
11461170
given if the first argument is not a string, the second
11471171
argument is not a number or when the third argument is present
11481172
and is not zero or one.
1173+
11491174
See |byteidx()| and |byteidxcomp()| for getting the byte index
1150-
from the character index.
1175+
from the character index and |utf16idx()| for getting the
1176+
UTF-16 index from the character index.
1177+
Refer to |string-offset-encoding| for more information.
11511178
Examples: >
11521179
echo charidx('áb́ć', 3) returns 1
11531180
echo charidx('áb́ć', 6, 1) returns 4
11541181
echo charidx('áb́ć', 16) returns -1
1182+
echo charidx('a😊😊', 4, 0, 1) returns 2
11551183
<
11561184
Can also be used as a |method|: >
11571185
GetName()->charidx(idx)
@@ -8332,6 +8360,28 @@ strtrans({string}) *strtrans()*
83328360
Can also be used as a |method|: >
83338361
GetString()->strtrans()
83348362
8363+
strutf16len({string} [, {countcc}]) *strutf16len()*
8364+
The result is a Number, which is the number of UTF-16 code
8365+
units in String {string} (after converting it to UTF-16).
8366+
8367+
When {countcc} is TRUE, composing characters are counted
8368+
separately.
8369+
When {countcc} is omitted or FALSE, composing characters are
8370+
ignored.
8371+
8372+
Returns zero on error.
8373+
8374+
Also see |strlen()| and |strcharlen()|.
8375+
Examples: >
8376+
echo strutf16len('a') returns 1
8377+
echo strutf16len('©') returns 1
8378+
echo strutf16len('😊') returns 2
8379+
echo strutf16len('ą́') returns 1
8380+
echo strutf16len('ą́', v:true) returns 3
8381+
8382+
Can also be used as a |method|: >
8383+
GetText()->strutf16len()
8384+
<
83358385
strwidth({string}) *strwidth()*
83368386
The result is a Number, which is the number of display cells
83378387
String {string} occupies. A Tab character is counted as one
@@ -9063,6 +9113,34 @@ uniq({list} [, {func} [, {dict}]]) *uniq()* *E882*
90639113

90649114
Can also be used as a |method|: >
90659115
mylist->uniq()
9116+
<
9117+
*utf16idx()*
9118+
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
9119+
Same as |charidx()| but returns the UTF-16 index of the byte
9120+
at {idx} in {string} (after converting it to UTF-16).
9121+
9122+
When {charidx} is present and TRUE, {idx} is used as the
9123+
character index in the String {string} instead of as the byte
9124+
index.
9125+
An {idx} in the middle of a UTF-8 sequence is rounded upwards
9126+
to the end of that sequence.
9127+
9128+
See |byteidx()| and |byteidxcomp()| for getting the byte index
9129+
from the UTF-16 index and |charidx()| for getting the
9130+
character index from the UTF-16 index.
9131+
Refer to |string-offset-encoding| for more information.
9132+
Examples: >
9133+
echo utf16idx('a😊😊', 3) returns 2
9134+
echo utf16idx('a😊😊', 7) returns 4
9135+
echo utf16idx('a😊😊', 1, 0, 1) returns 2
9136+
echo utf16idx('a😊😊', 2, 0, 1) returns 4
9137+
echo utf16idx('aą́c', 6) returns 2
9138+
echo utf16idx('aą́c', 6, 1) returns 4
9139+
echo utf16idx('a😊😊', 9) returns -1
9140+
<
9141+
Can also be used as a |method|: >
9142+
GetName()->utf16idx(idx)
9143+
90669144
90679145
values({dict}) *values()*
90689146
Return a |List| with all the values of {dict}. The |List| is

runtime/doc/eval.txt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1433,6 +1433,32 @@ Examples: >
14331433
echo $"The square root of {{9}} is {sqrt(9)}"
14341434
< The square root of {9} is 3.0 ~
14351435

1436+
*string-offset-encoding*
1437+
A string consists of multiple characters. UTF-8 uses one byte for ASCII
1438+
characters, two bytes for other latin characters and more bytes for other
1439+
characters.
1440+
1441+
A string offset can count characters or bytes. Other programs may use
1442+
UTF-16 encoding (16-bit words) and an offset of UTF-16 words. Some functions
1443+
use byte offsets, usually for UTF-8 encoding. Other functions use character
1444+
offsets, in which case the encoding doesn't matter.
1445+
1446+
The different offsets for the string "a©😊" are below:
1447+
1448+
UTF-8 offsets:
1449+
[0]: 61, [1]: C2, [2]: A9, [3]: F0, [4]: 9F, [5]: 98, [6]: 8A
1450+
UTF-16 offsets:
1451+
[0]: 0061, [1]: 00A9, [2]: D83D, [3]: DE0A
1452+
UTF-32 (character) offsets:
1453+
[0]: 00000061, [1]: 000000A9, [2]: 0001F60A
1454+
1455+
You can use the "g8" and "ga" commands on a character to see the
1456+
decimal/hex/octal values.
1457+
1458+
The functions |byteidx()|, |utf16idx()| and |charidx()| can be used to convert
1459+
between these indices. The functions |strlen()|, |strutf16len()| and
1460+
|strcharlen()| return the number of bytes, UTF-16 code units and characters in
1461+
a string respectively.
14361462

14371463
------------------------------------------------------------------------------
14381464
option *expr-option* *E112* *E113*

runtime/doc/usr_41.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,7 @@ String manipulation: *string-functions*
621621
strlen() length of a string in bytes
622622
strcharlen() length of a string in characters
623623
strchars() number of characters in a string
624+
strutf16len() number of UTF-16 code units in a string
624625
strwidth() size of string when displayed
625626
strdisplaywidth() size of string when displayed, deals with tabs
626627
setcellwidths() set character cell width overrides
@@ -636,6 +637,7 @@ String manipulation: *string-functions*
636637
byteidx() byte index of a character in a string
637638
byteidxcomp() like byteidx() but count composing characters
638639
charidx() character index of a byte in a string
640+
utf16idx() UTF-16 index of a byte in a string
639641
repeat() repeat a string multiple times
640642
eval() evaluate a string expression
641643
execute() execute an Ex command and get the output

src/nvim/eval.lua

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ return {
6565
bufwinid={args=1, base=1},
6666
bufwinnr={args=1, base=1},
6767
byte2line={args=1, base=1},
68-
byteidx={args=2, base=1, fast=true},
69-
byteidxcomp={args=2, base=1, fast=true},
68+
byteidx={args={2, 3}, base=1, fast=true},
69+
byteidxcomp={args={2, 3}, base=1, fast=true},
7070
call={args={2, 3}, base=1},
7171
ceil={args=1, base=1, float_func="ceil"},
7272
changenr={},
@@ -75,7 +75,7 @@ return {
7575
char2nr={args={1, 2}, base=1, fast=true},
7676
charclass={args=1, base=1},
7777
charcol={args={1, 2}, base=1},
78-
charidx={args={2, 3}, base=1},
78+
charidx={args={2, 4}, base=1},
7979
chdir={args=1, base=1},
8080
cindent={args=1, base=1},
8181
clearmatches={args={0, 1}, base=1},
@@ -397,6 +397,7 @@ return {
397397
strptime={args=2, base=1},
398398
strridx={args={2, 3}, base=1},
399399
strtrans={args=1, base=1, fast=true},
400+
strutf16len={args={1, 2}, base=1},
400401
strwidth={args=1, base=1, fast=true},
401402
submatch={args={1, 2}, base=1},
402403
substitute={args=4, base=1},
@@ -435,6 +436,7 @@ return {
435436
undofile={args=1, base=1},
436437
undotree={},
437438
uniq={args={1, 3}, base=1},
439+
utf16idx={args={2, 4}, base=1},
438440
values={args=1, base=1},
439441
virtcol={args={1, 2}, base=1},
440442
virtcol2col={args=3, base=1},

0 commit comments

Comments
 (0)