Skip to content

Commit a68fd42

Browse files
Fix use correct bytesize of UTF-16 strings
1 parent be3115e commit a68fd42

File tree

5 files changed

+81
-17
lines changed

5 files changed

+81
-17
lines changed

spec/std/env_spec.cr

+18
Original file line numberDiff line numberDiff line change
@@ -95,4 +95,22 @@ describe "ENV" do
9595
end
9696
end
9797
end
98+
99+
it "handles unicode" do
100+
ENV["TEST_UNICODE_1"] = "bar\u{d7ff}\u{10000}"
101+
ENV["TEST_UNICODE_2"] = "\u{1234}"
102+
ENV["TEST_UNICODE_1"].should eq "bar\u{d7ff}\u{10000}"
103+
ENV["TEST_UNICODE_2"].should eq "\u{1234}"
104+
105+
values = {} of String => String
106+
ENV.each do |key, value|
107+
if key.starts_with?("TEST_UNICODE_")
108+
values[key] = value
109+
end
110+
end
111+
values.should eq({
112+
"TEST_UNICODE_1" => "bar\u{d7ff}\u{10000}",
113+
"TEST_UNICODE_2" => "\u{1234}"
114+
})
115+
end
98116
end

spec/std/string/utf16_spec.cr

+21-2
Original file line numberDiff line numberDiff line change
@@ -23,31 +23,50 @@ describe "String UTF16" do
2323
end
2424
end
2525

26-
describe "from_utf16" do
26+
describe ".from_utf16" do
2727
it "in the range U+0000..U+D7FF" do
2828
input = Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16]
2929
String.from_utf16(input).should eq("\u{0}hello\u{d7ff}")
30+
String.from_utf16(input.to_unsafe).should eq({"", input.to_unsafe + 1})
3031
end
3132

3233
it "in the range U+E000 to U+FFFF" do
3334
input = Slice[0xe000_u16, 0xffff_u16]
3435
String.from_utf16(input).should eq("\u{e000}\u{ffff}")
36+
37+
pointer = Slice[0xe000_u16, 0xffff_u16, 0_u16].to_unsafe
38+
String.from_utf16(pointer).should eq({"\u{e000}\u{ffff}", pointer + 3})
3539
end
3640

3741
it "in the range U+10000..U+10FFFF" do
3842
input = Slice[0xd800_u16, 0xdc00_u16]
3943
String.from_utf16(input).should eq("\u{10000}")
44+
45+
pointer = Slice[0xd800_u16, 0xdc00_u16, 0_u16].to_unsafe
46+
String.from_utf16(pointer).should eq({"\u{10000}", pointer + 3})
4047
end
4148

4249
it "in the range U+D800..U+DFFF" do
4350
input = Slice[0xdc00_u16, 0xd800_u16]
4451
String.from_utf16(input).should eq("\u{fffd}\u{fffd}")
52+
53+
pointer = Slice[0xdc00_u16, 0xd800_u16, 0_u16].to_unsafe
54+
String.from_utf16(pointer).should eq({"\u{fffd}\u{fffd}", pointer + 3})
4555
end
4656

4757
it "handles null bytes" do
4858
slice = Slice[104_u16, 105_u16, 0_u16, 55296_u16, 56485_u16]
4959
String.from_utf16(slice).should eq("hi\0000𐂥")
50-
String.from_utf16(slice.to_unsafe).should eq("hi")
60+
String.from_utf16(slice.to_unsafe).should eq({"hi", slice.to_unsafe + 3})
61+
end
62+
63+
it "with pointer reads multiple strings" do
64+
input = Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16, 0_u16]
65+
pointer = input.to_unsafe
66+
string, pointer = String.from_utf16(pointer)
67+
string.should eq("")
68+
string, pointer = String.from_utf16(pointer)
69+
string.should eq("hello\u{d7ff}")
5170
end
5271
end
5372
end

src/crystal/system/win32/env.cr

+1-3
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,11 @@ module Crystal::System::Env
5454

5555
begin
5656
while !pointer.value.zero?
57-
string = String.from_utf16(pointer)
57+
string, pointer = String.from_utf16(pointer)
5858
key_value = string.split('=', 2)
5959
key = key_value[0]
6060
value = key_value[1]? || ""
6161
yield key, value
62-
63-
pointer += string.bytesize + 1
6462
end
6563
ensure
6664
LibC.FreeEnvironmentStringsW(orig_pointer)

src/crystal/system/win32/time.cr

+2-2
Original file line numberDiff line numberDiff line change
@@ -141,13 +141,13 @@ module Crystal::System::Time
141141

142142
# Normalizes the names of the standard and dst zones.
143143
private def self.normalize_zone_names(info : LibC::TIME_ZONE_INFORMATION) : Tuple(String, String)
144-
stdname = String.from_utf16(info.standardName.to_unsafe)
144+
stdname = String.from_utf16(info.standardName.to_slice)
145145

146146
if normalized_names = WINDOWS_ZONE_NAMES[stdname]?
147147
return normalized_names
148148
end
149149

150-
dstname = String.from_utf16(info.daylightName.to_unsafe)
150+
dstname = String.from_utf16(info.daylightName.to_slice)
151151

152152
if english_name = translate_zone_name(stdname, dstname)
153153
if normalized_names = WINDOWS_ZONE_NAMES[english_name]?

src/string/utf16.cr

+39-10
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,7 @@ class String
5353
# slice = Slice[104_u16, 105_u16, 32_u16, 55296_u16, 56485_u16]
5454
# String.from_utf16(slice) # => "hi 𐂥"
5555
# ```
56-
#
57-
# If *slice* is a pointer, the string ends when a zero value is found.
58-
#
59-
# ```
60-
# slice = Slice[104_u16, 105_u16, 0_u16, 55296_u16, 56485_u16]
61-
# String.from_utf16(slice) # => "hi\0000𐂥"
62-
# String.from_utf16(slice.to_unsafe) # => "hi"
63-
# ```
64-
def self.from_utf16(slice : Slice(UInt16) | Pointer(UInt16)) : String
56+
def self.from_utf16(slice : Slice(UInt16)) : String
6557
bytesize = 0
6658
size = 0
6759

@@ -81,6 +73,41 @@ class String
8173
end
8274
end
8375

76+
# Decodes the given *slice* UTF-16 sequence into a String and returns the
77+
# pointer after reading. The string ends when a zero value is found.
78+
#
79+
# ```
80+
# slice = Slice[104_u16, 105_u16, 0_u16, 55296_u16, 56485_u16, 0_u16]
81+
# String.from_utf16(slice) # => "hi\0000𐂥"
82+
# pointer = slice.to_unsafe
83+
# string, pointer = String.from_utf16(pointer) # => "hi"
84+
# string, pointer = String.from_utf16(pointer) # => "𐂥"
85+
# ```
86+
#
87+
# Invalid values are encoded using the unicode replacement char with
88+
# codepoint `0xfffd`.
89+
def self.from_utf16(pointer : Pointer(UInt16)) : {String, Pointer(UInt16)}
90+
bytesize = 0
91+
size = 0
92+
93+
each_utf16_char(pointer) do |char|
94+
bytesize += char.bytesize
95+
size += 1
96+
end
97+
98+
string = String.new(bytesize) do |buffer|
99+
pointer = each_utf16_char(pointer) do |char|
100+
char.each_byte do |byte|
101+
buffer.value = byte
102+
buffer += 1
103+
end
104+
end
105+
{bytesize, size}
106+
end
107+
108+
{string, pointer + 1}
109+
end
110+
84111
# Yields each decoded char in the given slice.
85112
private def self.each_utf16_char(slice : Slice(UInt16))
86113
i = 0
@@ -107,7 +134,7 @@ class String
107134
end
108135

109136
# Yields each decoded char in the given pointer, stopping at the first null byte.
110-
private def self.each_utf16_char(pointer : Pointer(UInt16))
137+
private def self.each_utf16_char(pointer : Pointer(UInt16)) : Pointer(UInt16)
111138
loop do
112139
byte = pointer.value.to_i
113140
break if byte == 0
@@ -129,5 +156,7 @@ class String
129156

130157
pointer = pointer + 1
131158
end
159+
160+
pointer
132161
end
133162
end

0 commit comments

Comments
 (0)