Skip to content

Commit 5839b9a

Browse files
committed
fix(builtins): ARK-350, string:ord need to check it get only one utf8 char ; add string:utf8len
1 parent 4dd99f4 commit 5839b9a

File tree

14 files changed

+106
-33
lines changed

14 files changed

+106
-33
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,13 @@
1212
- `append`, `concat`, and `pop` can be used as values
1313
- new `ptr` command for the debugger, printing the VM pointers (ip, pp, sp)
1414
- compile time arity check when performing a tail call
15+
- `string:utf8len` to compute the number of codepoints in a string
1516

1617
### Changed
1718
- all paths inside `if` should return a value, when used as an expression. If an `else` branch is missing, `nil` will be returned
1819
- new compile time error when trying to use `append!`, `concat!`, `pop!`, `@=` and `@@=` as values
1920
- arguments in tail calls are loaded by value and not by reference
21+
- `string:ord` checks that it get only 1 utf8 character
2022

2123
### Removed
2224

include/Ark/Builtins/Builtins.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ namespace Ark::internal::Builtins
7878
ARK_BUILTIN(format);
7979
ARK_BUILTIN(findSubStr);
8080
ARK_BUILTIN(removeAtStr);
81+
ARK_BUILTIN(utf8len);
8182
ARK_BUILTIN(ord);
8283
ARK_BUILTIN(chr);
8384
ARK_BUILTIN(setStringAt);

include/utf8.hpp

Lines changed: 52 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ namespace utf8
133133
/**
134134
* @brief Check the validity of a given string in UTF8
135135
* @param str
136-
* @return true if the given string is a valid UTF88 string
136+
* @return true if the given string is a valid UTF8 string
137137
*/
138138
inline bool isValid(const char* str)
139139
{
@@ -183,46 +183,69 @@ namespace utf8
183183
return true;
184184
}
185185

186+
inline std::size_t length(const char* str)
187+
{
188+
std::size_t count = 0;
189+
const char* s = str;
190+
191+
if (str == nullptr)
192+
return 0;
193+
194+
while (*s != 0)
195+
{
196+
if (0xf0 == (0xf8 & *s))
197+
{
198+
++count;
199+
s += 4;
200+
}
201+
else if (0xe0 == (0xf0 & *s))
202+
{
203+
++count;
204+
s += 3;
205+
}
206+
else if (0xc0 == (0xe0 & *s))
207+
{
208+
++count;
209+
s += 2;
210+
}
211+
else if (0x00 == (0x80 & *s))
212+
{
213+
++count;
214+
s += 1;
215+
}
216+
else
217+
break;
218+
}
219+
220+
return count;
221+
}
222+
186223
/**
187224
* @brief Compute the UTF8 codepoint for a given UTF8 char
188225
* @param str
189226
* @return UTF8 codepoint if valid, -1 otherwise
190227
*/
191228
inline int32_t codepoint(const char* str)
192229
{
193-
int32_t codepoint = 0;
194230
const char* s = str;
195231

196232
if (isValid(str))
197233
{
198-
while (*s != 0)
199-
{
200-
if (0xf0 == (0xf8 & *s))
201-
{
202-
codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) | ((0x3f & s[2]) << 6) | (0x3f & s[3]);
203-
s += 4;
204-
}
205-
else if (0xe0 == (0xf0 & *s))
206-
{
207-
codepoint = ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
208-
s += 3;
209-
}
210-
else if (0xc0 == (0xe0 & *s))
211-
{
212-
codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
213-
s += 2;
214-
}
215-
else if (0x00 == (0x80 & *s))
216-
{
217-
codepoint = s[0];
218-
++s;
219-
}
220-
else
221-
return -1;
222-
}
223-
}
234+
int32_t c = 0;
224235

225-
return codepoint;
236+
if (0xf0 == (0xf8 & *s))
237+
c = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) | ((0x3f & s[2]) << 6) | (0x3f & s[3]);
238+
else if (0xe0 == (0xf0 & *s))
239+
c = ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
240+
else if (0xc0 == (0xe0 & *s))
241+
c = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
242+
else if (0x00 == (0x80 & *s))
243+
c = s[0];
244+
else
245+
return -1;
246+
return c;
247+
}
248+
return -1;
226249
}
227250

228251
/**

lib/std

src/arkreactor/Builtins/Builtins.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ namespace Ark::internal::Builtins
6363
{ "format", Value(String::format) },
6464
{ "builtin__string:find", Value(String::findSubStr) },
6565
{ "builtin__string:removeAt", Value(String::removeAtStr) },
66+
{ "builtin__string:utf8len", Value(String::utf8len) },
6667
{ "builtin__string:ord", Value(String::ord) },
6768
{ "builtin__string:chr", Value(String::chr) },
6869
{ "builtin__string:setAt", Value(String::setStringAt) },

src/arkreactor/Builtins/String.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,18 @@ namespace Ark::internal::Builtins::String
288288
throw std::runtime_error(fmt::format("string:removeAt: index {} out of range (length: {})", num, n[0].stringRef().size()));
289289
}
290290

291+
Value utf8len(std::vector<Value>& n, VM* vm [[maybe_unused]])
292+
{
293+
if (!types::check(n, ValueType::String))
294+
throw types::TypeCheckingError(
295+
"string:utf8len",
296+
{ { types::Contract { { types::Typedef("string", ValueType::String) } } } },
297+
n);
298+
299+
const std::size_t len = utf8::length(n[0].stringRef().c_str());
300+
return Value(static_cast<double>(len));
301+
}
302+
291303
Value ord(std::vector<Value>& n, VM* vm [[maybe_unused]])
292304
{
293305
if (!types::check(n, ValueType::String))
@@ -296,7 +308,13 @@ namespace Ark::internal::Builtins::String
296308
{ { types::Contract { { types::Typedef("string", ValueType::String) } } } },
297309
n);
298310

299-
return Value(utf8::codepoint(n[0].stringRef().c_str()));
311+
if (const std::size_t len = utf8::length(n[0].stringRef().c_str()); len != 1)
312+
throw std::runtime_error(fmt::format("string:ord: invalid string '{}', expected a single character, got {}", n[0].string(), len));
313+
314+
const int32_t codepoint = utf8::codepoint(n[0].stringRef().c_str());
315+
if (codepoint == -1)
316+
throw std::runtime_error(fmt::format("string:ord: invalid string '{}'", n[0].string()));
317+
return Value(codepoint);
300318
}
301319

302320
// cppcheck-suppress constParameterReference

tests/unittests/resources/CompilerSuite/ir/operators_as_builtins.expected

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
page_0
22
PUSH_RETURN_ADDRESS L0
3-
BUILTIN 69
43
BUILTIN 70
54
BUILTIN 71
65
BUILTIN 72
@@ -25,6 +24,7 @@ page_0
2524
BUILTIN 91
2625
BUILTIN 92
2726
BUILTIN 93
27+
BUILTIN 94
2828
CALL_BUILTIN 9, 25
2929
.L0:
3030
POP 0

tests/unittests/resources/CompilerSuite/optimized_ir/builtins.expected

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ page_0
55
HALT 0
66

77
page_1
8-
CALL_BUILTIN_WITHOUT_RETURN_ADDRESS 55, 1
8+
CALL_BUILTIN_WITHOUT_RETURN_ADDRESS 56, 1
99
.L0:
1010
RET 0
1111
HALT 0
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
(print (builtin__string:ord ""))
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
string:ord: invalid string '', expected a single character, got 0
2+
3+
In file tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.ark:1
4+
1 | (print (builtin__string:ord ""))
5+
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6+
2 |

0 commit comments

Comments
 (0)