Skip to content

Commit

Permalink
wasm: unicode fixes for count, indexof, and substring builtins.
Browse files Browse the repository at this point in the history
These builtins should operate on codepoints, not on characters. This
also improves the string conversion from a memory byte array holding
UTF-8 string to a valid JavaScript string which is UTF-16.

The improved unicode tests were provided by Anders Eknert
<anders.eknert@bisnode.com>.

Signed-off-by: Teemu Koponen <koponen@styra.com>
  • Loading branch information
anderseknert authored and tsandall committed Oct 23, 2020
1 parent 6ce7a6b commit e8c5e12
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 20 deletions.
2 changes: 1 addition & 1 deletion internal/compiler/wasm/opa/opa.go

Large diffs are not rendered by default.

Binary file modified internal/compiler/wasm/opa/opa.wasm
Binary file not shown.
59 changes: 58 additions & 1 deletion test/wasm/assets/018_builtins.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ cases:
- note: count built-in
query: count([1,2,3],x)
want_result: [{'x': 3}]
- note: count built-in string
query: count("abc", x)
want_result: [{'x': 3}]
- note: count built-in string unicode
query: count("åäö", x)
want_result: [{'x': 3}]
- note: sum built-in
query: sum([1,2,3],x)
want_result: [{'x': 6}]
Expand Down Expand Up @@ -134,57 +140,108 @@ cases:
- note: concat built-in
query: concat(",",["a","b"],x)
want_result: [{'x': "a,b"}]
- note: concat built-in unicode
query: concat("ä",["å","ö"],x)
want_result: [{'x': "åäö"}]
- note: contains built-in
query: contains("abc","b",x)
want_result: [{'x': true}]
- note: contains built-in unicode
query: contains("åäö","ä",x)
want_result: [{'x': true}]
- note: endswith built-in
query: endswith("abc","c",x)
want_result: [{'x': true}]
- note: endswith built-in unicode
query: endswith("åäö","ö",x)
want_result: [{'x': true}]
- note: format_int built-in
query: format_int(10,16,x)
want_result: [{'x': "a"}]
- note: indexof built-in
query: indexof("abc","b",x)
want_result: [{'x': 1}]
- note: indexof built-in unicode
query: indexof("åäö","ä",x)
want_result: [{'x': 1}]
- note: lower built-in
query: lower("A",x)
want_result: [{'x': "a"}]
- note: lower built-in unicode
query: lower("Å",x)
want_result: [{'x': "å"}]
- note: replace built-in
query: replace("abc","b","d",x)
want_result: [{'x': "adc"}]
- note: replace built-in unicode
query: replace("åäö","å","ö",x)
want_result: [{'x': "öäö"}]
- note: replace_n built-in
query: strings.replace_n({"b":"d"},"abc",x)
want_result: [{'x': "adc"}]
- note: replace_n built-in unicode
query: strings.replace_n({"b":"ö"},"abc",x)
want_result: [{'x': "aöc"}]
- note: split built-in
query: split("a,b",",",x)
want_result: [{'x': ["a","b"]}]
- note: split built-in unicode
query: split("aöb","ö",x)
want_result: [{'x': ["a","b"]}]
- note: startswith built-in
query: startswith("abc","a",x)
want_result: [{'x': true}]
- note: startswith built-in unicode
query: startswith("åäö","å",x)
want_result: [{'x': true}]
- note: substring built-in
query: substring("abcd",1,2,x)
want_result: [{'x': "bc"}]
- note: substring built-in unicode
query: substring("åäö",1,2,x)
want_result: [ { 'x': "äö" } ]
- note: trim built-in
query: trim("abc","ac",x)
want_result: [{'x': "b"}]
- note: trim built-in unicode
query: trim("åäö","åö",x)
want_result: [{'x': "ä"}]
- note: trim_left built-in
query: trim_left("abc","ba",x)
want_result: [{'x': "c"}]
- note: trim_left built-in unicode
query: trim_left("åäö","äå",x)
want_result: [{'x': "ö"}]
- note: trim_prefix built-in
query: trim_prefix("abc","ab",x)
want_result: [{'x': "c"}]
- note: trim_prefix built-in unicode
query: trim_prefix("åäö","åä",x)
want_result: [{'x': "ö"}]
- note: trim_right built-in
query: trim_right("abc","cb",x)
want_result: [{'x': "a"}]
- note: trim_right built-in unicode
query: trim_right("åäö","öä",x)
want_result: [{'x': "å"}]
- note: trim_suffix built-in
query: trim_suffix("abc","bc",x)
want_result: [{'x': "a"}]
- note: trim_suffix built-in unicode
query: trim_suffix("åäö","äö",x)
want_result: [{'x': "å"}]
- note: trim_space built-in
query: trim_space(" abc ",x)
want_result: [{'x': "abc"}]
- note: trim_space built-in unicode
query: trim_space(" åäö ",x)
want_result: [{'x': "åäö"}]
- note: upper built-in
query: upper("a",x)
want_result: [{'x': "A"}]
- note: upper built-in unicode
query: upper("å",x)
want_result: [{'x': "Å"}]
- note: numbers.range built-in
query: numbers.range(10, 12, x)
want_result: [{'x': [10, 11, 12]}]
Expand Down Expand Up @@ -244,4 +301,4 @@ cases:
"x": "c",
"y": 4,
}
]
]
5 changes: 2 additions & 3 deletions test/wasm/assets/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,12 @@ function dumpJSON(mod, memory, addr) {
const buf = new Uint8Array(memory.buffer);

// NOTE(tsandall): There must be a better way of doing this...
let s = '';
let idx = rawAddr;
while (buf[idx] != 0) {
s += String.fromCharCode(buf[idx++]);
idx++;
}

return JSON.parse(s);
return JSON.parse(decodeURIComponent(escape(String.fromCharCode.apply(null, buf.slice(rawAddr, idx)))));
}

function builtinCustomTest(a) {
Expand Down
17 changes: 15 additions & 2 deletions wasm/src/aggregates.c
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
#include "aggregates.h"
#include "mpd.h"
#include "unicode.h"

opa_value *opa_agg_count(opa_value *v)
{
switch (opa_value_type(v))
{
case OPA_STRING:
return opa_number_int(opa_cast_string(v)->len);
case OPA_STRING: {
opa_string_t *s = opa_cast_string(v);
int units = 0;

for (int i = 0, len = 0; i < s->len; units++, i += len)
{
if (opa_unicode_decode_utf8(s->v, i, s->len, &len) == -1)
{
opa_abort("string: invalid unicode");
}
}

return opa_number_int(units);
}
case OPA_ARRAY:
return opa_number_int(opa_cast_array(v)->len);
case OPA_OBJECT:
Expand Down
64 changes: 51 additions & 13 deletions wasm/src/strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,23 @@ opa_value *opa_strings_indexof(opa_value *a, opa_value *b)

opa_string_t *s = opa_cast_string(a);
opa_string_t *substr = opa_cast_string(b);
int n = strings_indexof(s, 0, substr);

return opa_number_int(strings_indexof(s, 0, substr));
if (n < 0)
{
return opa_number_int(n);
}

int units = 0;
for (int i = 0, len = 0; i < n; units++, i += len)
{
if (opa_unicode_decode_utf8(s->v, i, s->len, &len) == -1)
{
opa_abort("string: invalid unicode");
}
}

return opa_number_int(units);
}

opa_value *opa_strings_replace(opa_value *a, opa_value *b, opa_value *c)
Expand Down Expand Up @@ -427,29 +442,52 @@ opa_value *opa_strings_substring(opa_value *a, opa_value *b, opa_value *c)
return NULL;
}

if (start >= base->len)
{
return opa_string_terminated("");
}

if (start < 0)
{
return NULL;
}

if (length < 0)
if (length == 0)
{
length = base->len - start;
return opa_string_terminated("");
}

if (base->len < (start + length))
size_t spos = base->len, epos = base->len;
for (int i = 0, units = 0, len = 0; i < base->len; units++, i += len)
{
length = base->len - start;
if (units == start)
{
spos = i;
}

if (opa_unicode_decode_utf8(base->v, i, base->len, &len) == -1)
{
opa_abort("string: invalid unicode");
}

if (units < start)
{
// Start index not reached yet.
continue;
}

if (length < 0)
{
// Everything from start to end.
break;
}

if (length == (units - start))
{
epos = i;
break;
}
}

char *str = opa_malloc(length + 1);
memcpy(str, &base->v[start], length + 1);
return opa_string_allocated(str, length);
char *str = opa_malloc(epos - spos + 1);
memcpy(str, &base->v[spos], epos - spos);
str[epos - spos] = 0;
return opa_string_allocated(str, epos - spos);
}

opa_value *opa_strings_trim(opa_value *a, opa_value *b)
Expand Down
6 changes: 6 additions & 0 deletions wasm/tests/test.c
Original file line number Diff line number Diff line change
Expand Up @@ -1504,6 +1504,7 @@ void test_aggregates(void)
opa_set_add(set, opa_number_int(4));

test("count/string", opa_value_compare(opa_agg_count(opa_string("foo", 3)), opa_number_int(3)) == 0);
test("count/unicode string", opa_value_compare(opa_agg_count(opa_string("\xC3\xA5\xC3\xA4\xC3\xB6", 6)), opa_number_int(3)) == 0);
test("count/array", opa_value_compare(opa_agg_count(&arr->hdr), opa_number_int(3)) == 0);
test("count/object", opa_value_compare(opa_agg_count(&obj->hdr), opa_number_int(3)) == 0);
test("count/set", opa_value_compare(opa_agg_count(&set->hdr), opa_number_int(3)) == 0);
Expand Down Expand Up @@ -1698,6 +1699,7 @@ void test_strings(void)
test("indexof/abaa", opa_value_compare(opa_strings_indexof(opa_string_terminated("ab"), opa_string_terminated("aa")), opa_number_int(-1)) == 0);
test("indexof/abcbc", opa_value_compare(opa_strings_indexof(opa_string_terminated("abc"), opa_string_terminated("bc")), opa_number_int(1)) == 0);
test("indexof/abcbd", opa_value_compare(opa_strings_indexof(opa_string_terminated("abc"), opa_string_terminated("bd")), opa_number_int(-1)) == 0);
test("indexof/unicode", opa_value_compare(opa_strings_indexof(opa_string_terminated("\xC3\xA5\xC3\xA4\xC3\xB6"), opa_string_terminated("\xC3\xB6")), opa_number_int(2)) == 0);

test("replace/___", opa_value_compare(opa_strings_replace(opa_string_terminated(""), opa_string_terminated(""), opa_string_terminated("")), opa_string_terminated("")) == 0);
test("replace/_ab", opa_value_compare(opa_strings_replace(opa_string_terminated(""), opa_string_terminated("a"), opa_string_terminated("b")), opa_string_terminated("")) == 0);
Expand Down Expand Up @@ -1769,6 +1771,10 @@ void test_strings(void)
test("substring/abc10", opa_value_compare(opa_strings_substring(opa_string_terminated("abc"), opa_number_int(1), opa_number_int(0)), opa_string_terminated("")) == 0);
test("substring/abc11", opa_value_compare(opa_strings_substring(opa_string_terminated("abc"), opa_number_int(1), opa_number_int(1)), opa_string_terminated("b")) == 0);
test("substring/abc12", opa_value_compare(opa_strings_substring(opa_string_terminated("abc"), opa_number_int(1), opa_number_int(2)), opa_string_terminated("bc")) == 0);
test("substring/abc41", opa_value_compare(opa_strings_substring(opa_string_terminated("abc"), opa_number_int(4), opa_number_int(1)), opa_string_terminated("")) == 0);
test("substring/unicode", opa_value_compare(opa_strings_substring(opa_string_terminated("\xC3\xA5\xC3\xA4\xC3\xB6\x7A"), opa_number_int(1), opa_number_int(1)), opa_string_terminated("\xC3\xA4")) == 0);
test("substring/unicode", opa_value_compare(opa_strings_substring(opa_string_terminated("\xC3\xA5\xC3\xA4\xC3\xB6\x7A"), opa_number_int(1), opa_number_int(2)), opa_string_terminated("\xC3\xA4\xC3\xB6")) == 0);
test("substring/unicode", opa_value_compare(opa_strings_substring(opa_string_terminated("\xC3\xA5\xC3\xA4\xC3\xB6\x7A"), opa_number_int(2), opa_number_int(-1)), opa_string_terminated("\xC3\xB6\x7A")) == 0);

test("trim/__", opa_value_compare(opa_strings_trim(opa_string_terminated(""), opa_string_terminated("")), opa_string_terminated("")) == 0);
test("trim/abcba", opa_value_compare(opa_strings_trim(opa_string_terminated("abc"), opa_string_terminated("ba")), opa_string_terminated("c")) == 0);
Expand Down

0 comments on commit e8c5e12

Please sign in to comment.