wasm: unicode fixes for count, indexof, and substring builtins.

These builtins should operate on codepoints, not on characters. This also improves the string conversion from a memory byte array holding UTF-8 string to a valid JavaScript string which is UTF-16. The improved unicode tests were provided by Anders Eknert <anders.eknert@bisnode.com>. Signed-off-by: Teemu Koponen <koponen@styra.com>
pcaruana · Oct 23, 2020 · e8c5e12 · e8c5e12
1 parent 6ce7a6b
commit e8c5e12
Show file tree

Hide file tree

Showing 7 changed files with 133 additions and 20 deletions.
diff --git a/internal/compiler/wasm/opa/opa.go b/internal/compiler/wasm/opa/opa.go
diff --git a/internal/compiler/wasm/opa/opa.wasm b/internal/compiler/wasm/opa/opa.wasm
diff --git a/test/wasm/assets/018_builtins.yaml b/test/wasm/assets/018_builtins.yaml
@@ -86,6 +86,12 @@ cases:
   - note: count built-in
     query: count([1,2,3],x)
     want_result: [{'x': 3}]
+  - note: count built-in string
+    query: count("abc", x)
+    want_result: [{'x': 3}]
+  - note: count built-in string unicode
+    query: count("åäö", x)
+    want_result: [{'x': 3}]
   - note: sum built-in
     query: sum([1,2,3],x)
     want_result: [{'x': 6}]
@@ -134,57 +140,108 @@ cases:
   - note: concat built-in
     query: concat(",",["a","b"],x)
     want_result: [{'x': "a,b"}]
+  - note: concat built-in unicode
+    query: concat("ä",["å","ö"],x)
+    want_result: [{'x': "åäö"}]
   - note: contains built-in
     query: contains("abc","b",x)
     want_result: [{'x': true}]
+  - note: contains built-in unicode
+    query: contains("åäö","ä",x)
+    want_result: [{'x': true}]
   - note: endswith built-in
     query: endswith("abc","c",x)
     want_result: [{'x': true}]
+  - note: endswith built-in unicode
+    query: endswith("åäö","ö",x)
+    want_result: [{'x': true}]
   - note: format_int built-in
     query: format_int(10,16,x)
     want_result: [{'x': "a"}]
   - note: indexof built-in
     query: indexof("abc","b",x)
     want_result: [{'x': 1}]
+  - note: indexof built-in unicode
+    query: indexof("åäö","ä",x)
+    want_result: [{'x': 1}]
   - note: lower built-in
     query: lower("A",x)
     want_result: [{'x': "a"}]
+  - note: lower built-in unicode
+    query: lower("Å",x)
+    want_result: [{'x': "å"}]
   - note: replace built-in
     query: replace("abc","b","d",x)
     want_result: [{'x': "adc"}]
+  - note: replace built-in unicode
+    query: replace("åäö","å","ö",x)
+    want_result: [{'x': "öäö"}]
   - note: replace_n built-in
     query: strings.replace_n({"b":"d"},"abc",x)
     want_result: [{'x': "adc"}]
+  - note: replace_n built-in unicode
+    query: strings.replace_n({"b":"ö"},"abc",x)
+    want_result: [{'x': "aöc"}]
   - note: split built-in
     query: split("a,b",",",x)
     want_result: [{'x': ["a","b"]}]
+  - note: split built-in unicode
+    query: split("aöb","ö",x)
+    want_result: [{'x': ["a","b"]}]
   - note: startswith built-in
     query: startswith("abc","a",x)
     want_result: [{'x': true}]
+  - note: startswith built-in unicode
+    query: startswith("åäö","å",x)
+    want_result: [{'x': true}]
   - note: substring built-in
     query: substring("abcd",1,2,x)
     want_result: [{'x': "bc"}]
+  - note: substring built-in unicode
+    query: substring("åäö",1,2,x)
+    want_result: [ { 'x': "äö" } ]
   - note: trim built-in
     query: trim("abc","ac",x)
     want_result: [{'x': "b"}]
+  - note: trim built-in unicode
+    query: trim("åäö","åö",x)
+    want_result: [{'x': "ä"}]
   - note: trim_left built-in
     query: trim_left("abc","ba",x)
     want_result: [{'x': "c"}]
+  - note: trim_left built-in unicode
+    query: trim_left("åäö","äå",x)
+    want_result: [{'x': "ö"}]
   - note: trim_prefix built-in
     query: trim_prefix("abc","ab",x)
     want_result: [{'x': "c"}]
+  - note: trim_prefix built-in unicode
+    query: trim_prefix("åäö","åä",x)
+    want_result: [{'x': "ö"}]
   - note: trim_right built-in
     query: trim_right("abc","cb",x)
     want_result: [{'x': "a"}]
+  - note: trim_right built-in unicode
+    query: trim_right("åäö","öä",x)
+    want_result: [{'x': "å"}]
   - note: trim_suffix built-in
     query: trim_suffix("abc","bc",x)
     want_result: [{'x': "a"}]
+  - note: trim_suffix built-in unicode
+    query: trim_suffix("åäö","äö",x)
+    want_result: [{'x': "å"}]
   - note: trim_space built-in
     query: trim_space(" abc ",x)
     want_result: [{'x': "abc"}]
+  - note: trim_space built-in unicode
+    query: trim_space(" åäö ",x)
+    want_result: [{'x': "åäö"}]
   - note: upper built-in
     query: upper("a",x)
     want_result: [{'x': "A"}]
+  - note: upper built-in unicode
+    query: upper("å",x)
+    want_result: [{'x': "Å"}]
   - note: numbers.range built-in
     query: numbers.range(10, 12, x)
     want_result: [{'x': [10, 11, 12]}]
@@ -244,4 +301,4 @@ cases:
         "x": "c",
         "y": 4,
       }
-    ]
+    ]
diff --git a/test/wasm/assets/test.js b/test/wasm/assets/test.js
@@ -98,13 +98,12 @@ function dumpJSON(mod, memory, addr) {
     const buf = new Uint8Array(memory.buffer);
 
     // NOTE(tsandall): There must be a better way of doing this...
-    let s = '';
     let idx = rawAddr;
     while (buf[idx] != 0) {
-        s += String.fromCharCode(buf[idx++]);
+        idx++;
     }
 
-    return JSON.parse(s);
+    return JSON.parse(decodeURIComponent(escape(String.fromCharCode.apply(null, buf.slice(rawAddr, idx)))));
 }
 
 function builtinCustomTest(a) {

diff --git a/wasm/src/aggregates.c b/wasm/src/aggregates.c
@@ -1,12 +1,25 @@
 #include "aggregates.h"
 #include "mpd.h"
+#include "unicode.h"
 
 opa_value *opa_agg_count(opa_value *v)
 {
     switch (opa_value_type(v))
     {
-    case OPA_STRING:
-        return opa_number_int(opa_cast_string(v)->len);
+    case OPA_STRING: {
+        opa_string_t *s = opa_cast_string(v);
+        int units = 0;
+
+        for (int i = 0, len = 0; i < s->len; units++, i += len)
+        {
+            if (opa_unicode_decode_utf8(s->v, i, s->len, &len) == -1)
+            {
+                opa_abort("string: invalid unicode");
+            }
+        }
+
+        return opa_number_int(units);
+    }
     case OPA_ARRAY:
         return opa_number_int(opa_cast_array(v)->len);
     case OPA_OBJECT:

diff --git a/wasm/src/strings.c b/wasm/src/strings.c
@@ -241,8 +241,23 @@ opa_value *opa_strings_indexof(opa_value *a, opa_value *b)
 
     opa_string_t *s = opa_cast_string(a);
     opa_string_t *substr = opa_cast_string(b);
+    int n = strings_indexof(s, 0, substr);
 
-    return opa_number_int(strings_indexof(s, 0, substr));
+    if (n < 0)
+    {
+        return opa_number_int(n);
+    }
+
+    int units = 0;
+    for (int i = 0, len = 0; i < n; units++, i += len)
+    {
+        if (opa_unicode_decode_utf8(s->v, i, s->len, &len) == -1)
+        {
+            opa_abort("string: invalid unicode");
+        }
+    }
+
+    return opa_number_int(units);
 }
 
 opa_value *opa_strings_replace(opa_value *a, opa_value *b, opa_value *c)
@@ -427,29 +442,52 @@ opa_value *opa_strings_substring(opa_value *a, opa_value *b, opa_value *c)
         return NULL;
     }
 
-    if (start >= base->len)
-    {
-        return opa_string_terminated("");
-    }
-
     if (start < 0)
     {
         return NULL;
     }
 
-    if (length < 0)
+    if (length == 0)
     {
-        length = base->len - start;
+        return opa_string_terminated("");
     }
 
-    if (base->len < (start + length))
+    size_t spos = base->len, epos = base->len;
+    for (int i = 0, units = 0, len = 0; i < base->len; units++, i += len)
     {
-        length = base->len - start;
+        if (units == start)
+        {
+            spos = i;
+        }
+
+        if (opa_unicode_decode_utf8(base->v, i, base->len, &len) == -1)
+        {
+            opa_abort("string: invalid unicode");
+        }
+
+        if (units < start)
+        {
+            // Start index not reached yet.
+            continue;
+        }
+
+        if (length < 0)
+        {
+            // Everything from start to end.
+            break;
+        }
+
+        if (length == (units - start))
+        {
+            epos = i;
+            break;
+        }
     }
 
-    char *str = opa_malloc(length + 1);
-    memcpy(str, &base->v[start], length + 1);
-    return opa_string_allocated(str, length);
+    char *str = opa_malloc(epos - spos + 1);
+    memcpy(str, &base->v[spos], epos - spos);
+    str[epos - spos] = 0;
+    return opa_string_allocated(str, epos - spos);
 }
 
 opa_value *opa_strings_trim(opa_value *a, opa_value *b)

diff --git a/wasm/tests/test.c b/wasm/tests/test.c
@@ -1504,6 +1504,7 @@ void test_aggregates(void)
     opa_set_add(set, opa_number_int(4));
 
     test("count/string", opa_value_compare(opa_agg_count(opa_string("foo", 3)), opa_number_int(3)) == 0);
+    test("count/unicode string", opa_value_compare(opa_agg_count(opa_string("\xC3\xA5\xC3\xA4\xC3\xB6", 6)), opa_number_int(3)) == 0);
     test("count/array", opa_value_compare(opa_agg_count(&arr->hdr), opa_number_int(3)) == 0);
     test("count/object", opa_value_compare(opa_agg_count(&obj->hdr), opa_number_int(3)) == 0);
     test("count/set", opa_value_compare(opa_agg_count(&set->hdr), opa_number_int(3)) == 0);
@@ -1698,6 +1699,7 @@ void test_strings(void)
     test("indexof/abaa", opa_value_compare(opa_strings_indexof(opa_string_terminated("ab"), opa_string_terminated("aa")), opa_number_int(-1)) == 0);
     test("indexof/abcbc", opa_value_compare(opa_strings_indexof(opa_string_terminated("abc"), opa_string_terminated("bc")), opa_number_int(1)) == 0);
     test("indexof/abcbd", opa_value_compare(opa_strings_indexof(opa_string_terminated("abc"), opa_string_terminated("bd")), opa_number_int(-1)) == 0);
+    test("indexof/unicode", opa_value_compare(opa_strings_indexof(opa_string_terminated("\xC3\xA5\xC3\xA4\xC3\xB6"), opa_string_terminated("\xC3\xB6")), opa_number_int(2)) == 0);
 
     test("replace/___", opa_value_compare(opa_strings_replace(opa_string_terminated(""), opa_string_terminated(""), opa_string_terminated("")), opa_string_terminated("")) == 0);
     test("replace/_ab", opa_value_compare(opa_strings_replace(opa_string_terminated(""), opa_string_terminated("a"), opa_string_terminated("b")), opa_string_terminated("")) == 0);
@@ -1769,6 +1771,10 @@ void test_strings(void)
     test("substring/abc10", opa_value_compare(opa_strings_substring(opa_string_terminated("abc"), opa_number_int(1), opa_number_int(0)), opa_string_terminated("")) == 0);
     test("substring/abc11", opa_value_compare(opa_strings_substring(opa_string_terminated("abc"), opa_number_int(1), opa_number_int(1)), opa_string_terminated("b")) == 0);
     test("substring/abc12", opa_value_compare(opa_strings_substring(opa_string_terminated("abc"), opa_number_int(1), opa_number_int(2)), opa_string_terminated("bc")) == 0);
+    test("substring/abc41", opa_value_compare(opa_strings_substring(opa_string_terminated("abc"), opa_number_int(4), opa_number_int(1)), opa_string_terminated("")) == 0);
+    test("substring/unicode", opa_value_compare(opa_strings_substring(opa_string_terminated("\xC3\xA5\xC3\xA4\xC3\xB6\x7A"), opa_number_int(1), opa_number_int(1)), opa_string_terminated("\xC3\xA4")) == 0);
+    test("substring/unicode", opa_value_compare(opa_strings_substring(opa_string_terminated("\xC3\xA5\xC3\xA4\xC3\xB6\x7A"), opa_number_int(1), opa_number_int(2)), opa_string_terminated("\xC3\xA4\xC3\xB6")) == 0);
+    test("substring/unicode", opa_value_compare(opa_strings_substring(opa_string_terminated("\xC3\xA5\xC3\xA4\xC3\xB6\x7A"), opa_number_int(2), opa_number_int(-1)), opa_string_terminated("\xC3\xB6\x7A")) == 0);
 
     test("trim/__", opa_value_compare(opa_strings_trim(opa_string_terminated(""), opa_string_terminated("")), opa_string_terminated("")) == 0);
     test("trim/abcba", opa_value_compare(opa_strings_trim(opa_string_terminated("abc"), opa_string_terminated("ba")), opa_string_terminated("c")) == 0);