Correct UTF-8 and UTF-16 errors during concatenation

UTF-8 errors and UTF-16 errors that were previously encoded into the ends of strings will now potentially be used to form correct code points. This is mostly a matter of making string equality behave expectedly, since without this normalisation, it is possible to produce `jv` strings that are converted to UTF-8 or UTF-16 the same way but are not equal due well-formed code units that may or may not be encoded as errors.
jqlang · May 30, 2021 · 223d6c4 · 223d6c4
1 parent 43c526f
commit 223d6c4
Show file tree

Hide file tree

Showing 4 changed files with 231 additions and 50 deletions.
diff --git a/src/jv.c b/src/jv.c
@@ -852,20 +852,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) {
   jvp_string* s = jvp_string_ptr(string);
   uint32_t currlen = jvp_string_length(s);
 
+  char join_buf[4];
+  int join_len = jvp_utf8_extended_join(s->data, &currlen, &data, &len, join_buf);
+
   if (jvp_refcnt_unshared(string.u.ptr) &&
-      jvp_string_remaining_space(s) >= len) {
+      jvp_string_remaining_space(s) >= join_len + len) {
     // the next string fits at the end of a
+    memcpy(s->data + currlen, join_buf, join_len);
+    currlen += join_len;
     memcpy(s->data + currlen, data, len);
     s->data[currlen + len] = 0;
     s->length_hashed = (currlen + len) << 1;
     return string;
   } else {
     // allocate a bigger buffer and copy
-    uint32_t allocsz = (currlen + len) * 2;
+    uint32_t allocsz = (currlen + join_len + len) * 2;
     if (allocsz < 32) allocsz = 32;
     jvp_string* news = jvp_string_alloc(allocsz);
-    news->length_hashed = (currlen + len) << 1;
+    news->length_hashed = (currlen + join_len + len) << 1;
     memcpy(news->data, s->data, currlen);
+    memcpy(news->data + currlen, join_buf, join_len);
+    currlen += join_len;
     memcpy(news->data + currlen, data, len);
     news->data[currlen + len] = 0;
     jvp_string_free(string);

diff --git a/src/jv_unicode.c b/src/jv_unicode.c
@@ -1,8 +1,72 @@
 #include <stdio.h>
+#include <string.h>
 #include <assert.h>
 #include "jv_unicode.h"
 #include "jv_utf8_tables.h"
 
+// length of encoding of erroneous UTF-8 byte
+#define UTF8_ERR_LEN 2
+// length of encoding of erroneous UTF-16 surrogate
+#define UTF16_ERR_LEN 3
+
+#define U32(a, b, c, d) ( \
+  (uint32_t) (a) << 0 | \
+  (uint32_t) (b) << 8 | \
+  (uint32_t) (c) << 16 | \
+  (uint32_t) (d) << 24 \
+)
+
+#define BYTE(u32, n) ((uint32_t) (((u32) >> (n)*8) & 0xFF))
+
+#define B0 0x00 // 00000000
+#define B1 0x80 // 10000000
+#define B2 0xC0 // 11000000
+#define B3 0xE0 // 11100000
+#define B4 0xF0 // 11110000
+#define B5 0xF8 // 11111000
+
+// NOTE: these flags are likely to be optimised out as `decode` gets inlined
+enum decode_flags {
+  DECODE_1 = 1,
+  DECODE_2 = 2,
+  DECODE_3 = 8,
+  DECODE_4 = 16
+};
+
+// decode up to 4 bytes of "generalised UTF-8"; no checking for overlong
+// codings or out-of-range code points, works by testing all fixed bits in each
+// of the 4 coding patterns, then shifting the value bits according to the
+// pattern
+static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) {
+  if((flags & DECODE_1) && (data & U32(B1, B0, B0, B0)) == 0){
+    *codepoint_ret = BYTE(data, 0);
+    return 1;
+  }
+  if((flags & DECODE_2) && (data & U32(B3, B2, B0, B0)) == U32(B2, B1, B0, B0)){
+    *codepoint_ret =
+      (BYTE(data, 0) & ~B3) << 6 |
+      (BYTE(data, 1) & ~B2) << 0;
+    return 2;
+  }
+  if((flags & DECODE_3) && (data & U32(B4, B2, B2, B0)) == U32(B3, B1, B1, B0)){
+    *codepoint_ret =
+      (BYTE(data, 0) & ~B4) << 12 |
+      (BYTE(data, 1) & ~B2) << 6 |
+      (BYTE(data, 2) & ~B2) << 0;
+    return 3;
+  }
+  if((flags & DECODE_4) && (data & U32(B5, B2, B2, B2)) == U32(B4, B1, B1, B1)){
+    *codepoint_ret =
+      (BYTE(data, 0) & ~B5) << 18 |
+      (BYTE(data, 1) & ~B2) << 12 |
+      (BYTE(data, 2) & ~B2) << 6 |
+      (BYTE(data, 3) & ~B2) << 0;
+    return 4;
+  }
+  *codepoint_ret = -1;
+  return 1;
+}
+
 // jvp_utf8_backtrack returns the beginning of the last codepoint in the
 // string, assuming that start is the last byte in the string.
 // If the last codepoint is incomplete, returns the number of missing bytes via
@@ -15,7 +79,7 @@ const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_
   }
   int length = 0;
   int seen = 1;
-  while (start >= min && (length = utf8_coding_length[(unsigned char)*start]) == UTF8_CONTINUATION_BYTE) {
+  while (start >= min && (length = utf8_coding_length[*start & 0xFF]) == UTF8_CONTINUATION_BYTE) {
     start--;
     seen++;
   }
@@ -81,56 +145,42 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf
   if (in == end) {
     return 0;
   }
-  int codepoint = -1;
-  unsigned char first = (unsigned char)in[0];
-  int length = utf8_coding_length[first];
-  if ((first & 0x80) == 0) {
+  uint32_t data = in[0] & 0xFF;
+  if ((data & B1) == 0) {
     /* Fast-path for ASCII */
-    codepoint = first;
-    length = 1;
-  } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
-    /* Bad single byte - either an invalid byte or an out-of-place continuation byte */
-    if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte");
-    length = 1;
-  } else if (in + length > end) {
-    /* String ends before UTF8 sequence ends */
-    if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun");
-    length = end - in;
-  } else {
-    codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
-    for (int i=1; i<length; i++) {
-      unsigned ch = (unsigned char)in[i];
-      if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
-        /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
-        if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes");
-        codepoint = -1;
-        length = i;
-        break;
-      }
-      codepoint = (codepoint << 6) | (ch & 0x3f);
-    }
-    if (codepoint < utf8_first_codepoint[length]) {
-      /* Overlong UTF8 sequence */
-      if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) {
-        /* UTF-8 error is emitted as a negative codepoint */
-        codepoint = -(codepoint + 0x80);
-      } else {
-        if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
-        codepoint = -1;
-      }
-    }
-    if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
-      /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
-      if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
-        /* Surrogate codepoints can't be encoded in UTF8 */
-        codepoint = -1;
-      }
+    *codepoint_ret = data;
+    return in + 1;
+  }
+  switch (end - in) {
+    default: // fall through
+    case 4: data |= (uint32_t)(in[3] & 0xFF) << 24; // fall through
+    case 3: data |= (uint32_t)(in[2] & 0xFF) << 16; // fall through
+    case 2: data |= (uint32_t)(in[1] & 0xFF) << 8; // fall through
+    case 1: break;
+  }
+  int codepoint;
+  int length = decode(DECODE_2 | DECODE_3 | DECODE_4, data, &codepoint);
+  if (codepoint == -1) {
+    if (flags & JVP_UTF8_ERRORS_UTF8) assert(0 && "Invalid WTF-8b sequence: no match");
+  } else if (codepoint < utf8_first_codepoint[length]) {
+    /* Overlong UTF-8 sequence */
+    if ((flags & JVP_UTF8_ERRORS_UTF8) && length == UTF8_ERR_LEN && 0x00 <= codepoint && codepoint <= 0x7F) {
+      /* UTF-8 error is emitted as a negative codepoint */
+      codepoint = -(codepoint + 0x80);
+    } else {
+      if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
+      codepoint = -1;
     }
-    if (codepoint > 0x10FFFF) {
-      /* Outside Unicode range */
-      if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
+  } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
+    /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
+    if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
+      /* Surrogate codepoints can't be encoded in UTF8 */
       codepoint = -1;
     }
+  } else if (codepoint > 0x10FFFF) {
+    /* Outside Unicode range */
+    if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
+    codepoint = -1;
   }
   if (codepoint == -1 && (flags & JVP_UTF8_REPLACE))
     codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
@@ -139,6 +189,112 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf
   return in + length;
 }
 
+// assumes two bytes are readable from `in`
+static int decode_utf8_error(const char* in) {
+  uint32_t data = U32(in[0]  & 0xFF, in[1] & 0xFF, 0, 0);
+  int codepoint;
+  if (decode(DECODE_2, data, &codepoint) == UTF8_ERR_LEN && codepoint < 0x80)
+    return codepoint + 0x80;
+  return -1;
+}
+
+// assumes three bytes are readable from `in`
+static int decode_utf16_error(const char* in) {
+  uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, in[2] & 0xFF, 0);
+  int codepoint;
+  if (decode(DECODE_3, data, &codepoint) == UTF16_ERR_LEN && codepoint >= 0xD800 && codepoint < 0xDFFF)
+    return codepoint;
+  return -1;
+}
+
+// jvp_utf8_extended_join attempts to turn errors at the end of `a` and the
+// beginning of `b` into a valid code point. if a correction is possible,
+// `*alen_io`, `*bstart_io` and `*blen_io` are updated to exclude the existing
+// errors, and the UTF-8 encoding of the code point to insert is stored in
+// `out`. the number of bytes that should be inserted from `out` into the
+// middle of the strings is returned (up to 4). this will be 0 if there are no
+// bytes to insert.
+int jvp_utf8_extended_join(const char* astart, uint32_t* alen_io, const char** bstart_io, uint32_t* blen_io, char* out) {
+  const char* aend = astart + *alen_io;
+  const char* bstart = *bstart_io;
+  const char* bend = bstart + *blen_io;
+  int bcp;
+  bstart = jvp_utf8_extended_next(bstart, bend, JVP_UTF8_ERRORS_ALL, &bcp);
+  if (!bstart) {
+    // end of string
+    return 0;
+  }
+  if (bcp >= 0xDC00 && bcp <= 0xDFFF) {
+    // UTF-16 tail surrogate, look for lead surrogate at the end of `a`
+    assert(bstart == *bstart_io + UTF16_ERR_LEN);
+    if (aend - astart < UTF16_ERR_LEN)
+      return 0;
+    int acp = decode_utf16_error(aend - UTF16_ERR_LEN);
+    if (acp >= 0xD800 && acp <= 0xDBFF) {
+      // UTF-16 lead surrogate, decode matching UTF-16 pair
+      *alen_io -= UTF16_ERR_LEN;
+      *blen_io -= UTF16_ERR_LEN;
+      *bstart_io += UTF16_ERR_LEN;
+      int codepoint = 0x10000 + (((acp - 0xD800) << 10) | (bcp - 0xDC00));
+      return jvp_utf8_encode(codepoint, out);
+    }
+    return 0;
+  }
+  if (bcp >= -0xFF && bcp <= -0x80) {
+    // UTF-8 error, if it's a continuation byte, search backwards in `a` for the leading byte
+    bcp = -bcp;
+    assert(bstart == *bstart_io + UTF8_ERR_LEN);
+    if (utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE)
+      return 0;
+    // if there's a correctable error, we will consume up to 4 encoded error bytes total, with up to 3 bytes from each of `a` and `b`
+    unsigned char buf[6];
+    unsigned char* bufstart = buf + 3;
+    unsigned char* bufend = bufstart;
+    *bufend++ = bcp;
+    int length;
+    // search backwards in `a` for a leading byte
+    for (;;) {
+      if (aend - astart < UTF8_ERR_LEN)
+        return 0; // `a` is too short
+      int acp = decode_utf8_error(aend - UTF8_ERR_LEN);
+      if (acp == -1)
+        return 0; // not a UTF-8 error
+      aend -= UTF8_ERR_LEN;
+      length = utf8_coding_length[acp];
+      if (length == 0)
+        return 0; // not a possible UTF-8 byte
+      *--bufstart = acp;
+      if (length != UTF8_CONTINUATION_BYTE)
+        break; // found leading byte
+      if (bufstart == buf)
+        return 0; // too many continuation bytes
+    }
+    if (bufend - bufstart > length)
+      return 0; // too many continuation bytes
+    // search forwards in `b` for any more needed continuation bytes
+    while (bufend - bufstart < length) {
+      if (bend - bstart < UTF8_ERR_LEN)
+        return 0; // `b` is too short
+      bcp = decode_utf8_error(bstart);
+      if (bcp == -1 || utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE)
+        return 0; // not a UTF-8 error, didn't find enough continuation bytes
+      bstart += UTF8_ERR_LEN;
+      *bufend++ = bcp;
+    }
+    int codepoint;
+    // check that the bytes are strict UTF-8
+    jvp_utf8_extended_next((char*)bufstart, (char*)bufend, 0, &codepoint);
+    if (codepoint != -1) {
+      memcpy(out, bufstart, 4);
+      *alen_io = aend - astart;
+      *blen_io = bend - bstart;
+      *bstart_io = bstart;
+      return bufend - bufstart;
+    }
+  }
+  return 0;
+}
+
 int jvp_utf8_is_valid(const char* in, const char* end) {
   int codepoint;
   while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) {

diff --git a/src/jv_unicode.h b/src/jv_unicode.h
@@ -1,6 +1,8 @@
 #ifndef JV_UNICODE_H
 #define JV_UNICODE_H
 
+#include <stdint.h>
+
 enum jvp_utf8_flags {
   /* Emit replacement character instead of -1 for errors */
   JVP_UTF8_REPLACE = 1,
@@ -14,6 +16,7 @@ enum jvp_utf8_flags {
 const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes);
 const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
+int jvp_utf8_extended_join(const char* astart, uint32_t* alen, const char** bstart, uint32_t* blen, char* out);
 int jvp_utf8_is_valid(const char* in, const char* end);
 
 int jvp_utf8_decode_length(char startchar);

diff --git a/tests/jq.test b/tests/jq.test
@@ -62,6 +62,11 @@ null
 null
 "∀\ud800∃\udc00∅\udfff"
 
+# Check that unpaired surrogates are paired when concatenated
+add
+["\ud83d","\ude43","\ud83e","\udd11","\ud83e","\udd17","\ud83e","\udd14","\ud83e","\udd10","\ud83d","\ude44","\ud83e","\udd12","\ud83e","\udd15","\ud83e","\udd13","\ud83e","\udd16","\ud83e","\udd18","\ud83c","\udffb","\ud83c","\udffc"]
+"🙃🤑🤗🤔🤐🙄🤒🤕🤓🤖🤘🏻🏼"
+
 "inter\("pol" + "ation")"
 null
 "interpolation"
@@ -87,6 +92,16 @@ null
 "Zm/Ds2Jhcgo="
 "foóbar\n"
 
+# test correction of UTF-8 errors when concatenating as binary data (input is a random sequence of code points)
+. as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text
+"򍨼衍򙮬񪜁򻴠󖂡󔁰񗏷󛊭񢠃򍧝𭌞󹰞󙴋𿋓󧜹򳔎񦰓򅆹򽐟󂑛򶃯㾱ꕽ񂊛򉙲򅤎􃖣󻣸󁸦򴏜򽃿􄑏󠦱񄛲񄕵񡿚򮩒񡏂򨆯򶚒󎮆󉨗򡮟򆿴񬏪򻀅㫑񉒗󴍶󬪸񝶑񂾑򇔣򉩉􂞇𲡀𨫆򤵇𲺝\u001c񖂟񳐉󲔹𳨬􀮔𸒙񜶻㊬񓐊񽒬󑀧󗧚󞌶󦥥𗌽𘀍󴼹􌇺򫗛񂷶󏷕񜁍񥬟󼁁󓺉𗟒򷝊𩕃񞝏񧄀󁲩򐀄򳂸񲊷򃀋񃫫𝷏򏖝򷂍󢭣􋛨𞪒򁁅勸󯩥󵪭񚮚򻡍騎񾊯򪓚񗡈񎕫򡯬񋫠ᕴ𞨹󾄇񩠶𙯾񢥱𚯴񬥷󢶖񾹌񡈟򧓑񒾘𚸯񳗺񭟡𫸬񷤖񷆐𖋌񦰃椀𫎾󗚋𿋆󈝰񺥲򝕊𵯮򙧚󬱃󍗞󱆃󂟙󟆺񻢬󸮤󗗉񉛮𺵡𰣒􁋙񻍛􇡘ᮍ񕥸񨵂盕嗪𻸮򶆍򊈤񽓎󙴐𗬜󾱒󷹰􇡈񨦎􏥩񴲡𨑮򱏝𭢊󕁶򣙥󶡮󮰌󿙾氕񼻘􆔪񢕀񊿃󮨝񑛖󣴊󎎏򳞓㊁󒭀󇜳𯄌𻙩"
+true
+
+# test preservation of binary data when concatenating (input is a random sequence of UTF-16 surrogates encoded in WTF-8, should be treated as regular UTF-8 errors)
+@base64d | . as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text
+"7bKv7aiz7auX7aG37aO77aOe7auy7bmm7bqk7aG87bSH7a6m7bmc7bum7bqj7au+7bqf7aap7buC7byq7aS37aCp7aSl7a+a7bur7aGV7bGl7b6M7biB7aOe7ayR7amW7aOX7b637a+P7bu+7ayP7bOw7ba/7ayp7b6G7aqd7bG37bK57b6O7bq27a+u7a2N7ayu7bKK"
+true
+
 @uri
 "\u03bc"
 "%CE%BC"