From 8829368f14943b8d2674c75805b27e56a569ad2c Mon Sep 17 00:00:00 2001 From: Max Zerzouri Date: Tue, 25 May 2021 22:59:59 +1200 Subject: [PATCH] Correct UTF-8 and UTF-16 errors during concatenation UTF-8 errors and UTF-16 errors that were previously encoded into the ends of strings will now potentially be used to form correct code points. This is mostly a matter of making string equality behave expectedly, since without this normalisation, it is possible to produce `jv` strings that are converted to UTF-8 or UTF-16 the same way but are not equal due well-formed code units that may or may not be encoded as errors. --- src/jv.c | 13 ++- src/jv_unicode.c | 248 ++++++++++++++++++++++++++++++++++++++--------- src/jv_unicode.h | 3 + tests/jq.test | 15 +++ 4 files changed, 230 insertions(+), 49 deletions(-) diff --git a/src/jv.c b/src/jv.c index 171a47cbe4..49faebcd62 100644 --- a/src/jv.c +++ b/src/jv.c @@ -852,20 +852,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) { jvp_string* s = jvp_string_ptr(string); uint32_t currlen = jvp_string_length(s); + char join_buf[4]; + int join_len = jvp_utf8_extended_join(s->data, &currlen, &data, &len, join_buf); + if (jvp_refcnt_unshared(string.u.ptr) && - jvp_string_remaining_space(s) >= len) { + jvp_string_remaining_space(s) >= join_len + len) { // the next string fits at the end of a + memcpy(s->data + currlen, join_buf, join_len); + currlen += join_len; memcpy(s->data + currlen, data, len); s->data[currlen + len] = 0; s->length_hashed = (currlen + len) << 1; return string; } else { // allocate a bigger buffer and copy - uint32_t allocsz = (currlen + len) * 2; + uint32_t allocsz = (currlen + join_len + len) * 2; if (allocsz < 32) allocsz = 32; jvp_string* news = jvp_string_alloc(allocsz); - news->length_hashed = (currlen + len) << 1; + news->length_hashed = (currlen + join_len + len) << 1; memcpy(news->data, s->data, currlen); + memcpy(news->data + currlen, join_buf, join_len); + currlen += join_len; memcpy(news->data + currlen, data, len); news->data[currlen + len] = 0; jvp_string_free(string); diff --git a/src/jv_unicode.c b/src/jv_unicode.c index 8c475365e6..7d67300a41 100644 --- a/src/jv_unicode.c +++ b/src/jv_unicode.c @@ -1,8 +1,72 @@ #include +#include #include #include "jv_unicode.h" #include "jv_utf8_tables.h" +// length of encoding of erroneous UTF-8 byte +#define UTF8_ERR_LEN 2 +// length of encoding of erroneous UTF-16 surrogate +#define UTF16_ERR_LEN 3 + +#define U32(a, b, c, d) ( \ + (uint32_t) (a) << 0 | \ + (uint32_t) (b) << 8 | \ + (uint32_t) (c) << 16 | \ + (uint32_t) (d) << 24 \ +) + +#define BYTE(u32, n) ((uint32_t) (((u32) >> (n)*8) & 0xFF)) + +#define B0 0x00 // 00000000 +#define B1 0x80 // 10000000 +#define B2 0xC0 // 11000000 +#define B3 0xE0 // 11100000 +#define B4 0xF0 // 11110000 +#define B5 0xF8 // 11111000 + +// NOTE: these flags are likely to be optimised out as `decode` gets inlined +enum decode_flags { + DECODE_1 = 1, + DECODE_2 = 2, + DECODE_3 = 8, + DECODE_4 = 16 +}; + +// decode up to 4 bytes of "generalised UTF-8"; no checking for overlong +// codings or out-of-range code points, works by testing all fixed bits in each +// of the 4 coding patterns, then shifting the value bits according to the +// pattern +static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) { + if((flags & DECODE_1) && (data & U32(B1, B0, B0, B0)) == 0){ + *codepoint_ret = BYTE(data, 0); + return 1; + } + if((flags & DECODE_2) && (data & U32(B3, B2, B0, B0)) == U32(B2, B1, B0, B0)){ + *codepoint_ret = + (BYTE(data, 0) & ~B3) << 6 | + (BYTE(data, 1) & ~B2) << 0; + return 2; + } + if((flags & DECODE_3) && (data & U32(B4, B2, B2, B0)) == U32(B3, B1, B1, B0)){ + *codepoint_ret = + (BYTE(data, 0) & ~B4) << 12 | + (BYTE(data, 1) & ~B2) << 6 | + (BYTE(data, 2) & ~B2) << 0; + return 3; + } + if((flags & DECODE_4) && (data & U32(B5, B2, B2, B2)) == U32(B4, B1, B1, B1)){ + *codepoint_ret = + (BYTE(data, 0) & ~B5) << 18 | + (BYTE(data, 1) & ~B2) << 12 | + (BYTE(data, 2) & ~B2) << 6 | + (BYTE(data, 3) & ~B2) << 0; + return 4; + } + *codepoint_ret = -1; + return 1; +} + // jvp_utf8_backtrack returns the beginning of the last codepoint in the // string, assuming that start is the last byte in the string. // If the last codepoint is incomplete, returns the number of missing bytes via @@ -81,56 +145,42 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf if (in == end) { return 0; } - int codepoint = -1; - unsigned char first = (unsigned char)in[0]; - int length = utf8_coding_length[first]; - if ((first & 0x80) == 0) { + uint32_t data = in[0] & 0xFF; + if ((data & B1) == 0) { /* Fast-path for ASCII */ - codepoint = first; - length = 1; - } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) { - /* Bad single byte - either an invalid byte or an out-of-place continuation byte */ - if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte"); - length = 1; - } else if (in + length > end) { - /* String ends before UTF8 sequence ends */ - if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun"); - length = end - in; - } else { - codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; - for (int i=1; i 0x10FFFF) { - /* Outside Unicode range */ - if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); + } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) { + /* Surrogate codepoints are allowed in WTF-8/WTF-8b */ + if (!(flags & JVP_UTF8_ERRORS_UTF16)) { + /* Surrogate codepoints can't be encoded in UTF8 */ codepoint = -1; } + } else if (codepoint > 0x10FFFF) { + /* Outside Unicode range */ + if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); + codepoint = -1; } if (codepoint == -1 && (flags & JVP_UTF8_REPLACE)) codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER @@ -139,6 +189,112 @@ const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf return in + length; } +// assumes two bytes are readable from `in` +static int decode_utf8_error(const char* in) { + uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, 0, 0); + int codepoint; + if (decode(DECODE_2, data, &codepoint) == UTF8_ERR_LEN && codepoint < 0x80) + return codepoint + 0x80; + return -1; +} + +// assumes three bytes are readable from `in` +static int decode_utf16_error(const char* in) { + uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, in[2] & 0xFF, 0); + int codepoint; + if (decode(DECODE_3, data, &codepoint) == UTF16_ERR_LEN && codepoint >= 0xD800 && codepoint < 0xDFFF) + return codepoint; + return -1; +} + +// jvp_utf8_extended_join attempts to turn errors at the end of `a` and the +// beginning of `b` into a valid code point. if a correction is possible, +// `*alen_io`, `*bstart_io` and `*blen_io` are updated to exclude the existing +// errors, and the UTF-8 encoding of the code point to insert is stored in +// `out`. the number of bytes that should be inserted from `out` into the +// middle of the strings is returned (up to 4). this will be 0 if there are no +// bytes to insert. +int jvp_utf8_extended_join(const char* astart, uint32_t* alen_io, const char** bstart_io, uint32_t* blen_io, char* out) { + const char* aend = astart + *alen_io; + const char* bstart = *bstart_io; + const char* bend = bstart + *blen_io; + int bcp; + bstart = jvp_utf8_extended_next(bstart, bend, JVP_UTF8_ERRORS_ALL, &bcp); + if (!bstart) { + // end of string + return 0; + } + if (bcp >= 0xDC00 && bcp <= 0xDFFF) { + // UTF-16 tail surrogate, look for lead surrogate at the end of `a` + assert(bstart == *bstart_io + UTF16_ERR_LEN); + if (aend - astart < UTF16_ERR_LEN) + return 0; + int acp = decode_utf16_error(aend - UTF16_ERR_LEN); + if (acp >= 0xD800 && acp <= 0xDBFF) { + // UTF-16 lead surrogate, decode matching UTF-16 pair + *alen_io -= UTF16_ERR_LEN; + *blen_io -= UTF16_ERR_LEN; + *bstart_io += UTF16_ERR_LEN; + int codepoint = 0x10000 + (((acp - 0xD800) << 10) | (bcp - 0xDC00)); + return jvp_utf8_encode(codepoint, out); + } + return 0; + } + if (bcp >= -0xFF && bcp <= -0x80) { + // UTF-8 error, if it's a continuation byte, search backwards in `a` for the leading byte + bcp = -bcp; + assert(bstart == *bstart_io + UTF8_ERR_LEN); + if (utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE) + return 0; + // if there's a correctable error, we will consume up to 4 encoded error bytes total, with up to 3 bytes from each of `a` and `b` + unsigned char buf[6]; + unsigned char* bufstart = buf + 3; + unsigned char* bufend = bufstart; + *bufend++ = bcp; + int length; + // search backwards in `a` for a leading byte + for (;;) { + if (aend - astart < UTF8_ERR_LEN) + return 0; // `a` is too short + int acp = decode_utf8_error(aend - UTF8_ERR_LEN); + if (acp == -1) + return 0; // not a UTF-8 error + aend -= UTF8_ERR_LEN; + length = utf8_coding_length[acp]; + if (length == 0) + return 0; // not a possible UTF-8 byte + *--bufstart = acp; + if (length != UTF8_CONTINUATION_BYTE) + break; // found leading byte + if (bufstart == buf) + return 0; // too many continuation bytes + } + if (bufend - bufstart > length) + return 0; // too many continuation bytes + // search forwards in `b` for any more needed continuation bytes + while (bufend - bufstart < length) { + if (bend - bstart < UTF8_ERR_LEN) + return 0; // `b` is too short + bcp = decode_utf8_error(bstart); + if (bcp == -1 || utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE) + return 0; // not a UTF-8 error, didn't find enough continuation bytes + bstart += UTF8_ERR_LEN; + *bufend++ = bcp; + } + int codepoint; + // check that the bytes are strict UTF-8 + jvp_utf8_extended_next((char*)bufstart, (char*)bufend, 0, &codepoint); + if (codepoint != -1) { + memcpy(out, bufstart, 4); + *alen_io = aend - astart; + *blen_io = bend - bstart; + *bstart_io = bstart; + return bufend - bufstart; + } + } + return 0; +} + int jvp_utf8_is_valid(const char* in, const char* end) { int codepoint; while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) { diff --git a/src/jv_unicode.h b/src/jv_unicode.h index 37c7fc08f6..ff2a4377ee 100644 --- a/src/jv_unicode.h +++ b/src/jv_unicode.h @@ -1,6 +1,8 @@ #ifndef JV_UNICODE_H #define JV_UNICODE_H +#include + enum jvp_utf8_flags { /* Emit replacement character instead of -1 for errors */ JVP_UTF8_REPLACE = 1, @@ -14,6 +16,7 @@ enum jvp_utf8_flags { const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes); const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint); const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); +int jvp_utf8_extended_join(const char* astart, uint32_t* alen, const char** bstart, uint32_t* blen, char* out); int jvp_utf8_is_valid(const char* in, const char* end); int jvp_utf8_decode_length(char startchar); diff --git a/tests/jq.test b/tests/jq.test index f7f5e1d3e3..80c736fb9b 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -62,6 +62,11 @@ null null "โˆ€\ud800โˆƒ\udc00โˆ…\udfff" +# Check that unpaired surrogates are paired when concatenated +add +["\ud83d","\ude43","\ud83e","\udd11","\ud83e","\udd17","\ud83e","\udd14","\ud83e","\udd10","\ud83d","\ude44","\ud83e","\udd12","\ud83e","\udd15","\ud83e","\udd13","\ud83e","\udd16","\ud83e","\udd18","\ud83c","\udffb","\ud83c","\udffc"] +"๐Ÿ™ƒ๐Ÿค‘๐Ÿค—๐Ÿค”๐Ÿค๐Ÿ™„๐Ÿค’๐Ÿค•๐Ÿค“๐Ÿค–๐Ÿค˜๐Ÿป๐Ÿผ" + "inter\("pol" + "ation")" null "interpolation" @@ -87,6 +92,16 @@ null "Zm/Ds2Jhcgo=" "foรณbar\n" +# test correction of UTF-8 errors when concatenating as binary data (input is a random sequence of code points) +. as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text +"๒จผ่ก๒™ฎฌ๑ชœ๒ปด ๓–‚ก๓”ฐ๑—ท๓›Šญ๑ข ƒ๒ง๐ญŒž๓นฐž๓™ด‹๐ฟ‹“๓งœน๒ณ”Ž๑ฆฐ“๒…†น๒ฝŸ๓‚‘›๒ถƒฏใพฑ๊•ฝ๑‚Š›๒‰™ฒ๒…คŽ๔ƒ–ฃ๓ปฃธ๓ธฆ๒ดœ๒ฝƒฟ๔„‘๓ ฆฑ๑„›ฒ๑„•ต๑กฟš๒ฎฉ’๑ก‚๒จ†ฏ๒ถš’๓Žฎ†๓‰จ—๒กฎŸ๒†ฟด๑ฌช๒ป€…ใซ‘๑‰’—๓ดถ๓ฌชธ๑ถ‘๑‚พ‘๒‡”ฃ๒‰ฉ‰๔‚ž‡๐ฒก€๐จซ†๒คต‡๐ฒบ\u001c๑–‚Ÿ๑ณ‰๓ฒ”น๐ณจฌ๔€ฎ”๐ธ’™๑œถปใŠฌ๑“Š๑ฝ’ฌ๓‘€ง๓—งš๓žŒถ๓ฆฅฅ๐—Œฝ๐˜€๓ดผน๔Œ‡บ๒ซ—›๑‚ทถ๓ท•๑œ๑ฅฌŸ๓ผ๓“บ‰๐—Ÿ’๒ทŠ๐ฉ•ƒ๑ž๑ง„€๓ฒฉ๒€„๒ณ‚ธ๑ฒŠท๒ƒ€‹๑ƒซซ๐ท๒–๒ท‚๓ขญฃ๔‹›จ๐žช’๒…ๅ‹ธ๓ฏฉฅ๓ตชญ๑šฎš๒ปก้จŽ๑พŠฏ๒ช“š๑—กˆ๑Ž•ซ๒กฏฌ๑‹ซ แ•ด๐žจน๓พ„‡๑ฉ ถ๐™ฏพ๑ขฅฑ๐šฏด๑ฌฅท๓ขถ–๑พนŒ๑กˆŸ๒ง“‘๑’พ˜๐šธฏ๑ณ—บ๑ญŸก๐ซธฌ๑ทค–๑ท†๐–‹Œ๑ฆฐƒๆค€๐ซŽพ๓—š‹๐ฟ‹†๓ˆฐ๑บฅฒ๒•Š๐ตฏฎ๒™งš๓ฌฑƒ๓—ž๓ฑ†ƒ๓‚Ÿ™๓Ÿ†บ๑ปขฌ๓ธฎค๓——‰๑‰›ฎ๐บตก๐ฐฃ’๔‹™๑ป›๔‡ก˜แฎ๑•ฅธ๑จต‚็›•ๅ—ช๐ปธฎ๒ถ†๒Šˆค๑ฝ“Ž๓™ด๐—ฌœ๓พฑ’๓ทนฐ๔‡กˆ๑จฆŽ๔ฅฉ๑ดฒก๐จ‘ฎ๒ฑ๐ญขŠ๓•ถ๒ฃ™ฅ๓ถกฎ๓ฎฐŒ๓ฟ™พๆฐ•๑ผป˜๔†”ช๑ข•€๑Šฟƒ๓ฎจ๑‘›–๓ฃดŠ๓ŽŽ๒ณž“ใŠ๓’ญ€๓‡œณ๐ฏ„Œ๐ป™ฉ" +true + +# test preservation of binary data when concatenating (input is a random sequence of UTF-16 surrogates encoded in WTF-8, should be treated as regular UTF-8 errors) +@base64d | . as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text +"7bKv7aiz7auX7aG37aO77aOe7auy7bmm7bqk7aG87bSH7a6m7bmc7bum7bqj7au+7bqf7aap7buC7byq7aS37aCp7aSl7a+a7bur7aGV7bGl7b6M7biB7aOe7ayR7amW7aOX7b637a+P7bu+7ayP7bOw7ba/7ayp7b6G7aqd7bG37bK57b6O7bq27a+u7a2N7ayu7bKK" +true + @uri "\u03bc" "%CE%BC"