Skip to content

Commit

Permalink
Binary strings: preserve UTF-8 and UTF-16 errors
Browse files Browse the repository at this point in the history
The internal string representation is changed from UTF-8 with replacement
characters to a modified form of "WTF-8" that is able to distinctly encode
UTF-8 errors and UTF-16 errors.

This handles UTF-8 errors in raw string inputs and handles UTF-8 and UTF-16
errors in JSON input. UTF-16 errors (using "\uXXXX") and UTF-8 errors (using
the original raw bytes) are maintained when emitting JSON. When emitting raw
strings, UTF-8 errors are maintained and UTF-16 errors are converted into
replacement characters.
  • Loading branch information
Maxdamantus committed May 20, 2021
1 parent 6129abd commit 0b2cff4
Show file tree
Hide file tree
Showing 11 changed files with 231 additions and 52 deletions.
3 changes: 1 addition & 2 deletions scripts/gen_utf8_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ def print_table(type, name, t):
def utf8info(c):
if c < 0x80: return 1, mask(7)
if 0x80 <= c <= 0xBF: return 255, mask(6)
if 0xC0 <= c <= 0xC1: return 0, 0
if 0xC2 <= c <= 0xDF: return 2, mask(5)
if 0xC0 <= c <= 0xDF: return 2, mask(5)
if 0xE0 <= c <= 0xEF: return 3, mask(4)
if 0xF0 <= c <= 0xF4: return 4, mask(3)
if 0xF4 <= c <= 0xFF: return 0, 0
Expand Down
28 changes: 16 additions & 12 deletions src/jv.c
Original file line number Diff line number Diff line change
Expand Up @@ -782,20 +782,24 @@ static jvp_string* jvp_string_alloc(uint32_t size) {
return s;
}

/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */
/* Copy a UTF8 string, using WTF-8b to replace all UTF-8 errors */
static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
const char* end = data + length;
const char* i = data;
const char* cstart;

uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD
uint32_t maxlength = length * 2 + 1; // worst case: all bad bytes, each becomes a 2-byte overlong U+XX
jvp_string* s = jvp_string_alloc(maxlength);
char* out = s->data;
int c = 0;

while ((i = jvp_utf8_next((cstart = i), end, &c))) {
while ((i = jvp_utf8_extended_next((cstart = i), end, 0, &c))) {
if (c == -1) {
c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
int error = (unsigned char)*cstart;
assert(error >= 0x80 && error <= 0xFF);
c = -error;
/* Ensure each UTF-8 error byte is consumed separately */
i = cstart + 1;
}
out += jvp_utf8_encode(c, out);
assert(out < s->data + maxlength);
Expand All @@ -807,8 +811,8 @@ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
return r;
}

/* Assumes valid UTF8 */
static jv jvp_string_new(const char* data, uint32_t length) {
/* Assumes valid WTF-8b */
jv jv_string_extended_sized(const char* data, int length) {
jvp_string* s = jvp_string_alloc(length);
s->length_hashed = length << 1;
if (data != NULL)
Expand Down Expand Up @@ -949,7 +953,7 @@ static int jvp_string_equal(jv a, jv b) {
jv jv_string_sized(const char* str, int len) {
return
jvp_utf8_is_valid(str, str+len) ?
jvp_string_new(str, len) :
jv_string_extended_sized(str, len) :
jvp_string_copy_replace_bad(str, len);
}

Expand Down Expand Up @@ -1015,14 +1019,14 @@ jv jv_string_split(jv j, jv sep) {

if (seplen == 0) {
int c;
while ((jstr = jvp_utf8_next(jstr, jend, &c)))
while ((jstr = jvp_utf8_extended_next(jstr, jend, JVP_UTF8_ERRORS_ALL, &c)))
a = jv_array_append(a, jv_string_append_codepoint(jv_string(""), c));
} else {
for (p = jstr; p < jend; p = s + seplen) {
s = _jq_memmem(p, jend - p, sepstr, seplen);
if (s == NULL)
s = jend;
a = jv_array_append(a, jv_string_sized(p, s - p));
a = jv_array_append(a, jv_string_extended_sized(p, s - p));
// Add an empty string to denote that j ends on a sep
if (s + seplen == jend && seplen != 0)
a = jv_array_append(a, jv_string(""));
Expand Down Expand Up @@ -1094,7 +1098,7 @@ jv jv_string_slice(jv j, int start, int end) {

/* Look for byte offset corresponding to start codepoints */
for (p = s, i = 0; i < start; i++) {
p = jvp_utf8_next(p, s + len, &c);
p = jvp_utf8_extended_next(p, s + len, JVP_UTF8_ERRORS_ALL, &c);
if (p == NULL) {
jv_free(j);
return jv_string_empty(16);
Expand All @@ -1106,7 +1110,7 @@ jv jv_string_slice(jv j, int start, int end) {
}
/* Look for byte offset corresponding to end codepoints */
for (e = p; e != NULL && i < end; i++) {
e = jvp_utf8_next(e, s + len, &c);
e = jvp_utf8_extended_next(e, s + len, JVP_UTF8_ERRORS_ALL, &c);
if (e == NULL) {
e = s + len;
break;
Expand All @@ -1124,7 +1128,7 @@ jv jv_string_slice(jv j, int start, int end) {
* memory like a drunken navy programmer. There's probably nothing we
* can do about it.
*/
res = jv_string_sized(p, e - p);
res = jv_string_extended_sized(p, e - p);
jv_free(j);
return res;
}
Expand Down
1 change: 1 addition & 0 deletions src/jv.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ jv jv_array_indexes(jv, jv);

jv jv_string(const char*);
jv jv_string_sized(const char*, int);
jv jv_string_extended_sized(const char*, int);
jv jv_string_empty(int len);
int jv_string_length_bytes(jv);
int jv_string_length_codepoints(jv);
Expand Down
80 changes: 55 additions & 25 deletions src/jv_parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ static void tokenadd(struct jv_parser* p, char c) {
p->tokenbuf[p->tokenpos++] = c;
}

static int unhex4(char* hex) {
static int unhex4(const char* hex) {
int r = 0;
for (int i=0; i<4; i++) {
char c = *hex++;
Expand All @@ -421,16 +421,24 @@ static int unhex4(char* hex) {
return r;
}

static pfunc found_string(struct jv_parser* p) {
char* in = p->tokenbuf;
char* out = p->tokenbuf;
char* end = p->tokenbuf + p->tokenpos;
static void found_string_cleanup(struct jv_parser* p, char* buf) {
if (buf != p->tokenbuf)
jv_mem_free(buf);
}

while (in < end) {
char c = *in++;
static pfunc found_string(struct jv_parser* p) {
const char* in = p->tokenbuf;
// start by writing to tokenbuf, only allocate in case that output size is greater than input size (possible only when input has UTF-8 errors)
char* buf = p->tokenbuf;
char* out = buf;
const char* end = p->tokenbuf + p->tokenpos;
const char* cstart;
int c;

while ((in = jvp_utf8_extended_next((cstart = in), end, 0, &c))) {
if (c == '\\') {
if (in >= end)
return "Expected escape character at end of string";
return found_string_cleanup(p, buf), "Expected escape character at end of string";
c = *in++;
switch (c) {
case '\\':
Expand All @@ -445,38 +453,60 @@ static pfunc found_string(struct jv_parser* p) {
case 'u':
/* ahh, the complicated case */
if (in + 4 > end)
return "Invalid \\uXXXX escape";
return found_string_cleanup(p, buf), "Invalid \\uXXXX escape";
int hexvalue = unhex4(in);
if (hexvalue < 0)
return "Invalid characters in \\uXXXX escape";
return found_string_cleanup(p, buf), "Invalid characters in \\uXXXX escape";
unsigned long codepoint = (unsigned long)hexvalue;
in += 4;
// leading surrogate
if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
/* who thought UTF-16 surrogate pairs were a good idea? */
if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
unsigned long surrogate = unhex4(in+2);
if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
in += 6;
codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
|(surrogate - 0xDC00));
// look ahead for trailing surrogate and decode as UTF-16, otherwise encode this lone surrogate as WTF-8
if (in + 6 <= end && in[0] && '\\' && in[1] == 'u') {
unsigned long surrogate = unhex4(in+2);
if (0xDC00 <= surrogate && surrogate <= 0xDFFF) {
in += 6;
codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
|(surrogate - 0xDC00));
}
}
}
if (codepoint > 0x10FFFF)
codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
// UTF-16 surrogates can not encode a greater codepoint
assert(codepoint <= 0x10FFFF);
// NOTE: a leading or trailing surrogate here (0xD800 <= codepoint && codepoint <= 0xDFFF) is encoded as WTF-8
out += jvp_utf8_encode(codepoint, out);
break;

default:
return "Invalid escape";
return found_string_cleanup(p, buf), "Invalid escape";
}
} else {
if (c > 0 && c < 0x001f)
return "Invalid string: control characters from U+0000 through U+001F must be escaped";
*out++ = c;
return found_string_cleanup(p, buf), "Invalid string: control characters from U+0000 through U+001F must be escaped";
if (c == -1) {
int error = (unsigned char)*cstart;
assert(error >= 0x80 && error <= 0xFF);
c = -error;
/* Ensure each UTF-8 error byte is consumed separately */
const int wtf8_length = 2;
assert(jvp_utf8_encode_length(c) == wtf8_length);
in = cstart + 1;
if (buf == p->tokenbuf && out + wtf8_length > in) {
/* Output is about to overflow input, move output to temporary buffer */
int current_size = out - buf;
int remaining = end - cstart;
buf = jv_mem_alloc(current_size + remaining * wtf8_length); // worst case: all remaining bad bytes, each becomes a 2-byte overlong U+XX
memcpy(buf, p->tokenbuf, current_size);
out = buf + current_size;
}
} else
assert(jvp_utf8_encode_length(c) == in - cstart);
out += jvp_utf8_encode(c, out);
}
}
TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf)));
jv v = jv_string_extended_sized(buf, out - buf);
found_string_cleanup(p, buf);
TRY(value(p, v));
p->tokenpos = 0;
return 0;
}
Expand Down
26 changes: 25 additions & 1 deletion src/jv_print.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,16 @@ static void put_char(char c, FILE* fout, jv* strout, int T) {
put_buf(&c, 1, fout, strout, T);
}

static void put_invalid_utf8_byte(int c, FILE* fout, jv* strout, int T) {
assert(c >= 0x80 && c <= 0xFF);
if (strout) {
// encode as an invalid UTF-8 byte in output
*strout = jv_string_append_codepoint(*strout, -c);
} else {
put_char(c, fout, strout, T);
}
}

static void put_str(const char* s, FILE* fout, jv* strout, int T) {
put_buf(s, strlen(s), fout, strout, T);
}
Expand All @@ -123,7 +133,7 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
int c = 0;
char buf[32];
put_char('"', F, S, T);
while ((i = jvp_utf8_next((cstart = i), end, &c))) {
while ((i = jvp_utf8_extended_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) {
assert(c != -1);
int unicode_escape = 0;
if (0x20 <= c && c <= 0x7E) {
Expand All @@ -132,6 +142,17 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
put_char('\\', F, S, T);
}
put_char(c, F, S, T);
} else if (c >= -0xFF && c <= -0x80) {
// Invalid UTF-8 byte
if (ascii_only) {
// refusing to emit invalid UTF-8
// TODO: convince the world to adopt a "\xXX" notation for JSON?
c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
unicode_escape = 1;
} else {
// pass through
put_invalid_utf8_byte(-c, F, S, T);
}
} else if (c < 0x20 || c == 0x7F) {
// ASCII control character
switch (c) {
Expand Down Expand Up @@ -162,6 +183,9 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
} else {
if (ascii_only) {
unicode_escape = 1;
} else if (c >= 0xD800 && c <= 0xDFFF) {
// lone surrogate; can't be encoded to UTF-8
unicode_escape = 1;
} else {
put_buf(cstart, i - cstart, F, S, T);
}
Expand Down
Loading

0 comments on commit 0b2cff4

Please sign in to comment.