Skip to content

Commit

Permalink
Remove UTF-8 backtracking workaround
Browse files Browse the repository at this point in the history
This is no longer needed as strings are capable of storing partial UTF-8
sequences.
  • Loading branch information
Maxdamantus committed Jul 22, 2023
1 parent 7ba471a commit a98f863
Show file tree
Hide file tree
Showing 3 changed files with 2 additions and 34 deletions.
12 changes: 2 additions & 10 deletions src/jv_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,13 @@ jv jv_load_file(const char* filename, int raw) {
parser = jv_parser_new(0);
}

// To avoid mangling UTF-8 multi-byte sequences that cross the end of our read
// buffer, we need to be able to read the remainder of a sequence and add that
// before appending.
const int max_utf8_len = 4;
char buf[4096+max_utf8_len];
char buf[4096];
while (!feof(file) && !ferror(file)) {
size_t n = fread(buf, 1, sizeof(buf)-max_utf8_len, file);
size_t n = fread(buf, 1, sizeof(buf), file);
int len = 0;

if (n == 0)
continue;
if (jvp_utf8_backtrack(buf+(n-1), buf, &len) && len > 0 &&
!feof(file) && !ferror(file)) {
n += fread(buf+n, 1, len, file);
}

if (raw) {
data = jv_string_append_buf(data, buf, n);
Expand Down
23 changes: 0 additions & 23 deletions src/jv_unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,29 +67,6 @@ static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) {
return 1;
}

// jvp_utf8_backtrack returns the beginning of the last codepoint in the
// string, assuming that start is the last byte in the string.
// If the last codepoint is incomplete, returns the number of missing bytes via
// *missing_bytes. If there are no leading bytes or an invalid byte is
// encountered, NULL is returned and *missing_bytes is not altered.
const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes) {
assert(min <= start);
if (min == start) {
return min;
}
int length = 0;
int seen = 1;
while (start >= min && (length = utf8_coding_length[(unsigned char)*start]) == UTF8_CONTINUATION_BYTE) {
start--;
seen++;
}
if (length == 0 || length == UTF8_CONTINUATION_BYTE || length - seen < 0) {
return NULL;
}
if (missing_bytes) *missing_bytes = length - seen;
return start;
}

const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
return jvp_utf8_wtf_next(in, end, JVP_UTF8_REPLACE, codepoint_ret);
}
Expand Down
1 change: 0 additions & 1 deletion src/jv_unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ enum jvp_utf8_flags {
JVP_UTF8_ERRORS_ALL = JVP_UTF8_ERRORS_UTF16 | JVP_UTF8_ERRORS_UTF8
};

const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes);
const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
const char* jvp_utf8_wtf_next_bytes(const char* in, const char* end, const char** bytes_out, uint32_t* bytes_len);
Expand Down

0 comments on commit a98f863

Please sign in to comment.