Remove UTF-8 backtracking workaround

This is no longer needed as strings are capable of storing partial UTF-8 sequences.
jqlang · Jul 22, 2023 · a98f863 · a98f863
1 parent 7ba471a
commit a98f863
Show file tree

Hide file tree

Showing 3 changed files with 2 additions and 34 deletions.
diff --git a/src/jv_file.c b/src/jv_file.c
@@ -39,21 +39,13 @@ jv jv_load_file(const char* filename, int raw) {
     parser = jv_parser_new(0);
   }
 
-  // To avoid mangling UTF-8 multi-byte sequences that cross the end of our read
-  // buffer, we need to be able to read the remainder of a sequence and add that
-  // before appending.
-  const int max_utf8_len = 4;
-  char buf[4096+max_utf8_len];
+  char buf[4096];
   while (!feof(file) && !ferror(file)) {
-    size_t n = fread(buf, 1, sizeof(buf)-max_utf8_len, file);
+    size_t n = fread(buf, 1, sizeof(buf), file);
     int len = 0;
 
     if (n == 0)
       continue;
-    if (jvp_utf8_backtrack(buf+(n-1), buf, &len) && len > 0 &&
-        !feof(file) && !ferror(file)) {
-      n += fread(buf+n, 1, len, file);
-    }
 
     if (raw) {
       data = jv_string_append_buf(data, buf, n);

diff --git a/src/jv_unicode.c b/src/jv_unicode.c
@@ -67,29 +67,6 @@ static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) {
   return 1;
 }
 
-// jvp_utf8_backtrack returns the beginning of the last codepoint in the
-// string, assuming that start is the last byte in the string.
-// If the last codepoint is incomplete, returns the number of missing bytes via
-// *missing_bytes.  If there are no leading bytes or an invalid byte is
-// encountered, NULL is returned and *missing_bytes is not altered.
-const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes) {
-  assert(min <= start);
-  if (min == start) {
-    return min;
-  }
-  int length = 0;
-  int seen = 1;
-  while (start >= min && (length = utf8_coding_length[(unsigned char)*start]) == UTF8_CONTINUATION_BYTE) {
-    start--;
-    seen++;
-  }
-  if (length == 0 || length == UTF8_CONTINUATION_BYTE || length - seen < 0) {
-    return NULL;
-  }
-  if (missing_bytes) *missing_bytes = length - seen;
-  return start;
-}
-
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
   return jvp_utf8_wtf_next(in, end, JVP_UTF8_REPLACE, codepoint_ret);
 }

diff --git a/src/jv_unicode.h b/src/jv_unicode.h
@@ -13,7 +13,6 @@ enum jvp_utf8_flags {
   JVP_UTF8_ERRORS_ALL = JVP_UTF8_ERRORS_UTF16 | JVP_UTF8_ERRORS_UTF8
 };
 
-const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes);
 const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
 const char* jvp_utf8_wtf_next_bytes(const char* in, const char* end, const char** bytes_out, uint32_t* bytes_len);