diff --git a/scripts/gen_utf8_tables.py b/scripts/gen_utf8_tables.py
index 6fe0a5312b..7706462351 100644
--- a/scripts/gen_utf8_tables.py
+++ b/scripts/gen_utf8_tables.py
@@ -16,8 +16,7 @@ def print_table(type, name, t):
 def utf8info(c):
     if c < 0x80: return 1, mask(7)
     if 0x80 <= c <= 0xBF: return 255, mask(6)
-    if 0xC0 <= c <= 0xC1: return 0, 0
-    if 0xC2 <= c <= 0xDF: return 2, mask(5)
+    if 0xC0 <= c <= 0xDF: return 2, mask(5)
     if 0xE0 <= c <= 0xEF: return 3, mask(4)
     if 0xF0 <= c <= 0xF4: return 4, mask(3)
     if 0xF4 <= c <= 0xFF: return 0, 0
diff --git a/src/jv.c b/src/jv.c
index f1201d29e1..c08f15491e 100644
--- a/src/jv.c
+++ b/src/jv.c
@@ -782,20 +782,24 @@ static jvp_string* jvp_string_alloc(uint32_t size) {
   return s;
 }
 
-/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */
+/* Copy a UTF8 string, using WTF-8b to replace all UTF-8 errors */
 static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
   const char* end = data + length;
   const char* i = data;
   const char* cstart;
 
-  uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD
+  uint32_t maxlength = length * 2 + 1; // worst case: all bad bytes, each becomes a 2-byte overlong U+XX
   jvp_string* s = jvp_string_alloc(maxlength);
   char* out = s->data;
   int c = 0;
 
-  while ((i = jvp_utf8_next((cstart = i), end, &c))) {
+  while ((i = jvp_utf8_extended_next((cstart = i), end, 0, &c))) {
     if (c == -1) {
-      c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
+      int error = (unsigned char)*cstart;
+      assert(error >= 0x80 && error <= 0xFF);
+      c = -error;
+      /* Ensure each UTF-8 error byte is consumed separately */
+      i = cstart + 1;
     }
     out += jvp_utf8_encode(c, out);
     assert(out < s->data + maxlength);
@@ -807,8 +811,8 @@ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
   return r;
 }
 
-/* Assumes valid UTF8 */
-static jv jvp_string_new(const char* data, uint32_t length) {
+/* Assumes valid WTF-8b */
+jv jv_string_extended_sized(const char* data, int length) {
   jvp_string* s = jvp_string_alloc(length);
   s->length_hashed = length << 1;
   if (data != NULL)
@@ -949,7 +953,7 @@ static int jvp_string_equal(jv a, jv b) {
 jv jv_string_sized(const char* str, int len) {
   return
     jvp_utf8_is_valid(str, str+len) ?
-    jvp_string_new(str, len) :
+    jv_string_extended_sized(str, len) :
     jvp_string_copy_replace_bad(str, len);
 }
 
@@ -1015,14 +1019,14 @@ jv jv_string_split(jv j, jv sep) {
 
   if (seplen == 0) {
     int c;
-    while ((jstr = jvp_utf8_next(jstr, jend, &c)))
+    while ((jstr = jvp_utf8_extended_next(jstr, jend, JVP_UTF8_ERRORS_ALL, &c)))
       a = jv_array_append(a, jv_string_append_codepoint(jv_string(""), c));
   } else {
     for (p = jstr; p < jend; p = s + seplen) {
       s = _jq_memmem(p, jend - p, sepstr, seplen);
       if (s == NULL)
         s = jend;
-      a = jv_array_append(a, jv_string_sized(p, s - p));
+      a = jv_array_append(a, jv_string_extended_sized(p, s - p));
       // Add an empty string to denote that j ends on a sep
       if (s + seplen == jend && seplen != 0)
         a = jv_array_append(a, jv_string(""));
@@ -1094,7 +1098,7 @@ jv jv_string_slice(jv j, int start, int end) {
 
   /* Look for byte offset corresponding to start codepoints */
   for (p = s, i = 0; i < start; i++) {
-    p = jvp_utf8_next(p, s + len, &c);
+    p = jvp_utf8_extended_next(p, s + len, JVP_UTF8_ERRORS_ALL, &c);
     if (p == NULL) {
       jv_free(j);
       return jv_string_empty(16);
@@ -1106,7 +1110,7 @@ jv jv_string_slice(jv j, int start, int end) {
   }
   /* Look for byte offset corresponding to end codepoints */
   for (e = p; e != NULL && i < end; i++) {
-    e = jvp_utf8_next(e, s + len, &c);
+    e = jvp_utf8_extended_next(e, s + len, JVP_UTF8_ERRORS_ALL, &c);
     if (e == NULL) {
       e = s + len;
       break;
@@ -1124,7 +1128,7 @@ jv jv_string_slice(jv j, int start, int end) {
    * memory like a drunken navy programmer.  There's probably nothing we
    * can do about it.
    */
-  res = jv_string_sized(p, e - p);
+  res = jv_string_extended_sized(p, e - p);
   jv_free(j);
   return res;
 }
diff --git a/src/jv.h b/src/jv.h
index 8c96f822f0..aae15afb5a 100644
--- a/src/jv.h
+++ b/src/jv.h
@@ -107,6 +107,7 @@ jv jv_array_indexes(jv, jv);
 
 jv jv_string(const char*);
 jv jv_string_sized(const char*, int);
+jv jv_string_extended_sized(const char*, int);
 jv jv_string_empty(int len);
 int jv_string_length_bytes(jv);
 int jv_string_length_codepoints(jv);
diff --git a/src/jv_parse.c b/src/jv_parse.c
index 9ced9f6d23..b5f5502ea9 100644
--- a/src/jv_parse.c
+++ b/src/jv_parse.c
@@ -406,7 +406,7 @@ static void tokenadd(struct jv_parser* p, char c) {
   p->tokenbuf[p->tokenpos++] = c;
 }
 
-static int unhex4(char* hex) {
+static int unhex4(const char* hex) {
   int r = 0;
   for (int i=0; i<4; i++) {
     char c = *hex++;
@@ -421,16 +421,24 @@ static int unhex4(char* hex) {
   return r;
 }
 
-static pfunc found_string(struct jv_parser* p) {
-  char* in = p->tokenbuf;
-  char* out = p->tokenbuf;
-  char* end = p->tokenbuf + p->tokenpos;
+static void found_string_cleanup(struct jv_parser* p, char* buf) {
+  if (buf != p->tokenbuf)
+    jv_mem_free(buf);
+}
 
-  while (in < end) {
-    char c = *in++;
+static pfunc found_string(struct jv_parser* p) {
+  const char* in = p->tokenbuf;
+  // start by writing to tokenbuf, only allocate in case that output size is greater than input size (possible only when input has UTF-8 errors)
+  char* buf = p->tokenbuf;
+  char* out = buf;
+  const char* end = p->tokenbuf + p->tokenpos;
+  const char* cstart;
+  int c;
+
+  while ((in = jvp_utf8_extended_next((cstart = in), end, 0, &c))) {
     if (c == '\\') {
       if (in >= end)
-        return "Expected escape character at end of string";
+        return found_string_cleanup(p, buf), "Expected escape character at end of string";
       c = *in++;
       switch (c) {
       case '\\':
@@ -445,38 +453,60 @@ static pfunc found_string(struct jv_parser* p) {
       case 'u':
         /* ahh, the complicated case */
         if (in + 4 > end)
-          return "Invalid \\uXXXX escape";
+          return found_string_cleanup(p, buf), "Invalid \\uXXXX escape";
         int hexvalue = unhex4(in);
         if (hexvalue < 0)
-          return "Invalid characters in \\uXXXX escape";
+          return found_string_cleanup(p, buf), "Invalid characters in \\uXXXX escape";
         unsigned long codepoint = (unsigned long)hexvalue;
         in += 4;
+        // leading surrogate
         if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
-          /* who thought UTF-16 surrogate pairs were a good idea? */
-          if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
-            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
-          unsigned long surrogate = unhex4(in+2);
-          if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
-            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
-          in += 6;
-          codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
-                                 |(surrogate - 0xDC00));
+          // look ahead for trailing surrogate and decode as UTF-16, otherwise encode this lone surrogate as WTF-8
+          if (in + 6 <= end && in[0] && '\\' && in[1] == 'u') {
+            unsigned long surrogate = unhex4(in+2);
+            if (0xDC00 <= surrogate && surrogate <= 0xDFFF) {
+              in += 6;
+              codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
+                                     |(surrogate - 0xDC00));
+            }
+          }
         }
-        if (codepoint > 0x10FFFF)
-          codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
+        // UTF-16 surrogates can not encode a greater codepoint
+        assert(codepoint <= 0x10FFFF);
+        // NOTE: a leading or trailing surrogate here (0xD800 <= codepoint && codepoint <= 0xDFFF) is encoded as WTF-8
         out += jvp_utf8_encode(codepoint, out);
         break;
 
       default:
-        return "Invalid escape";
+        return found_string_cleanup(p, buf), "Invalid escape";
       }
     } else {
       if (c > 0 && c < 0x001f)
-        return "Invalid string: control characters from U+0000 through U+001F must be escaped";
-      *out++ = c;
+        return found_string_cleanup(p, buf), "Invalid string: control characters from U+0000 through U+001F must be escaped";
+      if (c == -1) {
+        int error = (unsigned char)*cstart;
+        assert(error >= 0x80 && error <= 0xFF);
+        c = -error;
+        /* Ensure each UTF-8 error byte is consumed separately */
+        const int wtf8_length = 2;
+        assert(jvp_utf8_encode_length(c) == wtf8_length);
+        in = cstart + 1;
+        if (buf == p->tokenbuf && out + wtf8_length > in) {
+          /* Output is about to overflow input, move output to temporary buffer */
+          int current_size = out - buf;
+          int remaining = end - cstart;
+          buf = jv_mem_alloc(current_size + remaining * wtf8_length); // worst case: all remaining bad bytes, each becomes a 2-byte overlong U+XX
+          memcpy(buf, p->tokenbuf, current_size);
+          out = buf + current_size;
+        }
+      } else
+        assert(jvp_utf8_encode_length(c) == in - cstart);
+      out += jvp_utf8_encode(c, out);
     }
   }
-  TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf)));
+  jv v = jv_string_extended_sized(buf, out - buf);
+  found_string_cleanup(p, buf);
+  TRY(value(p, v));
   p->tokenpos = 0;
   return 0;
 }
diff --git a/src/jv_print.c b/src/jv_print.c
index 2e781bb8b4..d08eec478b 100644
--- a/src/jv_print.c
+++ b/src/jv_print.c
@@ -100,6 +100,16 @@ static void put_char(char c, FILE* fout, jv* strout, int T) {
   put_buf(&c, 1, fout, strout, T);
 }
 
+static void put_invalid_utf8_byte(int c, FILE* fout, jv* strout, int T) {
+  assert(c >= 0x80 && c <= 0xFF);
+  if (strout) {
+    // encode as an invalid UTF-8 byte in output
+    *strout = jv_string_append_codepoint(*strout, -c);
+  } else {
+    put_char(c, fout, strout, T);
+  }
+}
+
 static void put_str(const char* s, FILE* fout, jv* strout, int T) {
   put_buf(s, strlen(s), fout, strout, T);
 }
@@ -123,7 +133,7 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
   int c = 0;
   char buf[32];
   put_char('"', F, S, T);
-  while ((i = jvp_utf8_next((cstart = i), end, &c))) {
+  while ((i = jvp_utf8_extended_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) {
     assert(c != -1);
     int unicode_escape = 0;
     if (0x20 <= c && c <= 0x7E) {
@@ -132,6 +142,17 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
         put_char('\\', F, S, T);
       }
       put_char(c, F, S, T);
+    } else if (c >= -0xFF && c <= -0x80) {
+      // Invalid UTF-8 byte
+      if (ascii_only) {
+        // refusing to emit invalid UTF-8
+        // TODO: convince the world to adopt a "\xXX" notation for JSON?
+        c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
+        unicode_escape = 1;
+      } else {
+        // pass through
+        put_invalid_utf8_byte(-c, F, S, T);
+      }
     } else if (c < 0x20 || c == 0x7F) {
       // ASCII control character
       switch (c) {
@@ -162,6 +183,9 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
     } else {
       if (ascii_only) {
         unicode_escape = 1;
+      } else if (c >= 0xD800 && c <= 0xDFFF) {
+        // lone surrogate; can't be encoded to UTF-8
+        unicode_escape = 1;
       } else {
         put_buf(cstart, i - cstart, F, S, T);
       }
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
index d197349f48..df872de87f 100644
--- a/src/jv_unicode.c
+++ b/src/jv_unicode.c
@@ -27,6 +27,56 @@ const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_
 }
 
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
+  return jvp_utf8_extended_next(in, end, JVP_UTF8_REPLACE, codepoint_ret);
+}
+
+/*
+  The internal representation of jv strings uses an encoding that is hereby
+  referred to as "WTF-8b" (until someone demonstrates use of another term to
+  refer to the same encoding).
+
+  WTF-8b is an extension of WTF-8, which is an extension of UTF-8. Any sequence
+  of Unicode scalar values is represented by the same bytes in UTF-8, WTF-8 and
+  WTF-8b, therefore any well-formed UTF-8 string is interpreted as the same
+  sequence of Unicode scalar values (roughly, code points) in WTF-8b.
+
+  Like WTF-8, WTF-8b is able to encode UTF-16 errors (lone surrogates) using
+  the "generalized UTF-8" representation of code points between U+D800 and
+  U+DFFF. These errors occur in JSON terms such as:
+    "_\uD8AB_\uDBCD_"
+
+  Unlike WTF-8, WTF-8b is also able to encode UTF-8 errors (bytes 0x80 to 0xFF
+  that are not part of a valid UTF-8 sequence) using the first 128 "overlong"
+  codings (unused 2-byte representations of U+00 to U+7F). These errors can
+  occur in any byte stream that is interpreted as UTF-8, for example:
+    "\xED\xA2\xAB"
+  The above example is in fact the WTF-8b (and WTF-8) encoding for the lone
+  UTF-16 surrogate "\uD8AB", which demonstrates the need for a distinct
+  encoding of UTF-8 errors. If a distinction were not made, then "\xED\xA2\xAB"
+  and "\uD8AB" would be interpreted as the same string, so at least one of the
+  forms would not be preserved when printed as JSON output.
+
+  It should also be noted that the process of converting from invalid UTF-8 to
+  WTF-8b is not (and can not be) idempotent, since the "generalised UTF-8"
+  representation of UTF-16 surrogates are intentionally not able to be
+  generated from invalid UTF-8, only through some other means (usually "\uXXXX"
+  notation).
+
+  Each UTF-16 error is encoded as 3 WTF-8b (or WTF-8) bytes.
+  Each UTF-8 error is encoded as 2 WTF-8b bytes.
+
+  When iterating over code points using `JVP_UTF8_ERRORS_UTF16`, encoded UTF-16
+  errors are emitted in the form of code points in the range U+D800 to U+DFFF.
+  These code points can be reencoded as usual using `jvp_utf8_encode`.
+
+  When iterating over code points using `JVP_UTF8_ERRORS_UTF8`, encoded UTF-8
+  errors are emitted in the form of code points in the negative range -0x80 to
+  -0xFF. These negative code points can be negated to determine the original
+  error bytes. These code points can be reencoded as usual using
+  `jvp_utf8_encode`.
+*/
+
+const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint_ret) {
   assert(in <= end);
   if (in == end) {
     return 0;
@@ -40,9 +90,11 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
     length = 1;
   } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
     /* Bad single byte - either an invalid byte or an out-of-place continuation byte */
+    if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte");
     length = 1;
   } else if (in + length > end) {
     /* String ends before UTF8 sequence ends */
+    if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun");
     length = end - in;
   } else {
     codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
@@ -50,6 +102,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
       unsigned ch = (unsigned char)in[i];
       if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
         /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
+        if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes");
         codepoint = -1;
         length = i;
         break;
@@ -58,17 +111,29 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
     }
     if (codepoint < utf8_first_codepoint[length]) {
       /* Overlong UTF8 sequence */
-      codepoint = -1;
+      if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) {
+        /* UTF-8 error is emitted as a negative codepoint */
+        codepoint = -(codepoint + 0x80);
+      } else {
+        if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
+        codepoint = -1;
+      }
     }
     if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
-      /* Surrogate codepoints can't be encoded in UTF8 */
-      codepoint = -1;
+      /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
+      if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
+        /* Surrogate codepoints can't be encoded in UTF8 */
+        codepoint = -1;
+      }
     }
     if (codepoint > 0x10FFFF) {
       /* Outside Unicode range */
+      if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
       codepoint = -1;
     }
   }
+  if (codepoint == -1 && (flags & JVP_UTF8_REPLACE))
+    codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
   assert(length > 0);
   *codepoint_ret = codepoint;
   return in + length;
@@ -76,7 +141,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
 
 int jvp_utf8_is_valid(const char* in, const char* end) {
   int codepoint;
-  while ((in = jvp_utf8_next(in, end, &codepoint))) {
+  while ((in = jvp_utf8_extended_next(in, end, 0, &codepoint))) {
     if (codepoint == -1) return 0;
   }
   return 1;
@@ -91,20 +156,24 @@ int jvp_utf8_decode_length(char startchar) {
 }
 
 int jvp_utf8_encode_length(int codepoint) {
-  if (codepoint <= 0x7F) return 1;
+  if (codepoint >= 0 && codepoint <= 0x7F) return 1;
   else if (codepoint <= 0x7FF) return 2;
   else if (codepoint <= 0xFFFF) return 3;
   else return 4;
 }
 
 int jvp_utf8_encode(int codepoint, char* out) {
-  assert(codepoint >= 0 && codepoint <= 0x10FFFF);
+  assert((codepoint >= -0xFF && codepoint <= -0x80) || (codepoint >= 0 && codepoint <= 0x10FFFF));
   char* start = out;
-  if (codepoint <= 0x7F) {
+  if (codepoint >= 0 && codepoint <= 0x7F) {
     *out++ = codepoint;
   } else if (codepoint <= 0x7FF) {
-    *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6);
-    *out++ = 0x80 + ((codepoint & 0x03F));
+    // encode UTF-8 errors as overlong representations of U+00 to U+7F
+    int cp = codepoint >= -0xFF && codepoint <= -0x80?
+      -codepoint - 0x80 :
+      codepoint;
+    *out++ = 0xC0 + ((cp & 0x7C0) >> 6);
+    *out++ = 0x80 + ((cp & 0x03F));
   } else if(codepoint <= 0xFFFF) {
     *out++ = 0xE0 + ((codepoint & 0xF000) >> 12);
     *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6);
diff --git a/src/jv_unicode.h b/src/jv_unicode.h
index 558721a8fd..37c7fc08f6 100644
--- a/src/jv_unicode.h
+++ b/src/jv_unicode.h
@@ -1,7 +1,18 @@
 #ifndef JV_UNICODE_H
 #define JV_UNICODE_H
 
+enum jvp_utf8_flags {
+  /* Emit replacement character instead of -1 for errors */
+  JVP_UTF8_REPLACE = 1,
+  /* Treat input as WTF-8b, emit 0xD800 to 0xDFFF to denote encoded UTF-16 errors */
+  JVP_UTF8_ERRORS_UTF16 = 2,
+  /* Treat input as WTF-8b, emit -0x80 to -0xFF to denote encoded UTF-8 errors */
+  JVP_UTF8_ERRORS_UTF8 = 4,
+  JVP_UTF8_ERRORS_ALL = JVP_UTF8_ERRORS_UTF16 | JVP_UTF8_ERRORS_UTF8
+};
+
 const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes);
+const char* jvp_utf8_extended_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
 int jvp_utf8_is_valid(const char* in, const char* end);
 
diff --git a/src/jv_utf8_tables.h b/src/jv_utf8_tables.h
index f1a4252fce..7c68749e97 100644
--- a/src/jv_utf8_tables.h
+++ b/src/jv_utf8_tables.h
@@ -12,7 +12,7 @@ static const unsigned char utf8_coding_length[] =
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-  0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
   0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
   0x04, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
@@ -29,7 +29,7 @@ static const unsigned char utf8_coding_bits[] =
   0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
   0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
   0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
-  0x00, 0x00, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+  0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
   0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
   0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
   0x07, 0x07, 0x07, 0x07, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
diff --git a/src/main.c b/src/main.c
index 7022f19ebf..e536518e1d 100644
--- a/src/main.c
+++ b/src/main.c
@@ -30,6 +30,7 @@
 #include "jv.h"
 #include "jq.h"
 #include "jv_alloc.h"
+#include "jv_unicode.h"
 #include "util.h"
 #include "src/version.h"
 
@@ -173,6 +174,30 @@ static const char *skip_shebang(const char *p) {
   return n+1;
 }
 
+static void jvp_dump_raw_string(const char* start, const char* end, FILE* f) {
+  static const unsigned char UTF8_REPLACEMENT[] = {0xEF,0xBF,0xBD}; // U+FFFD REPLACEMENT CHARACTER
+
+  const char* i = start;
+  const char* cstart;
+  int c;
+
+  while ((i = jvp_utf8_extended_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) {
+    if (c >= -0xFF && c <= -0x80) {
+      // invalid UTF-8 byte; pass through
+      fwrite(start, 1, cstart - start, f);
+      start = i;
+      fputc(-c, f);
+    } else if ((c >= 0xD800 && c <= 0xDFFF) || c == -1) {
+      // lone surrugate; can't be encoded to UTF-8
+      fwrite(start, 1, cstart - start, f);
+      start = i;
+      fwrite(UTF8_REPLACEMENT, 1, sizeof(UTF8_REPLACEMENT), f);
+    } else
+      continue;
+  }
+  fwrite(start, 1, end - start, f);
+}
+
 static int process(jq_state *jq, jv value, int flags, int dumpopts) {
   int ret = JQ_OK_NO_OUTPUT; // No valid results && -e -> exit(4)
   jq_start(jq, value, flags);
@@ -182,7 +207,9 @@ static int process(jq_state *jq, jv value, int flags, int dumpopts) {
       if (options & ASCII_OUTPUT) {
         jv_dumpf(jv_copy(result), stdout, JV_PRINT_ASCII);
       } else {
-        fwrite(jv_string_value(result), 1, jv_string_length_bytes(jv_copy(result)), stdout);
+        const char *start = jv_string_value(result);
+        const char *end = start + jv_string_length_bytes(jv_copy(result));
+        jvp_dump_raw_string(start, end, stdout);
       }
       ret = JQ_OK;
       jv_free(result);
diff --git a/tests/jq.test b/tests/jq.test
index 2d5c36b887..f7f5e1d3e3 100644
--- a/tests/jq.test
+++ b/tests/jq.test
@@ -57,6 +57,11 @@ null
 "Aa\r\n\t\b\f\u03bc"
 "Aa\u000d\u000a\u0009\u0008\u000c\u03bc"
 
+# Check that unpaired surrogates are preserved in output
+"\u2200\ud800\u2203\udc00\u2205\udfff"
+null
+"∀\ud800∃\udc00∅\udfff"
+
 "inter\("pol" + "ation")"
 null
 "interpolation"
diff --git a/tests/shtest b/tests/shtest
index 8ed62b2213..eabdf26275 100755
--- a/tests/shtest
+++ b/tests/shtest
@@ -122,6 +122,15 @@ fi
 cmp $d/out $d/expected
 
 
+clean=false
+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON
+dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null
+$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
+$VALGRIND $Q $JQ -j . $d/out.json >$d/out
+cmp $d/out $d/rand
+clean=true
+
+
 ## Test --exit-status
 data='{"i": 1}\n{"i": 2}\n{"i": 3}\n'
 printf "$data" | $JQ --exit-status 'select(.i==1)' > /dev/null 2>&1