Skip to content

Commit

Permalink
feat: support parse with utf8 lossy
Browse files Browse the repository at this point in the history
  • Loading branch information
liuq19 committed Nov 8, 2024
1 parent 25f4f87 commit fe77809
Show file tree
Hide file tree
Showing 12 changed files with 296 additions and 127 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v1
- run: ./scripts/test_all.sh
- run: ./scripts/test.sh

test-stable-self:
name: Rust stable on self-hosted
Expand All @@ -29,7 +29,7 @@ jobs:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v1
- run: ./scripts/test_all.sh
- run: ./scripts/test.sh

test-nightly:
name: Rust nightly ${{matrix.os}}
Expand All @@ -43,7 +43,7 @@ jobs:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@nightly
- uses: Swatinem/rust-cache@v1
- run: ./scripts/test_all.sh
- run: ./scripts/test.sh

test-nightlye-self:
name: Rust nightly on self-hosted
Expand All @@ -52,7 +52,7 @@ jobs:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@nightly
- uses: Swatinem/rust-cache@v1
- run: ./scripts/test_all.sh
- run: ./scripts/test.sh

clippy_lint:
name: Format check ${{matrix.os}}
Expand Down
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,6 @@ sort_keys = []

# Will record the raw message of number and string when parse JSON into `sonic::Value`, and serialize the value will use the raw message
use_raw = []

# Allow to parse JSON with invalid UTF-8 and UTF-16 characters. Will replace them with `\uFFFD` (displayed as �).
utf8_lossy = []
23 changes: 11 additions & 12 deletions scripts/sanitize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,16 @@ set -ex

export ASAN_OPTIONS="disable_coredump=0:unmap_shadow_on_exit=1:abort_on_error=1"

for san in address leak; do
echo "Running tests with $san"
RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --target x86_64-unknown-linux-gnu -- --test-threads=1
RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --doc --package sonic-rs --target x86_64-unknown-linux-gnu -- --show-output --test-threads=1

RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --features arbitrary_precision --target x86_64-unknown-linux-gnu -- --test-threads=1
RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --features arbitrary_precision --doc --package sonic-rs --target x86_64-unknown-linux-gnu -- --show-output --test-threads=1

run_tests() {
local san="$1"
local features="$2"
cargo +nightly test --target x86_64-unknown-linux-gnu --features "$features" -- --test-threads=1
cargo +nightly test --doc --package sonic-rs --target x86_64-unknown-linux-gnu --features "$features" -- --show-output --test-threads=1
}

RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --features use_raw --target x86_64-unknown-linux-gnu -- --test-threads=1
RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" cargo +nightly test --features use_raw --doc --package sonic-rs --target x86_64-unknown-linux-gnu -- --show-output --test-threads=1
for san in address leak; do
for feature in "" "arbitrary_precision" "sort_keys" "use_raw" "utf8_lossy"; do
echo "Running tests with $san and $feature"
RUSTFLAGS="-Zsanitizer=${san}" RUSTDOCFLAGS="-Zsanitizer=${san}" run_tests $san $feature
done
done


2 changes: 2 additions & 0 deletions scripts/test_all.sh → scripts/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ cargo test --features sort_keys

cargo test --features use_raw

cargo test --features utf8_lossy

examples=$(cargo build --example 2>&1 | grep -v ":")

for example in $examples; do
Expand Down
5 changes: 3 additions & 2 deletions src/config.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#[derive(Debug, Clone, Copy, Default)]
pub(crate) struct DeserializeCfg {
pub(crate) use_rawnumber: bool,
pub(crate) use_raw: bool,
pub use_rawnumber: bool,
pub use_raw: bool,
pub utf8_lossy: bool,
}
164 changes: 126 additions & 38 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ use std::{
use faststr::FastStr;
use serde::de::{self, Expected, Unexpected};

use super::reader::{Reader, Reference};
use super::reader::Reader;
#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
use crate::util::simd::bits::NeonBits;
use crate::{
config::DeserializeCfg,
error::{
Error,
invalid_utf8, Error,
ErrorCode::{self, *},
Result,
},
Expand All @@ -34,6 +34,48 @@ use crate::{
LazyValue,
};

// support borrow for owned deserizlie or skip
pub(crate) enum Reference<'b, 'c, T>
where
T: ?Sized + 'static,
{
Borrowed(&'b T),
Copied(&'c T),
}

impl<'b, 'c, T> Deref for Reference<'b, 'c, T>
where
T: ?Sized + 'static,
{
type Target = T;

fn deref(&self) -> &Self::Target {
match *self {
Reference::Borrowed(b) => b,
Reference::Copied(c) => c,
}
}
}

pub(crate) enum ParsedSlice<'b, 'c> {
Borrowed {
slice: &'b [u8],
buf: &'c mut Vec<u8>,
},
Copied(&'c mut Vec<u8>),
}

impl<'b, 'c> Deref for ParsedSlice<'b, 'c> {
type Target = [u8];

fn deref(&self) -> &Self::Target {
match self {
ParsedSlice::Borrowed { slice, buf: _ } => slice,
ParsedSlice::Copied(c) => c.as_slice(),
}
}
}

pub(crate) const DEFAULT_KEY_BUF_CAPACITY: usize = 128;
pub(crate) fn as_str(data: &[u8]) -> &str {
unsafe { from_utf8_unchecked(data) }
Expand Down Expand Up @@ -252,19 +294,6 @@ where
}
}

#[inline(always)]
fn parse_string_inplace_impl<V: JsonVisitor<'de>>(&mut self, vis: &mut V) -> Result<()> {
unsafe {
let mut src = self.read.cur_ptr();
let start = self.read.cur_ptr();
let cnt = parse_string_inplace(&mut src).map_err(|e| self.error(e))?;
self.read.set_ptr(src);
let slice = from_raw_parts(start, cnt);
let s = from_utf8_unchecked(slice);
check_visit!(self, vis.visit_borrowed_str(s))
}
}

fn check_string_eof_inpadding(&self) -> Result<usize> {
let json = self.read.as_u8_slice();
let cur = self.read.index();
Expand All @@ -278,8 +307,18 @@ where
#[inline(always)]
fn parse_string_inplace<V: JsonVisitor<'de>>(&mut self, vis: &mut V) -> Result<()> {
if !self.cfg.use_raw {
return self.parse_string_inplace_impl(vis);
unsafe {
let mut src = self.read.cur_ptr();
let start = self.read.cur_ptr();
let cnt = parse_string_inplace(&mut src, self.cfg.utf8_lossy)
.map_err(|e| self.error(e))?;
self.read.set_ptr(src);
let slice = from_raw_parts(start, cnt);
let s = from_utf8_unchecked(slice);
return check_visit!(self, vis.visit_borrowed_str(s));
}
}

unsafe {
let start_idx = self.read.index();
let mut src = self.read.cur_ptr();
Expand All @@ -290,7 +329,8 @@ where
let raw = as_str(&self.read.as_u8_slice()[start_idx - 1..end]);
let alloc = vis.allocator().unwrap();
let raw = RawStr::new_in(alloc, raw);
let cnt = parse_string_inplace(&mut src).map_err(|e| self.error(e))?;
let cnt = parse_string_inplace(&mut src, self.cfg.utf8_lossy)
.map_err(|e| self.error(e))?;
self.read.set_ptr(src);
let s = str_from_raw_parts(start, cnt);
check_visit!(self, vis.visit_raw_str(s, raw))
Expand Down Expand Up @@ -621,11 +661,45 @@ where
&mut self,
buf: &'own mut Vec<u8>,
) -> Result<Reference<'de, 'own, str>> {
let slice = self.parse_string_raw(buf)?;
Ok(match slice {
Reference::Copied(buf) => Reference::Copied(unsafe { from_utf8_unchecked(buf) }),
Reference::Borrowed(buf) => Reference::Borrowed(unsafe { from_utf8_unchecked(buf) }),
})
match self.parse_string_raw(buf) {
Ok(ParsedSlice::Copied(buf)) => {
if self.check_invalid_utf8(self.cfg.utf8_lossy)? {
// repr the invalid utf-8
let repr = String::from_utf8_lossy(buf.as_ref()).into_owned();
*buf = repr.into_bytes();
}
let slice = unsafe { from_utf8_unchecked(buf.as_slice()) };
Ok(Reference::Copied(slice))
}
Ok(ParsedSlice::Borrowed { slice, buf }) => {
if self.check_invalid_utf8(self.cfg.utf8_lossy)? {
// repr the invalid utf-8
let repr = String::from_utf8_lossy(slice).into_owned();
*buf = repr.into_bytes();
let slice = unsafe { from_utf8_unchecked(buf) };
Ok(Reference::Copied(slice))
} else {
Ok(Reference::Borrowed(unsafe { from_utf8_unchecked(slice) }))
}
}
Err(e) => Err(e),
}
}

pub(crate) fn check_invalid_utf8(&mut self, allowed: bool) -> Result<bool> {
// the invalid UTF-8 before the string, must have been checked before.
let invalid = self.read.next_invalid_utf8();
if invalid >= self.read.index() {
return Ok(false);
}

if !allowed {
Err(invalid_utf8(self.read.as_u8_slice(), invalid))
} else {
// this space is allowed, should update the next invalid utf8 position
self.read.check_invalid_utf8();
Ok(true)
}
}

pub(crate) fn parse_escaped_utf8(&mut self) -> Result<u32> {
Expand All @@ -641,24 +715,40 @@ where
// parse the second utf8 code point of surrogate
let point2 = if let Some(asc) = self.read.next_n(6) {
if asc[0] != b'\\' || asc[1] != b'u' {
return perr!(self, InvalidSurrogateUnicodeCodePoint);
if self.cfg.utf8_lossy {
return Ok(0xFFFD);
} else {
// invalid surrogate
return perr!(self, InvalidSurrogateUnicodeCodePoint);
}
}
unsafe { hex_to_u32_nocheck(&*(asc.as_ptr().add(2) as *const _ as *const [u8; 4])) }
} else if self.cfg.utf8_lossy {
return Ok(0xFFFD);
} else {
return perr!(self, EofWhileParsing);
// invalid surrogate
return perr!(self, InvalidSurrogateUnicodeCodePoint);
};

/* calcute the real code point */
let low_bit = point2.wrapping_sub(0xdc00);
if (low_bit >> 10) != 0 {
// invalid surrogate
return perr!(self, InvalidSurrogateUnicodeCodePoint);
if self.cfg.utf8_lossy {
return Ok(0xFFFD);
} else {
// invalid surrogate
return perr!(self, InvalidSurrogateUnicodeCodePoint);
}
}

Ok((((point1 - 0xd800) << 10) | low_bit).wrapping_add(0x10000))
} else if (0xDC00..0xE000).contains(&point1) {
// invalid surrogate
perr!(self, InvalidSurrogateUnicodeCodePoint)
if self.cfg.utf8_lossy {
return Ok(0xFFFD);
} else {
// invalid surrogate
return perr!(self, InvalidSurrogateUnicodeCodePoint);
}
} else {
Ok(point1)
}
Expand Down Expand Up @@ -697,7 +787,7 @@ where
pub(crate) unsafe fn parse_string_escaped<'own>(
&mut self,
buf: &'own mut Vec<u8>,
) -> Result<Reference<'de, 'own, [u8]>> {
) -> Result<ParsedSlice<'de, 'own>> {
#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
let mut block: StringBlock<NeonBits>;
#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))]
Expand Down Expand Up @@ -725,7 +815,7 @@ where

// skip the right quote
self.read.eat(cnt + 1);
return Ok(Reference::Copied(buf.as_slice()));
return Ok(ParsedSlice::Copied(buf));
}

if block.has_backslash() {
Expand All @@ -746,7 +836,7 @@ where
match c {
b'"' => {
self.read.eat(1);
return Ok(Reference::Copied(buf.as_slice()));
return Ok(ParsedSlice::Copied(buf));
}
b'\\' => {
// skip the backslash
Expand All @@ -769,7 +859,7 @@ where
pub(crate) fn parse_string_raw<'own>(
&mut self,
buf: &'own mut Vec<u8>,
) -> Result<Reference<'de, 'own, [u8]>> {
) -> Result<ParsedSlice<'de, 'own>> {
// now reader is start after `"`, so we can directly skipstring
let start = self.read.index();
#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
Expand All @@ -784,9 +874,8 @@ where
if block.has_quote_first() {
let cnt = block.quote_index();
self.read.eat(cnt + 1);
return Ok(Reference::Borrowed(
self.read.slice_unchecked(start, self.read.index() - 1),
));
let slice = self.read.slice_unchecked(start, self.read.index() - 1);
return Ok(ParsedSlice::Borrowed { slice, buf });
}

if block.has_unescaped() {
Expand Down Expand Up @@ -815,9 +904,8 @@ where
match c {
b'"' => {
self.read.eat(1);
return Ok(Reference::Borrowed(
self.read.slice_unchecked(start, self.read.index() - 1),
));
let slice = self.read.slice_unchecked(start, self.read.index() - 1);
return Ok(ParsedSlice::Borrowed { slice, buf });
}
b'\\' => {
buf.clear();
Expand Down
Loading

0 comments on commit fe77809

Please sign in to comment.