Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,21 @@ jobs:
run: cargo +stable install rustfilt
- name: Check x86_64 inlining
run: |
./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt
./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt "--features public_imp"
RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx2.txt
RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-avx2.txt --no-default-features
RUSTFLAGS="-C target-feature=+sse4.2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-sse42.txt --no-default-features
- name: Check x86_64 inlining with avx2 autoselection
run: |
./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-old.txt
./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-old.txt "--features public_imp"
RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx2.txt
if: ${{ matrix.toolchain == '1.38.0' }}
- name: Check x86_64 inlining with avx512 autoselection
run: |
./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt
./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt "--features public_imp"
RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx512.txt
RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-avx512.txt --no-default-features
if: ${{ matrix.toolchain != '1.38.0' }}
- uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ matrix.toolchain }}
Expand Down
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,6 @@ targets = ["aarch64-unknown-linux-gnu", "wasm32-unknown-unknown", "wasm32-wasip1

[dependencies]
flexpect = "0.1.1"

[build-dependencies]
rustversion = "1.0.22"
16 changes: 10 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ This library has been thoroughly tested with sample data as well as fuzzing and
## Features
* `basic` API for the fastest validation, optimized for valid UTF-8
* `compat` API as a fully compatible replacement for `std::str::from_utf8()`
* 🆕 AVX 512 support on modern x86/x86-64 CPUs since Rust 1.89
* Supports AVX 2 and SSE 4.2 implementations on x86 and x86-64
* ARM64 (aarch64) SIMD is supported since Rust 1.61
* WASM (wasm32) SIMD is supported
* 🆕 armv7 NEON support with the `armv7_neon` feature on nightly Rust
* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCI
* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCII
* aarch64: Up to eleven times faster than the std library on valid non-ASCII, up to four times faster on ASCII (Apple Silicon)
* Faster than the original simdjson implementation
* Selects the fastest implementation at runtime based on CPU support (on x86)
Expand Down Expand Up @@ -71,14 +72,17 @@ This comes at a slight performance penalty compared to the `basic` API even if t
## Implementation selection

### X86
The fastest implementation is selected at runtime using the `std::is_x86_feature_detected!` macro, unless the CPU
targeted by the compiler supports the fastest available implementation.
So if you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine, the AVX 2 implementation is selected at
compile-time and runtime selection is disabled.
The fastest implementation is usually selected at runtime using the `std::is_x86_feature_detected!` macro. The AVX 512
implementation is however only selected if the CPU support the VBMI2 features to avoid throttling happening with CPUs before
Intels Ice Lake microarchitecture.

If you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine whichs support AVX 512 with Rust 1.89 or later,
the AVX 512 implementation is selected at compile-time and runtime selection is disabled.

For no-std support (compiled with `--no-default-features`) the implementation is always selected at compile time based on
the targeted CPU. Use `RUSTFLAGS="-C target-feature=+avx2"` for the AVX 2 implementation or `RUSTFLAGS="-C target-feature=+sse4.2"`
for the SSE 4.2 implementation.
for the SSE 4.2 implementation. For AVX 512 use `RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2"` with
Rust 1.89 or later.

### ARM64
The SIMD implementation is used automatically since Rust 1.61.
Expand Down
17 changes: 17 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
fn main() {
println!("cargo::rustc-check-cfg=cfg(avx512_stable)");
// `if rustversion::cfg!(...)` is not supported in older Rust versions
if avx512_stable() {
println!("cargo:rustc-cfg=avx512_stable");
}
}

#[rustversion::since(1.89)]
fn avx512_stable() -> bool {
true
}

#[rustversion::before(1.89)]
fn avx512_stable() -> bool {
false
}
5 changes: 5 additions & 0 deletions inlining/expected-methods-x86-nostd-avx512.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
simdutf8::implementation::helpers::get_compat_error
simdutf8::implementation::x86::validate_utf8_basic
simdutf8::implementation::x86::validate_utf8_basic_avx512
simdutf8::implementation::x86::validate_utf8_compat
simdutf8::implementation::x86::validate_utf8_compat_avx512
5 changes: 5 additions & 0 deletions inlining/expected-methods-x86-std-avx512.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
simdutf8::implementation::helpers::get_compat_error
simdutf8::implementation::x86::validate_utf8_basic
simdutf8::implementation::x86::validate_utf8_basic_avx512
simdutf8::implementation::x86::validate_utf8_compat
simdutf8::implementation::x86::validate_utf8_compat_avx512
9 changes: 9 additions & 0 deletions inlining/expected-methods-x86-std-old.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
simdutf8::implementation::helpers::get_compat_error
simdutf8::implementation::validate_utf8_basic_fallback
simdutf8::implementation::validate_utf8_compat_fallback
simdutf8::implementation::x86::avx2::validate_utf8_basic
simdutf8::implementation::x86::avx2::validate_utf8_compat
simdutf8::implementation::x86::sse42::validate_utf8_basic
simdutf8::implementation::x86::sse42::validate_utf8_compat
simdutf8::implementation::x86::validate_utf8_basic::get_fastest
simdutf8::implementation::x86::validate_utf8_compat::get_fastest
2 changes: 2 additions & 0 deletions inlining/expected-methods-x86-std.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ simdutf8::implementation::validate_utf8_basic_fallback
simdutf8::implementation::validate_utf8_compat_fallback
simdutf8::implementation::x86::avx2::validate_utf8_basic
simdutf8::implementation::x86::avx2::validate_utf8_compat
simdutf8::implementation::x86::avx512::validate_utf8_basic
simdutf8::implementation::x86::avx512::validate_utf8_compat
simdutf8::implementation::x86::sse42::validate_utf8_basic
simdutf8::implementation::x86::sse42::validate_utf8_compat
simdutf8::implementation::x86::validate_utf8_basic::get_fastest
Expand Down
10 changes: 10 additions & 0 deletions src/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,16 @@ pub mod imp {
/// Includes the x86/x86-64 SIMD implementations.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub mod x86 {
/// Includes the validation implementation for AVX 512-compatible CPUs.
///
/// Using the provided functionality on CPUs which do not support AVX 512 is undefined
/// behavior and will very likely cause a crash.
#[cfg(avx512_stable)]
pub mod avx512 {
pub use crate::implementation::x86::avx512::validate_utf8_basic as validate_utf8;
pub use crate::implementation::x86::avx512::ChunkedUtf8ValidatorImp;
pub use crate::implementation::x86::avx512::Utf8ValidatorImp;
}
/// Includes the validation implementation for AVX 2-compatible CPUs.
///
/// Using the provided functionality on CPUs which do not support AVX 2 is undefined
Expand Down
5 changes: 5 additions & 0 deletions src/compat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,11 @@ pub mod imp {
/// Includes the x86/x86-64 SIMD implementations.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub mod x86 {
/// Includes the validation implementation for AVX 512-compatible CPUs.
#[cfg(avx512_stable)]
pub mod avx512 {
pub use crate::implementation::x86::avx512::validate_utf8_compat as validate_utf8;
}
/// Includes the validation implementation for AVX 2-compatible CPUs.
pub mod avx2 {
pub use crate::implementation::x86::avx2::validate_utf8_compat as validate_utf8;
Expand Down
84 changes: 68 additions & 16 deletions src/implementation/algorithm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,10 @@ macro_rules! algorithm_simd {
unsafe fn check_block(&mut self, input: SimdInput) {
// WORKAROUND
// necessary because the for loop is not unrolled on ARM64
if input.vals.len() == 2 {
if input.vals.len() == 1 {
self.check_bytes(*input.vals.as_ptr());
self.incomplete = Self::is_incomplete(*input.vals.as_ptr());
} else if input.vals.len() == 2 {
self.check_bytes(*input.vals.as_ptr());
self.check_bytes(*input.vals.as_ptr().add(1));
self.incomplete = Self::is_incomplete(*input.vals.as_ptr().add(1));
Expand Down Expand Up @@ -237,13 +240,7 @@ macro_rules! algorithm_simd {
}

if idx < len {
let mut tmpbuf = TempSimdChunk::new();
crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
input.as_ptr().add(idx),
tmpbuf.0.as_mut_ptr(),
len - idx,
);
let simd_input = SimdInput::new(tmpbuf.0.as_ptr());
let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len-idx);
algorithm.check_utf8(simd_input);
}
algorithm.check_incomplete_pending();
Expand Down Expand Up @@ -329,14 +326,7 @@ macro_rules! algorithm_simd {
break;
}
if idx < len {
let mut tmpbuf = TempSimdChunk::new();
crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
input.as_ptr().add(idx),
tmpbuf.0.as_mut_ptr(),
len - idx,
);
let simd_input = SimdInput::new(tmpbuf.0.as_ptr());

let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len-idx);
algorithm.check_utf8(simd_input);
}
algorithm.check_incomplete_pending();
Expand Down Expand Up @@ -534,6 +524,18 @@ macro_rules! simd_input_128_bit {
}
}

$(#[$feat])*
#[inline]
unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
let mut tmpbuf = TempSimdChunk::new();
crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
ptr,
tmpbuf.0.as_mut_ptr(),
len,
);
Self::new(tmpbuf.0.as_ptr())
}

$(#[$feat])*
#[inline]
unsafe fn is_ascii(&self) -> bool {
Expand Down Expand Up @@ -565,6 +567,18 @@ macro_rules! simd_input_256_bit {
}
}

$(#[$feat])*
#[inline]
unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
let mut tmpbuf = TempSimdChunk::new();
crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
ptr,
tmpbuf.0.as_mut_ptr(),
len,
);
Self::new(tmpbuf.0.as_ptr())
}

$(#[$feat])*
#[inline]
unsafe fn is_ascii(&self) -> bool {
Expand All @@ -573,3 +587,41 @@ macro_rules! simd_input_256_bit {
}
};
}

macro_rules! simd_input_512_bit {
($(#[$feat:meta])*) => {
#[repr(C)]
struct SimdInput {
vals: [SimdU8Value; 1],
}

impl SimdInput {
$(#[$feat])*
#[inline]
unsafe fn new(ptr: *const u8) -> Self {
Self {
vals: [
SimdU8Value::load_from(ptr),
],
}
}


$(#[$feat])*
#[inline]
unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
Self {
vals: [
SimdU8Value::load_from_partial(ptr, len),
],
}
}

$(#[$feat])*
#[inline]
unsafe fn is_ascii(&self) -> bool {
self.vals[0].is_ascii()
}
}
};
}
13 changes: 13 additions & 0 deletions src/implementation/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ impl TempSimdChunkA16 {
#[allow(dead_code)] // only used if a 256-bit SIMD implementation is used
pub(crate) struct TempSimdChunkA32(pub(crate) [u8; SIMD_CHUNK_SIZE]);

#[repr(C, align(64))]
#[allow(dead_code)] // only used if a 256-bit SIMD implementation is used
pub(crate) struct TempSimdChunkA64(pub(crate) [u8; SIMD_CHUNK_SIZE]);

#[allow(dead_code)] // only used if there is a SIMD implementation
impl TempSimdChunkA32 {
#[flexpect::e(clippy::inline_always)]
Expand All @@ -148,6 +152,15 @@ impl TempSimdChunkA32 {
}
}

#[allow(dead_code)] // only used if there is a SIMD implementation
impl TempSimdChunkA64 {
#[flexpect::e(clippy::inline_always)]
#[inline(always)] // needs to be forced because otherwise it is not inlined on armv7 neo
pub(crate) const fn new() -> Self {
Self([0; SIMD_CHUNK_SIZE])
}
}

#[derive(Clone, Copy)]
#[allow(dead_code)] // only used if there is a SIMD implementation
pub(crate) struct SimdU8Value<T>(pub(crate) T)
Expand Down
Loading
Loading