rusticstuff · hkratz · Nov 20, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -138,11 +138,21 @@ jobs:
         run: cargo +stable install rustfilt
       - name: Check x86_64 inlining
         run: |
-          ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt
-          ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt "--features public_imp"
-          RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx2.txt
           RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-avx2.txt --no-default-features
           RUSTFLAGS="-C target-feature=+sse4.2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-sse42.txt --no-default-features
+      - name: Check x86_64 inlining with avx2 autoselection
+        run: |
+          ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-old.txt
+          ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-old.txt "--features public_imp"
+          RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx2.txt
+        if: ${{ matrix.toolchain == '1.38.0' }}
+      - name: Check x86_64 inlining with avx512 autoselection
+        run: |
+          ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt
+          ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt "--features public_imp"
+          RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx512.txt
+          RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-avx512.txt --no-default-features
+        if: ${{ matrix.toolchain != '1.38.0' }}
       - uses: dtolnay/rust-toolchain@master
         with:
           toolchain: ${{ matrix.toolchain }}

diff --git a/Cargo.toml b/Cargo.toml
@@ -53,3 +53,6 @@ targets = ["aarch64-unknown-linux-gnu", "wasm32-unknown-unknown", "wasm32-wasip1
 
 [dependencies]
 flexpect = "0.1.1"
+
+[build-dependencies]
+rustversion = "1.0.22"
diff --git a/README.md b/README.md
@@ -13,11 +13,12 @@ This library has been thoroughly tested with sample data as well as fuzzing and
 ## Features
 * `basic` API for the fastest validation, optimized for valid UTF-8
 * `compat` API as a fully compatible replacement for `std::str::from_utf8()`
+* 🆕 AVX 512 support on modern x86/x86-64 CPUs since Rust 1.89
 * Supports AVX 2 and SSE 4.2 implementations on x86 and x86-64
 * ARM64 (aarch64) SIMD is supported since Rust 1.61
 * WASM (wasm32) SIMD is supported
 * 🆕 armv7 NEON support with the `armv7_neon` feature on nightly Rust
-* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCI
+* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCII
 * aarch64: Up to eleven times faster than the std library on valid non-ASCII, up to four times faster on ASCII (Apple Silicon)
 * Faster than the original simdjson implementation
 * Selects the fastest implementation at runtime based on CPU support (on x86)
@@ -71,14 +72,17 @@ This comes at a slight performance penalty compared to the `basic` API even if t
 ## Implementation selection
 
 ### X86
-The fastest implementation is selected at runtime using the `std::is_x86_feature_detected!` macro, unless the CPU
-targeted by the compiler supports the fastest available implementation.
-So if you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine, the AVX 2 implementation is selected at
-compile-time and runtime selection is disabled.
+The fastest implementation is usually selected at runtime using the `std::is_x86_feature_detected!` macro. The AVX 512 
+implementation is however only selected if the CPU support the VBMI2 features to avoid throttling happening with CPUs before 
+Intels Ice Lake microarchitecture.
+
+If you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine whichs support AVX 512 with Rust 1.89 or later,
+the AVX 512 implementation is selected at compile-time and runtime selection is disabled.
 
 For no-std support (compiled with `--no-default-features`) the implementation is always selected at compile time based on
 the targeted CPU. Use `RUSTFLAGS="-C target-feature=+avx2"` for the AVX 2 implementation or `RUSTFLAGS="-C target-feature=+sse4.2"`
-for the SSE 4.2 implementation.
+for the SSE 4.2 implementation. For AVX 512 use `RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2"` with 
+Rust 1.89 or later.
 
 ### ARM64
 The SIMD implementation is used automatically since Rust 1.61.

diff --git a/build.rs b/build.rs
@@ -0,0 +1,17 @@
+fn main() {
+    println!("cargo::rustc-check-cfg=cfg(avx512_stable)");
+    // `if rustversion::cfg!(...)` is not supported in older Rust versions
+    if avx512_stable() {
+        println!("cargo:rustc-cfg=avx512_stable");
+    }
+}
+
+#[rustversion::since(1.89)]
+fn avx512_stable() -> bool {
+    true
+}
+
+#[rustversion::before(1.89)]
+fn avx512_stable() -> bool {
+    false
+}
diff --git a/inlining/expected-methods-x86-nostd-avx512.txt b/inlining/expected-methods-x86-nostd-avx512.txt
@@ -0,0 +1,5 @@
+simdutf8::implementation::helpers::get_compat_error
+simdutf8::implementation::x86::validate_utf8_basic
+simdutf8::implementation::x86::validate_utf8_basic_avx512
+simdutf8::implementation::x86::validate_utf8_compat
+simdutf8::implementation::x86::validate_utf8_compat_avx512
diff --git a/inlining/expected-methods-x86-std-avx512.txt b/inlining/expected-methods-x86-std-avx512.txt
@@ -0,0 +1,5 @@
+simdutf8::implementation::helpers::get_compat_error
+simdutf8::implementation::x86::validate_utf8_basic
+simdutf8::implementation::x86::validate_utf8_basic_avx512
+simdutf8::implementation::x86::validate_utf8_compat
+simdutf8::implementation::x86::validate_utf8_compat_avx512
diff --git a/inlining/expected-methods-x86-std-old.txt b/inlining/expected-methods-x86-std-old.txt
@@ -0,0 +1,9 @@
+simdutf8::implementation::helpers::get_compat_error
+simdutf8::implementation::validate_utf8_basic_fallback
+simdutf8::implementation::validate_utf8_compat_fallback
+simdutf8::implementation::x86::avx2::validate_utf8_basic
+simdutf8::implementation::x86::avx2::validate_utf8_compat
+simdutf8::implementation::x86::sse42::validate_utf8_basic
+simdutf8::implementation::x86::sse42::validate_utf8_compat
+simdutf8::implementation::x86::validate_utf8_basic::get_fastest
+simdutf8::implementation::x86::validate_utf8_compat::get_fastest
diff --git a/inlining/expected-methods-x86-std.txt b/inlining/expected-methods-x86-std.txt
@@ -3,6 +3,8 @@ simdutf8::implementation::validate_utf8_basic_fallback
 simdutf8::implementation::validate_utf8_compat_fallback
 simdutf8::implementation::x86::avx2::validate_utf8_basic
 simdutf8::implementation::x86::avx2::validate_utf8_compat
+simdutf8::implementation::x86::avx512::validate_utf8_basic
+simdutf8::implementation::x86::avx512::validate_utf8_compat
 simdutf8::implementation::x86::sse42::validate_utf8_basic
 simdutf8::implementation::x86::sse42::validate_utf8_compat
 simdutf8::implementation::x86::validate_utf8_basic::get_fastest

diff --git a/src/basic.rs b/src/basic.rs
@@ -197,6 +197,16 @@ pub mod imp {
     /// Includes the x86/x86-64 SIMD implementations.
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     pub mod x86 {
+        /// Includes the validation implementation for AVX 512-compatible CPUs.
+        ///
+        /// Using the provided functionality on CPUs which do not support AVX 512 is undefined
+        /// behavior and will very likely cause a crash.
+        #[cfg(avx512_stable)]
+        pub mod avx512 {
+            pub use crate::implementation::x86::avx512::validate_utf8_basic as validate_utf8;
+            pub use crate::implementation::x86::avx512::ChunkedUtf8ValidatorImp;
+            pub use crate::implementation::x86::avx512::Utf8ValidatorImp;
+        }
         /// Includes the validation implementation for AVX 2-compatible CPUs.
         ///
         /// Using the provided functionality on CPUs which do not support AVX 2 is undefined

diff --git a/src/compat.rs b/src/compat.rs
@@ -105,6 +105,11 @@ pub mod imp {
     /// Includes the x86/x86-64 SIMD implementations.
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     pub mod x86 {
+        /// Includes the validation implementation for AVX 512-compatible CPUs.
+        #[cfg(avx512_stable)]
+        pub mod avx512 {
+            pub use crate::implementation::x86::avx512::validate_utf8_compat as validate_utf8;
+        }
         /// Includes the validation implementation for AVX 2-compatible CPUs.
         pub mod avx2 {
             pub use crate::implementation::x86::avx2::validate_utf8_compat as validate_utf8;

diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs
@@ -182,7 +182,10 @@ macro_rules! algorithm_simd {
             unsafe fn check_block(&mut self, input: SimdInput) {
                 // WORKAROUND
                 // necessary because the for loop is not unrolled on ARM64
-                if input.vals.len() == 2 {
+                if input.vals.len() == 1 {
+                    self.check_bytes(*input.vals.as_ptr());
+                    self.incomplete = Self::is_incomplete(*input.vals.as_ptr());
+                } else if input.vals.len() == 2 {
                     self.check_bytes(*input.vals.as_ptr());
                     self.check_bytes(*input.vals.as_ptr().add(1));
                     self.incomplete = Self::is_incomplete(*input.vals.as_ptr().add(1));
@@ -237,13 +240,7 @@ macro_rules! algorithm_simd {
             }
 
             if idx < len {
-                let mut tmpbuf = TempSimdChunk::new();
-                crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
-                    input.as_ptr().add(idx),
-                    tmpbuf.0.as_mut_ptr(),
-                    len - idx,
-                );
-                let simd_input = SimdInput::new(tmpbuf.0.as_ptr());
+                let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len-idx);
                 algorithm.check_utf8(simd_input);
             }
             algorithm.check_incomplete_pending();
@@ -329,14 +326,7 @@ macro_rules! algorithm_simd {
                 break;
             }
             if idx < len {
-                let mut tmpbuf = TempSimdChunk::new();
-                crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
-                    input.as_ptr().add(idx),
-                    tmpbuf.0.as_mut_ptr(),
-                    len - idx,
-                );
-                let simd_input = SimdInput::new(tmpbuf.0.as_ptr());
-
+                let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len-idx);
                 algorithm.check_utf8(simd_input);
             }
             algorithm.check_incomplete_pending();
@@ -534,6 +524,18 @@ macro_rules! simd_input_128_bit {
                 }
             }
 
+            $(#[$feat])*
+            #[inline]
+            unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
+                let mut tmpbuf = TempSimdChunk::new();
+                crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
+                    ptr,
+                    tmpbuf.0.as_mut_ptr(),
+                    len,
+                );
+                Self::new(tmpbuf.0.as_ptr())
+            }
+
             $(#[$feat])*
             #[inline]
             unsafe fn is_ascii(&self) -> bool {
@@ -565,6 +567,18 @@ macro_rules! simd_input_256_bit {
                 }
             }
 
+            $(#[$feat])*
+            #[inline]
+            unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
+                let mut tmpbuf = TempSimdChunk::new();
+                crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
+                    ptr,
+                    tmpbuf.0.as_mut_ptr(),
+                    len,
+                );
+                Self::new(tmpbuf.0.as_ptr())
+            }
+
             $(#[$feat])*
             #[inline]
             unsafe fn is_ascii(&self) -> bool {
@@ -573,3 +587,41 @@ macro_rules! simd_input_256_bit {
         }
     };
 }
+
+macro_rules! simd_input_512_bit {
+    ($(#[$feat:meta])*) => {
+        #[repr(C)]
+        struct SimdInput {
+            vals: [SimdU8Value; 1],
+        }
+
+        impl SimdInput {
+            $(#[$feat])*
+            #[inline]
+            unsafe fn new(ptr: *const u8) -> Self {
+                Self {
+                    vals: [
+                        SimdU8Value::load_from(ptr),
+                    ],
+                }
+            }
+
+
+            $(#[$feat])*
+            #[inline]
+            unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
+                Self {
+                    vals: [
+                        SimdU8Value::load_from_partial(ptr, len),
+                    ],
+                }
+            }
+
+            $(#[$feat])*
+            #[inline]
+            unsafe fn is_ascii(&self) -> bool {
+                self.vals[0].is_ascii()
+            }
+        }
+    };
+}
diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs
@@ -139,6 +139,10 @@ impl TempSimdChunkA16 {
 #[allow(dead_code)] // only used if a 256-bit SIMD implementation is used
 pub(crate) struct TempSimdChunkA32(pub(crate) [u8; SIMD_CHUNK_SIZE]);
 
+#[repr(C, align(64))]
+#[allow(dead_code)] // only used if a 256-bit SIMD implementation is used
+pub(crate) struct TempSimdChunkA64(pub(crate) [u8; SIMD_CHUNK_SIZE]);
+
 #[allow(dead_code)] // only used if there is a SIMD implementation
 impl TempSimdChunkA32 {
     #[flexpect::e(clippy::inline_always)]
@@ -148,6 +152,15 @@ impl TempSimdChunkA32 {
     }
 }
 
+#[allow(dead_code)] // only used if there is a SIMD implementation
+impl TempSimdChunkA64 {
+    #[flexpect::e(clippy::inline_always)]
+    #[inline(always)] // needs to be forced because otherwise it is not inlined on armv7 neo
+    pub(crate) const fn new() -> Self {
+        Self([0; SIMD_CHUNK_SIZE])
+    }
+}
+
 #[derive(Clone, Copy)]
 #[allow(dead_code)] // only used if there is a SIMD implementation
 pub(crate) struct SimdU8Value<T>(pub(crate) T)