Skip to content

Commit a672463

Browse files
committed
avx512: use masked load for in complete last block
1 parent eb704c0 commit a672463

File tree

2 files changed

+48
-18
lines changed

2 files changed

+48
-18
lines changed

src/implementation/algorithm.rs

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -240,13 +240,7 @@ macro_rules! algorithm_simd {
240240
}
241241

242242
if idx < len {
243-
let mut tmpbuf = TempSimdChunk::new();
244-
crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
245-
input.as_ptr().add(idx),
246-
tmpbuf.0.as_mut_ptr(),
247-
len - idx,
248-
);
249-
let simd_input = SimdInput::new(tmpbuf.0.as_ptr());
243+
let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len-idx);
250244
algorithm.check_utf8(simd_input);
251245
}
252246
algorithm.check_incomplete_pending();
@@ -332,14 +326,7 @@ macro_rules! algorithm_simd {
332326
break;
333327
}
334328
if idx < len {
335-
let mut tmpbuf = TempSimdChunk::new();
336-
crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
337-
input.as_ptr().add(idx),
338-
tmpbuf.0.as_mut_ptr(),
339-
len - idx,
340-
);
341-
let simd_input = SimdInput::new(tmpbuf.0.as_ptr());
342-
329+
let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len-idx);
343330
algorithm.check_utf8(simd_input);
344331
}
345332
algorithm.check_incomplete_pending();
@@ -537,6 +524,18 @@ macro_rules! simd_input_128_bit {
537524
}
538525
}
539526

527+
$(#[$feat])*
528+
#[inline]
529+
unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
530+
let mut tmpbuf = TempSimdChunk::new();
531+
crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
532+
ptr,
533+
tmpbuf.0.as_mut_ptr(),
534+
len,
535+
);
536+
Self::new(tmpbuf.0.as_ptr())
537+
}
538+
540539
$(#[$feat])*
541540
#[inline]
542541
unsafe fn is_ascii(&self) -> bool {
@@ -568,6 +567,18 @@ macro_rules! simd_input_256_bit {
568567
}
569568
}
570569

570+
$(#[$feat])*
571+
#[inline]
572+
unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
573+
let mut tmpbuf = TempSimdChunk::new();
574+
crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64(
575+
ptr,
576+
tmpbuf.0.as_mut_ptr(),
577+
len,
578+
);
579+
Self::new(tmpbuf.0.as_ptr())
580+
}
581+
571582
$(#[$feat])*
572583
#[inline]
573584
unsafe fn is_ascii(&self) -> bool {
@@ -595,6 +606,17 @@ macro_rules! simd_input_512_bit {
595606
}
596607
}
597608

609+
610+
$(#[$feat])*
611+
#[inline]
612+
unsafe fn new_partial(ptr: *const u8, len: usize) -> Self {
613+
Self {
614+
vals: [
615+
SimdU8Value::load_from_partial(ptr, len),
616+
],
617+
}
618+
}
619+
598620
$(#[$feat])*
599621
#[inline]
600622
unsafe fn is_ascii(&self) -> bool {

src/implementation/x86/avx512.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
33
use core::arch::x86_64::{
44
__m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_cmpgt_epi8_mask, _mm512_loadu_si512,
5-
_mm512_maskz_abs_epi8, _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8,
6-
_mm512_set_epi64, _mm512_setzero_si512, _mm512_shuffle_epi8, _mm512_srli_epi16,
7-
_mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, _MM_HINT_T0,
5+
_mm512_maskz_abs_epi8, _mm512_maskz_loadu_epi8, _mm512_or_si512, _mm512_permutex2var_epi64,
6+
_mm512_set1_epi8, _mm512_set_epi64, _mm512_setzero_si512, _mm512_shuffle_epi8,
7+
_mm512_srli_epi16, _mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch,
8+
_MM_HINT_T0,
89
};
910
use core::arch::x86_64::{_mm512_movepi8_mask, _mm512_set_epi8};
1011

@@ -106,6 +107,13 @@ impl SimdU8Value {
106107
Self::from(_mm512_loadu_si512(ptr.cast::<__m512i>()))
107108
}
108109

110+
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
111+
#[inline]
112+
unsafe fn load_from_partial(ptr: *const u8, len: usize) -> Self {
113+
let res = _mm512_maskz_loadu_epi8(u64::MAX >> (64 - len), ptr.cast::<i8>());
114+
Self::from(res)
115+
}
116+
109117
#[flexpect::e(clippy::too_many_arguments)]
110118
#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]
111119
#[inline]

0 commit comments

Comments
 (0)