Use SIMD intrinsics for vector shifts

sayantn · sayantn · commit 8e0efaf6415e · 2025-11-11T04:27:54.000+05:30
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
@@ -2778,7 +2778,11 @@ pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpsllvd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(psllvd(a.as_i32x4(), count.as_i32x4())) }
+    unsafe {
+        let count = count.as_u32x4();
+        let overflow: u32x4 = simd_ge(count, u32x4::splat(32));
+        simd_select(overflow, u32x4::ZERO, simd_shl(a.as_u32x4(), count)).as_m128i()
+    }
 }
 
 /// Shifts packed 32-bit integers in `a` left by the amount
@@ -2791,7 +2795,11 @@ pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpsllvd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(psllvd256(a.as_i32x8(), count.as_i32x8())) }
+    unsafe {
+        let count = count.as_u32x8();
+        let overflow: u32x8 = simd_ge(count, u32x8::splat(32));
+        simd_select(overflow, u32x8::ZERO, simd_shl(a.as_u32x8(), count)).as_m256i()
+    }
 }
 
 /// Shifts packed 64-bit integers in `a` left by the amount
@@ -2804,7 +2812,11 @@ pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpsllvq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(psllvq(a.as_i64x2(), count.as_i64x2())) }
+    unsafe {
+        let count = count.as_u64x2();
+        let overflow: u64x2 = simd_ge(count, u64x2::splat(64));
+        simd_select(overflow, u64x2::ZERO, simd_shl(a.as_u64x2(), count)).as_m128i()
+    }
 }
 
 /// Shifts packed 64-bit integers in `a` left by the amount
@@ -2817,7 +2829,11 @@ pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpsllvq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(psllvq256(a.as_i64x4(), count.as_i64x4())) }
+    unsafe {
+        let count = count.as_u64x4();
+        let overflow: u64x4 = simd_ge(count, u64x4::splat(64));
+        simd_select(overflow, u64x4::ZERO, simd_shl(a.as_u64x4(), count)).as_m256i()
+    }
 }
 
 /// Shifts packed 16-bit integers in `a` right by `count` while
@@ -2881,7 +2897,11 @@ pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpsravd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(psravd(a.as_i32x4(), count.as_i32x4())) }
+    unsafe {
+        let count = count.as_i32x4();
+        let overflow: i32x4 = simd_ge(count, i32x4::splat(32));
+        simd_select(overflow, i32x4::ZERO, simd_shr(a.as_i32x4(), count)).as_m128i()
+    }
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
@@ -2893,7 +2913,11 @@ pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpsravd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(psravd256(a.as_i32x8(), count.as_i32x8())) }
+    unsafe {
+        let count = count.as_i32x8();
+        let overflow: i32x8 = simd_ge(count, i32x8::splat(32));
+        simd_select(overflow, i32x8::ZERO, simd_shr(a.as_i32x8(), count)).as_m256i()
+    }
 }
 
 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
@@ -3076,7 +3100,11 @@ pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpsrlvd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(psrlvd(a.as_i32x4(), count.as_i32x4())) }
+    unsafe {
+        let count = count.as_u32x4();
+        let overflow: u32x4 = simd_ge(count, u32x4::splat(32));
+        simd_select(overflow, u32x4::ZERO, simd_shr(a.as_u32x4(), count)).as_m128i()
+    }
 }
 
 /// Shifts packed 32-bit integers in `a` right by the amount specified by
@@ -3088,7 +3116,11 @@ pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpsrlvd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(psrlvd256(a.as_i32x8(), count.as_i32x8())) }
+    unsafe {
+        let count = count.as_u32x8();
+        let overflow: u32x8 = simd_ge(count, u32x8::splat(32));
+        simd_select(overflow, u32x8::ZERO, simd_shr(a.as_u32x8(), count)).as_m256i()
+    }
 }
 
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
@@ -3100,7 +3132,11 @@ pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
 #[cfg_attr(test, assert_instr(vpsrlvq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(psrlvq(a.as_i64x2(), count.as_i64x2())) }
+    unsafe {
+        let count = count.as_u64x2();
+        let overflow: u64x2 = simd_ge(count, u64x2::splat(64));
+        simd_select(overflow, u64x2::ZERO, simd_shr(a.as_u64x2(), count)).as_m128i()
+    }
 }
 
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
@@ -3112,7 +3148,11 @@ pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(vpsrlvq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(psrlvq256(a.as_i64x4(), count.as_i64x4())) }
+    unsafe {
+        let count = count.as_u64x4();
+        let overflow: u64x4 = simd_ge(count, u64x4::splat(64));
+        simd_select(overflow, u64x4::ZERO, simd_shr(a.as_u64x4(), count)).as_m256i()
+    }
 }
 
 /// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
@@ -3687,36 +3727,16 @@ unsafe extern "C" {
     fn pslld(a: i32x8, count: i32x4) -> i32x8;
     #[link_name = "llvm.x86.avx2.psll.q"]
     fn psllq(a: i64x4, count: i64x2) -> i64x4;
-    #[link_name = "llvm.x86.avx2.psllv.d"]
-    fn psllvd(a: i32x4, count: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.psllv.d.256"]
-    fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx2.psllv.q"]
-    fn psllvq(a: i64x2, count: i64x2) -> i64x2;
-    #[link_name = "llvm.x86.avx2.psllv.q.256"]
-    fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
     #[link_name = "llvm.x86.avx2.psra.w"]
     fn psraw(a: i16x16, count: i16x8) -> i16x16;
     #[link_name = "llvm.x86.avx2.psra.d"]
     fn psrad(a: i32x8, count: i32x4) -> i32x8;
-    #[link_name = "llvm.x86.avx2.psrav.d"]
-    fn psravd(a: i32x4, count: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.psrav.d.256"]
-    fn psravd256(a: i32x8, count: i32x8) -> i32x8;
     #[link_name = "llvm.x86.avx2.psrl.w"]
     fn psrlw(a: i16x16, count: i16x8) -> i16x16;
     #[link_name = "llvm.x86.avx2.psrl.d"]
     fn psrld(a: i32x8, count: i32x4) -> i32x8;
     #[link_name = "llvm.x86.avx2.psrl.q"]
     fn psrlq(a: i64x4, count: i64x2) -> i64x4;
-    #[link_name = "llvm.x86.avx2.psrlv.d"]
-    fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
-    #[link_name = "llvm.x86.avx2.psrlv.d.256"]
-    fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
-    #[link_name = "llvm.x86.avx2.psrlv.q"]
-    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
-    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
-    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
     #[link_name = "llvm.x86.avx2.pshuf.b"]
     fn pshufb(a: u8x32, b: u8x32) -> u8x32;
     #[link_name = "llvm.x86.avx2.permd"]
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
@@ -6852,7 +6852,11 @@ pub fn _mm_maskz_slli_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
 pub fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsllvw(a.as_i16x32(), count.as_i16x32())) }
+    unsafe {
+        let count = count.as_u16x32();
+        let overflow: u16x32 = simd_ge(count, u16x32::splat(16));
+        simd_select(overflow, u16x32::ZERO, simd_shl(a.as_u16x32(), count)).as_m512i()
+    }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6891,7 +6895,11 @@ pub fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
 pub fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16())) }
+    unsafe {
+        let count = count.as_u16x16();
+        let overflow: u16x16 = simd_ge(count, u16x16::splat(16));
+        simd_select(overflow, u16x16::ZERO, simd_shl(a.as_u16x16(), count)).as_m256i()
+    }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6930,7 +6938,11 @@ pub fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
 pub fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8())) }
+    unsafe {
+        let count = count.as_u16x8();
+        let overflow: u16x8 = simd_ge(count, u16x8::splat(16));
+        simd_select(overflow, u16x8::ZERO, simd_shl(a.as_u16x8(), count)).as_m128i()
+    }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7188,7 +7200,11 @@ pub fn _mm_maskz_srli_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
 pub fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32())) }
+    unsafe {
+        let count = count.as_u16x32();
+        let overflow: u16x32 = simd_ge(count, u16x32::splat(16));
+        simd_select(overflow, u16x32::ZERO, simd_shr(a.as_u16x32(), count)).as_m512i()
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7227,7 +7243,11 @@ pub fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
 pub fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16())) }
+    unsafe {
+        let count = count.as_u16x16();
+        let overflow: u16x16 = simd_ge(count, u16x16::splat(16));
+        simd_select(overflow, u16x16::ZERO, simd_shr(a.as_u16x16(), count)).as_m256i()
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7266,7 +7286,11 @@ pub fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
 pub fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8())) }
+    unsafe {
+        let count = count.as_u16x8();
+        let overflow: u16x8 = simd_ge(count, u16x8::splat(16));
+        simd_select(overflow, u16x8::ZERO, simd_shr(a.as_u16x8(), count)).as_m128i()
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7511,7 +7535,11 @@ pub fn _mm_maskz_srai_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsravw))]
 pub fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i {
-    unsafe { transmute(vpsravw(a.as_i16x32(), count.as_i16x32())) }
+    unsafe {
+        let count = count.as_i16x32();
+        let overflow: i16x32 = simd_ge(count, i16x32::splat(16));
+        simd_select(overflow, i16x32::ZERO, simd_shr(a.as_i16x32(), count)).as_m512i()
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7550,7 +7578,11 @@ pub fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m5
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsravw))]
 pub fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i {
-    unsafe { transmute(vpsravw256(a.as_i16x16(), count.as_i16x16())) }
+    unsafe {
+        let count = count.as_i16x16();
+        let overflow: i16x16 = simd_ge(count, i16x16::splat(16));
+        simd_select(overflow, i16x16::ZERO, simd_shr(a.as_i16x16(), count)).as_m256i()
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7589,7 +7621,11 @@ pub fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m2
 #[stable(feature = "stdarch_x86_avx512", since = "1.89")]
 #[cfg_attr(test, assert_instr(vpsravw))]
 pub fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i {
-    unsafe { transmute(vpsravw128(a.as_i16x8(), count.as_i16x8())) }
+    unsafe {
+        let count = count.as_i16x8();
+        let overflow: i16x8 = simd_ge(count, i16x8::splat(16));
+        simd_select(overflow, i16x8::ZERO, simd_shr(a.as_i16x8(), count)).as_m128i()
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11645,33 +11681,12 @@ unsafe extern "C" {
     #[link_name = "llvm.x86.avx512.psll.w.512"]
     fn vpsllw(a: i16x32, count: i16x8) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.psllv.w.512"]
-    fn vpsllvw(a: i16x32, b: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psllv.w.256"]
-    fn vpsllvw256(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.psllv.w.128"]
-    fn vpsllvw128(a: i16x8, b: i16x8) -> i16x8;
-
     #[link_name = "llvm.x86.avx512.psrl.w.512"]
     fn vpsrlw(a: i16x32, count: i16x8) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.psrlv.w.512"]
-    fn vpsrlvw(a: i16x32, b: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psrlv.w.256"]
-    fn vpsrlvw256(a: i16x16, b: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.psrlv.w.128"]
-    fn vpsrlvw128(a: i16x8, b: i16x8) -> i16x8;
-
     #[link_name = "llvm.x86.avx512.psra.w.512"]
     fn vpsraw(a: i16x32, count: i16x8) -> i16x32;
 
-    #[link_name = "llvm.x86.avx512.psrav.w.512"]
-    fn vpsravw(a: i16x32, count: i16x32) -> i16x32;
-    #[link_name = "llvm.x86.avx512.psrav.w.256"]
-    fn vpsravw256(a: i16x16, count: i16x16) -> i16x16;
-    #[link_name = "llvm.x86.avx512.psrav.w.128"]
-    fn vpsravw128(a: i16x8, count: i16x8) -> i16x8;
-
     #[link_name = "llvm.x86.avx512.vpermi2var.hi.512"]
     fn vpermi2w(a: i16x32, idx: i16x32, b: i16x32) -> i16x32;
     #[link_name = "llvm.x86.avx512.vpermi2var.hi.256"]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs