@@ -897,10 +897,20 @@ pub unsafe fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
897
897
packusdw ( a, b)
898
898
}
899
899
900
+ /// Permutes packed 32-bit integers from `a` according to the content of `b`.
901
+ ///
902
+ /// The last 3 bits of each integer of `b` are used as addresses into the 8
903
+ /// integers of `a`.
904
+ #[ inline( always) ]
905
+ #[ target_feature = "+avx2" ]
906
+ #[ cfg_attr( test, assert_instr( vpermd) ) ]
907
+ pub unsafe fn _mm256_permutevar8x32_epi32 ( a : u32x8 , b : u32x8 ) -> u32x8 {
908
+ permd ( a, b)
909
+ }
910
+
900
911
// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
901
912
// TODO _mm256_permute4x64_epi64 (__m256i a, const int imm8)
902
913
// TODO _mm256_permute4x64_pd (__m256d a, const int imm8)
903
- // TODO _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx)
904
914
// TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
905
915
906
916
/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
@@ -914,8 +924,43 @@ pub unsafe fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
914
924
psadbw ( a, b)
915
925
}
916
926
927
+ /// Shuffle bytes from `a` according to the content of `b`.
928
+ ///
929
+ /// The last 4 bits of each byte of `b` are used as addresses into the 32 bytes
930
+ /// of `a`.
931
+ ///
932
+ /// In addition, if the highest significant bit of a byte of `b` is set, the
933
+ /// respective destination byte is set to 0.
934
+ ///
935
+ /// The low and high halves of the vectors are shuffled separately.
936
+ ///
937
+ /// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
938
+ /// equivalent to:
939
+ ///
940
+ /// ```
941
+ /// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
942
+ /// let mut r = [0; 32];
943
+ /// for i in 0..16 {
944
+ /// // if the most significant bit of b is set,
945
+ /// // then the destination byte is set to 0.
946
+ /// if b[i] & 0x80 == 0u8 {
947
+ /// r[i] = a[(b[i] % 16) as usize];
948
+ /// }
949
+ /// if b[i + 16] & 0x80 == 0u8 {
950
+ /// r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
951
+ /// }
952
+ /// }
953
+ /// r
954
+ /// }
955
+ /// ```
956
+ #[ inline( always) ]
957
+ #[ target_feature = "+avx2" ]
958
+ #[ cfg_attr( test, assert_instr( vpshufb) ) ]
959
+ pub unsafe fn _mm256_shuffle_epi8 ( a : u8x32 , b : u8x32 ) -> u8x32 {
960
+ pshufb ( a, b)
961
+ }
962
+
917
963
// TODO _mm256_shuffle_epi32 (__m256i a, const int imm8)
918
- // TODO _mm256_shuffle_epi8 (__m256i a, __m256i b)
919
964
// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8)
920
965
// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8)
921
966
@@ -1430,7 +1475,10 @@ extern "C" {
1430
1475
fn psubusb ( a : u8x32 , b : u8x32 ) -> u8x32 ;
1431
1476
#[ link_name = "llvm.x86.avx2.psubus.w" ]
1432
1477
fn psubusw ( a : u16x16 , b : u16x16 ) -> u16x16 ;
1433
-
1478
+ #[ link_name = "llvm.x86.avx2.pshuf.b" ]
1479
+ fn pshufb ( a : u8x32 , b : u8x32 ) -> u8x32 ;
1480
+ #[ link_name = "llvm.x86.avx2.permd" ]
1481
+ fn permd ( a : u32x8 , b : u32x8 ) -> u32x8 ;
1434
1482
}
1435
1483
1436
1484
#[ cfg( test) ]
@@ -2566,4 +2614,37 @@ mod tests {
2566
2614
let r = avx2:: _mm256_alignr_epi8 ( a, b, 0 ) ;
2567
2615
assert_eq ! ( r, b) ;
2568
2616
}
2617
+
2618
+ #[ simd_test = "avx2" ]
2619
+ unsafe fn _mm256_shuffle_epi8 ( ) {
2620
+ let a = u8x32:: new (
2621
+ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 ,
2622
+ 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 ,
2623
+ 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 ,
2624
+ 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32
2625
+ ) ;
2626
+ let b = u8x32:: new (
2627
+ 4 , 128 , 4 , 3 , 24 , 12 , 6 , 19 ,
2628
+ 12 , 5 , 5 , 10 , 4 , 1 , 8 , 0 ,
2629
+ 4 , 128 , 4 , 3 , 24 , 12 , 6 , 19 ,
2630
+ 12 , 5 , 5 , 10 , 4 , 1 , 8 , 0 ,
2631
+ ) ;
2632
+ let expected = u8x32:: new (
2633
+ 5 , 0 , 5 , 4 , 9 , 13 , 7 , 4 ,
2634
+ 13 , 6 , 6 , 11 , 5 , 2 , 9 , 1 ,
2635
+ 21 , 0 , 21 , 20 , 25 , 29 , 23 , 20 ,
2636
+ 29 , 22 , 22 , 27 , 21 , 18 , 25 , 17 ,
2637
+ ) ;
2638
+ let r = avx2:: _mm256_shuffle_epi8 ( a, b) ;
2639
+ assert_eq ! ( r, expected) ;
2640
+ }
2641
+
2642
+ #[ simd_test = "avx2" ]
2643
+ unsafe fn _mm256_permutevar8x32_epi32 ( ) {
2644
+ let a = u32x8:: new ( 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 ) ;
2645
+ let b = u32x8:: new ( 5 , 0 , 5 , 1 , 7 , 6 , 3 , 4 ) ;
2646
+ let expected = u32x8:: new ( 600 , 100 , 600 , 200 , 800 , 700 , 400 , 500 ) ;
2647
+ let r = avx2:: _mm256_permutevar8x32_epi32 ( a, b) ;
2648
+ assert_eq ! ( r, expected) ;
2649
+ }
2569
2650
}
0 commit comments