Skip to content

Commit f36ccde

Browse files
sfackleralexcrichton
authored andcommitted
Add _mm256_shuffle_epi8 and _mm256_permutevar8x32_epi32 (rust-lang#133)
* Add _mm256_shuffle_epi8 * Add _mm256_permutevar8x32_epi32
1 parent 50aced8 commit f36ccde

File tree

1 file changed

+84
-3
lines changed

1 file changed

+84
-3
lines changed

src/x86/avx2.rs

Lines changed: 84 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -897,10 +897,20 @@ pub unsafe fn _mm256_packus_epi32(a: i32x8, b: i32x8) -> u16x16 {
897897
packusdw(a, b)
898898
}
899899

900+
/// Permutes packed 32-bit integers from `a` according to the content of `b`.
901+
///
902+
/// The last 3 bits of each integer of `b` are used as addresses into the 8
903+
/// integers of `a`.
904+
#[inline(always)]
905+
#[target_feature = "+avx2"]
906+
#[cfg_attr(test, assert_instr(vpermd))]
907+
pub unsafe fn _mm256_permutevar8x32_epi32(a: u32x8, b: u32x8) -> u32x8 {
908+
permd(a, b)
909+
}
910+
900911
// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
901912
// TODO _mm256_permute4x64_epi64 (__m256i a, const int imm8)
902913
// TODO _mm256_permute4x64_pd (__m256d a, const int imm8)
903-
// TODO _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx)
904914
// TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
905915

906916
/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
@@ -914,8 +924,43 @@ pub unsafe fn _mm256_sad_epu8 (a: u8x32, b: u8x32) -> u64x4 {
914924
psadbw(a, b)
915925
}
916926

927+
/// Shuffle bytes from `a` according to the content of `b`.
928+
///
929+
/// The last 4 bits of each byte of `b` are used as addresses into the 32 bytes
930+
/// of `a`.
931+
///
932+
/// In addition, if the highest significant bit of a byte of `b` is set, the
933+
/// respective destination byte is set to 0.
934+
///
935+
/// The low and high halves of the vectors are shuffled separately.
936+
///
937+
/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
938+
/// equivalent to:
939+
///
940+
/// ```
941+
/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
942+
/// let mut r = [0; 32];
943+
/// for i in 0..16 {
944+
/// // if the most significant bit of b is set,
945+
/// // then the destination byte is set to 0.
946+
/// if b[i] & 0x80 == 0u8 {
947+
/// r[i] = a[(b[i] % 16) as usize];
948+
/// }
949+
/// if b[i + 16] & 0x80 == 0u8 {
950+
/// r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
951+
/// }
952+
/// }
953+
/// r
954+
/// }
955+
/// ```
956+
#[inline(always)]
957+
#[target_feature = "+avx2"]
958+
#[cfg_attr(test, assert_instr(vpshufb))]
959+
pub unsafe fn _mm256_shuffle_epi8(a: u8x32, b: u8x32) -> u8x32 {
960+
pshufb(a, b)
961+
}
962+
917963
// TODO _mm256_shuffle_epi32 (__m256i a, const int imm8)
918-
// TODO _mm256_shuffle_epi8 (__m256i a, __m256i b)
919964
// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8)
920965
// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8)
921966

@@ -1430,7 +1475,10 @@ extern "C" {
14301475
fn psubusb(a: u8x32, b: u8x32) -> u8x32;
14311476
#[link_name = "llvm.x86.avx2.psubus.w"]
14321477
fn psubusw(a: u16x16, b: u16x16) -> u16x16;
1433-
1478+
#[link_name = "llvm.x86.avx2.pshuf.b"]
1479+
fn pshufb(a: u8x32, b: u8x32) -> u8x32;
1480+
#[link_name = "llvm.x86.avx2.permd"]
1481+
fn permd(a: u32x8, b: u32x8) -> u32x8;
14341482
}
14351483

14361484
#[cfg(test)]
@@ -2566,4 +2614,37 @@ mod tests {
25662614
let r = avx2::_mm256_alignr_epi8(a, b, 0);
25672615
assert_eq!(r, b);
25682616
}
2617+
2618+
#[simd_test = "avx2"]
2619+
unsafe fn _mm256_shuffle_epi8() {
2620+
let a = u8x32::new(
2621+
1, 2, 3, 4, 5, 6, 7, 8,
2622+
9, 10, 11, 12, 13, 14, 15, 16,
2623+
17, 18, 19, 20, 21, 22, 23, 24,
2624+
25, 26, 27, 28, 29, 30, 31, 32
2625+
);
2626+
let b = u8x32::new(
2627+
4, 128, 4, 3, 24, 12, 6, 19,
2628+
12, 5, 5, 10, 4, 1, 8, 0,
2629+
4, 128, 4, 3, 24, 12, 6, 19,
2630+
12, 5, 5, 10, 4, 1, 8, 0,
2631+
);
2632+
let expected = u8x32::new(
2633+
5, 0, 5, 4, 9, 13, 7, 4,
2634+
13, 6, 6, 11, 5, 2, 9, 1,
2635+
21, 0, 21, 20, 25, 29, 23, 20,
2636+
29, 22, 22, 27, 21, 18, 25, 17,
2637+
);
2638+
let r = avx2::_mm256_shuffle_epi8(a, b);
2639+
assert_eq!(r, expected);
2640+
}
2641+
2642+
#[simd_test = "avx2"]
2643+
unsafe fn _mm256_permutevar8x32_epi32() {
2644+
let a = u32x8::new(100, 200, 300, 400, 500, 600, 700, 800);
2645+
let b = u32x8::new(5, 0, 5, 1, 7, 6, 3, 4);
2646+
let expected = u32x8::new(600, 100, 600, 200, 800, 700, 400, 500);
2647+
let r = avx2::_mm256_permutevar8x32_epi32(a, b);
2648+
assert_eq!(r, expected);
2649+
}
25692650
}

0 commit comments

Comments
 (0)