Closed
Description
Here's my code:
#[bench]
fn simd_paeth_1(bench: &mut Bencher) {
// pi
unsafe {
let a = [0x32, 0x43, 0xf6, 0xa8];
let b = [0x88, 0x5a, 0x30, 0x8d];
let c = [0x31, 0x31, 0x98, 0xa2];
let ba = x86::_mm_set_epi16(b[3], b[2], b[1], b[0], a[3], a[2], a[1], a[0]);
let cc = x86::_mm_set_epi16(c[3], c[2], c[1], c[0], c[3], c[2], c[1], c[0]);
let all_ones = x86::_mm_set1_epi16(-1);
bench.iter(|| {
test::black_box(a);
test::black_box(b);
test::black_box(c);
for i in 0..1000 {
test::black_box(i);
// Compute signed distances.
let spapb = x86::_mm_sub_epi16(ba, cc);
let spbpa = x86::_mm_shuffle_epi32(spapb, 0b01001110); // swap dwords
let spcpc = x86::_mm_add_epi16(spbpa, spapb);
// Compute absolute distances.
let papb = x86::_mm_abs_epi16(spapb);
let pcpc = x86::_mm_abs_epi16(spcpc);
let pbpa = x86::_mm_shuffle_epi32(papb, 0b01001110);
// Compute minima.
let min_bc = x86::_mm_min_epi16(papb, pcpc);
let min_abc = x86::_mm_min_epi16(pbpa, min_bc);
// Choose b or c.
let pick_b_or_c = x86::_mm_cmpeq_epi16(papb, min_bc);
let b = x86::_mm_slli_si128(ba, 4);
let b_or_c = x86::_mm_blendv_epi8(b, cc, pick_b_or_c);
// Choose a if necessary.
let pick_a = x86::_mm_cmpeq_epi16(pbpa, min_abc);
let result = x86::_mm_blendv_epi8(ba, b_or_c, pick_a);
test::black_box(result);
}
});
}
}
Note that mm_blendv_epi8
fails to inline, ruining performance.
This happens a lot and it makes using SIMD intrinsics very annoying. I have to start using inline asm.