Closed
Description
#include <emmintrin.h>
#include <stdint.h>
__m128i a(uint32_t in)
{
__m128i val = _mm_cvtsi32_si128(in*2);
val = _mm_shufflelo_epi16(val, _MM_SHUFFLE(0,0,0,0));
return val;
}
__m128i b(uint32_t in)
{
__m128i val = _mm_cvtsi32_si128(in);
val = _mm_shufflelo_epi16(val, _MM_SHUFFLE(0,0,0,0));
return val;
}
Expected:
a(unsigned int):
movd xmm0, edi
pslld xmm0, 1
pshuflw xmm0, xmm0, 0
ret
b(unsigned int):
movd xmm0, edi
pshuflw xmm0, xmm0, 0
ret
(or, like gcc, do a scalar shift, I have no opinion either way)
Actual:
a(unsigned int):
movd xmm0, edi
pslld xmm0, 1
xorps xmm1, xmm1
movss xmm1, xmm0
pshuflw xmm0, xmm1, 0
ret
b(unsigned int):
movd xmm0, edi
pshuflw xmm0, xmm0, 0
ret
https://godbolt.org/z/W96qqGKc6
The xorps+movss dance takes the low 32 bits from xmm0 into xmm1 and zeroes the rest of xmm1, which is completely unnecessary because the upper 96 bits of xmm0 are already zero.