Closed
Description
#include <x86intrin.h>
void func();
void scatter_mask_vector_i32_epi32(__m128 mask) {
if (_mm_movemask_ps(mask) & 0x8)
func();
}
is optimized to the following ir:
define void @scatter_mask_vector_i32_epi32(<4 x float> noundef %mask) {
entry:
%0 = bitcast <4 x float> %mask to <4 x i32>
%1 = icmp slt <4 x i32> %0, zeroinitializer
%2 = bitcast <4 x i1> %1 to i4
%tobool.not = icmp sgt i4 %2, -1
br i1 %tobool.not, label %if.end, label %if.then
if.then:
tail call void @func()()
br label %if.end
if.end:
ret void
}
declare void @func()() local_unnamed_addr
scatter_mask_vector_i32_epi32:
vmovmskps %xmm0, %eax
shlb $4, %al
sarb $4, %al
js func()@PLT # TAILCALL
retq
Changing the bit test to a lower bit in the mask creates much simpler code:
#include <x86intrin.h>
void func();
void scatter_mask_vector_i32_epi32(__m128 mask) {
if (_mm_movemask_ps(mask) & 0x2)
func();
}
define void @scatter_mask_vector_i32_epi32(<4 x float> noundef %mask) {
entry:
%0 = bitcast <4 x float> %mask to <4 x i32>
%1 = icmp slt <4 x i32> %0, zeroinitializer
%2 = bitcast <4 x i1> %1 to i4
%3 = and i4 %2, 2
%tobool.not = icmp eq i4 %3, 0
br i1 %tobool.not, label %if.end, label %if.then
if.then:
tail call void @func()()
br label %if.end
if.end:
ret void
}
declare void @func()() local_unnamed_addr #1
scatter_mask_vector_i32_epi32:
vmovmskps %xmm0, %eax
testb $2, %al
jne func()@PLT # TAILCALL
retq
With suitable value tracking we should be able to convert the i4 sext_inreg into a signbit test on the source mask value.