Closed
Description
InstCombine has x86 folds that can recognise when a _mm_blendv_epi8
intrinsic mask comes from a comparison and converts it to a select, even through all the bitcasts due to the __m128i
type:
#include <x86intrin.h>
__m128i select_m128i(__m128i a, __m128i b, __m128i c, __m128i d)
{
__m128i mask = (__m128i)((__v16qi)c < (__v16qi)d);
return _mm_blendv_epi8( a, b, mask );
}
define <2 x i64> @select_m128i(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
entry:
%0 = bitcast <2 x i64> %c to <16 x i8>
%1 = bitcast <2 x i64> %d to <16 x i8>
%cmp = icmp slt <16 x i8> %0, %1
%2 = bitcast <2 x i64> %a to <16 x i8>
%3 = bitcast <2 x i64> %b to <16 x i8>
%4 = select <16 x i1> %cmp, <16 x i8> %3, <16 x i8> %2
%5 = bitcast <16 x i8> %4 to <2 x i64>
ret <2 x i64> %5
}
But this fails entirely for AVX1 targets that try to emulate the 256-bit equivalent by splitting/concatenation:
__m256i select_m256i(__m256i a, __m256i b, __m256i c, __m256i d)
{
__m256i mask = (__m256i)((__v32qi)c < (__v32qi)d);
__m128i lo = _mm_blendv_epi8( _mm256_extractf128_si256( a, 0 ), _mm256_extractf128_si256( b, 0 ), _mm256_extractf128_si256( mask, 0 ) );
__m128i hi = _mm_blendv_epi8( _mm256_extractf128_si256( a, 1 ), _mm256_extractf128_si256( b, 1 ), _mm256_extractf128_si256( mask, 1 ) );
#ifdef ALT
return _mm256_setr_m128i( lo, hi );
#else
return _mm256_insertf128_si256( _mm256_castsi128_si256( lo ), hi, 1 );
#endif
}
define <4 x i64> @select_m256i(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
entry:
%0 = bitcast <4 x i64> %c to <32 x i8>
%1 = bitcast <4 x i64> %d to <32 x i8>
%cmp = icmp slt <32 x i8> %0, %1
%sext = sext <32 x i1> %cmp to <32 x i8>
%2 = bitcast <4 x i64> %a to <8 x i32>
%extract = shufflevector <8 x i32> %2, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = bitcast <4 x i64> %b to <8 x i32>
%extract1 = shufflevector <8 x i32> %3, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%4 = bitcast <32 x i8> %sext to <8 x i32>
%extract2 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%5 = bitcast <4 x i32> %extract to <16 x i8>
%6 = bitcast <4 x i32> %extract1 to <16 x i8>
%7 = bitcast <4 x i32> %extract2 to <16 x i8>
%8 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %5, <16 x i8> %6, <16 x i8> %7)
%9 = bitcast <16 x i8> %8 to <2 x i64>
%extract3 = shufflevector <8 x i32> %2, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%extract4 = shufflevector <8 x i32> %3, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%extract5 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%10 = bitcast <4 x i32> %extract3 to <16 x i8>
%11 = bitcast <4 x i32> %extract4 to <16 x i8>
%12 = bitcast <4 x i32> %extract5 to <16 x i8>
%13 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12)
%shuffle.i = shufflevector <2 x i64> %9, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%14 = bitcast <4 x i64> %shuffle.i to <8 x i32>
%15 = bitcast <16 x i8> %13 to <4 x i32>
%widen = shufflevector <4 x i32> %15, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%insert = shufflevector <8 x i32> %14, <8 x i32> %widen, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
%16 = bitcast <8 x i32> %insert to <4 x i64>
ret <4 x i64> %16
}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
Goldbolt: https://godbolt.org/z/r61dhrqYM