Skip to content

[X86] InstCombine fails to replace shuffled _mm_blendv_epi8+icmp with select+icmp #58895

Closed
@RKSimon

Description

@RKSimon

InstCombine has x86 folds that can recognise when a _mm_blendv_epi8 intrinsic mask comes from a comparison and converts it to a select, even through all the bitcasts due to the __m128i type:

#include <x86intrin.h>

__m128i select_m128i(__m128i a, __m128i b, __m128i c, __m128i d)
{
    __m128i mask = (__m128i)((__v16qi)c < (__v16qi)d);
  return _mm_blendv_epi8( a, b, mask );
}
define <2 x i64> @select_m128i(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
entry:
  %0 = bitcast <2 x i64> %c to <16 x i8>
  %1 = bitcast <2 x i64> %d to <16 x i8>
  %cmp = icmp slt <16 x i8> %0, %1
  %2 = bitcast <2 x i64> %a to <16 x i8>
  %3 = bitcast <2 x i64> %b to <16 x i8>
  %4 = select <16 x i1> %cmp, <16 x i8> %3, <16 x i8> %2
  %5 = bitcast <16 x i8> %4 to <2 x i64>
  ret <2 x i64> %5
}

But this fails entirely for AVX1 targets that try to emulate the 256-bit equivalent by splitting/concatenation:

__m256i select_m256i(__m256i a, __m256i b, __m256i c, __m256i d)
{
    __m256i mask = (__m256i)((__v32qi)c < (__v32qi)d);
  __m128i lo = _mm_blendv_epi8( _mm256_extractf128_si256( a, 0 ), _mm256_extractf128_si256( b, 0 ), _mm256_extractf128_si256( mask, 0 ) );
  __m128i hi = _mm_blendv_epi8( _mm256_extractf128_si256( a, 1 ), _mm256_extractf128_si256( b, 1 ), _mm256_extractf128_si256( mask, 1 ) );

#ifdef ALT
    return _mm256_setr_m128i( lo, hi );
#else
  return _mm256_insertf128_si256( _mm256_castsi128_si256( lo ), hi, 1 );
#endif
}
define <4 x i64> @select_m256i(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
entry:
  %0 = bitcast <4 x i64> %c to <32 x i8>
  %1 = bitcast <4 x i64> %d to <32 x i8>
  %cmp = icmp slt <32 x i8> %0, %1
  %sext = sext <32 x i1> %cmp to <32 x i8>
  %2 = bitcast <4 x i64> %a to <8 x i32>
  %extract = shufflevector <8 x i32> %2, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %3 = bitcast <4 x i64> %b to <8 x i32>
  %extract1 = shufflevector <8 x i32> %3, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %4 = bitcast <32 x i8> %sext to <8 x i32>
  %extract2 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %5 = bitcast <4 x i32> %extract to <16 x i8>
  %6 = bitcast <4 x i32> %extract1 to <16 x i8>
  %7 = bitcast <4 x i32> %extract2 to <16 x i8>
  %8 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %5, <16 x i8> %6, <16 x i8> %7)
  %9 = bitcast <16 x i8> %8 to <2 x i64>
  %extract3 = shufflevector <8 x i32> %2, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %extract4 = shufflevector <8 x i32> %3, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %extract5 = shufflevector <8 x i32> %4, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %10 = bitcast <4 x i32> %extract3 to <16 x i8>
  %11 = bitcast <4 x i32> %extract4 to <16 x i8>
  %12 = bitcast <4 x i32> %extract5 to <16 x i8>
  %13 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12)
  %shuffle.i = shufflevector <2 x i64> %9, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
  %14 = bitcast <4 x i64> %shuffle.i to <8 x i32>
  %15 = bitcast <16 x i8> %13 to <4 x i32>
  %widen = shufflevector <4 x i32> %15, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %insert = shufflevector <8 x i32> %14, <8 x i32> %widen, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
  %16 = bitcast <8 x i32> %insert to <4 x i64>
  ret <4 x i64> %16
}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)

Goldbolt: https://godbolt.org/z/r61dhrqYM

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions