[X86] Use __builtin_ia32_vec_ext_v4hi and __builtin_ia32_vec_set_v4hi to implement pextrw/pinsertw MMX intrinsics instead of trying to use native IR.

topperc · topperc · commit 561ba720accd · 2016-07-09T05:30:41.000Z
Without this we end up generating code that doesn't use mmx registers and probably doesn't work well with other mmx intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@274968 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def
@@ -161,6 +161,8 @@ TARGET_BUILTIN(__builtin_ia32_pmovmskb, "iV8c", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "", "sse")
 TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "", "sse")
+TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "", "sse")
+TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "", "sse")
 
 // MMX+SSE2
 TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "", "sse2")
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
@@ -2114,12 +2114,8 @@ _mm_sfence(void)
 ///    2: Bits [47:32] are copied to the destination.
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_extract_pi16(__m64 __a, int __n)
-{
-  __v4hi __b = (__v4hi)__a;
-  return (unsigned short)__b[__n & 3];
-}
+#define _mm_extract_pi16(a, n) __extension__ ({ \
+  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); })
 
 /// \brief Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
@@ -2145,13 +2141,8 @@ _mm_extract_pi16(__m64 __a, int __n)
 ///    bits in operand __a.
 /// \returns A 64-bit integer vector containing the copied packed data from the
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_insert_pi16(__m64 __a, int __d, int __n)
-{
-   __v4hi __b = (__v4hi)__a;
-   __b[__n & 3] = __d;
-   return (__m64)__b;
-}
+#define _mm_insert_pi16(a, d, n) __extension__ ({ \
+  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); })
 
 /// \brief Compares each of the corresponding packed 16-bit integer values of
 ///    the 64-bit integer vectors, and writes the greater value to the
diff --git a/test/CodeGen/mmx-builtins.c b/test/CodeGen/mmx-builtins.c
@@ -217,6 +217,12 @@ __m64 test_mm_cvttps_pi32(__m128 a) {
   return _mm_cvttps_pi32(a);
 }
 
+int test_mm_extract_pi16(__m64 a) {
+  // CHECK-LABEL: test_mm_extract_pi16
+  // CHECK: call i32 @llvm.x86.mmx.pextr.w
+  return _mm_extract_pi16(a, 2);
+}
+
 __m64 test_m_from_int(int a) {
   // CHECK-LABEL: test_m_from_int
   // CHECK: insertelement <2 x i32>
@@ -265,6 +271,12 @@ __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   return _mm_hsubs_pi16(a, b);
 }
 
+__m64 test_mm_insert_pi16(__m64 a, int d) {
+  // CHECK-LABEL: test_mm_insert_pi16
+  // CHECK: call x86_mmx @llvm.x86.mmx.pinsr.w
+  return _mm_insert_pi16(a, d, 2);
+}
+
 __m64 test_mm_madd_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_madd_pi16
   // CHECK: call x86_mmx @llvm.x86.mmx.pmadd.wd