[CIR] Add support for __builtin_ia32_psrldqi_byteshift (#1886)

wizardengineer · lanza · commit 870cad557913 · 2025-11-06T16:23:25.000-08:00
**Related Issue**: #1885
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -165,16 +165,23 @@ static mlir::Value emitX86PSLLDQIByteShift(CIRGenFunction &cgf,
   CIRGenBuilderTy &builder = cgf.getBuilder();
   unsigned shiftVal = getIntValueFromConstOp(Ops[1]) & 0xff;
   mlir::Location loc = cgf.getLoc(E->getExprLoc());
-  auto resultType = cast<cir::VectorType>(Ops[0].getType());
+  auto byteVecType = cast<cir::VectorType>(Ops[0].getType());
+
+  // Get the original return type from the expression
+  auto resultType = cast<cir::VectorType>(cgf.convertType(E->getType()));
 
   // If pslldq is shifting the vector more than 15 bytes, emit zero.
   // This matches the hardware behavior where shifting by 16+ bytes
   // clears the entire 128-bit lane.
-  if (shiftVal >= 16)
-    return builder.getZero(loc, resultType);
+  if (shiftVal >= 16) {
+    mlir::Value zero = builder.getZero(loc, byteVecType);
+    if (byteVecType != resultType)
+      return builder.createBitcast(zero, resultType);
+    return zero;
+  }
 
-  // Builtin type is vXi64 so multiply by 8 to get bytes.
-  unsigned numElts = resultType.getSize() * 8;
+  // Builtin type is vXi8 (already in bytes)
+  unsigned numElts = byteVecType.getSize();
   assert(numElts % 16 == 0 && "Vector size must be multiple of 16 bytes");
 
   llvm::SmallVector<int64_t, 64> indices;
@@ -189,17 +196,63 @@ static mlir::Value emitX86PSLLDQIByteShift(CIRGenFunction &cgf,
     }
   }
 
-  // Cast to byte vector for shuffle operation
-  auto byteVecTy = cir::VectorType::get(builder.getSInt8Ty(), numElts);
-  mlir::Value byteCast = builder.createBitcast(Ops[0], byteVecTy);
-  mlir::Value zero = builder.getZero(loc, byteVecTy);
+  mlir::Value zero = builder.getZero(loc, byteVecType);
 
   // Perform the shuffle (left shift by inserting zeros)
-  mlir::Value shuffleResult =
-      builder.createVecShuffle(loc, zero, byteCast, indices);
+  mlir::Value shuffleResult = builder.createVecShuffle(loc, zero, Ops[0], indices);
+
+  // Cast back to original type if necessary
+  if (byteVecType != resultType)
+    return builder.createBitcast(shuffleResult, resultType);
+  return shuffleResult;
+}
+
+static mlir::Value emitX86PSRLDQIByteShift(CIRGenFunction &cgf,
+                                           const CallExpr *E,
+                                           ArrayRef<mlir::Value> Ops) {
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  auto byteVecType = cast<cir::VectorType>(Ops[0].getType());
+  mlir::Location loc = cgf.getLoc(E->getExprLoc());
+  unsigned shiftVal = getIntValueFromConstOp(Ops[1]) & 0xff;
+
+  // Get the original return type from the expression
+  auto resultType = cast<cir::VectorType>(cgf.convertType(E->getType()));
+
+  // If psrldq is shifting the vector more than 15 bytes, emit zero.
+  if (shiftVal >= 16) {
+    mlir::Value zero = builder.getZero(loc, byteVecType);
+    if (byteVecType != resultType)
+      return builder.createBitcast(zero, resultType);
+    return zero;
+  }
+
+  // Builtin type is vXi8 (already in bytes)
+  uint64_t numElts = byteVecType.getSize();
+  assert(numElts % 16 == 0 && "Expected a multiple of 16");
+
+  llvm::SmallVector<int64_t, 64> indices;
+
+  // This correlates to the OG CodeGen
+  // As stated in the OG, 256/512-bit psrldq operates on 128-bit lanes.
+  // So we have to make sure we handle it.
+  for (unsigned l = 0; l < numElts; l += 16) {
+    for (unsigned i = 0; i < 16; ++i) {
+      unsigned idx = i + shiftVal;
+      if (idx >= 16)
+        idx += numElts - 16;
+      indices.push_back(idx + l);
+    }
+  }
+
+  mlir::Value zero = builder.getZero(loc, byteVecType);
+
+  // Perform the shuffle (right shift by inserting zeros from the left)
+  mlir::Value shuffleResult = builder.createVecShuffle(loc, Ops[0], zero, indices);
 
-  // Cast back to original type
-  return builder.createBitcast(shuffleResult, resultType);
+  // Cast back to original type if necessary
+  if (byteVecType != resultType)
+    return builder.createBitcast(shuffleResult, resultType);
+  return shuffleResult;
 }
 
 static mlir::Value emitX86MaskedCompareResult(CIRGenFunction &cgf,
@@ -1366,7 +1419,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_psrldqi128_byteshift:
   case X86::BI__builtin_ia32_psrldqi256_byteshift:
   case X86::BI__builtin_ia32_psrldqi512_byteshift:
-    llvm_unreachable("psrldqi NYI");
+    return emitX86PSRLDQIByteShift(*this, E, Ops);
   case X86::BI__builtin_ia32_kshiftliqi:
   case X86::BI__builtin_ia32_kshiftlihi:
   case X86::BI__builtin_ia32_kshiftlisi:
diff --git a/clang/test/CIR/CodeGen/builtin-x86-pslldqi.cpp b/clang/test/CIR/CodeGen/builtin-x86-pslldqi.cpp
@@ -12,11 +12,6 @@ typedef long long __m128i __attribute__((__vector_size__(16)));
 typedef long long __m256i __attribute__((__vector_size__(32)));
 typedef long long __m512i __attribute__((__vector_size__(64)));
 
-// Declare the builtins directly
-extern __m128i __builtin_ia32_pslldqi128_byteshift(__m128i, int);
-extern __m256i __builtin_ia32_pslldqi256_byteshift(__m256i, int);
-extern __m512i __builtin_ia32_pslldqi512_byteshift(__m512i, int);
-
 // ============================================================================
 // Core Functionality Tests
 // ============================================================================
@@ -48,7 +43,8 @@ __m128i test_pslldqi128_shift0(__m128i a) {
 // OGCG-LABEL: @_Z23test_pslldqi128_shift16Dv2_x
 __m128i test_pslldqi128_shift16(__m128i a) {
     // Entire vector shifted out, should return zero
-    // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 2>
+    // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<!s8i x 16>
+    // CIR: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.vector<!s8i x 16> -> !cir.vector<!s64i x 2>
     // LLVM: store <2 x i64> zeroinitializer, ptr %{{.*}}, align 16
     // OGCG: ret <2 x i64> zeroinitializer
     return __builtin_ia32_pslldqi128_byteshift(a, 16);
@@ -74,7 +70,8 @@ __m256i test_pslldqi256_shift4(__m256i a) {
 // OGCG-LABEL: @_Z23test_pslldqi256_shift16Dv4_x
 __m256i test_pslldqi256_shift16(__m256i a) {
     // Both lanes completely shifted out, returns zero
-    // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 4>
+    // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<!s8i x 32>
+    // CIR: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.vector<!s8i x 32> -> !cir.vector<!s64i x 4>
     // LLVM: store <4 x i64> zeroinitializer, ptr %{{.*}}, align 32
     // OGCG: ret <4 x i64> zeroinitializer
     return __builtin_ia32_pslldqi256_byteshift(a, 16);
@@ -100,7 +97,8 @@ __m512i test_pslldqi512_shift4(__m512i a) {
 // OGCG-LABEL: @_Z23test_pslldqi512_shift16Dv8_x
 __m512i test_pslldqi512_shift16(__m512i a) {
     // All 4 lanes completely cleared
-    // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 8>
+    // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<!s8i x 64>
+    // CIR: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.vector<!s8i x 64> -> !cir.vector<!s64i x 8>
     // LLVM: store <8 x i64> zeroinitializer, ptr %{{.*}}, align 64
     // OGCG: ret <8 x i64> zeroinitializer
     return __builtin_ia32_pslldqi512_byteshift(a, 16);
@@ -170,7 +168,8 @@ __m128i test_concrete_input_constant() {
 // OGCG-LABEL: @_Z22test_large_shift_valueDv2_x
 __m128i test_large_shift_value(__m128i a) {
     // 240 & 0xFF = 240, so this should return zero (240 > 16)
-    // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<!s64i x 2>
+    // CIR: %{{.*}} = cir.const #cir.zero : !cir.vector<!s8i x 16>
+    // CIR: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.vector<!s8i x 16> -> !cir.vector<!s64i x 2>
     // LLVM: store <2 x i64> zeroinitializer, ptr %{{.*}}, align 16
     // OGCG: ret <2 x i64> zeroinitializer
     return __builtin_ia32_pslldqi128_byteshift(a, 240);
diff --git a/clang/test/CIR/CodeGen/builtin-x86-psrldqi.cpp b/clang/test/CIR/CodeGen/builtin-x86-psrldqi.cpp