Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4769,6 +4769,79 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}

// Approximately handle AVX Galois Field Affine Transformation
//
// e.g.,
// <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
// <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
// <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
// Out A x b
// where A and x are packed matrices, b is a vector,
// Out = A * x + b in GF(2)
//
// Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix
// computation also includes a parity calculation.
//
// For the bitwise AND of bits V1 and V2, the exact shadow is:
// Out_Shadow = (V1_Shadow & V2_Shadow)
// | (V1 & V2_Shadow)
// | (V1_Shadow & V2 )
//
// We approximate the shadow of gf2p8affineqb using:
// Out_Shadow = gf2p8affineqb(x_Shadow, A_shadow, 0)
// | gf2p8affineqb(x, A_shadow, 0)
// | gf2p8affineqb(x_Shadow, A, 0)
// | set1_epi8(b_Shadow)
//
// This approximation has false negatives: if an intermediate dot-product
// contains an even number of 1's, the parity is 0.
// It has no false positives.
void handleAVXGF2P8Affine(IntrinsicInst &I) {
IRBuilder<> IRB(&I);

assert(I.arg_size() == 3);
Value *A = I.getOperand(0);
Value *X = I.getOperand(1);
Value *B = I.getOperand(2);

assert(isFixedIntVector(A));
assert(cast<VectorType>(A->getType())
->getElementType()
->getScalarSizeInBits() == 8);

assert(A->getType() == X->getType());

assert(B->getType()->isIntegerTy());
assert(B->getType()->getScalarSizeInBits() == 8);

assert(I.getType() == A->getType());

Value *AShadow = getShadow(A);
Value *XShadow = getShadow(X);
Value *BZeroShadow = getCleanShadow(B);

CallInst *AShadowXShadow = IRB.CreateIntrinsic(
I.getType(), I.getIntrinsicID(), {XShadow, AShadow, BZeroShadow});
CallInst *AShadowX = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
{X, AShadow, BZeroShadow});
CallInst *XShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
{XShadow, A, BZeroShadow});

unsigned NumElements = cast<FixedVectorType>(I.getType())->getNumElements();
Value *BShadow = getShadow(B);
Value *BBroadcastShadow = getCleanShadow(AShadow);
// There is no LLVM IR intrinsic for _mm512_set1_epi8.
// This loop generates a lot of LLVM IR, which we expect that CodeGen will
// lower appropriately (e.g., VPBROADCASTB).
// Besides, b is often a constant, in which case it is fully initialized.
for (unsigned i = 0; i < NumElements; i++)
BBroadcastShadow = IRB.CreateInsertElement(BBroadcastShadow, BShadow, i);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This just concats BShadow before getCleanShadow, right? Or am I missing something? Can we use shufflevector for this instead?

Copy link
Contributor Author

@thurstond thurstond Jul 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getCleanShadow is merely used to get the correct vector type; the shadow contents are entirely replaced by the insertelement calls.

shufflevector could be used instead. I chose insertelement because it is most likely to be the instruction sequence that the vectorizer will know how to convert into the AVX instruction, if not now then in the future. clang/test/CodeGen/X86/avx512f-builtins.c shows that:

__m512i test_mm512_set1_epi8(char d)
{
  // CHECK-LABEL: test_mm512_set1_epi8
  // CHECK: insertelement <64 x i8> {{.*}}, i32 0
  // CHECK: insertelement <64 x i8> {{.*}}, i32 1
  // CHECK: insertelement <64 x i8> {{.*}}, i32 2
  // CHECK: insertelement <64 x i8> {{.*}}, i32 3
  // CHECK: insertelement <64 x i8> {{.*}}, i32 4
  // CHECK: insertelement <64 x i8> {{.*}}, i32 5
  // CHECK: insertelement <64 x i8> {{.*}}, i32 6
  // CHECK: insertelement <64 x i8> {{.*}}, i32 7
  // CHECK: insertelement <64 x i8> {{.*}}, i32 63
  return _mm512_set1_epi8(d);
}


setShadow(&I, IRB.CreateOr(
{AShadowXShadow, AShadowX, XShadowA, BBroadcastShadow}));
setOriginForNaryOp(I);
}

// Handle Arm NEON vector load intrinsics (vld*).
//
// The WithLane instructions (ld[234]lane) are similar to:
Expand Down Expand Up @@ -5604,6 +5677,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
}

// AVX Galois Field New Instructions
case Intrinsic::x86_vgf2p8affineqb_128:
case Intrinsic::x86_vgf2p8affineqb_256:
case Intrinsic::x86_vgf2p8affineqb_512:
handleAVXGF2P8Affine(I);
break;

case Intrinsic::fshl:
case Intrinsic::fshr:
handleFunnelShift(I);
Expand Down
Loading