-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[msan] Approximately handle AVX Galois Field Affine Transformation #150794
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
e.g.,
<16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
<32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
<64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
Out A x b
where Out = A * x + b in GF(2) (but A and x are packed)
Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix computation also includes a parity calculation.
For the bitwise AND of bits V1 and V2, the exact shadow is:
Out_Shadow = (V1_Shadow & V2_Shadow) | (V1 & V2_Shadow) | (V1_Shadow & V2_Shadow)
We approximate the shadow of gf2p8affine using:
Out_Shadow = _mm512_gf2p8affine_epi64_epi8(x_Shadow, A_shadow, 0)
| _mm512_gf2p8affine_epi64_epi8(x, A_shadow, 0)
| _mm512_gf2p8affine_epi64_epi8(x_Shadow, A, 0)
| _mm512_set1_epi8(b_Shadow)
This approximation has false negatives: if an intermediate dot-product contains an even number of 1's, the parity is 0.
It has no false positives.
Updates the test from llvm#149258
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-compiler-rt-sanitizer Author: Thurston Dang (thurstond) Changese.g., Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix computation also includes a parity calculation. For the bitwise AND of bits V1 and V2, the exact shadow is: We approximate the shadow of gf2p8affine using: This approximation has false negatives: if an intermediate dot-product contains an even number of 1's, the parity is 0. It has no false positives. Updates the test from #149258 Patch is 24.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150794.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index df31f07097f82..324c7f22253fd 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4769,6 +4769,78 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
+ // Approximately handle AVX Galois Field Affine Transformation
+ //
+ // e.g.,
+ // <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+ // <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+ // <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
+ // Out A x b
+ // where Out = A * x + b in GF(2) (but A and x are packed)
+ //
+ // Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix
+ // computation also includes a parity calculation.
+ //
+ // For the bitwise AND of bits V1 and V2, the exact shadow is:
+ // Out_Shadow = (V1_Shadow & V2_Shadow)
+ // | (V1 & V2_Shadow)
+ // | (V1_Shadow & V2_Shadow)
+ //
+ // We approximate the shadow of gf2p8affine using:
+ // Out_Shadow = _mm512_gf2p8affine_epi64_epi8(x_Shadow, A_shadow, 0)
+ // | _mm512_gf2p8affine_epi64_epi8(x, A_shadow, 0)
+ // | _mm512_gf2p8affine_epi64_epi8(x_Shadow, A, 0)
+ // | _mm512_set1_epi8(b_Shadow)
+ //
+ // This approximation has false negatives: if an intermediate dot-product
+ // contains an even number of 1's, the parity is 0.
+ // It has no false positives.
+ void handleAVXGF2P8Affine(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+
+ assert(I.arg_size() == 3);
+ Value *A = I.getOperand(0);
+ Value *x = I.getOperand(1);
+ Value *b = I.getOperand(2);
+
+ assert(isFixedIntVector(A));
+ assert(cast<VectorType>(A->getType())
+ ->getElementType()
+ ->getScalarSizeInBits() == 8);
+
+ assert(A->getType() == x->getType());
+
+ assert(b->getType()->isIntegerTy());
+ assert(b->getType()->getScalarSizeInBits() == 8);
+
+ assert(I.getType() == A->getType());
+
+ Value *AShadow = getShadow(A);
+ Value *xShadow = getShadow(x);
+ Value *bZeroShadow = getCleanShadow(b);
+
+ CallInst *xShadowAShadow = IRB.CreateIntrinsic(
+ I.getType(), I.getIntrinsicID(), {xShadow, AShadow, bZeroShadow});
+ CallInst *xAShadow = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+ {x, AShadow, bZeroShadow});
+ CallInst *xShadowA = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+ {xShadow, A, bZeroShadow});
+
+ unsigned NumElements = cast<FixedVectorType>(I.getType())->getNumElements();
+ Value *bShadow = getShadow(b);
+ Value *bBroadcastShadow = getCleanShadow(AShadow);
+ // There is no LLVM IR intrinsic for _mm512_set1_epi8.
+ // This loop generates a lot of LLVM IR, which we expect that CodeGen will
+ // lower appropriately (e.g., VPBROADCASTB).
+ // Besides, b is often a constant, in which case it is fully initialized.
+ for (unsigned i = 0; i < NumElements; i++)
+ bBroadcastShadow = IRB.CreateInsertElement(bBroadcastShadow, bShadow, i);
+
+ setShadow(&I, IRB.CreateOr(
+ {xShadowAShadow, xAShadow, xShadowA, bBroadcastShadow}));
+ setOriginForNaryOp(I);
+ }
+
// Handle Arm NEON vector load intrinsics (vld*).
//
// The WithLane instructions (ld[234]lane) are similar to:
@@ -5604,6 +5676,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
}
+ // AVX Galois Field New Instructions
+ case Intrinsic::x86_vgf2p8affineqb_128:
+ case Intrinsic::x86_vgf2p8affineqb_256:
+ case Intrinsic::x86_vgf2p8affineqb_512: {
+ handleAVXGF2P8Affine(I);
+ break;
+ }
+
case Intrinsic::fshl:
case Intrinsic::fshr:
handleFunnelShift(I);
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
index e5e4371c525b2..43da02d19693c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
@@ -7,9 +7,6 @@
; - llvm.x86.vgf2p8affineinvqb.128
; - llvm.x86.vgf2p8affineinvqb.256
; - llvm.x86.vgf2p8affineinvqb.512
-; - llvm.x86.vgf2p8affineqb.128
-; - llvm.x86.vgf2p8affineqb.256
-; - llvm.x86.vgf2p8affineqb.512
;
; Heuristically handled:
; - llvm.x86.vgf2p8mulb.128
@@ -254,53 +251,42 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK: 9:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT: unreachable
-; CHECK: 10:
-; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]], i8 3)
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
-; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
-; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK: 14:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT: unreachable
-; CHECK: 15:
+; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC2:%.*]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[SRC1:%.*]], i8 0)
+; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i8> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i8> [[TMP10]], [[TMP9]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 3)
+; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC2]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP20:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[SRC1]], i8 0)
+; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i8> [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i8> [[TMP17]], [[TMP20]]
+; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i8> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 4)
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP17]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i8> [[TMP3]] to i128
-; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP18]], 0
-; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
-; CHECK: 19:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT: unreachable
-; CHECK: 20:
+; CHECK-NEXT: [[TMP37:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC2]], <16 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP40:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[TMP3]], <16 x i8> [[SRC1]], i8 0)
+; CHECK-NEXT: [[TMP41:%.*]] = or <16 x i8> [[TMP37]], [[TMP39]]
+; CHECK-NEXT: [[TMP42:%.*]] = or <16 x i8> [[TMP41]], [[TMP40]]
+; CHECK-NEXT: [[TMP43:%.*]] = or <16 x i8> [[TMP42]], zeroinitializer
; CHECK-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 5)
-; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP19]], <16 x i8> zeroinitializer
; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i8> [[TMP16]], zeroinitializer
-; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i8> [[TMP23]], [[TMP19]]
; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i8> [[TMP24]], zeroinitializer
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP25]], <16 x i8> [[TMP22]]
; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP16]], <16 x i8> zeroinitializer
-; CHECK-NEXT: [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> [[TMP4]]
+; CHECK-NEXT: [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP43]], <16 x i8> [[TMP4]]
; CHECK-NEXT: [[TMP28:%.*]] = xor <16 x i8> [[TMP21]], [[PASSTHRU:%.*]]
-; CHECK-NEXT: [[TMP29:%.*]] = or <16 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT: [[TMP29:%.*]] = or <16 x i8> [[TMP28]], [[TMP43]]
; CHECK-NEXT: [[TMP30:%.*]] = or <16 x i8> [[TMP29]], [[TMP4]]
; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP30]], <16 x i8> [[TMP27]]
; CHECK-NEXT: [[TMP31:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP21]], <16 x i8> [[PASSTHRU]]
+; CHECK-NEXT: [[TMP38:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[TMP12]], 0
; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> [[TMP11]], 0
-; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> zeroinitializer, <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP38]], <16 x i8> [[_MSPROP_SELECT]], 1
; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP32]], <16 x i8> [[TMP26]], 1
; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP33]], <16 x i8> [[_MSPROP_SELECT1]], 2
; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP34]], <16 x i8> [[TMP31]], 2
@@ -329,53 +315,42 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK: 9:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT: unreachable
-; CHECK: 10:
-; CHECK-NEXT: [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]], i8 3)
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
-; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
-; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP13]], 0
-; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
-; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK: 14:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT: unreachable
-; CHECK: 15:
+; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP8:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC2:%.*]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP9:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[SRC1:%.*]], i8 0)
+; CHECK-NEXT: [[TMP10:%.*]] = or <32 x i8> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i8> [[TMP10]], [[TMP9]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <32 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 3)
+; CHECK-NEXT: [[TMP14:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP15:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC2]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP20:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[SRC1]], i8 0)
+; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i8> [[TMP14]], [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = or <32 x i8> [[TMP17]], [[TMP20]]
+; CHECK-NEXT: [[TMP19:%.*]] = or <32 x i8> [[TMP18]], zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 4)
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
-; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i256 [[TMP17]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast <32 x i8> [[TMP3]] to i256
-; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP18]], 0
-; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
-; CHECK: 19:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT: unreachable
-; CHECK: 20:
+; CHECK-NEXT: [[TMP37:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP39:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC2]], <32 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP40:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[TMP3]], <32 x i8> [[SRC1]], i8 0)
+; CHECK-NEXT: [[TMP41:%.*]] = or <32 x i8> [[TMP37]], [[TMP39]]
+; CHECK-NEXT: [[TMP42:%.*]] = or <32 x i8> [[TMP41]], [[TMP40]]
+; CHECK-NEXT: [[TMP43:%.*]] = or <32 x i8> [[TMP42]], zeroinitializer
; CHECK-NEXT: [[TMP21:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 5)
-; CHECK-NEXT: [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP19]], <32 x i8> zeroinitializer
; CHECK-NEXT: [[TMP23:%.*]] = xor <32 x i8> [[TMP16]], zeroinitializer
-; CHECK-NEXT: [[TMP24:%.*]] = or <32 x i8> [[TMP23]], zeroinitializer
+; CHECK-NEXT: [[TMP24:%.*]] = or <32 x i8> [[TMP23]], [[TMP19]]
; CHECK-NEXT: [[TMP25:%.*]] = or <32 x i8> [[TMP24]], zeroinitializer
; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP25]], <32 x i8> [[TMP22]]
; CHECK-NEXT: [[TMP26:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP16]], <32 x i8> zeroinitializer
-; CHECK-NEXT: [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> [[TMP4]]
+; CHECK-NEXT: [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP43]], <32 x i8> [[TMP4]]
; CHECK-NEXT: [[TMP28:%.*]] = xor <32 x i8> [[TMP21]], [[PASSTHRU:%.*]]
-; CHECK-NEXT: [[TMP29:%.*]] = or <32 x i8> [[TMP28]], zeroinitializer
+; CHECK-NEXT: [[TMP29:%.*]] = or <32 x i8> [[TMP28]], [[TMP43]]
; CHECK-NEXT: [[TMP30:%.*]] = or <32 x i8> [[TMP29]], [[TMP4]]
; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP30]], <32 x i8> [[TMP27]]
; CHECK-NEXT: [[TMP31:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP21]], <32 x i8> [[PASSTHRU]]
+; CHECK-NEXT: [[TMP38:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[TMP12]], 0
; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> [[TMP11]], 0
-; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> zeroinitializer, <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP38]], <32 x i8> [[_MSPROP_SELECT]], 1
; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP32]], <32 x i8> [[TMP26]], 1
; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP33]], <32 x i8> [[_MSPROP_SELECT1]], 2
; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP34]], <32 x i8> [[TMP31]], 2
@@ -404,53 +379,42 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK: 9:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT: unreachable
-; CHECK: 10:
-; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]], i8 3)
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <64 x i8> [[TMP2]] to i512
-; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast <64 x i8> [[TMP3]] to i512
-; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP13]], 0
-; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
-; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK: 14:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]]
-; CHECK-NEXT: unreachable
-; CHECK: 15:
+; CHECK-NEXT: [[TMP7:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP8:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC2:%.*]], <64 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP9:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[SRC1:%.*]], i8 0)
+; CHECK-NEXT: [[TMP10:%.*]] = or <64 x i8> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP13:%.*]] = or <64 x i8> [[TMP10]], [[TMP9]]
+; CHECK-NEXT: [[TMP12:%.*]] = or <64 x i8> [[TMP13]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 3)
+; CHECK-NEXT: [[TMP14:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[TMP3]], <64 x i8> [[TMP2]], i8 0)
+; CHECK-NEXT: [[TMP15:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC2]], <64 x i8> [[TMP2]], ...
[truncated]
|
| assert(I.getType() == A->getType()); | ||
|
|
||
| Value *AShadow = getShadow(A); | ||
| Value *xShadow = getShadow(x); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: variable naming
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Capitalized to X (formerly known as Twitter).
|
I think your commit message has a mistake:
|
Fixed, thanks |
| // lower appropriately (e.g., VPBROADCASTB). | ||
| // Besides, b is often a constant, in which case it is fully initialized. | ||
| for (unsigned i = 0; i < NumElements; i++) | ||
| BBroadcastShadow = IRB.CreateInsertElement(BBroadcastShadow, BShadow, i); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This just concats BShadow before getCleanShadow, right? Or am I missing something? Can we use shufflevector for this instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getCleanShadow is merely used to get the correct vector type; the shadow contents are entirely replaced by the insertelement calls.
shufflevector could be used instead. I chose insertelement because it is most likely to be the instruction sequence that the vectorizer will know how to convert into the AVX instruction, if not now then in the future. clang/test/CodeGen/X86/avx512f-builtins.c shows that:
__m512i test_mm512_set1_epi8(char d)
{
// CHECK-LABEL: test_mm512_set1_epi8
// CHECK: insertelement <64 x i8> {{.*}}, i32 0
// CHECK: insertelement <64 x i8> {{.*}}, i32 1
// CHECK: insertelement <64 x i8> {{.*}}, i32 2
// CHECK: insertelement <64 x i8> {{.*}}, i32 3
// CHECK: insertelement <64 x i8> {{.*}}, i32 4
// CHECK: insertelement <64 x i8> {{.*}}, i32 5
// CHECK: insertelement <64 x i8> {{.*}}, i32 6
// CHECK: insertelement <64 x i8> {{.*}}, i32 7
// CHECK: insertelement <64 x i8> {{.*}}, i32 63
return _mm512_set1_epi8(d);
}
| // AVX Galois Field New Instructions | ||
| case Intrinsic::x86_vgf2p8affineqb_128: | ||
| case Intrinsic::x86_vgf2p8affineqb_256: | ||
| case Intrinsic::x86_vgf2p8affineqb_512: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: unnecessary braces
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed
e.g.,
<16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
<32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
<64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
Out A x b
where A and x are packed matrices, b is a vector, Out = A * x + b in GF(2)
Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix computation also includes a parity calculation.
For the bitwise AND of bits V1 and V2, the exact shadow is:
Out_Shadow = (V1_Shadow & V2_Shadow) | (V1 & V2_Shadow) | (V1_Shadow & V2)
We approximate the shadow of gf2p8affine using:
Out_Shadow = _mm512_gf2p8affine_epi64_epi8(x_Shadow, A_shadow, 0)
| _mm512_gf2p8affine_epi64_epi8(x, A_shadow, 0)
| _mm512_gf2p8affine_epi64_epi8(x_Shadow, A, 0)
| _mm512_set1_epi8(b_Shadow)
This approximation has false negatives: if an intermediate dot-product contains an even number of 1's, the parity is 0.
It has no false positives.
Updates the test from #149258