Skip to content

Commit aa23e49

Browse files
authored
[NVPTX] Fix generating permute bytes from register pair when the initial values are undefined (#74437)
When generating the permute bytes for the prmt instruction, the existence of an undefined initial value initialises the int32 that holds the mask with all 1's (0xFFFFFFFF). That initialization subsequently leads to complications during the subsequent OR operation, leading to inaccuracies in populating mask values for the following bytes. Consequently, the final value persists as a constant -1, irrespective of the actual mask values that succeed the initial set value.
1 parent 7e909d5 commit aa23e49

File tree

2 files changed

+22
-2
lines changed

2 files changed

+22
-2
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2387,8 +2387,10 @@ SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
23872387
const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
23882388
SDValue V2 = Op.getOperand(1);
23892389
uint32_t Selector = 0;
2390-
for (auto I : llvm::enumerate(SVN->getMask()))
2391-
Selector |= (I.value() << (I.index() * 4));
2390+
for (auto I : llvm::enumerate(SVN->getMask())) {
2391+
if (I.value() != -1) // -1 is a placeholder for undef.
2392+
Selector |= (I.value() << (I.index() * 4));
2393+
}
23922394

23932395
SDLoc DL(Op);
23942396
return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
2+
; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-FOUND
3+
4+
define void @kernel_func(ptr %in.vec, ptr %out.vec0) nounwind {
5+
entry:
6+
%wide.vec = load <32 x i8>, ptr %in.vec, align 64
7+
%vec0 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
8+
store <4 x i8> %vec0, ptr %out.vec0, align 64
9+
ret void
10+
11+
; CHECK-FOUND: prmt.b32 {{.*}} 16384;
12+
; CHECK-FOUND: prmt.b32 {{.*}} 64;
13+
; CHECK-FOUND: prmt.b32 {{.*}} 30224;
14+
15+
; CHECK: @kernel_func
16+
; CHECK-NOT: prmt.b32 {{.*}} -1;
17+
; CHECK: -- End function
18+
}

0 commit comments

Comments
 (0)