Skip to content

Commit 0a33db0

Browse files
committed
[X86][AMX] Combine constant zero vector and AMX cast to tilezero
Found this problem when investigating llvm#91207
1 parent 46bc54f commit 0a33db0

File tree

2 files changed

+49
-68
lines changed

2 files changed

+49
-68
lines changed

llvm/lib/Target/X86/X86LowerAMXType.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,7 @@ class X86LowerAMXCast {
709709
X86LowerAMXCast(Function &F) : Func(F), DT(nullptr) {}
710710
bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
711711
bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
712+
bool combineTilezero(IntrinsicInst *Cast);
712713
bool combineLdSt(SmallVectorImpl<Instruction *> &Casts);
713714
bool combineAMXcast(TargetLibraryInfo *TLI);
714715
bool transformAMXCast(IntrinsicInst *AMXCast);
@@ -988,6 +989,27 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
988989
return EraseLoad;
989990
}
990991

992+
// %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
993+
// -->
994+
// %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
995+
bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) {
996+
Value *Row = nullptr, *Col = nullptr;
997+
Use &U = *(Cast->use_begin());
998+
unsigned OpNo = U.getOperandNo();
999+
auto *II = cast<IntrinsicInst>(U.getUser());
1000+
if (!isAMXIntrinsic(II))
1001+
return false;
1002+
1003+
std::tie(Row, Col) = getShape(II, OpNo);
1004+
std::array<Value *, 2> Args = {Row, Col};
1005+
1006+
IRBuilder<> Builder(Cast);
1007+
Value *NewInst = Builder.CreateIntrinsic(Intrinsic::x86_tilezero_internal,
1008+
std::nullopt, Args);
1009+
Cast->replaceAllUsesWith(NewInst);
1010+
return true;
1011+
}
1012+
9911013
bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
9921014
bool Change = false;
9931015
for (auto *Cast : Casts) {
@@ -1011,6 +1033,14 @@ bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
10111033
for (auto *Store : DeadStores)
10121034
Store->eraseFromParent();
10131035
} else { // x86_cast_vector_to_tile
1036+
// %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
1037+
// -->
1038+
// %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
1039+
if (dyn_cast<ConstantAggregateZero>(Cast->getOperand(0))) {
1040+
Change |= combineTilezero(cast<IntrinsicInst>(Cast));
1041+
continue;
1042+
}
1043+
10141044
SmallVector<Instruction *, 2> DeadLoads;
10151045
auto *Load = dyn_cast<LoadInst>(Cast->getOperand(0));
10161046
if (!Load || !Load->hasOneUse())
@@ -1024,6 +1054,7 @@ bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
10241054
// Set the operand is null so that load instruction can be erased.
10251055
Cast->setOperand(0, nullptr);
10261056
Load->eraseFromParent();
1057+
Change = true;
10271058
}
10281059
}
10291060
}

llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll

Lines changed: 18 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -52,26 +52,13 @@ declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_
5252
declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
5353
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
5454

55-
define void @PR90954(ptr %0, ptr %1, i32 %2) {
55+
define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
5656
; CHECK-LABEL: PR90954:
5757
; CHECK: # %bb.0:
5858
; CHECK-NEXT: pushq %rbp
59-
; CHECK-NEXT: .cfi_def_cfa_offset 16
60-
; CHECK-NEXT: .cfi_offset %rbp, -16
61-
; CHECK-NEXT: movq %rsp, %rbp
62-
; CHECK-NEXT: .cfi_def_cfa_register %rbp
63-
; CHECK-NEXT: pushq %r15
6459
; CHECK-NEXT: pushq %r14
65-
; CHECK-NEXT: pushq %r13
66-
; CHECK-NEXT: pushq %r12
6760
; CHECK-NEXT: pushq %rbx
68-
; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
69-
; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400
70-
; CHECK-NEXT: .cfi_offset %rbx, -56
71-
; CHECK-NEXT: .cfi_offset %r12, -48
72-
; CHECK-NEXT: .cfi_offset %r13, -40
73-
; CHECK-NEXT: .cfi_offset %r14, -32
74-
; CHECK-NEXT: .cfi_offset %r15, -24
61+
; CHECK-NEXT: subq $2912, %rsp # imm = 0xB60
7562
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
7663
; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
7764
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
@@ -87,29 +74,26 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
8774
; CHECK-NEXT: movw $64, %cx
8875
; CHECK-NEXT: movw $16, %di
8976
; CHECK-NEXT: movb $1, %r8b
90-
; CHECK-NEXT: movl $64, %r9d
91-
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r10
92-
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r11
93-
; CHECK-NEXT: xorl %ebx, %ebx
94-
; CHECK-NEXT: xorl %r14d, %r14d
77+
; CHECK-NEXT: xorl %r9d, %r9d
78+
; CHECK-NEXT: xorl %r10d, %r10d
9579
; CHECK-NEXT: jmp .LBB1_1
9680
; CHECK-NEXT: .p2align 4, 0x90
9781
; CHECK-NEXT: .LBB1_5: # in Loop: Header=BB1_1 Depth=1
98-
; CHECK-NEXT: incq %r14
99-
; CHECK-NEXT: addl %edx, %ebx
82+
; CHECK-NEXT: incq %r10
83+
; CHECK-NEXT: addl %edx, %r9d
10084
; CHECK-NEXT: .LBB1_1: # =>This Loop Header: Depth=1
10185
; CHECK-NEXT: # Child Loop BB1_2 Depth 2
102-
; CHECK-NEXT: movslq %ebx, %r15
103-
; CHECK-NEXT: leaq (%rsi,%r15,4), %r15
104-
; CHECK-NEXT: xorl %r12d, %r12d
105-
; CHECK-NEXT: xorl %r13d, %r13d
86+
; CHECK-NEXT: movslq %r9d, %r11
87+
; CHECK-NEXT: leaq (%rsi,%r11,4), %r11
88+
; CHECK-NEXT: xorl %ebx, %ebx
89+
; CHECK-NEXT: xorl %r14d, %r14d
10690
; CHECK-NEXT: jmp .LBB1_2
10791
; CHECK-NEXT: .p2align 4, 0x90
10892
; CHECK-NEXT: .LBB1_4: # in Loop: Header=BB1_2 Depth=2
109-
; CHECK-NEXT: tilestored %tmm1, (%r15,%rax)
110-
; CHECK-NEXT: incq %r13
111-
; CHECK-NEXT: addq $64, %r15
112-
; CHECK-NEXT: decq %r12
93+
; CHECK-NEXT: tilestored %tmm1, (%r11,%rax)
94+
; CHECK-NEXT: incq %r14
95+
; CHECK-NEXT: addq $64, %r11
96+
; CHECK-NEXT: decq %rbx
11397
; CHECK-NEXT: je .LBB1_5
11498
; CHECK-NEXT: .LBB1_2: # Parent Loop BB1_1 Depth=1
11599
; CHECK-NEXT: # => This Inner Loop Header: Depth=2
@@ -118,46 +102,12 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
118102
; CHECK-NEXT: testb %r8b, %r8b
119103
; CHECK-NEXT: jne .LBB1_4
120104
; CHECK-NEXT: # %bb.3: # in Loop: Header=BB1_2 Depth=2
121-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
122-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
123-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
124-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
125-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
126-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
127-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
128-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
129-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
130-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
131-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
132-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
133-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
134-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
135-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
136-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
137-
; CHECK-NEXT: tileloadd (%r10,%r9), %tmm1
138-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
139-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
140-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
141-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
142-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
143-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
144-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
145-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
146-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
147-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
148-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
149-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
150-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
151-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
152-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
153-
; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
154-
; CHECK-NEXT: tileloadd (%r11,%r9), %tmm2
105+
; CHECK-NEXT: tilezero %tmm1
106+
; CHECK-NEXT: tilezero %tmm2
155107
; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
156-
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
157-
; CHECK-NEXT: movabsq $64, %rax
158-
; CHECK-NEXT: tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
108+
; CHECK-NEXT: movabsq $64, %rbp
109+
; CHECK-NEXT: tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
159110
; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
160-
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
161111
; CHECK-NEXT: jmp .LBB1_4
162112
%4 = shl i32 %2, 4
163113
%5 = icmp eq i64 0, 0

0 commit comments

Comments
 (0)