@@ -52,26 +52,13 @@ declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_
5252declare x86_amx @llvm.x86.tdpbf16ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
5353declare void @llvm.x86.tilestored64.internal (i16 , i16 , ptr , i64 , x86_amx)
5454
55- define void @PR90954 (ptr %0 , ptr %1 , i32 %2 ) {
55+ define void @PR90954 (ptr %0 , ptr %1 , i32 %2 ) nounwind {
5656; CHECK-LABEL: PR90954:
5757; CHECK: # %bb.0:
5858; CHECK-NEXT: pushq %rbp
59- ; CHECK-NEXT: .cfi_def_cfa_offset 16
60- ; CHECK-NEXT: .cfi_offset %rbp, -16
61- ; CHECK-NEXT: movq %rsp, %rbp
62- ; CHECK-NEXT: .cfi_def_cfa_register %rbp
63- ; CHECK-NEXT: pushq %r15
6459; CHECK-NEXT: pushq %r14
65- ; CHECK-NEXT: pushq %r13
66- ; CHECK-NEXT: pushq %r12
6760; CHECK-NEXT: pushq %rbx
68- ; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
69- ; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400
70- ; CHECK-NEXT: .cfi_offset %rbx, -56
71- ; CHECK-NEXT: .cfi_offset %r12, -48
72- ; CHECK-NEXT: .cfi_offset %r13, -40
73- ; CHECK-NEXT: .cfi_offset %r14, -32
74- ; CHECK-NEXT: .cfi_offset %r15, -24
61+ ; CHECK-NEXT: subq $2912, %rsp # imm = 0xB60
7562; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
7663; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
7764; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
@@ -87,29 +74,26 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
8774; CHECK-NEXT: movw $64, %cx
8875; CHECK-NEXT: movw $16, %di
8976; CHECK-NEXT: movb $1, %r8b
90- ; CHECK-NEXT: movl $64, %r9d
91- ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r10
92- ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %r11
93- ; CHECK-NEXT: xorl %ebx, %ebx
94- ; CHECK-NEXT: xorl %r14d, %r14d
77+ ; CHECK-NEXT: xorl %r9d, %r9d
78+ ; CHECK-NEXT: xorl %r10d, %r10d
9579; CHECK-NEXT: jmp .LBB1_1
9680; CHECK-NEXT: .p2align 4, 0x90
9781; CHECK-NEXT: .LBB1_5: # in Loop: Header=BB1_1 Depth=1
98- ; CHECK-NEXT: incq %r14
99- ; CHECK-NEXT: addl %edx, %ebx
82+ ; CHECK-NEXT: incq %r10
83+ ; CHECK-NEXT: addl %edx, %r9d
10084; CHECK-NEXT: .LBB1_1: # =>This Loop Header: Depth=1
10185; CHECK-NEXT: # Child Loop BB1_2 Depth 2
102- ; CHECK-NEXT: movslq %ebx , %r15
103- ; CHECK-NEXT: leaq (%rsi,%r15 ,4), %r15
104- ; CHECK-NEXT: xorl %r12d , %r12d
105- ; CHECK-NEXT: xorl %r13d , %r13d
86+ ; CHECK-NEXT: movslq %r9d , %r11
87+ ; CHECK-NEXT: leaq (%rsi,%r11 ,4), %r11
88+ ; CHECK-NEXT: xorl %ebx , %ebx
89+ ; CHECK-NEXT: xorl %r14d , %r14d
10690; CHECK-NEXT: jmp .LBB1_2
10791; CHECK-NEXT: .p2align 4, 0x90
10892; CHECK-NEXT: .LBB1_4: # in Loop: Header=BB1_2 Depth=2
109- ; CHECK-NEXT: tilestored %tmm1, (%r15 ,%rax)
110- ; CHECK-NEXT: incq %r13
111- ; CHECK-NEXT: addq $64, %r15
112- ; CHECK-NEXT: decq %r12
93+ ; CHECK-NEXT: tilestored %tmm1, (%r11 ,%rax)
94+ ; CHECK-NEXT: incq %r14
95+ ; CHECK-NEXT: addq $64, %r11
96+ ; CHECK-NEXT: decq %rbx
11397; CHECK-NEXT: je .LBB1_5
11498; CHECK-NEXT: .LBB1_2: # Parent Loop BB1_1 Depth=1
11599; CHECK-NEXT: # => This Inner Loop Header: Depth=2
@@ -118,46 +102,12 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) {
118102; CHECK-NEXT: testb %r8b, %r8b
119103; CHECK-NEXT: jne .LBB1_4
120104; CHECK-NEXT: # %bb.3: # in Loop: Header=BB1_2 Depth=2
121- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
122- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
123- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
124- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
125- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
126- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
127- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
128- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
129- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
130- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
131- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
132- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
133- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
134- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
135- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
136- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
137- ; CHECK-NEXT: tileloadd (%r10,%r9), %tmm1
138- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
139- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
140- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
141- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
142- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
143- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
144- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
145- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
146- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
147- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
148- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
149- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
150- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
151- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
152- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
153- ; CHECK-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp)
154- ; CHECK-NEXT: tileloadd (%r11,%r9), %tmm2
105+ ; CHECK-NEXT: tilezero %tmm1
106+ ; CHECK-NEXT: tilezero %tmm2
155107; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
156- ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
157- ; CHECK-NEXT: movabsq $64, %rax
158- ; CHECK-NEXT: tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
108+ ; CHECK-NEXT: movabsq $64, %rbp
109+ ; CHECK-NEXT: tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
159110; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
160- ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
161111; CHECK-NEXT: jmp .LBB1_4
162112 %4 = shl i32 %2 , 4
163113 %5 = icmp eq i64 0 , 0
0 commit comments