Skip to content

Commit f0dd12e

Browse files
committed
[x86] use zero-extending load of a byte outside of loops too (2nd try)
The first attempt missed changing test files for tools (update_llc_test_checks.py). Original commit message: This implements the main suggested change from issue #56498. Using the shorter (non-extending) instruction with only -Oz ("minsize") rather than -Os ("optsize") is left as a possible follow-up. As noted in the bug report, the zero-extending load may have shorter latency/better throughput across a wide range of x86 micro-arches, and it avoids a potential false dependency. The cost is an extra instruction byte. This could cause perf ups and downs from secondary effects, but I don't think it is possible to account for those in advance, and that will likely also depend on exact micro-arch. This does bring LLVM x86 codegen more in line with existing gcc codegen, so if problems are exposed they are more likely to occur for both compilers. Differential Revision: https://reviews.llvm.org/D129775
1 parent 2d889a8 commit f0dd12e

File tree

211 files changed

+3834
-3292
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

211 files changed

+3834
-3292
lines changed

llvm/lib/Target/X86/X86FixupBWInsts.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -393,12 +393,12 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
393393
switch (MI->getOpcode()) {
394394

395395
case X86::MOV8rm:
396-
// Only replace 8 bit loads with the zero extending versions if
397-
// in an inner most loop and not optimizing for size. This takes
398-
// an extra byte to encode, and provides limited performance upside.
399-
if (MachineLoop *ML = MLI->getLoopFor(&MBB))
400-
if (ML->begin() == ML->end() && !OptForSize)
401-
return tryReplaceLoad(X86::MOVZX32rm8, MI);
396+
// Replace 8-bit loads with the zero-extending version if not optimizing
397+
// for size. The extending op is cheaper across a wide range of uarch and
398+
// it avoids a potentially expensive partial register stall. It takes an
399+
// extra byte to encode, however, so don't do this when optimizing for size.
400+
if (!OptForSize)
401+
return tryReplaceLoad(X86::MOVZX32rm8, MI);
402402
break;
403403

404404
case X86::MOV16rm:

llvm/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ target triple = "i686-unknown-unknown"
1111
define i32 @test5(i32 %B, i8 %C) {
1212
; CHECK-LABEL: test5:
1313
; CHECK: # %bb.0: # %entry
14-
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
14+
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
1515
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
1616
; CHECK-NEXT: movl A, %eax
1717
; CHECK-NEXT: shldl %cl, %edx, %eax

llvm/test/CodeGen/X86/2006-05-08-InstrSched.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ define void @test() {
1010
; CHECK: # %bb.0:
1111
; CHECK-NEXT: movl A, %eax
1212
; CHECK-NEXT: movzwl 2(%eax), %eax
13-
; CHECK-NEXT: movb B, %cl
13+
; CHECK-NEXT: movzbl B, %ecx
1414
; CHECK-NEXT: movl C, %edx
1515
; CHECK-NEXT: andb $16, %cl
1616
; CHECK-NEXT: shll %cl, %edx

llvm/test/CodeGen/X86/2006-11-17-IllegalMove.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ define void @handle_vector_size_attribute() nounwind {
99
; CHECK-NEXT: cmpl $1, %eax
1010
; CHECK-NEXT: ja .LBB0_2
1111
; CHECK-NEXT: # %bb.1: # %bb77
12-
; CHECK-NEXT: movb 0, %al
13-
; CHECK-NEXT: movb 0, %al
12+
; CHECK-NEXT: movzbl 0, %eax
13+
; CHECK-NEXT: movzbl 0, %eax
1414
; CHECK-NEXT: xorl %eax, %eax
1515
; CHECK-NEXT: testb %al, %al
1616
; CHECK-NEXT: .LBB0_2: # %bb84

llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,11 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
6969
; CHECK-NEXT: movq _PyUFunc_API@GOTPCREL(%rip), %rbp
7070
; CHECK-NEXT: movq (%rbp), %rax
7171
; CHECK-NEXT: callq *216(%rax)
72-
; CHECK-NEXT: movb {{[0-9]+}}(%rsp), %dl
72+
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
7373
; CHECK-NEXT: testb %dl, %dl
7474
; CHECK-NEXT: je LBB0_11
7575
; CHECK-NEXT: ## %bb.7: ## %cond_false.i
76-
; CHECK-NEXT: movb {{[0-9]+}}(%rsp), %bl
76+
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
7777
; CHECK-NEXT: movzbl %bl, %ecx
7878
; CHECK-NEXT: movl %ecx, %eax
7979
; CHECK-NEXT: divb %dl
@@ -98,8 +98,8 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
9898
; CHECK-NEXT: LBB0_11: ## %cond_true.i
9999
; CHECK-NEXT: movl $4, %edi
100100
; CHECK-NEXT: callq _feraiseexcept
101-
; CHECK-NEXT: movb {{[0-9]+}}(%rsp), %dl
102-
; CHECK-NEXT: movb {{[0-9]+}}(%rsp), %bl
101+
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx
102+
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx
103103
; CHECK-NEXT: xorl %r14d, %r14d
104104
; CHECK-NEXT: testb %bl, %bl
105105
; CHECK-NEXT: je LBB0_14

llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
3131
; CHECK-NEXT: .cfi_offset %ebx, -12
3232
; CHECK-NEXT: .cfi_offset %ebp, -8
3333
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
34-
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %bl
34+
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
3535
; CHECK-NEXT: testb $1, %bl
3636
; CHECK-NEXT: je LBB0_25
3737
; CHECK-NEXT: ## %bb.1: ## %bb116.i

llvm/test/CodeGen/X86/2008-04-24-MemCpyBug.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ define void @testit63_entry_2E_ce() nounwind {
1717
; CHECK-NEXT: movl %esp, %edi
1818
; CHECK-NEXT: movl $g1s63, %esi
1919
; CHECK-NEXT: rep;movsl (%esi), %es:(%edi)
20-
; CHECK-NEXT: movb g1s63+62, %al
20+
; CHECK-NEXT: movzbl g1s63+62, %eax
2121
; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp)
2222
; CHECK-NEXT: movzwl g1s63+60, %eax
2323
; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)

llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ define i32 @func_44(i16 signext %p_46) nounwind {
1515
; SOURCE-SCHED-NEXT: xorl %ecx, %ecx
1616
; SOURCE-SCHED-NEXT: cmpl $2, %eax
1717
; SOURCE-SCHED-NEXT: setge %cl
18-
; SOURCE-SCHED-NEXT: movb g_73, %dl
18+
; SOURCE-SCHED-NEXT: movzbl g_73, %edx
1919
; SOURCE-SCHED-NEXT: xorl %eax, %eax
2020
; SOURCE-SCHED-NEXT: subb {{[0-9]+}}(%esp), %al
2121
; SOURCE-SCHED-NEXT: testb %dl, %dl

llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,44 @@
1-
; RUN: llc < %s -mcpu=core2 | FileCheck %s
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 | FileCheck %s
23

34
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
4-
target triple = "x86_64-apple-darwin10.4"
55
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
66

77
define fastcc i32 @cli_magic_scandesc(ptr %in) nounwind ssp {
8+
; CHECK-LABEL: cli_magic_scandesc:
9+
; CHECK: # %bb.0: # %entry
10+
; CHECK-NEXT: subq $72, %rsp
11+
; CHECK-NEXT: movq __stack_chk_guard(%rip), %rax
12+
; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
13+
; CHECK-NEXT: movzbl (%rsp), %eax
14+
; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
15+
; CHECK-NEXT: movq (%rdi), %rdx
16+
; CHECK-NEXT: movq 8(%rdi), %rsi
17+
; CHECK-NEXT: movq %rdx, (%rsp)
18+
; CHECK-NEXT: movq 24(%rdi), %rdx
19+
; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
20+
; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
21+
; CHECK-NEXT: movq 16(%rdi), %rdx
22+
; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
23+
; CHECK-NEXT: movq 32(%rdi), %rdx
24+
; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
25+
; CHECK-NEXT: movq 40(%rdi), %rdx
26+
; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
27+
; CHECK-NEXT: movq 48(%rdi), %rdx
28+
; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
29+
; CHECK-NEXT: movq 56(%rdi), %rdx
30+
; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
31+
; CHECK-NEXT: movb %al, (%rsp)
32+
; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp)
33+
; CHECK-NEXT: movq __stack_chk_guard(%rip), %rax
34+
; CHECK-NEXT: cmpq {{[0-9]+}}(%rsp), %rax
35+
; CHECK-NEXT: jne .LBB0_2
36+
; CHECK-NEXT: # %bb.1: # %entry
37+
; CHECK-NEXT: xorl %eax, %eax
38+
; CHECK-NEXT: addq $72, %rsp
39+
; CHECK-NEXT: retq
40+
; CHECK-NEXT: .LBB0_2: # %entry
41+
; CHECK-NEXT: callq __stack_chk_fail@PLT
842
entry:
943
%a = alloca [64 x i8]
1044
%c = getelementptr inbounds [64 x i8], ptr %a, i64 0, i32 30
@@ -15,10 +49,3 @@ entry:
1549
store i8 %e, ptr %c, align 8
1650
ret i32 0
1751
}
18-
19-
; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip)
20-
; CHECK: movb (%rsp), [[R1:%.+]]
21-
; CHECK: movb 30(%rsp), [[R0:%.+]]
22-
; CHECK: movb [[R1]], (%rsp)
23-
; CHECK: movb [[R0]], 30(%rsp)
24-
; CHECK: callq ___stack_chk_fail

llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nou
233233
define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounwind {
234234
; I386-NOCMOV-LABEL: negative_CopyFromReg:
235235
; I386-NOCMOV: # %bb.0:
236-
; I386-NOCMOV-NEXT: movb {{[0-9]+}}(%esp), %al
236+
; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
237237
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
238238
; I386-NOCMOV-NEXT: addl {{[0-9]+}}(%esp), %ecx
239239
; I386-NOCMOV-NEXT: cmpb %cl, %al
@@ -255,7 +255,7 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
255255
;
256256
; I686-NOCMOV-LABEL: negative_CopyFromReg:
257257
; I686-NOCMOV: # %bb.0:
258-
; I686-NOCMOV-NEXT: movb {{[0-9]+}}(%esp), %al
258+
; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
259259
; I686-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
260260
; I686-NOCMOV-NEXT: addl {{[0-9]+}}(%esp), %ecx
261261
; I686-NOCMOV-NEXT: cmpb %cl, %al
@@ -297,8 +297,8 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
297297
define i8 @negative_CopyFromRegs(i32 %a1_wide, i32 %a2_wide) nounwind {
298298
; I386-NOCMOV-LABEL: negative_CopyFromRegs:
299299
; I386-NOCMOV: # %bb.0:
300-
; I386-NOCMOV-NEXT: movb {{[0-9]+}}(%esp), %cl
301-
; I386-NOCMOV-NEXT: movb {{[0-9]+}}(%esp), %al
300+
; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
301+
; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
302302
; I386-NOCMOV-NEXT: cmpb %cl, %al
303303
; I386-NOCMOV-NEXT: jg .LBB4_2
304304
; I386-NOCMOV-NEXT: # %bb.1:
@@ -317,8 +317,8 @@ define i8 @negative_CopyFromRegs(i32 %a1_wide, i32 %a2_wide) nounwind {
317317
;
318318
; I686-NOCMOV-LABEL: negative_CopyFromRegs:
319319
; I686-NOCMOV: # %bb.0:
320-
; I686-NOCMOV-NEXT: movb {{[0-9]+}}(%esp), %cl
321-
; I686-NOCMOV-NEXT: movb {{[0-9]+}}(%esp), %al
320+
; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
321+
; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
322322
; I686-NOCMOV-NEXT: cmpb %cl, %al
323323
; I686-NOCMOV-NEXT: jg .LBB4_2
324324
; I686-NOCMOV-NEXT: # %bb.1:

0 commit comments

Comments
 (0)