[PowerPC] hoist xxspltiw instruction out of the loop with FMA mutation pass. (#111696)

diggerlin · web-flow · commit a91b0d278062 · 2025-06-05T09:41:51.000-04:00
Summary: The patch fixes the issue [[PowerPC] missing VSX FMA Mutation optimize in some case for option -schedule-ppc-vsx-fma-mutation-early #111906](#111906) In certain cases, the Register Coalescer pass—which eliminates COPY instructions—can interfere with the PowerPC VSX FMA Mutation pass. Specifically, it can prevent the mutation of a COPY adjacent to an XSMADDADP into a single XSMADDMDP instruction. As a result, the xxspltiw instruction is not hoisted out of the loop as expected, leading to missed optimization opportunities. To address this, the patch ensures that the `VSX FMA Mutation` pass runs before the `Register Coalescer` pass when the -schedule-ppc-vsx-fma-mutation-early option is enabled.
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -559,7 +559,8 @@ void PPCPassConfig::addMachineSSAOptimization() {
 
 void PPCPassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOptLevel::None) {
-    insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+    insertPass(VSXFMAMutateEarly ? &TwoAddressInstructionPassID
+                                 : &MachineSchedulerID,
                &PPCVSXFMAMutateID);
   }
 
diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-m-early.ll
@@ -69,14 +69,14 @@ declare <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float>, <4 x float>)
 ; CHECK64-NEXT:         bltlr   cr0
 ; CHECK64-NEXT: # %bb.1:                                # %for.body.preheader
 ; CHECK64-NEXT:         xxspltiw vs0, 1069066811
+; CHECK64-NEXT:         xxspltiw vs1, 1170469888
 ; CHECK64-NEXT:         mtctr r5
 ; CHECK64-NEXT:         li r5, 0
 ; CHECK64-NEXT:         {{.*}}align  5
 ; CHECK64-NEXT: [[L2_bar:.*]]:                               # %for.body
 ; CHECK64-NEXT:                                         # =>This Inner Loop Header: Depth=1
-; CHECK64-NEXT:         lxvx vs1, r4, r5
-; CHECK64-NEXT:         xxspltiw vs2, 1170469888
-; CHECK64-NEXT:         xvmaddasp vs2, vs1, vs0
+; CHECK64-NEXT:         lxvx vs2, r4, r5
+; CHECK64-NEXT:         xvmaddmsp vs2, vs0, vs1
 ; CHECK64-NEXT:         stxvx vs2, r3, r5
 ; CHECK64-NEXT:         addi r5, r5, 16
 ; CHECK64-NEXT:         bdnz [[L2_bar]]
@@ -139,17 +139,17 @@ declare <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float>, <4 x float>)
 ; CHECK32-NEXT:       blelr cr0
 ; CHECK32-NEXT: # %bb.1:                                # %for.body.preheader
 ; CHECK32-NEXT:       xxspltiw vs0, 1069066811
+; CHECK32-NEXT:       xxspltiw vs1, 1170469888
 ; CHECK32-NEXT:       li r6, 0
 ; CHECK32-NEXT:       li r7, 0
 ; CHECK32-NEXT:       .align  4
 ; CHECK32-NEXT: [[L2_foo:.*]]:                               # %for.body
 ; CHECK32-NEXT:                                         # =>This Inner Loop Header: Depth=1
 ; CHECK32-NEXT:       slwi r8, r7, 4
-; CHECK32-NEXT:       xxspltiw vs2, 1170469888
 ; CHECK32-NEXT:       addic r7, r7, 1
 ; CHECK32-NEXT:       addze r6, r6
-; CHECK32-NEXT:       lxvx vs1, r4, r8
-; CHECK32-NEXT:       xvmaddasp vs2, vs1, vs0
+; CHECK32-NEXT:       lxvx vs2, r4, r8
+; CHECK32-NEXT:       xvmaddmsp vs2, vs0, vs1
 ; CHECK32-NEXT:       stxvx vs2, r3, r8
 ; CHECK32-NEXT:       xor r8, r7, r5
 ; CHECK32-NEXT:       or. r8, r8, r6

Original file line number	Diff line number	Diff line change
`@@ -559,7 +559,8 @@ void PPCPassConfig::addMachineSSAOptimization() {`
`559`	`559`
`560`	`560`	`void PPCPassConfig::addPreRegAlloc() {`
`561`	`561`	`if (getOptLevel() != CodeGenOptLevel::None) {`
`562`		`- insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,`
	`562`	`+ insertPass(VSXFMAMutateEarly ? &TwoAddressInstructionPassID`
	`563`	`+ : &MachineSchedulerID,`
`563`	`564`	`&PPCVSXFMAMutateID);`
`564`	`565`	`}`
`565`	`566`