|
| 1 | +;; Tests that the ppc-vsx-fma-mutate pass with the schedule-ppc-vsx-fma-mutation-early pass does not hoist xxspltiw out of loops. |
| 2 | +; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \ |
| 3 | +; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \ |
| 4 | +; RUN: -mtriple powerpc64-ibm-aix < %s | FileCheck --check-prefixes=CHECK64,AIX64 %s |
| 5 | + |
| 6 | +; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \ |
| 7 | +; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \ |
| 8 | +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck --check-prefixes=CHECK64,LINUX64 %s |
| 9 | + |
| 10 | +; RUN: llc -verify-machineinstrs -mcpu=pwr10 -disable-ppc-vsx-fma-mutation=false \ |
| 11 | +; RUN: -ppc-asm-full-reg-names -schedule-ppc-vsx-fma-mutation-early \ |
| 12 | +; RUN: -mtriple powerpc-ibm-aix < %s | FileCheck --check-prefix=CHECK32 %s |
| 13 | + |
| 14 | +define void @bar(ptr noalias nocapture noundef writeonly %__output_a, ptr noalias nocapture noundef readonly %var1321In_a, ptr noalias nocapture noundef readonly %n) { |
| 15 | +entry: |
| 16 | + %0 = load i32, ptr %n, align 4 |
| 17 | + %cmp11 = icmp sgt i32 %0, 0 |
| 18 | + br i1 %cmp11, label %for.body.preheader, label %for.end |
| 19 | + |
| 20 | +for.body.preheader: |
| 21 | + %wide.trip.count = zext i32 %0 to i64 |
| 22 | + br label %for.body |
| 23 | + |
| 24 | +for.body: |
| 25 | + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] |
| 26 | + %1 = shl nsw i64 %indvars.iv, 2 |
| 27 | + %add.ptr = getelementptr inbounds float, ptr %var1321In_a, i64 %1 |
| 28 | + %add.ptr.val = load <4 x float>, ptr %add.ptr, align 1 |
| 29 | + %2 = tail call contract <4 x float> @llvm.fma.v4f32(<4 x float> %add.ptr.val, <4 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, <4 x float> <float 6.270500e+03, float 6.270500e+03, float 6.270500e+03, float 6.270500e+03>) |
| 30 | + %add.ptr6 = getelementptr inbounds float, ptr %__output_a, i64 %1 |
| 31 | + store <4 x float> %2, ptr %add.ptr6, align 1 |
| 32 | + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 |
| 33 | + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count |
| 34 | + br i1 %exitcond.not, label %for.end, label %for.body |
| 35 | + |
| 36 | +for.end: |
| 37 | + ret void |
| 38 | +} |
| 39 | + |
| 40 | +define void @foo(i1 %cmp97) #0 { |
| 41 | +entry: |
| 42 | + br i1 %cmp97, label %for.body, label %for.end |
| 43 | + |
| 44 | +for.body: ; preds = %for.body, %entry |
| 45 | + %0 = phi float [ %vecext.i, %for.body ], [ 0.000000e+00, %entry ] |
| 46 | + %splat.splatinsert.i = insertelement <4 x float> zeroinitializer, float %0, i64 0 |
| 47 | + %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %splat.splatinsert.i, <4 x float> zeroinitializer, <4 x float> splat (float 6.270500e+03)) |
| 48 | + %2 = tail call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> zeroinitializer, <4 x float> %splat.splatinsert.i) |
| 49 | + %3 = bitcast <4 x float> %1 to <4 x i32> |
| 50 | + %and1.i8896 = and <4 x i32> %2, %3 |
| 51 | + %4 = bitcast <4 x i32> %and1.i8896 to <4 x float> |
| 52 | + %vecext.i = extractelement <4 x float> %4, i64 0 |
| 53 | + br label %for.body |
| 54 | + |
| 55 | +for.end: ; preds = %entry |
| 56 | + ret void |
| 57 | +} |
| 58 | + |
| 59 | +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) |
| 60 | +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) |
| 61 | + |
| 62 | +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) |
| 63 | +declare <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float>, <4 x float>) |
| 64 | + |
| 65 | +; CHECK64: bar: |
| 66 | +; CHECK64: # %bb.0: # %entry |
| 67 | +; CHECK64-NEXT: lwz r5, 0(r5) |
| 68 | +; CHECK64-NEXT: cmpwi r5, 1 |
| 69 | +; CHECK64-NEXT: bltlr cr0 |
| 70 | +; CHECK64-NEXT: # %bb.1: # %for.body.preheader |
| 71 | +; CHECK64-NEXT: xxspltiw vs0, 1069066811 |
| 72 | +; CHECK64-NEXT: mtctr r5 |
| 73 | +; CHECK64-NEXT: li r5, 0 |
| 74 | +; CHECK64-NEXT: {{.*}}align 5 |
| 75 | +; CHECK64-NEXT: [[L2_bar:.*]]: # %for.body |
| 76 | +; CHECK64-NEXT: # =>This Inner Loop Header: Depth=1 |
| 77 | +; CHECK64-NEXT: lxvx vs1, r4, r5 |
| 78 | +; CHECK64-NEXT: xxspltiw vs2, 1170469888 |
| 79 | +; CHECK64-NEXT: xvmaddasp vs2, vs1, vs0 |
| 80 | +; CHECK64-NEXT: stxvx vs2, r3, r5 |
| 81 | +; CHECK64-NEXT: addi r5, r5, 16 |
| 82 | +; CHECK64-NEXT: bdnz [[L2_bar]] |
| 83 | +; CHECK64-NEXT: # %bb.3: # %for.end |
| 84 | +; CHECK64-NEXT: blr |
| 85 | + |
| 86 | +; AIX64: .foo: |
| 87 | +; AIX64-NEXT: # %bb.0: # %entry |
| 88 | +; AIX64-NEXT: andi. r3, r3, 1 |
| 89 | +; AIX64-NEXT: bclr 4, gt, 0 |
| 90 | +; AIX64-NEXT: # %bb.1: # %for.body.preheader |
| 91 | +; AIX64-NEXT: xxlxor f0, f0, f0 |
| 92 | +; AIX64-NEXT: xxlxor vs1, vs1, vs1 |
| 93 | +; AIX64-NEXT: xxlxor f2, f2, f2 |
| 94 | +; AIX64-NEXT: .align 4 |
| 95 | +; AIX64-NEXT: L..BB1_2: # %for.body |
| 96 | +; AIX64-NEXT: # =>This Inner Loop Header: Depth=1 |
| 97 | +; AIX64-NEXT: xxmrghd vs2, vs2, vs0 |
| 98 | +; AIX64-NEXT: xvcvdpsp vs34, vs2 |
| 99 | +; AIX64-NEXT: xxmrghd vs2, vs0, vs0 |
| 100 | +; AIX64-NEXT: xvcvdpsp vs35, vs2 |
| 101 | +; AIX64-NEXT: xxspltiw vs2, 1170469888 |
| 102 | +; AIX64-NEXT: vmrgew v2, v2, v3 |
| 103 | +; AIX64-NEXT: xvcmpgtsp vs3, vs1, vs34 |
| 104 | +; AIX64-NEXT: xvmaddasp vs2, vs34, vs1 |
| 105 | +; AIX64-NEXT: xxland vs2, vs3, vs2 |
| 106 | +; AIX64-NEXT: xscvspdpn f2, vs2 |
| 107 | +; AIX64-NEXT: b L..BB1_2 |
| 108 | + |
| 109 | +; LINUX64: foo: # @foo |
| 110 | +; LINUX64-NEXT: .Lfunc_begin1: |
| 111 | +; LINUX64-NEXT: .cfi_startproc |
| 112 | +; LINUX64-NEXT: # %bb.0: # %entry |
| 113 | +; LINUX64-NEXT: andi. r3, r3, 1 |
| 114 | +; LINUX64-NEXT: bclr 4, gt, 0 |
| 115 | +; LINUX64-NEXT: # %bb.1: # %for.body.preheader |
| 116 | +; LINUX64-NEXT: xxlxor f0, f0, f0 |
| 117 | +; LINUX64-NEXT: xxlxor vs1, vs1, vs1 |
| 118 | +; LINUX64-NEXT: xxlxor f2, f2, f2 |
| 119 | +; LINUX64-NEXT: .p2align 4 |
| 120 | +; LINUX64-NEXT: .LBB1_2: # %for.body |
| 121 | +; LINUX64-NEXT: # =>This Inner Loop Header: Depth=1 |
| 122 | +; LINUX64-NEXT: xxmrghd vs2, vs0, vs2 |
| 123 | +; LINUX64-NEXT: xvcvdpsp vs34, vs2 |
| 124 | +; LINUX64-NEXT: xxspltd vs2, vs0, 0 |
| 125 | +; LINUX64-NEXT: xvcvdpsp vs35, vs2 |
| 126 | +; LINUX64-NEXT: xxspltiw vs2, 1170469888 |
| 127 | +; LINUX64-NEXT: vmrgew v2, v3, v2 |
| 128 | +; LINUX64-NEXT: xvcmpgtsp vs3, vs1, vs34 |
| 129 | +; LINUX64-NEXT: xvmaddasp vs2, vs34, vs1 |
| 130 | +; LINUX64-NEXT: xxland vs2, vs3, vs2 |
| 131 | +; LINUX64-NEXT: xxsldwi vs2, vs2, vs2, 3 |
| 132 | +; LINUX64-NEXT: xscvspdpn f2, vs2 |
| 133 | +; LINUX64-NEXT: b .LBB1_2 |
| 134 | + |
| 135 | +; CHECK32: .bar: |
| 136 | +; CHECK32-NEXT: # %bb.0: # %entry |
| 137 | +; CHECK32-NEXT: lwz r5, 0(r5) |
| 138 | +; CHECK32-NEXT: cmpwi r5, 0 |
| 139 | +; CHECK32-NEXT: blelr cr0 |
| 140 | +; CHECK32-NEXT: # %bb.1: # %for.body.preheader |
| 141 | +; CHECK32-NEXT: xxspltiw vs0, 1069066811 |
| 142 | +; CHECK32-NEXT: li r6, 0 |
| 143 | +; CHECK32-NEXT: li r7, 0 |
| 144 | +; CHECK32-NEXT: .align 4 |
| 145 | +; CHECK32-NEXT: [[L2_foo:.*]]: # %for.body |
| 146 | +; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1 |
| 147 | +; CHECK32-NEXT: slwi r8, r7, 4 |
| 148 | +; CHECK32-NEXT: xxspltiw vs2, 1170469888 |
| 149 | +; CHECK32-NEXT: addic r7, r7, 1 |
| 150 | +; CHECK32-NEXT: addze r6, r6 |
| 151 | +; CHECK32-NEXT: lxvx vs1, r4, r8 |
| 152 | +; CHECK32-NEXT: xvmaddasp vs2, vs1, vs0 |
| 153 | +; CHECK32-NEXT: stxvx vs2, r3, r8 |
| 154 | +; CHECK32-NEXT: xor r8, r7, r5 |
| 155 | +; CHECK32-NEXT: or. r8, r8, r6 |
| 156 | +; CHECK32-NEXT: bne cr0, [[L2_foo]] |
| 157 | + |
| 158 | +; CHECK32: .foo: |
| 159 | +; CHECK32-NEXT: # %bb.0: # %entry |
| 160 | +; CHECK32-NEXT: andi. r3, r3, 1 |
| 161 | +; CHECK32-NEXT: bclr 4, gt, 0 |
| 162 | +; CHECK32-NEXT: # %bb.1: # %for.body.preheader |
| 163 | +; CHECK32-NEXT: lwz r3, L..C0(r2) # %const.0 |
| 164 | +; CHECK32-NEXT: xxlxor f1, f1, f1 |
| 165 | +; CHECK32-NEXT: xxlxor vs0, vs0, vs0 |
| 166 | +; CHECK32-NEXT: xscvdpspn vs35, f1 |
| 167 | +; CHECK32-NEXT: lxv vs34, 0(r3) |
| 168 | +; CHECK32-NEXT: .align 4 |
| 169 | +; CHECK32-NEXT: L..BB1_2: # %for.body |
| 170 | +; CHECK32-NEXT: # =>This Inner Loop Header: Depth=1 |
| 171 | +; CHECK32-NEXT: xscvdpspn vs36, f1 |
| 172 | +; CHECK32-NEXT: xxspltiw vs1, 1170469888 |
| 173 | +; CHECK32-NEXT: vperm v4, v4, v3, v2 |
| 174 | +; CHECK32-NEXT: xvcmpgtsp vs2, vs0, vs36 |
| 175 | +; CHECK32-NEXT: xvmaddasp vs1, vs36, vs0 |
| 176 | +; CHECK32-NEXT: xxland vs1, vs2, vs1 |
| 177 | +; CHECK32-NEXT: xscvspdpn f1, vs1 |
| 178 | +; CHECK32-NEXT: b L..BB1_2 |
0 commit comments