-
Notifications
You must be signed in to change notification settings - Fork 13.9k
[LegalizeTypes][RISCV] Use SPLAT_VECTOR_PARTS to legalize splat BUILD_VECTOR #107290
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…_VECTOR If the element type needs to be expanded, we can use SPLAT_VECTOR_PARTS if the target supports it. There's already a DAGCombine to turn BUILD_VECTOR into SPLAT_VECTOR if the target make SPLAT_VECTOR legal, but it doesn't fire for vectors that need to be split. Alternatively, we could tweak the existig DAGCombine to form SPLAT_VECTOR if the type will be split to a legal vector type that supports SPLAT_VECTOR. That requires iterating through getTypeToTransformTo until we reach a legal type.
@llvm/pr-subscribers-llvm-selectiondag Author: Craig Topper (topperc) ChangesIf the element type needs to be expanded, we can use SPLAT_VECTOR_PARTS if the target supports it. There's already a DAGCombine to turn BUILD_VECTOR into SPLAT_VECTOR if the target makes SPLAT_VECTOR legal, but it doesn't fire for vectors that need to be split. Alternatively, we could tweak the existig DAGCombine to form SPLAT_VECTOR if the type will be split to a legal vector type that supports SPLAT_VECTOR. That requires iterating through getTypeToTransformTo until we reach a legal type. Patch is 69.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/107290.diff 15 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index b402e823762764..2655e8428309da 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -376,6 +376,15 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
assert(OldVT == VecVT.getVectorElementType() &&
"BUILD_VECTOR operand type doesn't match vector element type!");
+ if (VecVT.isInteger() && TLI.isOperationLegal(ISD::SPLAT_VECTOR, VecVT) &&
+ TLI.isOperationLegalOrCustom(ISD::SPLAT_VECTOR_PARTS, VecVT)) {
+ if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) {
+ SDValue Lo, Hi;
+ GetExpandedOp(V, Lo, Hi);
+ return DAG.getNode(ISD::SPLAT_VECTOR_PARTS, dl, VecVT, Lo, Hi);
+ }
+ }
+
// Build a vector of twice the length out of the expanded elements.
// For example <3 x i64> -> <6 x i32>.
SmallVector<SDValue, 16> NewElts;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
index 901be442c0012a..d52cbb54c4b2da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
@@ -14,9 +14,11 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV32-NEXT: vfmv.f.s fa0, v8
; RV32-NEXT: call llrintf
-; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a0, 0(sp)
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
@@ -669,9 +671,11 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vfmv.f.s fa0, v8
; RV32-NEXT: call llrint
-; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT: vmv.v.x v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
+; RV32-NEXT: sw a1, 4(sp)
+; RV32-NEXT: sw a0, 0(sp)
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
index ad075e4b4e198c..2f20caa6eb1894 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
@@ -397,43 +397,22 @@ define void @masked_load_v32i32(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
declare <32 x i32> @llvm.masked.load.v32i32(ptr, i32, <32 x i1>, <32 x i32>)
define void @masked_load_v32i64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; RV32-LABEL: masked_load_v32i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi a3, a1, 128
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vle64.v v0, (a1)
-; RV32-NEXT: vle64.v v24, (a3)
-; RV32-NEXT: li a1, 32
-; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vmseq.vv v8, v0, v16
-; RV32-NEXT: vmseq.vv v0, v24, v16
-; RV32-NEXT: addi a1, a0, 128
-; RV32-NEXT: vle64.v v16, (a1), v0.t
-; RV32-NEXT: vmv1r.v v0, v8
-; RV32-NEXT: vle64.v v8, (a0), v0.t
-; RV32-NEXT: vse64.v v8, (a2)
-; RV32-NEXT: addi a0, a2, 128
-; RV32-NEXT: vse64.v v16, (a0)
-; RV32-NEXT: ret
-;
-; RV64-LABEL: masked_load_v32i64:
-; RV64: # %bb.0:
-; RV64-NEXT: addi a3, a1, 128
-; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle64.v v16, (a1)
-; RV64-NEXT: vle64.v v24, (a3)
-; RV64-NEXT: vmseq.vi v8, v16, 0
-; RV64-NEXT: vmseq.vi v0, v24, 0
-; RV64-NEXT: addi a1, a0, 128
-; RV64-NEXT: vle64.v v16, (a1), v0.t
-; RV64-NEXT: vmv1r.v v0, v8
-; RV64-NEXT: vle64.v v8, (a0), v0.t
-; RV64-NEXT: vse64.v v8, (a2)
-; RV64-NEXT: addi a0, a2, 128
-; RV64-NEXT: vse64.v v16, (a0)
-; RV64-NEXT: ret
+; CHECK-LABEL: masked_load_v32i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi a3, a1, 128
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v16, (a1)
+; CHECK-NEXT: vle64.v v24, (a3)
+; CHECK-NEXT: vmseq.vi v8, v16, 0
+; CHECK-NEXT: vmseq.vi v0, v24, 0
+; CHECK-NEXT: addi a1, a0, 128
+; CHECK-NEXT: vle64.v v16, (a1), v0.t
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vle64.v v8, (a0), v0.t
+; CHECK-NEXT: vse64.v v8, (a2)
+; CHECK-NEXT: addi a0, a2, 128
+; CHECK-NEXT: vse64.v v16, (a0)
+; CHECK-NEXT: ret
%m = load <32 x i64>, ptr %m_ptr
%mask = icmp eq <32 x i64> %m, zeroinitializer
%load = call <32 x i64> @llvm.masked.load.v32i64(ptr %a, i32 8, <32 x i1> %mask, <32 x i64> undef)
@@ -547,3 +526,6 @@ define void @masked_load_v256i8(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
ret void
}
declare <256 x i8> @llvm.masked.load.v256i8(ptr, i32, <256 x i1>, <256 x i8>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
index 86c28247e97ef1..90690bbc8e2085 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
@@ -397,87 +397,44 @@ define void @masked_store_v32i32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
declare void @llvm.masked.store.v32i32.p0(<32 x i32>, ptr, i32, <32 x i1>)
define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
-; RV32-LABEL: masked_store_v32i64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: csrr a3, vlenb
-; RV32-NEXT: slli a3, a3, 4
-; RV32-NEXT: sub sp, sp, a3
-; RV32-NEXT: addi a3, a2, 128
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vle64.v v24, (a2)
-; RV32-NEXT: vle64.v v8, (a3)
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.i v8, 0
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vmseq.vv v7, v24, v8
-; RV32-NEXT: addi a2, a0, 128
-; RV32-NEXT: vle64.v v24, (a2)
-; RV32-NEXT: vle64.v v16, (a0)
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vmseq.vv v0, v16, v8
-; RV32-NEXT: addi a0, a1, 128
-; RV32-NEXT: vse64.v v24, (a0), v0.t
-; RV32-NEXT: vmv1r.v v0, v7
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vse64.v v8, (a1), v0.t
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: masked_store_v32i64:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: csrr a3, vlenb
-; RV64-NEXT: slli a3, a3, 4
-; RV64-NEXT: sub sp, sp, a3
-; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vle64.v v8, (a2)
-; RV64-NEXT: addi a2, a2, 128
-; RV64-NEXT: vle64.v v16, (a2)
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT: vmseq.vi v0, v8, 0
-; RV64-NEXT: vle64.v v24, (a0)
-; RV64-NEXT: addi a0, a0, 128
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vmseq.vi v8, v16, 0
-; RV64-NEXT: vse64.v v24, (a1), v0.t
-; RV64-NEXT: addi a0, a1, 128
-; RV64-NEXT: vmv1r.v v0, v8
-; RV64-NEXT: addi a1, sp, 16
-; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT: vse64.v v8, (a0), v0.t
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 4
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: ret
+; CHECK-LABEL: masked_store_v32i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: slli a3, a3, 4
+; CHECK-NEXT: sub sp, sp, a3
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vle64.v v8, (a2)
+; CHECK-NEXT: addi a2, a2, 128
+; CHECK-NEXT: vle64.v v16, (a2)
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, sp, a2
+; CHECK-NEXT: addi a2, a2, 16
+; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT: vmseq.vi v0, v8, 0
+; CHECK-NEXT: vle64.v v24, (a0)
+; CHECK-NEXT: addi a0, a0, 128
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmseq.vi v8, v16, 0
+; CHECK-NEXT: vse64.v v24, (a1), v0.t
+; CHECK-NEXT: addi a0, a1, 128
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT: vse64.v v8, (a0), v0.t
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%m = load <32 x i64>, ptr %m_ptr
%mask = icmp eq <32 x i64> %m, zeroinitializer
%val = load <32 x i64>, ptr %val_ptr
@@ -683,3 +640,6 @@ define void @masked_store_v256i8(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
ret void
}
declare void @llvm.masked.store.v256i8.p0(<256 x i8>, ptr, i32, <256 x i1>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
index 5601bd5ee7a3ae..805a3c640957bf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
@@ -1346,93 +1346,48 @@ define <16 x i64> @vadd_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
declare <32 x i64> @llvm.vp.add.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
define <32 x i64> @vadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: vadd_vx_v32i64:
-; RV32: # %bb.0:
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v7, v0, 2
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: bltu a0, a2, .LBB108_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: .LBB108_2:
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.i v24, -1
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT: addi a1, a0, -16
-; RV32-NEXT: sltu a0, a0, a1
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a1
-; RV32-NEXT: vmv1r.v v0, v7
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vadd_vx_v32i64:
-; RV64: # %bb.0:
-; RV64-NEXT: li a2, 16
-; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT: vslidedown.vi v24, v0, 2
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: bltu a0, a2, .LBB108_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: li a1, 16
-; RV64-NEXT: .LBB108_2:
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vadd.vi v8, v8, -1, v0.t
-; RV64-NEXT: addi a1, a0, -16
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a1
-; RV64-NEXT: vmv1r.v v0, v24
-; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vi v16, v16, -1, v0.t
-; RV64-NEXT: ret
+; CHECK-LABEL: vadd_vx_v32i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v24, v0, 2
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: bltu a0, a2, .LBB108_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a1, 16
+; CHECK-NEXT: .LBB108_2:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT: addi a1, a0, -16
+; CHECK-NEXT: sltu a0, a0, a1
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT: ret
%v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 %evl)
ret <32 x i64> %v
}
define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
-; RV32-LABEL: vadd_vi_v32i64_unmasked:
-; RV32: # %bb.0:
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: mv a1, a0
-; RV32-NEXT: bltu a0, a2, .LBB109_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: .LBB109_2:
-; RV32-NEXT: li a2, 32
-; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.i v24, -1
-; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v8, v24
-; RV32-NEXT: addi a1, a0, -16
-; RV32-NEXT: sltu a0, a0, a1
-; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: and a0, a0, a1
-; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v16, v24
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vadd_vi_v32i64_unmasked:
-; RV64: # %bb.0:
-; RV64-NEXT: li a2, 16
-; RV64-NEXT: mv a1, a0
-; RV64-NEXT: bltu a0, a2, .LBB109_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: li a1, 16
-; RV64-NEXT: .LBB109_2:
-; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT: vadd.vi v8, v8, -1
-; RV64-NEXT: addi a1, a0, -16
-; RV64-NEXT: sltu a0, a0, a1
-; RV64-NEXT: addi a0, a0, -1
-; RV64-NEXT: and a0, a0, a1
-; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT: vadd.vi v16, v16, -1
-; RV64-NEXT: ret
+; CHECK-LABEL: vadd_vi_v32i64_unmasked:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: mv a1, a0
+; CHECK-NEXT: bltu a0, a2, .LBB109_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a1, 16
+; CHECK-NEXT: .LBB109_2:
+; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v8, v8, -1
+; CHECK-NEXT: addi a1, a0, -16
+; CHECK-NEXT: sltu a0, a0, a1
+; CHECK-NEXT: addi a0, a0, -1
+; CHECK-NEXT: and a0, a0, a1
+; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v16, -1
+; CHECK-NEXT: ret
%v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> splat (i1 true), i32 %evl)
ret <32 x i64> %v
}
@@ -1440,49 +1395,26 @@ define <32 x i64> @vadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
; FIXME: We don't match vadd.vi on RV32.
define <32 x i64> @vadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vadd_vx_v32i64_evl12:
-; RV32: # %bb.0:
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.i v16, -1
-; RV32-NEXT: vsetivli zero, 12, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vadd_vx_v32i64_evl12:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 12, e64, m8, ta, ma
-; RV64-NEXT: vadd.vi v8, v8, -1, v0.t
-; RV64-NEXT: ret
+; CHECK-LABEL: vadd_vx_v32i64_evl12:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 12, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT: ret
%v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12)
ret <32 x i64> %v
}
define <32 x i64> @vadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
-; RV32-LABEL: vadd_vx_v32i64_evl27:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v7, v0, 2
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.i v24, -1
-; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT: vmv1r.v v0, v7
-; RV32-NEXT: vsetivli zero, 11, e64, m8, ta, ma
-; RV32-NEXT: vadd.vv v16, v16, v24, v0.t
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vadd_vx_v32i64_evl27:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT: vslidedown.vi v24, v0, 2
-; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT: vadd.vi v8, v8, -1, v0.t
-; RV64-NEXT: vmv1r.v v0, v24
-; RV64-NEXT: vsetivli zero, 11, e64, m8, ta, ma
-; RV64-NEXT: vadd.vi v16, v16, -1, v0.t
-; RV64-NEXT: ret
+; CHECK-LABEL: vadd_vx_v32i64_evl27:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT: vslidedown.vi v24, v0, 2
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: vsetivli zero, 11, e64, m8, ta, ma
+; CHECK-NEXT: vadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT: ret
%v = call <32 x i64> @llvm.vp.add.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 27)
ret <32 x i64> %v
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
index d414be76672ab0..c413dd86f37128 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
@@ -1139,18 +1139,16 @@ define <11 x i64> @vand_vv_v11i64_unmasked(<11 x i64> %va, <11 x i64> %b, i32 ze
define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vand_vx_v11i64:
; RV32: # %bb.0:
-; RV32-NEXT: vmv1r.v v16, v0
-; RV32-NEXT: lui a3, 341
-; RV32-NEXT: addi a3, a3, 1365
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vmv.s.x v0, a3
-; RV32-NEXT: li a3, 32
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v24, a1
-; RV32-NEXT: vmerge.vxm v24, v24, a0, v0
-; RV32-NEXT: vmv1r.v v0, v16
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v16, (a0), zero
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vand.vv v8, v8, v24, v0.t
+; RV32-NEXT: vand.vv v8, v8, v16, v0.t
+; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vand_vx_v11i64:
@@ -1167,16 +1165,16 @@ define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zero
define <11 x i64> @vand_vx_v11i64_unmasked(<11 x i64> %va, i64 %b, i32 zeroext %evl) {
; RV32-LABEL: vand_vx_v11i64_unmasked:
; RV32: # %bb.0:
-; RV32-NEXT: li a3, 32
-; RV32-NEXT: lui a4, 341
-; RV32-NEXT: addi a4, a4, 1365
-; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT: vmv.s.x v0, a4
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vmv.v.x v16, a1
-; RV32-NEXT: vmerge.vxm v16, v16, a0, v0
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vsetivli zero, 16,...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
@@ -376,6 +376,15 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) { | |||
assert(OldVT == VecVT.getVectorElementType() && | |||
"BUILD_VECTOR operand type doesn't match vector element type!"); | |||
|
|||
if (VecVT.isInteger() && TLI.isOperationLegal(ISD::SPLAT_VECTOR, VecVT) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need the SPLAT_VECTOR legality? Isn't SPLAT_VECTOR_PARTS enough?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we don't default SPLAT_VECTOR_PARTS to Expand so it started crashing other targets.
; RV32-NEXT: sw a1, 4(sp) | ||
; RV32-NEXT: sw a0, 0(sp) | ||
; RV32-NEXT: mv a0, sp | ||
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the results on short VL, we'd probably be better going through the buildvector lowering, or adapting some of that lowering to work on short splat_vector_parts. Using the strided load may not always be our best fallback.
(Optional follow up only)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree. The strided load is not good given what we know now about microarchitectures.
This continues the line of work started in llvm#97520, and gives a 2.5% reduction in the number of spills on SPEC CPU 2017. Program regalloc.NumSpills regalloc.NumReloads regalloc.NumReMaterialization lhs rhs diff lhs rhs diff lhs rhs diff 605.mcf_s 141.00 141.00 0.0% 372.00 372.00 0.0% 123.00 123.00 0.0% 505.mcf_r 141.00 141.00 0.0% 372.00 372.00 0.0% 123.00 123.00 0.0% 519.lbm_r 73.00 73.00 0.0% 75.00 75.00 0.0% 18.00 18.00 0.0% 619.lbm_s 68.00 68.00 0.0% 70.00 70.00 0.0% 20.00 20.00 0.0% 631.deepsjeng_s 354.00 353.00 -0.3% 683.00 682.00 -0.1% 529.00 530.00 0.2% 531.deepsjeng_r 354.00 353.00 -0.3% 683.00 682.00 -0.1% 529.00 530.00 0.2% 625.x264_s 1896.00 1886.00 -0.5% 4583.00 4561.00 -0.5% 2086.00 2108.00 1.1% 525.x264_r 1896.00 1886.00 -0.5% 4583.00 4561.00 -0.5% 2086.00 2108.00 1.1% 508.namd_r 6665.00 6598.00 -1.0% 15649.00 15509.00 -0.9% 3014.00 3164.00 5.0% 644.nab_s 761.00 753.00 -1.1% 1199.00 1183.00 -1.3% 1542.00 1559.00 1.1% 544.nab_r 761.00 753.00 -1.1% 1199.00 1183.00 -1.3% 1542.00 1559.00 1.1% 638.imagick_s 4287.00 4181.00 -2.5% 11624.00 11342.00 -2.4% 10551.00 10884.00 3.2% 538.imagick_r 4287.00 4181.00 -2.5% 11624.00 11342.00 -2.4% 10551.00 10884.00 3.2% 602.gcc_s 12771.00 12450.00 -2.5% 28117.00 27328.00 -2.8% 49757.00 50526.00 1.5% 502.gcc_r 12771.00 12450.00 -2.5% 28117.00 27328.00 -2.8% 49757.00 50526.00 1.5% Geomean difference -2.5% -2.6% 1.8% I initially held off submitting this patch because it surprisingly introduced a lot of spills in the test diffs, but after llvm#107290 the vmv.v.is that caused them are now gone. The gist is that marking vmv.v.i as spillable decreased its spill weight, which actually resulted in more m8 registers getting evicted and spilled during register allocation. The SPEC results show this isn't an issue in practice though, and I plan on posting a separate patch to explain this in more detail.
This continues the line of work started in #97520, and gives a 2.5% reduction in the number of spills on SPEC CPU 2017. Program regalloc.NumSpills lhs rhs diff 605.mcf_s 141.00 141.00 0.0% 505.mcf_r 141.00 141.00 0.0% 519.lbm_r 73.00 73.00 0.0% 619.lbm_s 68.00 68.00 0.0% 631.deepsjeng_s 354.00 353.00 -0.3% 531.deepsjeng_r 354.00 353.00 -0.3% 625.x264_s 1896.00 1886.00 -0.5% 525.x264_r 1896.00 1886.00 -0.5% 508.namd_r 6665.00 6598.00 -1.0% 644.nab_s 761.00 753.00 -1.1% 544.nab_r 761.00 753.00 -1.1% 638.imagick_s 4287.00 4181.00 -2.5% 538.imagick_r 4287.00 4181.00 -2.5% 602.gcc_s 12771.00 12450.00 -2.5% 502.gcc_r 12771.00 12450.00 -2.5% 510.parest_r 43876.00 42740.00 -2.6% 500.perlbench_r 4297.00 4179.00 -2.7% 600.perlbench_s 4297.00 4179.00 -2.7% 526.blender_r 13503.00 13103.00 -3.0% 511.povray_r 2006.00 1937.00 -3.4% 620.omnetpp_s 984.00 946.00 -3.9% 520.omnetpp_r 984.00 946.00 -3.9% 657.xz_s 302.00 289.00 -4.3% 557.xz_r 302.00 289.00 -4.3% 541.leela_r 378.00 356.00 -5.8% 641.leela_s 378.00 356.00 -5.8% 623.xalancbmk_s 1646.00 1548.00 -6.0% 523.xalancbmk_r 1646.00 1548.00 -6.0% Geomean difference -2.5% I initially held off submitting this patch because it surprisingly introduced a lot of spills in the test diffs, but after #107290 the vmv.v.is that caused them are now gone. The gist is that marking vmv.v.i as spillable decreased its spill weight, which actually resulted in more m8 registers getting evicted and spilled during register allocation. The SPEC results show this isn't an issue in practice though, and I plan on posting a separate patch to explain this in more detail.
If the element type needs to be expanded, we can use SPLAT_VECTOR_PARTS if the target supports it.
There's already a DAGCombine to turn BUILD_VECTOR into SPLAT_VECTOR if the target makes SPLAT_VECTOR legal, but it doesn't fire for vectors that need to be split.
Alternatively, we could tweak the existig DAGCombine to form SPLAT_VECTOR if the type will be split to a legal vector type that supports SPLAT_VECTOR. That requires iterating through getTypeToTransformTo until we reach a legal type.
This is an alternative to #107205