Skip to content

[LegalizeTypes][RISCV] Use SPLAT_VECTOR_PARTS to legalize splat BUILD_VECTOR #107290

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,15 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
assert(OldVT == VecVT.getVectorElementType() &&
"BUILD_VECTOR operand type doesn't match vector element type!");

if (VecVT.isInteger() && TLI.isOperationLegal(ISD::SPLAT_VECTOR, VecVT) &&
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need the SPLAT_VECTOR legality? Isn't SPLAT_VECTOR_PARTS enough?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we don't default SPLAT_VECTOR_PARTS to Expand so it started crashing other targets.

TLI.isOperationLegalOrCustom(ISD::SPLAT_VECTOR_PARTS, VecVT)) {
if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) {
SDValue Lo, Hi;
GetExpandedOp(V, Lo, Hi);
return DAG.getNode(ISD::SPLAT_VECTOR_PARTS, dl, VecVT, Lo, Hi);
}
}

// Build a vector of twice the length out of the expanded elements.
// For example <3 x i64> -> <6 x i32>.
SmallVector<SDValue, 16> NewElts;
Expand Down
16 changes: 10 additions & 6 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) {
; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
; RV32-NEXT: vfmv.f.s fa0, v8
; RV32-NEXT: call llrintf
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV32-NEXT: vmv.v.x v8, a0
; RV32-NEXT: vslide1down.vx v8, v8, a1
; RV32-NEXT: sw a1, 4(sp)
; RV32-NEXT: sw a0, 0(sp)
; RV32-NEXT: mv a0, sp
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given the results on short VL, we'd probably be better going through the buildvector lowering, or adapting some of that lowering to work on short splat_vector_parts. Using the strided load may not always be our best fallback.

(Optional follow up only)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. The strided load is not good given what we know now about microarchitectures.

; RV32-NEXT: vlse64.v v8, (a0), zero
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
Expand Down Expand Up @@ -669,9 +671,11 @@ define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) {
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vfmv.f.s fa0, v8
; RV32-NEXT: call llrint
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; RV32-NEXT: vmv.v.x v8, a0
; RV32-NEXT: vslide1down.vx v8, v8, a1
; RV32-NEXT: sw a1, 4(sp)
; RV32-NEXT: sw a0, 0(sp)
; RV32-NEXT: mv a0, sp
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vlse64.v v8, (a0), zero
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
Expand Down
56 changes: 19 additions & 37 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
Original file line number Diff line number Diff line change
Expand Up @@ -397,43 +397,22 @@ define void @masked_load_v32i32(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
declare <32 x i32> @llvm.masked.load.v32i32(ptr, i32, <32 x i1>, <32 x i32>)

define void @masked_load_v32i64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
; RV32-LABEL: masked_load_v32i64:
; RV32: # %bb.0:
; RV32-NEXT: addi a3, a1, 128
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v0, (a1)
; RV32-NEXT: vle64.v v24, (a3)
; RV32-NEXT: li a1, 32
; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; RV32-NEXT: vmv.v.i v16, 0
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vmseq.vv v8, v0, v16
; RV32-NEXT: vmseq.vv v0, v24, v16
; RV32-NEXT: addi a1, a0, 128
; RV32-NEXT: vle64.v v16, (a1), v0.t
; RV32-NEXT: vmv1r.v v0, v8
; RV32-NEXT: vle64.v v8, (a0), v0.t
; RV32-NEXT: vse64.v v8, (a2)
; RV32-NEXT: addi a0, a2, 128
; RV32-NEXT: vse64.v v16, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: masked_load_v32i64:
; RV64: # %bb.0:
; RV64-NEXT: addi a3, a1, 128
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v16, (a1)
; RV64-NEXT: vle64.v v24, (a3)
; RV64-NEXT: vmseq.vi v8, v16, 0
; RV64-NEXT: vmseq.vi v0, v24, 0
; RV64-NEXT: addi a1, a0, 128
; RV64-NEXT: vle64.v v16, (a1), v0.t
; RV64-NEXT: vmv1r.v v0, v8
; RV64-NEXT: vle64.v v8, (a0), v0.t
; RV64-NEXT: vse64.v v8, (a2)
; RV64-NEXT: addi a0, a2, 128
; RV64-NEXT: vse64.v v16, (a0)
; RV64-NEXT: ret
; CHECK-LABEL: masked_load_v32i64:
; CHECK: # %bb.0:
; CHECK-NEXT: addi a3, a1, 128
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a1)
; CHECK-NEXT: vle64.v v24, (a3)
; CHECK-NEXT: vmseq.vi v8, v16, 0
; CHECK-NEXT: vmseq.vi v0, v24, 0
; CHECK-NEXT: addi a1, a0, 128
; CHECK-NEXT: vle64.v v16, (a1), v0.t
; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vle64.v v8, (a0), v0.t
; CHECK-NEXT: vse64.v v8, (a2)
; CHECK-NEXT: addi a0, a2, 128
; CHECK-NEXT: vse64.v v16, (a0)
; CHECK-NEXT: ret
%m = load <32 x i64>, ptr %m_ptr
%mask = icmp eq <32 x i64> %m, zeroinitializer
%load = call <32 x i64> @llvm.masked.load.v32i64(ptr %a, i32 8, <32 x i1> %mask, <32 x i64> undef)
Expand Down Expand Up @@ -547,3 +526,6 @@ define void @masked_load_v256i8(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
ret void
}
declare <256 x i8> @llvm.masked.load.v256i8(ptr, i32, <256 x i1>, <256 x i8>)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32: {{.*}}
; RV64: {{.*}}
122 changes: 41 additions & 81 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
Original file line number Diff line number Diff line change
Expand Up @@ -397,87 +397,44 @@ define void @masked_store_v32i32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
declare void @llvm.masked.store.v32i32.p0(<32 x i32>, ptr, i32, <32 x i1>)

define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
; RV32-LABEL: masked_store_v32i64:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: csrr a3, vlenb
; RV32-NEXT: slli a3, a3, 4
; RV32-NEXT: sub sp, sp, a3
; RV32-NEXT: addi a3, a2, 128
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vle64.v v24, (a2)
; RV32-NEXT: vle64.v v8, (a3)
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, sp, a2
; RV32-NEXT: addi a2, a2, 16
; RV32-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vmv.v.i v8, 0
; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV32-NEXT: vmseq.vv v7, v24, v8
; RV32-NEXT: addi a2, a0, 128
; RV32-NEXT: vle64.v v24, (a2)
; RV32-NEXT: vle64.v v16, (a0)
; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 16
; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vmseq.vv v0, v16, v8
; RV32-NEXT: addi a0, a1, 128
; RV32-NEXT: vse64.v v24, (a0), v0.t
; RV32-NEXT: vmv1r.v v0, v7
; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; RV32-NEXT: vse64.v v8, (a1), v0.t
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 4
; RV32-NEXT: add sp, sp, a0
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: masked_store_v32i64:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: csrr a3, vlenb
; RV64-NEXT: slli a3, a3, 4
; RV64-NEXT: sub sp, sp, a3
; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; RV64-NEXT: vle64.v v8, (a2)
; RV64-NEXT: addi a2, a2, 128
; RV64-NEXT: vle64.v v16, (a2)
; RV64-NEXT: csrr a2, vlenb
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, sp, a2
; RV64-NEXT: addi a2, a2, 16
; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; RV64-NEXT: vmseq.vi v0, v8, 0
; RV64-NEXT: vle64.v v24, (a0)
; RV64-NEXT: addi a0, a0, 128
; RV64-NEXT: vle64.v v8, (a0)
; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 16
; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; RV64-NEXT: vmseq.vi v8, v16, 0
; RV64-NEXT: vse64.v v24, (a1), v0.t
; RV64-NEXT: addi a0, a1, 128
; RV64-NEXT: vmv1r.v v0, v8
; RV64-NEXT: addi a1, sp, 16
; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; RV64-NEXT: vse64.v v8, (a0), v0.t
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 4
; RV64-NEXT: add sp, sp, a0
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
; CHECK-LABEL: masked_store_v32i64:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a3, vlenb
; CHECK-NEXT: slli a3, a3, 4
; CHECK-NEXT: sub sp, sp, a3
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a2)
; CHECK-NEXT: addi a2, a2, 128
; CHECK-NEXT: vle64.v v16, (a2)
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: add a2, sp, a2
; CHECK-NEXT: addi a2, a2, 16
; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
; CHECK-NEXT: vmseq.vi v0, v8, 0
; CHECK-NEXT: vle64.v v24, (a0)
; CHECK-NEXT: addi a0, a0, 128
; CHECK-NEXT: vle64.v v8, (a0)
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmseq.vi v8, v16, 0
; CHECK-NEXT: vse64.v v24, (a1), v0.t
; CHECK-NEXT: addi a0, a1, 128
; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload
; CHECK-NEXT: vse64.v v8, (a0), v0.t
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
%m = load <32 x i64>, ptr %m_ptr
%mask = icmp eq <32 x i64> %m, zeroinitializer
%val = load <32 x i64>, ptr %val_ptr
Expand Down Expand Up @@ -683,3 +640,6 @@ define void @masked_store_v256i8(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind {
ret void
}
declare void @llvm.masked.store.v256i8.p0(<256 x i8>, ptr, i32, <256 x i1>)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32: {{.*}}
; RV64: {{.*}}
Loading
Loading