Skip to content

Commit 505d574

Browse files
joshherr-quicKrzysztof Parzyszek
authored andcommitted
[Hexagon] Improve BUILD_VECTOR codegen
For vectors with repeating values, old codegen would rotate and insert every duplicate element. This patch replaces that behavior with a splat of the most common element, vinsert/vror only occur when needed.
1 parent 180455a commit 505d574

File tree

4 files changed

+201
-13
lines changed

4 files changed

+201
-13
lines changed

llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp

100644100755
Lines changed: 57 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -656,22 +656,66 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
656656
}
657657
}
658658

659-
// Construct two halves in parallel, then or them together.
659+
// Find most common element to initialize vector with. This is to avoid
660+
// unnecessary vinsert/valign for cases where the same value is present
661+
// many times. Creates a histogram of the vector's elements to find the
662+
// most common element n.
660663
assert(4*Words.size() == Subtarget.getVectorLength());
661-
SDValue HalfV0 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
662-
SDValue HalfV1 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
663-
SDValue S = DAG.getConstant(4, dl, MVT::i32);
664-
for (unsigned i = 0; i != NumWords/2; ++i) {
665-
SDValue N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
666-
{HalfV0, Words[i]});
667-
SDValue M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
668-
{HalfV1, Words[i+NumWords/2]});
669-
HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, S});
670-
HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, S});
664+
int VecHist[32];
665+
int n = 0;
666+
for (unsigned i = 0; i != NumWords; ++i) {
667+
VecHist[i] = 0;
668+
if (Words[i].isUndef())
669+
continue;
670+
for (unsigned j = i; j != NumWords; ++j)
671+
if (Words[i] == Words[j])
672+
VecHist[i]++;
673+
674+
if (VecHist[i] > VecHist[n])
675+
n = i;
671676
}
672677

673-
HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy,
674-
{HalfV0, DAG.getConstant(HwLen/2, dl, MVT::i32)});
678+
SDValue HalfV = getZero(dl, VecTy, DAG);
679+
if (VecHist[n] > 1) {
680+
SDValue SplatV = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Words[n]);
681+
HalfV = DAG.getNode(HexagonISD::VALIGN, dl, VecTy,
682+
{HalfV, SplatV, DAG.getConstant(HwLen/2, dl, MVT::i32)});
683+
}
684+
SDValue HalfV0 = HalfV;
685+
SDValue HalfV1 = HalfV;
686+
687+
// Construct two halves in parallel, then or them together. Rn and Rm count
688+
// number of rotations needed before the next element. One last rotation is
689+
// performed post-loop to position the last element.
690+
int Rn = 0, Rm = 0;
691+
SDValue Sn, Sm;
692+
SDValue N = HalfV0;
693+
SDValue M = HalfV1;
694+
for (unsigned i = 0; i != NumWords/2; ++i) {
695+
696+
// Rotate by element count since last insertion.
697+
if (Words[i] != Words[n] || VecHist[n] <= 1) {
698+
Sn = DAG.getConstant(Rn, dl, MVT::i32);
699+
HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, Sn});
700+
N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
701+
{HalfV0, Words[i]});
702+
Rn = 0;
703+
}
704+
if (Words[i+NumWords/2] != Words[n] || VecHist[n] <= 1) {
705+
Sm = DAG.getConstant(Rm, dl, MVT::i32);
706+
HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, Sm});
707+
M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
708+
{HalfV1, Words[i+NumWords/2]});
709+
Rm = 0;
710+
}
711+
Rn += 4;
712+
Rm += 4;
713+
}
714+
// Perform last rotation.
715+
Sn = DAG.getConstant(Rn+HwLen/2, dl, MVT::i32);
716+
Sm = DAG.getConstant(Rm, dl, MVT::i32);
717+
HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, Sn});
718+
HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, Sm});
675719

676720
SDValue T0 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV0);
677721
SDValue T1 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV1);
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; RUN: llc -march=hexagon < %s | FileCheck %s
2+
3+
; Check that vector is produced with vxor
4+
; CHECK: v{{[0-9]*}} = vxor
5+
define <32 x i32> @f0(i32 %x) #0 {
6+
%vect = insertelement <32 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %x, i32 0
7+
ret <32 x i32> %vect
8+
}
9+
10+
; Check that vector is produced with vsplat
11+
; CHECK: v{{[0-9]*}} = vsplat
12+
define <32 x i32> @f1(i32 %x) #0 {
13+
%vect = insertelement <32 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 0
14+
ret <32 x i32> %vect
15+
}
16+
17+
; Check that the correct vror is generated
18+
; CHECK: [[REG0:r([0-9]+)]] = #120
19+
; CHECK: vror(v{{[0-9]+}},[[REG0]])
20+
define <32 x i32> @f2(i32 %x) #0 {
21+
%vect = insertelement <32 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 2
22+
ret <32 x i32> %vect
23+
}
24+
25+
; Check that the correct vror is generated
26+
; CHECK: [[REG0:r([0-9]+)]] = #12
27+
; CHECK: vror(v{{[0-9]+}},[[REG0]])
28+
define <32 x i32> @f3(i32 %x) #0 {
29+
%vect = insertelement <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 undef, i32 1, i32 1>, i32 %x, i32 29
30+
ret <32 x i32> %vect
31+
}
32+
33+
attributes #0 = { readnone nounwind "target-cpu"="hexagonv62" "target-features"="+hvx,+hvx-length128b" }
34+
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; RUN: llc -march=hexagon < %s | FileCheck %s
2+
3+
; Check that vector is produced with vxor
4+
; CHECK: v{{[0-9]*}} = vxor
5+
define <16 x i32> @f0(i32 %x) #0 {
6+
%vect = insertelement <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %x, i32 0
7+
ret <16 x i32> %vect
8+
}
9+
10+
; Check that vector is produced with vsplat
11+
; CHECK: v{{[0-9]*}} = vsplat
12+
define <16 x i32> @f1(i32 %x) #0 {
13+
%vect = insertelement <16 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 0
14+
ret <16 x i32> %vect
15+
}
16+
17+
; Check that the correct vror is generated
18+
; CHECK: [[REG0:r([0-9]+)]] = #56
19+
; CHECK: vror(v{{[0-9]+}},[[REG0]])
20+
define <16 x i32> @f2(i32 %x) #0 {
21+
%vect = insertelement <16 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 2
22+
ret <16 x i32> %vect
23+
}
24+
25+
; Check that the correct vror is generated
26+
; CHECK: [[REG0:r([0-9]+)]] = #12
27+
; CHECK: vror(v{{[0-9]+}},[[REG0]])
28+
define <16 x i32> @f3(i32 %x) #0 {
29+
%vect = insertelement <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 undef, i32 1, i32 1>, i32 %x, i32 13
30+
ret <16 x i32> %vect
31+
}
32+
33+
attributes #0 = { readnone nounwind "target-cpu"="hexagonv62" "target-features"="+hvx,+hvx-length64b" }
34+
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -march=hexagon < %s | FileCheck %s
3+
4+
define <32 x i32> @fred(i32 %a0) #0 {
5+
; CHECK-LABEL: fred:
6+
; CHECK: .cfi_startproc
7+
; CHECK-NEXT: // %bb.0:
8+
; CHECK-NEXT: {
9+
; CHECK-NEXT: r3:2 = combine(#20,#9)
10+
; CHECK-NEXT: v0 = vxor(v0,v0)
11+
; CHECK-NEXT: r1 = #24
12+
; CHECK-NEXT: r4 = #12
13+
; CHECK-NEXT: }
14+
; CHECK-NEXT: {
15+
; CHECK-NEXT: v1 = vror(v0,r1)
16+
; CHECK-NEXT: }
17+
; CHECK-NEXT: {
18+
; CHECK-NEXT: v1.w = vinsert(r2)
19+
; CHECK-NEXT: r4 = #7
20+
; CHECK-NEXT: r2 = #116
21+
; CHECK-NEXT: v0 = vror(v0,r4)
22+
; CHECK-NEXT: }
23+
; CHECK-NEXT: {
24+
; CHECK-NEXT: v0.w = vinsert(r4)
25+
; CHECK-NEXT: }
26+
; CHECK-NEXT: {
27+
; CHECK-NEXT: v1 = vror(v1,r3)
28+
; CHECK-NEXT: }
29+
; CHECK-NEXT: {
30+
; CHECK-NEXT: v1.w = vinsert(r0)
31+
; CHECK-NEXT: v0 = vror(v0,r2)
32+
; CHECK-NEXT: }
33+
; CHECK-NEXT: {
34+
; CHECK-NEXT: v1 = vror(v1,r3)
35+
; CHECK-NEXT: }
36+
; CHECK-NEXT: {
37+
; CHECK-NEXT: v0 = vor(v0,v1)
38+
; CHECK-NEXT: jumpr r31
39+
; CHECK-NEXT: }
40+
%v0 = insertelement <32 x i32> undef, i32 undef, i32 0
41+
%v1 = insertelement <32 x i32> %v0, i32 undef, i32 1
42+
%v2 = insertelement <32 x i32> %v1, i32 undef, i32 2
43+
%v3 = insertelement <32 x i32> %v2, i32 7, i32 3
44+
%v4 = insertelement <32 x i32> %v3, i32 undef, i32 4
45+
%v5 = insertelement <32 x i32> %v4, i32 undef, i32 5
46+
%v6 = insertelement <32 x i32> %v5, i32 undef, i32 6
47+
%v7 = insertelement <32 x i32> %v6, i32 undef, i32 7
48+
%v8 = insertelement <32 x i32> %v7, i32 undef, i32 8
49+
%v9 = insertelement <32 x i32> %v8, i32 undef, i32 9
50+
%v10 = insertelement <32 x i32> %v9, i32 undef, i32 10
51+
%v11 = insertelement <32 x i32> %v10, i32 undef, i32 11
52+
%v12 = insertelement <32 x i32> %v11, i32 undef, i32 12
53+
%v13 = insertelement <32 x i32> %v12, i32 undef, i32 13
54+
%v14 = insertelement <32 x i32> %v13, i32 undef, i32 14
55+
%v15 = insertelement <32 x i32> %v14, i32 undef, i32 15
56+
%v16 = insertelement <32 x i32> %v15, i32 undef, i32 16
57+
%v17 = insertelement <32 x i32> %v16, i32 undef, i32 17
58+
%v18 = insertelement <32 x i32> %v17, i32 undef, i32 18
59+
%v19 = insertelement <32 x i32> %v18, i32 undef, i32 19
60+
%v20 = insertelement <32 x i32> %v19, i32 undef, i32 20
61+
%v21 = insertelement <32 x i32> %v20, i32 undef, i32 21
62+
%v22 = insertelement <32 x i32> %v21, i32 9, i32 22
63+
%v23 = insertelement <32 x i32> %v22, i32 undef, i32 23
64+
%v24 = insertelement <32 x i32> %v23, i32 undef, i32 24
65+
%v25 = insertelement <32 x i32> %v24, i32 undef, i32 25
66+
%v26 = insertelement <32 x i32> %v25, i32 undef, i32 26
67+
%v27 = insertelement <32 x i32> %v26, i32 %a0, i32 27
68+
%v28 = insertelement <32 x i32> %v27, i32 undef, i32 28
69+
%v29 = insertelement <32 x i32> %v28, i32 undef, i32 29
70+
%v30 = insertelement <32 x i32> %v29, i32 undef, i32 30
71+
%v31 = insertelement <32 x i32> %v30, i32 undef, i32 31
72+
ret <32 x i32> %v31
73+
}
74+
75+
attributes #0 = { "target-cpu"="hexagonv66" "target-features"="+hvx,+hvx-length128b" }
76+

0 commit comments

Comments
 (0)