Skip to content

Commit 6b9afa3

Browse files
NickGuy-Armkbluck
authored andcommitted
[IR][LangRef] Add partial reduction add intrinsic (llvm#94499)
Adds the llvm.experimental.partial.reduce.add.* overloaded intrinsic, this intrinsic represents add reductions that result in a narrower vector.
1 parent 3c92ad9 commit 6b9afa3

File tree

5 files changed

+166
-0
lines changed

5 files changed

+166
-0
lines changed

llvm/docs/LangRef.rst

+31
Original file line numberDiff line numberDiff line change
@@ -19441,6 +19441,37 @@ will be on any later loop iteration.
1944119441
This intrinsic will only return 0 if the input count is also 0. A non-zero input
1944219442
count will produce a non-zero result.
1944319443

19444+
'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic
19445+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
19446+
19447+
Syntax:
19448+
"""""""
19449+
This is an overloaded intrinsic.
19450+
19451+
::
19452+
19453+
declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
19454+
declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
19455+
declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
19456+
declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
19457+
19458+
Overview:
19459+
"""""""""
19460+
19461+
The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics reduce the
19462+
concatenation of the two vector operands down to the number of elements dictated
19463+
by the result type. The result type is a vector type that matches the type of the
19464+
first operand vector.
19465+
19466+
Arguments:
19467+
""""""""""
19468+
19469+
Both arguments must be vectors of matching element types. The first argument type must
19470+
match the result type, while the second argument type must have a vector length that is a
19471+
positive integer multiple of the first vector/result type. The arguments must be either be
19472+
both fixed or both scalable vectors.
19473+
19474+
1944419475
'``llvm.experimental.vector.histogram.*``' Intrinsic
1944519476
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1944619477

llvm/include/llvm/IR/Intrinsics.td

+6
Original file line numberDiff line numberDiff line change
@@ -2640,6 +2640,12 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType
26402640
[llvm_anyvector_ty],
26412641
[IntrNoMem]>;
26422642

2643+
//===-------------- Intrinsics to perform partial reduction ---------------===//
2644+
2645+
def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
2646+
[llvm_anyvector_ty, llvm_anyvector_ty],
2647+
[IntrNoMem]>;
2648+
26432649
//===----------------- Pointer Authentication Intrinsics ------------------===//
26442650
//
26452651

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

+32
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
#include "llvm/TargetParser/Triple.h"
105105
#include "llvm/Transforms/Utils/Local.h"
106106
#include <cstddef>
107+
#include <deque>
107108
#include <iterator>
108109
#include <limits>
109110
#include <optional>
@@ -7976,6 +7977,37 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
79767977
setValue(&I, Trunc);
79777978
return;
79787979
}
7980+
case Intrinsic::experimental_vector_partial_reduce_add: {
7981+
SDValue OpNode = getValue(I.getOperand(1));
7982+
EVT ReducedTy = EVT::getEVT(I.getType());
7983+
EVT FullTy = OpNode.getValueType();
7984+
7985+
unsigned Stride = ReducedTy.getVectorMinNumElements();
7986+
unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
7987+
7988+
// Collect all of the subvectors
7989+
std::deque<SDValue> Subvectors;
7990+
Subvectors.push_back(getValue(I.getOperand(0)));
7991+
for (unsigned i = 0; i < ScaleFactor; i++) {
7992+
auto SourceIndex = DAG.getVectorIdxConstant(i * Stride, sdl);
7993+
Subvectors.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ReducedTy,
7994+
{OpNode, SourceIndex}));
7995+
}
7996+
7997+
// Flatten the subvector tree
7998+
while (Subvectors.size() > 1) {
7999+
Subvectors.push_back(DAG.getNode(ISD::ADD, sdl, ReducedTy,
8000+
{Subvectors[0], Subvectors[1]}));
8001+
Subvectors.pop_front();
8002+
Subvectors.pop_front();
8003+
}
8004+
8005+
assert(Subvectors.size() == 1 &&
8006+
"There should only be one subvector after tree flattening");
8007+
8008+
setValue(&I, Subvectors[0]);
8009+
return;
8010+
}
79798011
case Intrinsic::experimental_cttz_elts: {
79808012
auto DL = getCurSDLoc();
79818013
SDValue Op = getValue(I.getOperand(0));

llvm/lib/IR/Verifier.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -6143,6 +6143,20 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
61436143
}
61446144
break;
61456145
}
6146+
case Intrinsic::experimental_vector_partial_reduce_add: {
6147+
VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
6148+
VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
6149+
6150+
unsigned VecWidth = VecTy->getElementCount().getKnownMinValue();
6151+
unsigned AccWidth = AccTy->getElementCount().getKnownMinValue();
6152+
6153+
Check((VecWidth % AccWidth) == 0,
6154+
"Invalid vector widths for partial "
6155+
"reduction. The width of the input vector "
6156+
"must be a positive integer multiple of "
6157+
"the width of the accumulator vector.");
6158+
break;
6159+
}
61466160
case Intrinsic::experimental_noalias_scope_decl: {
61476161
NoAliasScopeDecls.push_back(cast<IntrinsicInst>(&Call));
61486162
break;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -force-vector-interleave=1 -o - %s | FileCheck %s
3+
4+
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
5+
target triple = "aarch64-none-unknown-elf"
6+
7+
define <4 x i32> @partial_reduce_add_fixed(<4 x i32> %accumulator, <4 x i32> %0) #0 {
8+
; CHECK-LABEL: partial_reduce_add_fixed:
9+
; CHECK: // %bb.0: // %entry
10+
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
11+
; CHECK-NEXT: ret
12+
entry:
13+
%partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v4i32(<4 x i32> %accumulator, <4 x i32> %0)
14+
ret <4 x i32> %partial.reduce
15+
}
16+
17+
define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32> %0) #0 {
18+
; CHECK-LABEL: partial_reduce_add_fixed_half:
19+
; CHECK: // %bb.0: // %entry
20+
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
21+
; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
22+
; CHECK-NEXT: ret
23+
entry:
24+
%partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
25+
ret <4 x i32> %partial.reduce
26+
}
27+
28+
define <vscale x 4 x i32> @partial_reduce_add(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0) #0 {
29+
; CHECK-LABEL: partial_reduce_add:
30+
; CHECK: // %bb.0: // %entry
31+
; CHECK-NEXT: add z0.s, z0.s, z1.s
32+
; CHECK-NEXT: ret
33+
entry:
34+
%partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32> %accumulator, <vscale x 4 x i32> %0)
35+
ret <vscale x 4 x i32> %partial.reduce
36+
}
37+
38+
define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0) #0 {
39+
; CHECK-LABEL: partial_reduce_add_half:
40+
; CHECK: // %bb.0: // %entry
41+
; CHECK-NEXT: add z0.s, z0.s, z1.s
42+
; CHECK-NEXT: add z0.s, z2.s, z0.s
43+
; CHECK-NEXT: ret
44+
entry:
45+
%partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
46+
ret <vscale x 4 x i32> %partial.reduce
47+
}
48+
49+
define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
50+
; CHECK-LABEL: partial_reduce_add_quart:
51+
; CHECK: // %bb.0: // %entry
52+
; CHECK-NEXT: add z0.s, z0.s, z1.s
53+
; CHECK-NEXT: add z2.s, z2.s, z3.s
54+
; CHECK-NEXT: add z0.s, z4.s, z0.s
55+
; CHECK-NEXT: add z0.s, z2.s, z0.s
56+
; CHECK-NEXT: ret
57+
entry:
58+
%partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
59+
ret <vscale x 4 x i32> %partial.reduce
60+
}
61+
62+
define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
63+
; CHECK-LABEL: partial_reduce_add_half_8:
64+
; CHECK: // %bb.0: // %entry
65+
; CHECK-NEXT: add z0.s, z0.s, z2.s
66+
; CHECK-NEXT: add z1.s, z1.s, z3.s
67+
; CHECK-NEXT: add z0.s, z4.s, z0.s
68+
; CHECK-NEXT: add z1.s, z5.s, z1.s
69+
; CHECK-NEXT: ret
70+
entry:
71+
%partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
72+
ret <vscale x 8 x i32> %partial.reduce
73+
}
74+
75+
declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
76+
declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32>, <vscale x 8 x i32>)
77+
declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
78+
declare <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32>, <vscale x 16 x i32>)
79+
80+
declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
81+
declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>)
82+
83+
attributes #0 = { "target-features"="+sve2" }

0 commit comments

Comments
 (0)