Skip to content
This repository has been archived by the owner on Apr 23, 2020. It is now read-only.

Commit

Permalink
[AArch64] Add custom lowering for v4i8 trunc store
Browse files Browse the repository at this point in the history
This patch adds a custom trunc store lowering for v4i8 vector types.
Since there is not v.4b register, the v4i8 is promoted to v4i16 (v.4h)
and default action for v4i8 is to extract each element and issue 4
byte stores.

A better strategy would be to extended the promoted v4i16 to v8i16
(with undef elements) and extract and store the word lane which
represents the v4i8 subvectores. The construction:

  define void @foo(<4 x i16> %x, i8* nocapture %p) {
    %0 = trunc <4 x i16> %x to <4 x i8>
    %1 = bitcast i8* %p to <4 x i8>*
    store <4 x i8> %0, <4 x i8>* %1, align 4, !tbaa !2
    ret void
  }

Can be optimized from:

  umov    w8, v0.h[3]
  umov    w9, v0.h[2]
  umov    w10, v0.h[1]
  umov    w11, v0.h[0]
  strb    w8, [x0, #3]
  strb    w9, [x0, #2]
  strb    w10, [x0, #1]
  strb    w11, [x0]
  ret

To:

  xtn     v0.8b, v0.8h
  str     s0, [x0]
  ret

The patch also adjust the memory cost for autovectorization, so the C
code:

  void foo (const int *src, int width, unsigned char *dst)
  {
    for (int i = 0; i < width; i++)
       *dst++ = *src++;
  }

can be vectorized to:

  .LBB0_4:                                // %vector.body
                                          // =>This Inner Loop Header: Depth=1
        ldr     q0, [x0], #16
        subs    x12, x12, #4            // =4
        xtn     v0.4h, v0.4s
        xtn     v0.8b, v0.8h
        st1     { v0.s }[0], [x2], #4
        b.ne    .LBB0_4

Instead of byte operations.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@335735 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
zatrazz committed Jun 27, 2018
1 parent 212054e commit 751c17b
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 10 deletions.
66 changes: 66 additions & 0 deletions lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
}

setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
}

PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
Expand Down Expand Up @@ -2673,6 +2675,68 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}

// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
EVT VT, EVT MemVT,
SelectionDAG &DAG) {
assert(VT.isVector() && "VT should be a vector type");
assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);

SDValue Value = ST->getValue();

// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
// the word lane which represent the v4i8 subvector. It optimizes the store
// to:
//
// xtn v0.8b, v0.8h
// str s0, [x0]

SDValue Undef = DAG.getUNDEF(MVT::i16);
SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
{Undef, Undef, Undef, Undef});

SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
Value, UndefVec);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);

Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
Trunc, DAG.getConstant(0, DL, MVT::i64));

return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
ST->getBasePtr(), ST->getMemOperand());
}

// Custom lowering for any store, vector or scalar and/or default or with
// a truncate operations. Currently only custom lower truncate operation
// from vector v4i16 to v4i8.
SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc Dl(Op);
StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
assert (StoreNode && "Can only custom lower store nodes");

SDValue Value = StoreNode->getValue();

EVT VT = Value.getValueType();
EVT MemVT = StoreNode->getMemoryVT();

assert (VT.isVector() && "Can only custom lower vector store types");

unsigned AS = StoreNode->getAddressSpace();
unsigned Align = StoreNode->getAlignment();
if (Align < MemVT.getStoreSize() &&
!allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
return scalarizeVectorStore(StoreNode, DAG);
}

if (StoreNode->isTruncatingStore()) {
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
}

return SDValue();
}

SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
LLVM_DEBUG(dbgs() << "Custom lowering: ");
Expand Down Expand Up @@ -2784,6 +2848,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerMULH(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_SMAX:
case ISD::VECREDUCE_SMIN:
Expand Down
2 changes: 2 additions & 0 deletions lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,8 @@ class AArch64TargetLowering : public TargetLowering {
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const;

SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;

bool isEligibleForTailCallOptimization(
Expand Down
24 changes: 16 additions & 8 deletions lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -634,14 +634,22 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
return LT.first * 2 * AmortizationCost;
}

if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
Ty->getVectorNumElements() < 8) {
// We scalarize the loads/stores because there is not v.4b register and we
// have to promote the elements to v.4h.
unsigned NumVecElts = Ty->getVectorNumElements();
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
// We generate 2 instructions per vector element.
return NumVectorizableInstsToAmortize * NumVecElts * 2;
if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
unsigned ProfitableNumElements;
if (Opcode == Instruction::Store)
// We use a custom trunc store lowering so v.4b should be profitable.
ProfitableNumElements = 4;
else
// We scalarize the loads because there is not v.4b register and we
// have to promote the elements to v.2.
ProfitableNumElements = 8;

if (Ty->getVectorNumElements() < ProfitableNumElements) {
unsigned NumVecElts = Ty->getVectorNumElements();
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
// We generate 2 instructions per vector element.
return NumVectorizableInstsToAmortize * NumVecElts * 2;
}
}

return LT.first;
Expand Down
2 changes: 1 addition & 1 deletion test/Analysis/CostModel/AArch64/store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ define void @getMemoryOpCost() {
; these types (they get extended to v.4h/v.2s).
; CHECK: cost of 16 {{.*}} store
store <2 x i8> undef, <2 x i8> * undef
; CHECK: cost of 64 {{.*}} store
; CHECK: cost of 1 {{.*}} store
store <4 x i8> undef, <4 x i8> * undef
; CHECK: cost of 16 {{.*}} load
load <2 x i8> , <2 x i8> * undef
Expand Down
10 changes: 10 additions & 0 deletions test/CodeGen/AArch64/neon-truncStore-extLoad.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@ define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
ret void
}

define void @truncStore.v4i8(<4 x i32> %a, <4 x i8>* %result) {
; CHECK-LABEL: truncStore.v4i8:
; CHECK: xtn [[TMP:(v[0-9]+)]].4h, v{{[0-9]+}}.4s
; CHECK-NEXT: xtn [[TMP2:(v[0-9]+)]].8b, [[TMP]].8h
; CHECK-NEXT: str s{{[0-9]+}}, [x{{[0-9]+}}]
%b = trunc <4 x i32> %a to <4 x i8>
store <4 x i8> %b, <4 x i8>* %result
ret void
}

define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) {
; CHECK-LABEL: truncStore.v8i16:
; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ target triple = "aarch64--linux-gnu"
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8
; CHECK: vector.body
; CHECK: load i8
; CHECK: load i8
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body

define void @test(%pair* %p, i64 %n) {
Expand Down

0 comments on commit 751c17b

Please sign in to comment.