diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index e2dd4976f39065..2a419560be3030 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -18,6 +18,7 @@ #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/IR/Module.h" #include "llvm/IR/VFABIDemangler.h" +#include "llvm/IR/VectorUtils.h" #include "llvm/Support/CheckedArithmetic.h" namespace llvm { @@ -127,18 +128,8 @@ namespace Intrinsic { typedef unsigned ID; } -/// A helper function for converting Scalar types to vector types. If -/// the incoming type is void, we return void. If the EC represents a -/// scalar, we return the scalar type. -inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { - if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar()) - return Scalar; - return VectorType::get(Scalar, EC); -} - -inline Type *ToVectorTy(Type *Scalar, unsigned VF) { - return ToVectorTy(Scalar, ElementCount::getFixed(VF)); -} +/// Returns true if `Ty` can be widened by the loop vectorizer. +bool canWidenType(Type *Ty); /// Identify if the intrinsic is trivially vectorizable. /// This method returns true if the intrinsic's argument types are all scalars diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 7198e134a2d262..4a04946f00ca76 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1561,8 +1561,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { Type *RetTy = ICA.getReturnType(); ElementCount RetVF = - (RetTy->isVectorTy() ? cast(RetTy)->getElementCount() - : ElementCount::getFixed(1)); + isWideTy(RetTy) ? getWideTypeVF(RetTy) : ElementCount::getFixed(1); + const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl &Args = ICA.getArgs(); FastMathFlags FMF = ICA.getFlags(); @@ -1883,10 +1883,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { InstructionCost ScalarizationCost = InstructionCost::getInvalid(); if (RetVF.isVector() && !RetVF.isScalable()) { ScalarizationCost = 0; - if (!RetTy->isVoidTy()) - ScalarizationCost += getScalarizationOverhead( - cast(RetTy), - /*Insert*/ true, /*Extract*/ false, CostKind); + if (!RetTy->isVoidTy()) { + for (Type *VectorTy : getContainedTypes(RetTy)) { + ScalarizationCost += getScalarizationOverhead( + cast(VectorTy), + /*Insert*/ true, /*Extract*/ false, CostKind); + } + } ScalarizationCost += getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); } @@ -2477,27 +2480,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Else, assume that we need to scalarize this intrinsic. For math builtins // this will emit a costly libcall, adding call overhead and spills. Make it // very expensive. - if (auto *RetVTy = dyn_cast(RetTy)) { + if (isWideTy(RetTy)) { + const SmallVector RetVTys = getContainedTypes(RetTy); + // Scalable vectors cannot be scalarized, so return Invalid. - if (isa(RetTy) || any_of(Tys, [](const Type *Ty) { - return isa(Ty); - })) + if (any_of(concat(RetVTys, Tys), + [](Type *Ty) { return isa(Ty); })) return InstructionCost::getInvalid(); - InstructionCost ScalarizationCost = - SkipScalarizationCost - ? ScalarizationCostPassed - : getScalarizationOverhead(RetVTy, /*Insert*/ true, - /*Extract*/ false, CostKind); + InstructionCost ScalarizationCost = ScalarizationCostPassed; + if (!SkipScalarizationCost) { + ScalarizationCost = 0; + for (Type *RetVTy : RetVTys) { + ScalarizationCost += getScalarizationOverhead( + cast(RetVTy), /*Insert*/ true, + /*Extract*/ false, CostKind); + } + } - unsigned ScalarCalls = cast(RetVTy)->getNumElements(); + unsigned ScalarCalls = getWideTypeVF(RetTy).getFixedValue(); SmallVector ScalarTys; for (Type *Ty : Tys) { if (Ty->isVectorTy()) Ty = Ty->getScalarType(); ScalarTys.push_back(Ty); } - IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); + IntrinsicCostAttributes Attrs(IID, ToNarrowTy(RetTy), ScalarTys, FMF); InstructionCost ScalarCost = thisT()->getIntrinsicInstrCost(Attrs, CostKind); for (Type *Ty : Tys) { diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index 975c142f1a4572..a24801d8bdf834 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -301,6 +301,10 @@ class StructType : public Type { /// {, }} bool containsHomogeneousScalableVectorTypes() const; + /// Return true if this struct is non-empty and all element types are the + /// same. + bool containsHomogeneousTypes() const; + /// Return true if this is a named struct that has a non-empty name. bool hasName() const { return SymbolTableEntry != nullptr; } diff --git a/llvm/include/llvm/IR/VectorUtils.h b/llvm/include/llvm/IR/VectorUtils.h new file mode 100644 index 00000000000000..e8e838d8287c42 --- /dev/null +++ b/llvm/include/llvm/IR/VectorUtils.h @@ -0,0 +1,53 @@ +//===----------- VectorUtils.h - Vector type utility functions -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/DerivedTypes.h" + +namespace llvm { + +/// A helper function for converting Scalar types to vector types. If +/// the incoming type is void, we return void. If the EC represents a +/// scalar, we return the scalar type. +inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { + if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar()) + return Scalar; + return VectorType::get(Scalar, EC); +} + +inline Type *ToVectorTy(Type *Scalar, unsigned VF) { + return ToVectorTy(Scalar, ElementCount::getFixed(VF)); +} + +/// A helper for converting to wider (vector) types. For scalar types, this is +/// equivalent to calling `ToVectorTy`. For struct types, this returns a new +/// struct where each element type has been widened to a vector type. Note: Only +/// unpacked literal struct types are supported. +Type *ToWideTy(Type *Ty, ElementCount EC); + +/// A helper for converting wide types to narrow (non-vector) types. For vector +/// types, this is equivalent to calling .getScalarType(). For struct types, +/// this returns a new struct where each element type has been converted to a +/// scalar type. Note: Only unpacked literal struct types are supported. +Type *ToNarrowTy(Type *Ty); + +/// Returns the types contained in `Ty`. For struct types, it returns the +/// elements, all other types are returned directly. +SmallVector getContainedTypes(Type *Ty); + +/// Returns true if `Ty` is a vector type or a struct of vector types where all +/// vector types share the same VF. +bool isWideTy(Type *Ty); + +/// Returns the vectorization factor for a widened type. +inline ElementCount getWideTypeVF(Type *Ty) { + assert(isWideTy(Ty) && "expected widened type!"); + return cast(getContainedTypes(Ty).front())->getElementCount(); +} + +} // namespace llvm diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index dbffbb8a5f81d9..38b9da69ae2b76 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -39,6 +39,20 @@ static cl::opt MaxInterleaveGroupFactor( cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8)); +/// Returns true if `Ty` can be widened by the loop vectorizer. +bool llvm::canWidenType(Type *Ty) { + Type *ElTy = Ty; + // For now, only allow widening non-packed literal structs where all + // element types are the same. This simplifies the cost model and + // conversion between scalar and wide types. + if (auto *StructTy = dyn_cast(Ty); + StructTy && !StructTy->isPacked() && StructTy->isLiteral() && + StructTy->containsHomogeneousTypes()) { + ElTy = StructTy->elements().front(); + } + return VectorType::isValidElementType(ElTy); +} + /// Return true if all of the intrinsic's arguments and return type are scalars /// for the scalar form of the intrinsic, and vectors for the vector form of the /// intrinsic (except operands that are marked as always being scalar by diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt index 91e0e0cc65f36b..01c73bcd68d2db 100644 --- a/llvm/lib/IR/CMakeLists.txt +++ b/llvm/lib/IR/CMakeLists.txt @@ -71,6 +71,7 @@ add_llvm_component_library(LLVMCore Value.cpp ValueSymbolTable.cpp VectorBuilder.cpp + VectorUtils.cpp Verifier.cpp VFABIDemangler.cpp RuntimeLibcalls.cpp diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 3784ad28d7219d..055959dbb3575b 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -433,10 +433,12 @@ bool StructType::containsHomogeneousScalableVectorTypes() const { Type *FirstTy = getNumElements() > 0 ? elements()[0] : nullptr; if (!FirstTy || !isa(FirstTy)) return false; - for (Type *Ty : elements()) - if (Ty != FirstTy) - return false; - return true; + return containsHomogeneousTypes(); +} + +bool StructType::containsHomogeneousTypes() const { + ArrayRef ElementTys = elements(); + return !ElementTys.empty() && all_equal(ElementTys); } void StructType::setBody(ArrayRef Elements, bool isPacked) { diff --git a/llvm/lib/IR/VFABIDemangler.cpp b/llvm/lib/IR/VFABIDemangler.cpp index cdfb9fbfaa084d..6ccd77fd23793a 100644 --- a/llvm/lib/IR/VFABIDemangler.cpp +++ b/llvm/lib/IR/VFABIDemangler.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/IR/Module.h" +#include "llvm/IR/VectorUtils.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include @@ -346,12 +347,15 @@ getScalableECFromSignature(const FunctionType *Signature, const VFISAKind ISA, // Also check the return type if not void. Type *RetTy = Signature->getReturnType(); if (!RetTy->isVoidTy()) { - std::optional ReturnEC = getElementCountForTy(ISA, RetTy); - // If we have an unknown scalar element type we can't find a reasonable VF. - if (!ReturnEC) - return std::nullopt; - if (ElementCount::isKnownLT(*ReturnEC, MinEC)) - MinEC = *ReturnEC; + for (Type *RetTy : getContainedTypes(RetTy)) { + std::optional ReturnEC = getElementCountForTy(ISA, RetTy); + // If we have an unknown scalar element type we can't find a reasonable + // VF. + if (!ReturnEC) + return std::nullopt; + if (ElementCount::isKnownLT(*ReturnEC, MinEC)) + MinEC = *ReturnEC; + } } // The SVE Vector function call ABI bases the VF on the widest element types @@ -566,7 +570,7 @@ FunctionType *VFABI::createFunctionType(const VFInfo &Info, auto *RetTy = ScalarFTy->getReturnType(); if (!RetTy->isVoidTy()) - RetTy = VectorType::get(RetTy, VF); + RetTy = ToWideTy(RetTy, VF); return FunctionType::get(RetTy, VecTypes, false); } diff --git a/llvm/lib/IR/VectorUtils.cpp b/llvm/lib/IR/VectorUtils.cpp new file mode 100644 index 00000000000000..c89a8eaf2ad1e0 --- /dev/null +++ b/llvm/lib/IR/VectorUtils.cpp @@ -0,0 +1,69 @@ +//===----------- VectorUtils.cpp - Vector type utility functions ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/VectorUtils.h" +#include "llvm/ADT/SmallVectorExtras.h" + +using namespace llvm; + +/// A helper for converting to wider (vector) types. For scalar types, this is +/// equivalent to calling `ToVectorTy`. For struct types, this returns a new +/// struct where each element type has been widened to a vector type. Note: Only +/// unpacked literal struct types are supported. +Type *llvm::ToWideTy(Type *Ty, ElementCount EC) { + if (EC.isScalar()) + return Ty; + auto *StructTy = dyn_cast(Ty); + if (!StructTy) + return ToVectorTy(Ty, EC); + assert(StructTy->isLiteral() && !StructTy->isPacked() && + "expected unpacked struct literal"); + return StructType::get( + Ty->getContext(), + map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * { + return VectorType::get(ElTy, EC); + })); +} + +/// A helper for converting wide types to narrow (non-vector) types. For vector +/// types, this is equivalent to calling .getScalarType(). For struct types, +/// this returns a new struct where each element type has been converted to a +/// scalar type. Note: Only unpacked literal struct types are supported. +Type *llvm::ToNarrowTy(Type *Ty) { + auto *StructTy = dyn_cast(Ty); + if (!StructTy) + return Ty->getScalarType(); + assert(StructTy->isLiteral() && !StructTy->isPacked() && + "expected unpacked struct literal"); + return StructType::get( + Ty->getContext(), + map_to_vector(StructTy->elements(), [](Type *ElTy) -> Type * { + return ElTy->getScalarType(); + })); +} + +/// Returns the types contained in `Ty`. For struct types, it returns the +/// elements, all other types are returned directly. +SmallVector llvm::getContainedTypes(Type *Ty) { + auto *StructTy = dyn_cast(Ty); + if (StructTy) + return to_vector<2>(StructTy->elements()); + return {Ty}; +} + +/// Returns true if `Ty` is a vector type or a struct of vector types where all +/// vector types share the same VF. +bool llvm::isWideTy(Type *Ty) { + auto ContainedTys = getContainedTypes(Ty); + if (ContainedTys.empty() || !ContainedTys.front()->isVectorTy()) + return false; + ElementCount VF = cast(ContainedTys.front())->getElementCount(); + return all_of(ContainedTys, [&](Type *Ty) { + return Ty->isVectorTy() && cast(Ty)->getElementCount() == VF; + }); +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index e695902c9d72ad..00ebd7d3c96f9a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -945,8 +945,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Check that the instruction return type is vectorizable. // We can't vectorize casts from vector type to scalar type. // Also, we can't vectorize extractelement instructions. - if ((!VectorType::isValidElementType(I.getType()) && - !I.getType()->isVoidTy()) || + Type* InstTy = I.getType(); + if (!(InstTy->isVoidTy() || canWidenType(InstTy)) || (isa(I) && !VectorType::isValidElementType(I.getOperand(0)->getType())) || isa(I)) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 09e4d0fcd31f3c..1b19c43b04ea9d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2324,7 +2324,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPIteration &Instance, VPTransformState &State) { - assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + assert((!Instr->getType()->isAggregateType() || + canWidenType(Instr->getType())) && + "widenable type or non-aggregate type!"); // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for // the first lane and part. @@ -2887,10 +2889,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, return ScalarCallCost; } -static Type *maybeVectorizeType(Type *Elt, ElementCount VF) { - if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) - return Elt; - return VectorType::get(Elt, VF); +static Type *maybeVectorizeType(Type *Ty, ElementCount VF) { + if (VF.isScalar() || !canWidenType(Ty)) + return Ty; + return ToWideTy(Ty, VF); } InstructionCost @@ -3655,9 +3657,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // ExtractValue instructions must be uniform, because the operands are // known to be loop-invariant. - if (auto *EVI = dyn_cast(&I)) { - assert(IsOutOfScope(EVI->getAggregateOperand()) && - "Expected aggregate value to be loop invariant"); + if (auto *EVI = dyn_cast(&I); + EVI && IsOutOfScope(EVI->getAggregateOperand())) { AddToWorklistIfAllowed(EVI); continue; } @@ -4487,8 +4488,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, llvm_unreachable("unhandled recipe"); } - auto WillWiden = [&TTI, VF](Type *ScalarTy) { - Type *VectorTy = ToVectorTy(ScalarTy, VF); + auto WillWiden = [&TTI, VF](Type *VectorTy) { unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy); if (!NumLegalParts) return false; @@ -4519,7 +4519,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, Type *ScalarTy = TypeInfo.inferScalarType(ToCheck); if (!Visited.insert({ScalarTy}).second) continue; - if (WillWiden(ScalarTy)) + Type *WideTy = ToWideTy(ScalarTy, VF); + if (any_of(getContainedTypes(WideTy), WillWiden)) return true; } } @@ -5468,10 +5469,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( // and phi nodes. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { - ScalarCost += TTI.getScalarizationOverhead( - cast(ToVectorTy(I->getType(), VF)), - APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, - /*Extract*/ false, CostKind); + Type *WideTy = ToWideTy(I->getType(), VF); + for (Type *VectorTy : getContainedTypes(WideTy)) { + ScalarCost += TTI.getScalarizationOverhead( + cast(VectorTy), APInt::getAllOnes(VF.getFixedValue()), + /*Insert*/ true, + /*Extract*/ false, CostKind); + } ScalarCost += VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); } @@ -5960,13 +5964,17 @@ InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( return 0; InstructionCost Cost = 0; - Type *RetTy = ToVectorTy(I->getType(), VF); + Type *RetTy = ToWideTy(I->getType(), VF); if (!RetTy->isVoidTy() && - (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), - /*Insert*/ true, - /*Extract*/ false, CostKind); + (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) { + + for (Type *VectorTy : getContainedTypes(RetTy)) { + Cost += TTI.getScalarizationOverhead( + cast(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()), + /*Insert*/ true, + /*Extract*/ false, CostKind); + } + } // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6225,9 +6233,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { bool MaskRequired = Legal->isMaskRequired(CI); // Compute corresponding vector type for return value and arguments. - Type *RetTy = ToVectorTy(ScalarRetTy, VF); + Type *RetTy = ToWideTy(ScalarRetTy, VF); for (Type *ScalarTy : ScalarTys) - Tys.push_back(ToVectorTy(ScalarTy, VF)); + Tys.push_back(ToWideTy(ScalarTy, VF)); // An in-loop reduction using an fmuladd intrinsic is a special case; // we don't want the normal cost for that intrinsic. @@ -6404,7 +6412,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, HasSingleCopyAfterVectorization(I, VF)); VectorTy = RetTy; } else - VectorTy = ToVectorTy(RetTy, VF); + VectorTy = ToWideTy(RetTy, VF); if (VF.isVector() && VectorTy->isVectorTy() && !TTI.getNumberOfParts(VectorTy)) @@ -8408,6 +8416,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, case Instruction::Sub: case Instruction::Xor: case Instruction::Freeze: + case Instruction::ExtractValue: if (I->getOpcode() == Instruction::Mul) { // Simplify operands of multiplications using SCEV. This is needed at the // moment to match the behavior of the legacy cost-model. @@ -9443,7 +9452,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { VectorType::get(UI->getType(), State.VF)); State.set(this, Poison); } - State.packScalarIntoVectorValue(this, *State.Instance); + State.packScalarIntoWideValue(this, *State.Instance); } return; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5e4d487261c6f0..449e9a3402b9e5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -344,10 +344,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) { } else { // Initialize packing with insertelements to start from undef. assert(!VF.isScalable() && "VF is assumed to be non scalable."); - Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF)); + Value *Undef = PoisonValue::get(ToWideTy(LastInst->getType(), VF)); set(Def, Undef); for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) - packScalarIntoVectorValue(Def, {0, Lane}); + packScalarIntoWideValue(Def, {0, Lane}); VectorValue = get(Def); } Builder.restoreIP(OldIP); @@ -400,13 +400,24 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) { Builder.SetCurrentDebugLocation(DIL); } -void VPTransformState::packScalarIntoVectorValue(VPValue *Def, - const VPIteration &Instance) { +void VPTransformState::packScalarIntoWideValue(VPValue *Def, + const VPIteration &Instance) { Value *ScalarInst = get(Def, Instance); - Value *VectorValue = get(Def); - VectorValue = Builder.CreateInsertElement( - VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(Builder, VF)); - set(Def, VectorValue); + Value *WideValue = get(Def); + Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF); + if (auto *StructTy = dyn_cast(WideValue->getType())) { + // We must handle each element of a widened struct type. + for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) { + Value *ScalarValue = Builder.CreateExtractValue(ScalarInst, I); + Value *VectorValue = Builder.CreateExtractValue(WideValue, I); + VectorValue = Builder.CreateInsertElement(VectorValue, ScalarValue, Lane); + WideValue = Builder.CreateInsertValue(WideValue, VectorValue, I); + } + } else { + assert(WideValue->getType()->isVectorTy() && "expected vector type!"); + WideValue = Builder.CreateInsertElement(WideValue, ScalarInst, Lane); + } + set(Def, WideValue); } BasicBlock * diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c886a39aec76e5..c4d7781adb904a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -294,7 +294,7 @@ struct VPTransformState { set(Def, V, VPIteration(0, 0)); return; } - assert((VF.isScalar() || V->getType()->isVectorTy()) && + assert((VF.isScalar() || isWideTy(V->getType())) && "scalar values must be stored as (0, 0)"); Data.VPV2Vector[Def] = V; } @@ -344,8 +344,8 @@ struct VPTransformState { /// Set the debug location in the builder using the debug location \p DL. void setDebugLocFrom(DebugLoc DL); - /// Construct the vector value of a scalarized value \p V one lane at a time. - void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance); + /// Construct the wide value of a scalarized value \p V one lane at a time. + void packScalarIntoWideValue(VPValue *Def, const VPIteration &Instance); /// Hold state information used when constructing the CFG of the output IR, /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 277df0637372d8..d70bdc13c72132 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -122,6 +122,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) { case Instruction::FNeg: case Instruction::Freeze: return inferScalarType(R->getOperand(0)); + case Instruction::ExtractValue: + return R->getUnderlyingInstr()->getType(); default: break; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f33293e65010f9..3935194fa5e045 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -989,7 +989,7 @@ InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF, } Type *RetTy = - ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); + ToWideTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF); SmallVector ParamTys; for (unsigned I = 0; I != getNumOperands(); ++I) ParamTys.push_back( @@ -1157,6 +1157,13 @@ void VPWidenRecipe::execute(VPTransformState &State) { State.addMetadata(V, dyn_cast_or_null(getUnderlyingValue())); break; } + case Instruction::ExtractValue: { + Value *Op = State.get(getOperand(0)); + Value *Extract = Builder.CreateExtractValue( + Op, cast(getUnderlyingValue())->getIndices()); + State.set(this, Extract); + break; + } case Instruction::Freeze: { Value *Op = State.get(getOperand(0)); @@ -1213,6 +1220,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}); } + case Instruction::ExtractValue: case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return.ll new file mode 100644 index 00000000000000..cc434c2a6de965 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return.ll @@ -0,0 +1,251 @@ +; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=NEON +; RUN: opt < %s -mattr=+sve -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_TF + +target triple = "aarch64-unknown-linux-gnu" + +; Tests basic vectorization of homogeneous struct literal returns. + +define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; NEON-LABEL: define void @struct_return_f32_widen +; NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; NEON: vector.body: +; NEON: [[WIDE_CALL:%.*]] = call { <4 x float>, <4 x float> } @fixed_vec_foo(<4 x float> [[WIDE_LOAD:%.*]]) +; NEON: [[WIDE_A:%.*]] = extractvalue { <4 x float>, <4 x float> } [[WIDE_CALL]], 0 +; NEON: [[WIDE_B:%.*]] = extractvalue { <4 x float>, <4 x float> } [[WIDE_CALL]], 1 +; NEON: store <4 x float> [[WIDE_A]], ptr {{%.*}}, align 4 +; NEON: store <4 x float> [[WIDE_B]], ptr {{%.*}}, align 4 +; +; SVE_TF-LABEL: define void @struct_return_f32_widen +; SVE_TF-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; SVE_TF: vector.body: +; SVE_TF: [[WIDE_CALL:%.*]] = call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF: [[WIDE_A:%.*]] = extractvalue { , } [[WIDE_CALL]], 0 +; SVE_TF: [[WIDE_B:%.*]] = extractvalue { , } [[WIDE_CALL]], 1 +; SVE_TF: call void @llvm.masked.store.nxv4f32.p0( [[WIDE_A]], ptr {{%.*}}, i32 4, [[ACTIVE_LANE_MASK]]) +; SVE_TF: call void @llvm.masked.store.nxv4f32.p0( [[WIDE_B]], ptr {{%.*}}, i32 4, [[ACTIVE_LANE_MASK]]) +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %call = tail call { float, float } @foo(float %0) #0 + %1 = extractvalue { float, float } %call, 0 + %2 = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %indvars.iv + store float %1, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %indvars.iv + store float %2, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; NEON-LABEL: define void @struct_return_f64_widen +; NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; NEON: vector.body: +; NEON: [[WIDE_CALL:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD:%.*]]) +; NEON: [[WIDE_A:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 0 +; NEON: [[WIDE_B:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 1 +; NEON: store <2 x double> [[WIDE_A]], ptr {{%.*}}, align 8 +; NEON: store <2 x double> [[WIDE_B]], ptr {{%.*}}, align 8 +; +; SVE_TF-LABEL: define void @struct_return_f64_widen +; SVE_TF-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; SVE_TF: vector.body: +; SVE_TF: [[WIDE_CALL:%.*]] = call { , } @scalable_vec_masked_bar( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF: [[WIDE_A:%.*]] = extractvalue { , } [[WIDE_CALL]], 0 +; SVE_TF: [[WIDE_B:%.*]] = extractvalue { , } [[WIDE_CALL]], 1 +; SVE_TF: call void @llvm.masked.store.nxv2f64.p0( [[WIDE_A]], ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) +; SVE_TF: call void @llvm.masked.store.nxv2f64.p0( [[WIDE_B]], ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %in, i64 %indvars.iv + %0 = load double, ptr %arrayidx, align 8 + %call = tail call { double, double } @bar(double %0) #1 + %1 = extractvalue { double, double } %call, 0 + %2 = extractvalue { double, double } %call, 1 + %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %indvars.iv + store double %1, ptr %arrayidx2, align 8 + %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %indvars.iv + store double %2, ptr %arrayidx4, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; NEON-LABEL: define void @struct_return_f32_replicate +; NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) +; NEON: vector.body: +; NEON: [[CALL_LANE_0:%.*]] = tail call { float, float } @foo(float {{%.*}}) +; NEON: [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}}) +; NEON: [[LANE_0_A:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0 +; NEON: [[TMP_A:%.*]] = insertelement <2 x float> poison, float [[LANE_0_A]], i64 0 +; NEON: [[LANE_0_B:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1 +; NEON: [[TMP_B:%.*]] = insertelement <2 x float> poison, float [[LANE_0_B]], i64 0 +; NEON: [[LANE_1_A:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0 +; NEON: [[WIDE_A:%.*]] = insertelement <2 x float> [[TMP_A]], float [[LANE_1_A]], i64 1 +; NEON: [[LANE_1_B:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1 +; NEON: [[WIDE_B:%.*]] = insertelement <2 x float> [[TMP_B]], float [[LANE_1_B]], i64 1 +; NEON: store <2 x float> [[WIDE_A]], ptr {{%.*}}, align 4 +; NEON: store <2 x float> [[WIDE_B]], ptr {{%.*}}, align 4 +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + ; #3 does not have a fixed-size vector mapping (so replication is used) + %call = tail call { float, float } @foo(float %0) #3 + %1 = extractvalue { float, float } %call, 0 + %2 = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %indvars.iv + store float %1, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %indvars.iv + store float %2, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) { +; NEON-LABEL: define void @struct_return_f32_widen_rt_checks +; NEON-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) +; NEON: entry: +; NEON: br i1 false, label %scalar.ph, label %vector.memcheck +; NEON: vector.memcheck: +; NEON: vector.body: +; NEON: call { <4 x float>, <4 x float> } @fixed_vec_foo(<4 x float> [[WIDE_LOAD:%.*]]) +; NEON: for.body: +; NEON call { float, float } @foo(float [[LOAD:%.*]]) +; +; SVE_TF-LABEL: define void @struct_return_f32_widen_rt_checks +; SVE_TF-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) +; SVE_TF: entry: +; SVE_TF: br i1 false, label %scalar.ph, label %vector.memcheck +; SVE_TF: vector.memcheck: +; SVE_TF: vector.body: +; SVE_TF: call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE_TF: for.body: +; SVE_TF: call { float, float } @foo(float [[LOAD:%.*]]) +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %call = tail call { float, float } @foo(float %0) #0 + %1 = extractvalue { float, float } %call, 0 + %2 = extractvalue { float, float } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %indvars.iv + store float %1, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %indvars.iv + store float %2, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +; Negative test. Widening structs with mixed element types is not supported. +define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; NEON-LABEL: define void @negative_mixed_element_type_struct_return +; NEON-NOT: vector.body: +; NEON-NOT: call {{.*}} @fixed_vec_baz +; +; SVE_TF-LABEL: define void @negative_mixed_element_type_struct_return +; SVE_TF-NOT: vector.body: +; SVE_TF-NOT: call {{.*}} @scalable_vec_masked_baz +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, ptr %in, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %call = tail call { float, i32 } @baz(float %0) #2 + %1 = extractvalue { float, i32 } %call, 0 + %2 = extractvalue { float, i32 } %call, 1 + %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %indvars.iv + store float %1, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds i32, ptr %out_b, i64 %indvars.iv + store i32 %2, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +%named_struct = type { double, double } + +; Negative test. Widening non-literal structs is not supported. +define void @test_named_struct_return(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { +; NEON-LABEL: define void @test_named_struct_return +; NEON-NOT: vector.body: +; NEON-NOT: call {{.*}} @fixed_vec_bar +; +; SVE_TF-LABEL: define void @test_named_struct_return +; SVE_TF-NOT: vector.body: +; SVE_TF-NOT: call {{.*}} @scalable_vec_masked_bar +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %in, i64 %indvars.iv + %0 = load double, ptr %arrayidx, align 8 + %call = tail call %named_struct @bar_named(double %0) #4 + %1 = extractvalue %named_struct %call, 0 + %2 = extractvalue %named_struct %call, 1 + %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %indvars.iv + store double %1, ptr %arrayidx2, align 8 + %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %indvars.iv + store double %2, ptr %arrayidx4, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +declare { float, float } @foo(float) +declare { double, double } @bar(double) +declare { float, i32 } @baz(float) +declare %named_struct @bar_named(double) + +declare { <4 x float>, <4 x float> } @fixed_vec_foo(<4 x float>) +declare { , } @scalable_vec_masked_foo(, ) + +declare { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double>) +declare { , } @scalable_vec_masked_bar(, ) + +declare { <4 x float>, <4 x i32> } @fixed_vec_baz(<4 x float>) +declare { , } @scalable_vec_masked_baz(, ) + +attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN4v_foo(fixed_vec_foo),_ZGVsMxv_foo(scalable_vec_masked_foo)" } +attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar(fixed_vec_bar),_ZGVsMxv_bar(scalable_vec_masked_bar)" } +attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN4v_baz(fixed_vec_baz),_ZGVsMxv_foo(scalable_vec_masked_baz)" } +attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" } +attributes #4 = { nounwind "vector-function-abi-variant"="_ZGVnN4v_bar_named(fixed_vec_bar),_ZGVsMxv_bar_named(scalable_vec_masked_bar)" }