From 59b3fa9c65610bb1bf6d599e44aef93e80027069 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 20 Sep 2024 13:52:48 +0000
Subject: [PATCH] [LV] Add initial support for vectorizing literal struct
 return values

This patch adds initial support for vectorizing literal struct return
values. Currently, this is limited to the case where the struct is
homogeneous (all elements have the same type) and not packed.

The intended use case for this is vectorizing intrinsics such as:

```
declare { float, float } @llvm.sincos.f32(float %x)
```

Mapping them to structure-returning library calls such as:

```
declare { <4 x float>, <4 x i32> } @Sleef_sincosf4_u10advsimd(<4 x float>)
```

It could also be possible to vectorize the intrinsic (without a libcall)
and then later lower the intrinsic to a library call. This may be
desired if the only library calls available take output pointers rather
than return multiple values.

Implementing this required two main changes:

1. Supporting widening `extractvalue`
2. Adding support for "wide" types (in LV and parts of the cost model)

The first change is relatively straightforward, the second is larger as
it requires changing assumptions that types are always scalars or
vectors.

In this patch, a "wide" type is defined as a vector, or a struct literal
where all elements are vectors (of the same element count).

To help with the second change some helpers for wide types have been
added (that work similarly to existing vector helpers). These have been
used along the paths needed to support vectorizing calls, however, I
expect there are many places that still only expect vector types.
---
 llvm/include/llvm/Analysis/VectorUtils.h      |  15 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  42 +--
 llvm/include/llvm/IR/DerivedTypes.h           |   4 +
 llvm/include/llvm/IR/VectorUtils.h            |  53 ++++
 llvm/lib/Analysis/VectorUtils.cpp             |  14 +
 llvm/lib/IR/CMakeLists.txt                    |   1 +
 llvm/lib/IR/Type.cpp                          |  10 +-
 llvm/lib/IR/VFABIDemangler.cpp                |  18 +-
 llvm/lib/IR/VectorUtils.cpp                   |  69 +++++
 .../Vectorize/LoopVectorizationLegality.cpp   |   4 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  59 ++--
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  27 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   6 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   2 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  10 +-
 .../LoopVectorize/AArch64/struct-return.ll    | 251 ++++++++++++++++++
 16 files changed, 506 insertions(+), 79 deletions(-)
 create mode 100644 llvm/include/llvm/IR/VectorUtils.h
 create mode 100644 llvm/lib/IR/VectorUtils.cpp
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/struct-return.ll

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index e2dd4976f39065..2a419560be3030 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/VFABIDemangler.h"
+#include "llvm/IR/VectorUtils.h"
 #include "llvm/Support/CheckedArithmetic.h"
 
 namespace llvm {
@@ -127,18 +128,8 @@ namespace Intrinsic {
 typedef unsigned ID;
 }
 
-/// A helper function for converting Scalar types to vector types. If
-/// the incoming type is void, we return void. If the EC represents a
-/// scalar, we return the scalar type.
-inline Type *ToVectorTy(Type *Scalar, ElementCount EC) {
-  if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar())
-    return Scalar;
-  return VectorType::get(Scalar, EC);
-}
-
-inline Type *ToVectorTy(Type *Scalar, unsigned VF) {
-  return ToVectorTy(Scalar, ElementCount::getFixed(VF));
-}
+/// Returns true if `Ty` can be widened by the loop vectorizer.
+bool canWidenType(Type *Ty);
 
 /// Identify if the intrinsic is trivially vectorizable.
 /// This method returns true if the intrinsic's argument types are all scalars
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 7198e134a2d262..4a04946f00ca76 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1561,8 +1561,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     Type *RetTy = ICA.getReturnType();
 
     ElementCount RetVF =
-        (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
-                             : ElementCount::getFixed(1));
+        isWideTy(RetTy) ? getWideTypeVF(RetTy) : ElementCount::getFixed(1);
+
     const IntrinsicInst *I = ICA.getInst();
     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
     FastMathFlags FMF = ICA.getFlags();
@@ -1883,10 +1883,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
     if (RetVF.isVector() && !RetVF.isScalable()) {
       ScalarizationCost = 0;
-      if (!RetTy->isVoidTy())
-        ScalarizationCost += getScalarizationOverhead(
-            cast<VectorType>(RetTy),
-            /*Insert*/ true, /*Extract*/ false, CostKind);
+      if (!RetTy->isVoidTy()) {
+        for (Type *VectorTy : getContainedTypes(RetTy)) {
+          ScalarizationCost += getScalarizationOverhead(
+              cast<VectorType>(VectorTy),
+              /*Insert*/ true, /*Extract*/ false, CostKind);
+        }
+      }
       ScalarizationCost +=
           getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
     }
@@ -2477,27 +2480,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     // Else, assume that we need to scalarize this intrinsic. For math builtins
     // this will emit a costly libcall, adding call overhead and spills. Make it
     // very expensive.
-    if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
+    if (isWideTy(RetTy)) {
+      const SmallVector<Type *, 2> RetVTys = getContainedTypes(RetTy);
+
       // Scalable vectors cannot be scalarized, so return Invalid.
-      if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
-            return isa<ScalableVectorType>(Ty);
-          }))
+      if (any_of(concat<Type *const>(RetVTys, Tys),
+                 [](Type *Ty) { return isa<ScalableVectorType>(Ty); }))
         return InstructionCost::getInvalid();
 
-      InstructionCost ScalarizationCost =
-          SkipScalarizationCost
-              ? ScalarizationCostPassed
-              : getScalarizationOverhead(RetVTy, /*Insert*/ true,
-                                         /*Extract*/ false, CostKind);
+      InstructionCost ScalarizationCost = ScalarizationCostPassed;
+      if (!SkipScalarizationCost) {
+        ScalarizationCost = 0;
+        for (Type *RetVTy : RetVTys) {
+          ScalarizationCost += getScalarizationOverhead(
+              cast<VectorType>(RetVTy), /*Insert*/ true,
+              /*Extract*/ false, CostKind);
+        }
+      }
 
-      unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
+      unsigned ScalarCalls = getWideTypeVF(RetTy).getFixedValue();
       SmallVector<Type *, 4> ScalarTys;
       for (Type *Ty : Tys) {
         if (Ty->isVectorTy())
           Ty = Ty->getScalarType();
         ScalarTys.push_back(Ty);
       }
-      IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
+      IntrinsicCostAttributes Attrs(IID, ToNarrowTy(RetTy), ScalarTys, FMF);
       InstructionCost ScalarCost =
           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
       for (Type *Ty : Tys) {
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 975c142f1a4572..a24801d8bdf834 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -301,6 +301,10 @@ class StructType : public Type {
   ///  {<vscale x 2 x i32>, <vscale x 4 x i64>}}
   bool containsHomogeneousScalableVectorTypes() const;
 
+  /// Return true if this struct is non-empty and all element types are the
+  /// same.
+  bool containsHomogeneousTypes() const;
+
   /// Return true if this is a named struct that has a non-empty name.
   bool hasName() const { return SymbolTableEntry != nullptr; }
 
diff --git a/llvm/include/llvm/IR/VectorUtils.h b/llvm/include/llvm/IR/VectorUtils.h
new file mode 100644
index 00000000000000..e8e838d8287c42
--- /dev/null
+++ b/llvm/include/llvm/IR/VectorUtils.h
@@ -0,0 +1,53 @@
+//===----------- VectorUtils.h -  Vector type utility functions -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/DerivedTypes.h"
+
+namespace llvm {
+
+/// A helper function for converting Scalar types to vector types. If
+/// the incoming type is void, we return void. If the EC represents a
+/// scalar, we return the scalar type.
+inline Type *ToVectorTy(Type *Scalar, ElementCount EC) {
+  if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar())
+    return Scalar;
+  return VectorType::get(Scalar, EC);
+}
+
+inline Type *ToVectorTy(Type *Scalar, unsigned VF) {
+  return ToVectorTy(Scalar, ElementCount::getFixed(VF));
+}
+
+/// A helper for converting to wider (vector) types. For scalar types, this is
+/// equivalent to calling `ToVectorTy`. For struct types, this returns a new
+/// struct where each element type has been widened to a vector type. Note: Only
+/// unpacked literal struct types are supported.
+Type *ToWideTy(Type *Ty, ElementCount EC);
+
+/// A helper for converting wide types to narrow (non-vector) types. For vector
+/// types, this is equivalent to calling .getScalarType(). For struct types,
+/// this returns a new struct where each element type has been converted to a
+/// scalar type. Note: Only unpacked literal struct types are supported.
+Type *ToNarrowTy(Type *Ty);
+
+/// Returns the types contained in `Ty`. For struct types, it returns the
+/// elements, all other types are returned directly.
+SmallVector<Type *, 2> getContainedTypes(Type *Ty);
+
+/// Returns true if `Ty` is a vector type or a struct of vector types where all
+/// vector types share the same VF.
+bool isWideTy(Type *Ty);
+
+/// Returns the vectorization factor for a widened type.
+inline ElementCount getWideTypeVF(Type *Ty) {
+  assert(isWideTy(Ty) && "expected widened type!");
+  return cast<VectorType>(getContainedTypes(Ty).front())->getElementCount();
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index dbffbb8a5f81d9..38b9da69ae2b76 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -39,6 +39,20 @@ static cl::opt<unsigned> MaxInterleaveGroupFactor(
     cl::desc("Maximum factor for an interleaved access group (default = 8)"),
     cl::init(8));
 
+/// Returns true if `Ty` can be widened by the loop vectorizer.
+bool llvm::canWidenType(Type *Ty) {
+  Type *ElTy = Ty;
+  // For now, only allow widening non-packed literal structs where all
+  // element types are the same. This simplifies the cost model and
+  // conversion between scalar and wide types.
+  if (auto *StructTy = dyn_cast<StructType>(Ty);
+      StructTy && !StructTy->isPacked() && StructTy->isLiteral() &&
+      StructTy->containsHomogeneousTypes()) {
+    ElTy = StructTy->elements().front();
+  }
+  return VectorType::isValidElementType(ElTy);
+}
+
 /// Return true if all of the intrinsic's arguments and return type are scalars
 /// for the scalar form of the intrinsic, and vectors for the vector form of the
 /// intrinsic (except operands that are marked as always being scalar by
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index 91e0e0cc65f36b..01c73bcd68d2db 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -71,6 +71,7 @@ add_llvm_component_library(LLVMCore
   Value.cpp
   ValueSymbolTable.cpp
   VectorBuilder.cpp
+  VectorUtils.cpp
   Verifier.cpp
   VFABIDemangler.cpp
   RuntimeLibcalls.cpp
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 3784ad28d7219d..055959dbb3575b 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -433,10 +433,12 @@ bool StructType::containsHomogeneousScalableVectorTypes() const {
   Type *FirstTy = getNumElements() > 0 ? elements()[0] : nullptr;
   if (!FirstTy || !isa<ScalableVectorType>(FirstTy))
     return false;
-  for (Type *Ty : elements())
-    if (Ty != FirstTy)
-      return false;
-  return true;
+  return containsHomogeneousTypes();
+}
+
+bool StructType::containsHomogeneousTypes() const {
+  ArrayRef<Type *> ElementTys = elements();
+  return !ElementTys.empty() && all_equal(ElementTys);
 }
 
 void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) {
diff --git a/llvm/lib/IR/VFABIDemangler.cpp b/llvm/lib/IR/VFABIDemangler.cpp
index cdfb9fbfaa084d..6ccd77fd23793a 100644
--- a/llvm/lib/IR/VFABIDemangler.cpp
+++ b/llvm/lib/IR/VFABIDemangler.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/VectorUtils.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <limits>
@@ -346,12 +347,15 @@ getScalableECFromSignature(const FunctionType *Signature, const VFISAKind ISA,
   // Also check the return type if not void.
   Type *RetTy = Signature->getReturnType();
   if (!RetTy->isVoidTy()) {
-    std::optional<ElementCount> ReturnEC = getElementCountForTy(ISA, RetTy);
-    // If we have an unknown scalar element type we can't find a reasonable VF.
-    if (!ReturnEC)
-      return std::nullopt;
-    if (ElementCount::isKnownLT(*ReturnEC, MinEC))
-      MinEC = *ReturnEC;
+    for (Type *RetTy : getContainedTypes(RetTy)) {
+      std::optional<ElementCount> ReturnEC = getElementCountForTy(ISA, RetTy);
+      // If we have an unknown scalar element type we can't find a reasonable
+      // VF.
+      if (!ReturnEC)
+        return std::nullopt;
+      if (ElementCount::isKnownLT(*ReturnEC, MinEC))
+        MinEC = *ReturnEC;
+    }
   }
 
   // The SVE Vector function call ABI bases the VF on the widest element types
@@ -566,7 +570,7 @@ FunctionType *VFABI::createFunctionType(const VFInfo &Info,
 
   auto *RetTy = ScalarFTy->getReturnType();
   if (!RetTy->isVoidTy())
-    RetTy = VectorType::get(RetTy, VF);
+    RetTy = ToWideTy(RetTy, VF);
   return FunctionType::get(RetTy, VecTypes, false);
 }
 
diff --git a/llvm/lib/IR/VectorUtils.cpp b/llvm/lib/IR/VectorUtils.cpp
new file mode 100644
index 00000000000000..c89a8eaf2ad1e0
--- /dev/null
+++ b/llvm/lib/IR/VectorUtils.cpp
@@ -0,0 +1,69 @@
+//===----------- VectorUtils.cpp - Vector type utility functions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/VectorUtils.h"
+#include "llvm/ADT/SmallVectorExtras.h"
+
+using namespace llvm;
+
+/// A helper for converting to wider (vector) types. For scalar types, this is
+/// equivalent to calling `ToVectorTy`. For struct types, this returns a new
+/// struct where each element type has been widened to a vector type. Note: Only
+/// unpacked literal struct types are supported.
+Type *llvm::ToWideTy(Type *Ty, ElementCount EC) {
+  if (EC.isScalar())
+    return Ty;
+  auto *StructTy = dyn_cast<StructType>(Ty);
+  if (!StructTy)
+    return ToVectorTy(Ty, EC);
+  assert(StructTy->isLiteral() && !StructTy->isPacked() &&
+         "expected unpacked struct literal");
+  return StructType::get(
+      Ty->getContext(),
+      map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * {
+        return VectorType::get(ElTy, EC);
+      }));
+}
+
+/// A helper for converting wide types to narrow (non-vector) types. For vector
+/// types, this is equivalent to calling .getScalarType(). For struct types,
+/// this returns a new struct where each element type has been converted to a
+/// scalar type. Note: Only unpacked literal struct types are supported.
+Type *llvm::ToNarrowTy(Type *Ty) {
+  auto *StructTy = dyn_cast<StructType>(Ty);
+  if (!StructTy)
+    return Ty->getScalarType();
+  assert(StructTy->isLiteral() && !StructTy->isPacked() &&
+         "expected unpacked struct literal");
+  return StructType::get(
+      Ty->getContext(),
+      map_to_vector(StructTy->elements(), [](Type *ElTy) -> Type * {
+        return ElTy->getScalarType();
+      }));
+}
+
+/// Returns the types contained in `Ty`. For struct types, it returns the
+/// elements, all other types are returned directly.
+SmallVector<Type *, 2> llvm::getContainedTypes(Type *Ty) {
+  auto *StructTy = dyn_cast<StructType>(Ty);
+  if (StructTy)
+    return to_vector<2>(StructTy->elements());
+  return {Ty};
+}
+
+/// Returns true if `Ty` is a vector type or a struct of vector types where all
+/// vector types share the same VF.
+bool llvm::isWideTy(Type *Ty) {
+  auto ContainedTys = getContainedTypes(Ty);
+  if (ContainedTys.empty() || !ContainedTys.front()->isVectorTy())
+    return false;
+  ElementCount VF = cast<VectorType>(ContainedTys.front())->getElementCount();
+  return all_of(ContainedTys, [&](Type *Ty) {
+    return Ty->isVectorTy() && cast<VectorType>(Ty)->getElementCount() == VF;
+  });
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index e695902c9d72ad..00ebd7d3c96f9a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -945,8 +945,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       // Check that the instruction return type is vectorizable.
       // We can't vectorize casts from vector type to scalar type.
       // Also, we can't vectorize extractelement instructions.
-      if ((!VectorType::isValidElementType(I.getType()) &&
-           !I.getType()->isVoidTy()) ||
+      Type* InstTy = I.getType();
+      if (!(InstTy->isVoidTy() || canWidenType(InstTy)) ||
           (isa<CastInst>(I) &&
            !VectorType::isValidElementType(I.getOperand(0)->getType())) ||
           isa<ExtractElementInst>(I)) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09e4d0fcd31f3c..1b19c43b04ea9d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2324,7 +2324,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
                                                VPReplicateRecipe *RepRecipe,
                                                const VPIteration &Instance,
                                                VPTransformState &State) {
-  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  assert((!Instr->getType()->isAggregateType() ||
+          canWidenType(Instr->getType())) &&
+         "widenable type or non-aggregate type!");
 
   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
   // the first lane and part.
@@ -2887,10 +2889,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   return ScalarCallCost;
 }
 
-static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
-  if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
-    return Elt;
-  return VectorType::get(Elt, VF);
+static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
+  if (VF.isScalar() || !canWidenType(Ty))
+    return Ty;
+  return ToWideTy(Ty, VF);
 }
 
 InstructionCost
@@ -3655,9 +3657,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 
       // ExtractValue instructions must be uniform, because the operands are
       // known to be loop-invariant.
-      if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
-        assert(IsOutOfScope(EVI->getAggregateOperand()) &&
-               "Expected aggregate value to be loop invariant");
+      if (auto *EVI = dyn_cast<ExtractValueInst>(&I);
+          EVI && IsOutOfScope(EVI->getAggregateOperand())) {
         AddToWorklistIfAllowed(EVI);
         continue;
       }
@@ -4487,8 +4488,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
         llvm_unreachable("unhandled recipe");
       }
 
-      auto WillWiden = [&TTI, VF](Type *ScalarTy) {
-        Type *VectorTy = ToVectorTy(ScalarTy, VF);
+      auto WillWiden = [&TTI, VF](Type *VectorTy) {
         unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
         if (!NumLegalParts)
           return false;
@@ -4519,7 +4519,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
       if (!Visited.insert({ScalarTy}).second)
         continue;
-      if (WillWiden(ScalarTy))
+      Type *WideTy = ToWideTy(ScalarTy, VF);
+      if (any_of(getContainedTypes(WideTy), WillWiden))
         return true;
     }
   }
@@ -5468,10 +5469,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // and phi nodes.
     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
-      ScalarCost += TTI.getScalarizationOverhead(
-          cast<VectorType>(ToVectorTy(I->getType(), VF)),
-          APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
-          /*Extract*/ false, CostKind);
+      Type *WideTy = ToWideTy(I->getType(), VF);
+      for (Type *VectorTy : getContainedTypes(WideTy)) {
+        ScalarCost += TTI.getScalarizationOverhead(
+            cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
+            /*Insert*/ true,
+            /*Extract*/ false, CostKind);
+      }
       ScalarCost +=
           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
     }
@@ -5960,13 +5964,17 @@ InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
     return 0;
 
   InstructionCost Cost = 0;
-  Type *RetTy = ToVectorTy(I->getType(), VF);
+  Type *RetTy = ToWideTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
-      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
-    Cost += TTI.getScalarizationOverhead(
-        cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
-        /*Insert*/ true,
-        /*Extract*/ false, CostKind);
+      (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
+
+    for (Type *VectorTy : getContainedTypes(RetTy)) {
+      Cost += TTI.getScalarizationOverhead(
+          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
+          /*Insert*/ true,
+          /*Extract*/ false, CostKind);
+    }
+  }
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6225,9 +6233,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
 
       bool MaskRequired = Legal->isMaskRequired(CI);
       // Compute corresponding vector type for return value and arguments.
-      Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+      Type *RetTy = ToWideTy(ScalarRetTy, VF);
       for (Type *ScalarTy : ScalarTys)
-        Tys.push_back(ToVectorTy(ScalarTy, VF));
+        Tys.push_back(ToWideTy(ScalarTy, VF));
 
       // An in-loop reduction using an fmuladd intrinsic is a special case;
       // we don't want the normal cost for that intrinsic.
@@ -6404,7 +6412,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
            HasSingleCopyAfterVectorization(I, VF));
     VectorTy = RetTy;
   } else
-    VectorTy = ToVectorTy(RetTy, VF);
+    VectorTy = ToWideTy(RetTy, VF);
 
   if (VF.isVector() && VectorTy->isVectorTy() &&
       !TTI.getNumberOfParts(VectorTy))
@@ -8408,6 +8416,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
   case Instruction::Sub:
   case Instruction::Xor:
   case Instruction::Freeze:
+  case Instruction::ExtractValue:
     if (I->getOpcode() == Instruction::Mul) {
       // Simplify operands of multiplications using SCEV. This is needed at the
       // moment to match the behavior of the legacy cost-model.
@@ -9443,7 +9452,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
             VectorType::get(UI->getType(), State.VF));
         State.set(this, Poison);
       }
-      State.packScalarIntoVectorValue(this, *State.Instance);
+      State.packScalarIntoWideValue(this, *State.Instance);
     }
     return;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 5e4d487261c6f0..449e9a3402b9e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -344,10 +344,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
   } else {
     // Initialize packing with insertelements to start from undef.
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
-    Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
+    Value *Undef = PoisonValue::get(ToWideTy(LastInst->getType(), VF));
     set(Def, Undef);
     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
-      packScalarIntoVectorValue(Def, {0, Lane});
+      packScalarIntoWideValue(Def, {0, Lane});
     VectorValue = get(Def);
   }
   Builder.restoreIP(OldIP);
@@ -400,13 +400,24 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
     Builder.SetCurrentDebugLocation(DIL);
 }
 
-void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
-                                                 const VPIteration &Instance) {
+void VPTransformState::packScalarIntoWideValue(VPValue *Def,
+                                               const VPIteration &Instance) {
   Value *ScalarInst = get(Def, Instance);
-  Value *VectorValue = get(Def);
-  VectorValue = Builder.CreateInsertElement(
-      VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(Builder, VF));
-  set(Def, VectorValue);
+  Value *WideValue = get(Def);
+  Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF);
+  if (auto *StructTy = dyn_cast<StructType>(WideValue->getType())) {
+    // We must handle each element of a widened struct type.
+    for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
+      Value *ScalarValue = Builder.CreateExtractValue(ScalarInst, I);
+      Value *VectorValue = Builder.CreateExtractValue(WideValue, I);
+      VectorValue = Builder.CreateInsertElement(VectorValue, ScalarValue, Lane);
+      WideValue = Builder.CreateInsertValue(WideValue, VectorValue, I);
+    }
+  } else {
+    assert(WideValue->getType()->isVectorTy() && "expected vector type!");
+    WideValue = Builder.CreateInsertElement(WideValue, ScalarInst, Lane);
+  }
+  set(Def, WideValue);
 }
 
 BasicBlock *
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c886a39aec76e5..c4d7781adb904a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -294,7 +294,7 @@ struct VPTransformState {
       set(Def, V, VPIteration(0, 0));
       return;
     }
-    assert((VF.isScalar() || V->getType()->isVectorTy()) &&
+    assert((VF.isScalar() || isWideTy(V->getType())) &&
            "scalar values must be stored as (0, 0)");
     Data.VPV2Vector[Def] = V;
   }
@@ -344,8 +344,8 @@ struct VPTransformState {
   /// Set the debug location in the builder using the debug location \p DL.
   void setDebugLocFrom(DebugLoc DL);
 
-  /// Construct the vector value of a scalarized value \p V one lane at a time.
-  void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance);
+  /// Construct the wide value of a scalarized value \p V one lane at a time.
+  void packScalarIntoWideValue(VPValue *Def, const VPIteration &Instance);
 
   /// Hold state information used when constructing the CFG of the output IR,
   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 277df0637372d8..d70bdc13c72132 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -122,6 +122,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
   case Instruction::FNeg:
   case Instruction::Freeze:
     return inferScalarType(R->getOperand(0));
+  case Instruction::ExtractValue:
+    return R->getUnderlyingInstr()->getType();
   default:
     break;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f33293e65010f9..3935194fa5e045 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -989,7 +989,7 @@ InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
   }
 
   Type *RetTy =
-      ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
+      ToWideTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
   SmallVector<Type *> ParamTys;
   for (unsigned I = 0; I != getNumOperands(); ++I)
     ParamTys.push_back(
@@ -1157,6 +1157,13 @@ void VPWidenRecipe::execute(VPTransformState &State) {
     State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     break;
   }
+  case Instruction::ExtractValue: {
+    Value *Op = State.get(getOperand(0));
+    Value *Extract = Builder.CreateExtractValue(
+        Op, cast<ExtractValueInst>(getUnderlyingValue())->getIndices());
+    State.set(this, Extract);
+    break;
+  }
   case Instruction::Freeze: {
     Value *Op = State.get(getOperand(0));
 
@@ -1213,6 +1220,7 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
   }
 
+  case Instruction::ExtractValue:
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::SRem:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return.ll
new file mode 100644
index 00000000000000..cc434c2a6de965
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return.ll
@@ -0,0 +1,251 @@
+; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=NEON
+; RUN: opt < %s -mattr=+sve -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_TF
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Tests basic vectorization of homogeneous struct literal returns.
+
+define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; NEON-LABEL: define void @struct_return_f32_widen
+; NEON-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; NEON:       vector.body:
+; NEON:         [[WIDE_CALL:%.*]] = call { <4 x float>, <4 x float> } @fixed_vec_foo(<4 x float> [[WIDE_LOAD:%.*]])
+; NEON:         [[WIDE_A:%.*]] = extractvalue { <4 x float>, <4 x float> } [[WIDE_CALL]], 0
+; NEON:         [[WIDE_B:%.*]] = extractvalue { <4 x float>, <4 x float> } [[WIDE_CALL]], 1
+; NEON:         store <4 x float> [[WIDE_A]], ptr {{%.*}}, align 4
+; NEON:         store <4 x float> [[WIDE_B]], ptr {{%.*}}, align 4
+;
+; SVE_TF-LABEL: define void @struct_return_f32_widen
+; SVE_TF-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; SVE_TF:       vector.body:
+; SVE_TF:         [[WIDE_CALL:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:         [[WIDE_A:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[WIDE_CALL]], 0
+; SVE_TF:         [[WIDE_B:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[WIDE_CALL]], 1
+; SVE_TF:         call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_A]], ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; SVE_TF:         call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_B]], ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %0) #0
+  %1 = extractvalue { float, float } %call, 0
+  %2 = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %indvars.iv
+  store float %1, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %indvars.iv
+  store float %2, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; NEON-LABEL: define void @struct_return_f64_widen
+; NEON-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; NEON:        vector.body:
+; NEON:          [[WIDE_CALL:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD:%.*]])
+; NEON:          [[WIDE_A:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 0
+; NEON:          [[WIDE_B:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 1
+; NEON:          store <2 x double> [[WIDE_A]], ptr {{%.*}}, align 8
+; NEON:          store <2 x double> [[WIDE_B]], ptr {{%.*}}, align 8
+;
+; SVE_TF-LABEL: define void @struct_return_f64_widen
+; SVE_TF-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; SVE_TF:       vector.body:
+; SVE_TF:         [[WIDE_CALL:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:         [[WIDE_A:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[WIDE_CALL]], 0
+; SVE_TF:         [[WIDE_B:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[WIDE_CALL]], 1
+; SVE_TF:         call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[WIDE_A]], ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; SVE_TF:         call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[WIDE_B]], ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %indvars.iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call { double, double } @bar(double %0) #1
+  %1 = extractvalue { double, double } %call, 0
+  %2 = extractvalue { double, double } %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %indvars.iv
+  store double %1, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %indvars.iv
+  store double %2, ptr %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @struct_return_f32_replicate(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; NEON-LABEL: define void @struct_return_f32_replicate
+; NEON-SAME:  (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
+; NEON:       vector.body:
+; NEON:         [[CALL_LANE_0:%.*]] = tail call { float, float } @foo(float {{%.*}})
+; NEON:         [[CALL_LANE_1:%.*]] = tail call { float, float } @foo(float {{%.*}})
+; NEON:         [[LANE_0_A:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 0
+; NEON:         [[TMP_A:%.*]] = insertelement <2 x float> poison, float [[LANE_0_A]], i64 0
+; NEON:         [[LANE_0_B:%.*]] = extractvalue { float, float } [[CALL_LANE_0]], 1
+; NEON:         [[TMP_B:%.*]] = insertelement <2 x float> poison, float [[LANE_0_B]], i64 0
+; NEON:         [[LANE_1_A:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 0
+; NEON:         [[WIDE_A:%.*]] = insertelement <2 x float> [[TMP_A]], float [[LANE_1_A]], i64 1
+; NEON:         [[LANE_1_B:%.*]] = extractvalue { float, float } [[CALL_LANE_1]], 1
+; NEON:         [[WIDE_B:%.*]] = insertelement <2 x float> [[TMP_B]], float [[LANE_1_B]], i64 1
+; NEON:         store <2 x float> [[WIDE_A]], ptr {{%.*}}, align 4
+; NEON:         store <2 x float> [[WIDE_B]], ptr {{%.*}}, align 4
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  ; #3 does not have a fixed-size vector mapping (so replication is used)
+  %call = tail call { float, float } @foo(float %0) #3
+  %1 = extractvalue { float, float } %call, 0
+  %2 = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %indvars.iv
+  store float %1, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %indvars.iv
+  store float %2, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
+; NEON-LABEL: define void @struct_return_f32_widen_rt_checks
+; NEON-SAME:  (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]])
+; NEON:       entry:
+; NEON:         br i1 false, label %scalar.ph, label %vector.memcheck
+; NEON:       vector.memcheck:
+; NEON:       vector.body:
+; NEON:         call { <4 x float>, <4 x float> } @fixed_vec_foo(<4 x float> [[WIDE_LOAD:%.*]])
+; NEON:       for.body:
+; NEON          call { float, float } @foo(float [[LOAD:%.*]])
+;
+; SVE_TF-LABEL: define void @struct_return_f32_widen_rt_checks
+; SVE_TF-SAME:  (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]])
+; SVE_TF:       entry:
+; SVE_TF:         br i1 false, label %scalar.ph, label %vector.memcheck
+; SVE_TF:       vector.memcheck:
+; SVE_TF:       vector.body:
+; SVE_TF:         call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:       for.body:
+; SVE_TF:         call { float, float } @foo(float [[LOAD:%.*]])
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call { float, float } @foo(float %0) #0
+  %1 = extractvalue { float, float } %call, 0
+  %2 = extractvalue { float, float } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %indvars.iv
+  store float %1, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, ptr %out_b, i64 %indvars.iv
+  store float %2, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; Negative test. Widening structs with mixed element types is not supported.
+define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; NEON-LABEL: define void @negative_mixed_element_type_struct_return
+; NEON-NOT:   vector.body:
+; NEON-NOT:   call {{.*}} @fixed_vec_baz
+;
+; SVE_TF-LABEL: define void @negative_mixed_element_type_struct_return
+; SVE_TF-NOT:   vector.body:
+; SVE_TF-NOT:   call {{.*}} @scalable_vec_masked_baz
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, ptr %in, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %call = tail call { float, i32 } @baz(float %0) #2
+  %1 = extractvalue { float, i32 } %call, 0
+  %2 = extractvalue { float, i32 } %call, 1
+  %arrayidx2 = getelementptr inbounds float, ptr %out_a, i64 %indvars.iv
+  store float %1, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %out_b, i64 %indvars.iv
+  store i32 %2, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+%named_struct = type { double, double }
+
+; Negative test. Widening non-literal structs is not supported.
+define void @test_named_struct_return(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
+; NEON-LABEL: define void @test_named_struct_return
+; NEON-NOT:   vector.body:
+; NEON-NOT:   call {{.*}} @fixed_vec_bar
+;
+; SVE_TF-LABEL: define void @test_named_struct_return
+; SVE_TF-NOT:   vector.body:
+; SVE_TF-NOT:   call {{.*}} @scalable_vec_masked_bar
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, ptr %in, i64 %indvars.iv
+  %0 = load double, ptr %arrayidx, align 8
+  %call = tail call %named_struct @bar_named(double %0) #4
+  %1 = extractvalue %named_struct %call, 0
+  %2 = extractvalue %named_struct %call, 1
+  %arrayidx2 = getelementptr inbounds double, ptr %out_a, i64 %indvars.iv
+  store double %1, ptr %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, ptr %out_b, i64 %indvars.iv
+  store double %2, ptr %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare { float, float } @foo(float)
+declare { double, double } @bar(double)
+declare { float, i32 } @baz(float)
+declare %named_struct @bar_named(double)
+
+declare { <4 x float>, <4 x float> } @fixed_vec_foo(<4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float>, <vscale x 4 x i1>)
+
+declare { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double>)
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double>, <vscale x 2 x i1>)
+
+declare { <4 x float>, <4 x i32> } @fixed_vec_baz(<4 x float>)
+declare { <vscale x 4 x float>, <vscale x 4 x i32> } @scalable_vec_masked_baz(<vscale x 4 x float>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN4v_foo(fixed_vec_foo),_ZGVsMxv_foo(scalable_vec_masked_foo)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN2v_bar(fixed_vec_bar),_ZGVsMxv_bar(scalable_vec_masked_bar)" }
+attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN4v_baz(fixed_vec_baz),_ZGVsMxv_foo(scalable_vec_masked_baz)" }
+attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVsMxv_foo(scalable_vec_masked_foo)" }
+attributes #4 = { nounwind "vector-function-abi-variant"="_ZGVnN4v_bar_named(fixed_vec_bar),_ZGVsMxv_bar_named(scalable_vec_masked_bar)" }