@@ -3069,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const {
30693069  });
30703070}
30713071
3072+ // / Returns true if \p Ptr is a pointer computation for which the legacy cost
3073+ // / model computes a SCEV expression when computing the address cost.
3074+ static  bool  shouldUseAddressAccessSCEV (const  VPValue *Ptr) {
3075+   auto  *PtrR = Ptr->getDefiningRecipe ();
3076+   if  (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
3077+                   cast<VPReplicateRecipe>(PtrR)->getOpcode () ==
3078+                       Instruction::GetElementPtr) ||
3079+                  isa<VPWidenGEPRecipe>(PtrR)))
3080+     return  false ;
3081+ 
3082+   //  We are looking for a GEP where all indices are either loop invariant or
3083+   //  inductions.
3084+   for  (VPValue *Opd : drop_begin (PtrR->operands ())) {
3085+     if  (!Opd->isDefinedOutsideLoopRegions () &&
3086+         !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
3087+       return  false ;
3088+   }
3089+ 
3090+   return  true ;
3091+ }
3092+ 
3093+ // / Returns true if \p V is used as part of the address of another load or
3094+ // / store.
3095+ static  bool  isUsedByLoadStoreAddress (const  VPUser *V) {
3096+   SmallPtrSet<const  VPUser *, 4 > Seen;
3097+   SmallVector<const  VPUser *> WorkList = {V};
3098+ 
3099+   while  (!WorkList.empty ()) {
3100+     auto  *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val ());
3101+     if  (!Cur || !Seen.insert (Cur).second )
3102+       continue ;
3103+ 
3104+     for  (VPUser *U : Cur->users ()) {
3105+       if  (auto  *InterleaveR = dyn_cast<VPInterleaveBase>(U))
3106+         if  (InterleaveR->getAddr () == Cur)
3107+           return  true ;
3108+       if  (auto  *RepR = dyn_cast<VPReplicateRecipe>(U)) {
3109+         if  (RepR->getOpcode () == Instruction::Load &&
3110+             RepR->getOperand (0 ) == Cur)
3111+           return  true ;
3112+         if  (RepR->getOpcode () == Instruction::Store &&
3113+             RepR->getOperand (1 ) == Cur)
3114+           return  true ;
3115+       }
3116+       if  (auto  *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
3117+         if  (MemR->getAddr () == Cur && MemR->isConsecutive ())
3118+           return  true ;
3119+       }
3120+     }
3121+ 
3122+     append_range (WorkList, cast<VPSingleDefRecipe>(Cur)->users ());
3123+   }
3124+   return  false ;
3125+ }
3126+ 
30723127InstructionCost VPReplicateRecipe::computeCost (ElementCount VF,
30733128                                               VPCostContext &Ctx) const  {
30743129  Instruction *UI = cast<Instruction>(getUnderlyingValue ());
@@ -3176,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31763231  }
31773232  case  Instruction::Load:
31783233  case  Instruction::Store: {
3179-     if  (isSingleScalar ()) {
3180-       bool  IsLoad = UI->getOpcode () == Instruction::Load;
3181-       Type *ValTy = Ctx.Types .inferScalarType (IsLoad ? this  : getOperand (0 ));
3182-       Type *ScalarPtrTy = Ctx.Types .inferScalarType (getOperand (IsLoad ? 0  : 1 ));
3183-       const  Align Alignment = getLoadStoreAlignment (UI);
3184-       unsigned  AS = getLoadStoreAddressSpace (UI);
3185-       TTI::OperandValueInfo OpInfo = TTI::getOperandInfo (UI->getOperand (0 ));
3186-       InstructionCost ScalarMemOpCost = Ctx.TTI .getMemoryOpCost (
3187-           UI->getOpcode (), ValTy, Alignment, AS, Ctx.CostKind , OpInfo, UI);
3188-       return  ScalarMemOpCost + Ctx.TTI .getAddressComputationCost (
3189-                                    ScalarPtrTy, nullptr , nullptr , Ctx.CostKind );
3190-     }
3234+     if  (VF.isScalable () && !isSingleScalar ())
3235+       return  InstructionCost::getInvalid ();
3236+ 
31913237    //  TODO: See getMemInstScalarizationCost for how to handle replicating and
31923238    //  predicated cases.
3193-     break ;
3239+     const  VPRegionBlock *ParentRegion = getParent ()->getParent ();
3240+     if  (ParentRegion && ParentRegion->isReplicator ())
3241+       break ;
3242+ 
3243+     bool  IsLoad = UI->getOpcode () == Instruction::Load;
3244+     const  VPValue *PtrOp = getOperand (!IsLoad);
3245+     //  TODO: Handle cases where we need to pass a SCEV to
3246+     //  getAddressComputationCost.
3247+     if  (shouldUseAddressAccessSCEV (PtrOp))
3248+       break ;
3249+ 
3250+     Type *ValTy = Ctx.Types .inferScalarType (IsLoad ? this  : getOperand (0 ));
3251+     Type *ScalarPtrTy = Ctx.Types .inferScalarType (PtrOp);
3252+     const  Align Alignment = getLoadStoreAlignment (UI);
3253+     unsigned  AS = getLoadStoreAddressSpace (UI);
3254+     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo (UI->getOperand (0 ));
3255+     InstructionCost ScalarMemOpCost = Ctx.TTI .getMemoryOpCost (
3256+         UI->getOpcode (), ValTy, Alignment, AS, Ctx.CostKind , OpInfo);
3257+ 
3258+     Type *PtrTy = isSingleScalar () ? ScalarPtrTy : toVectorTy (ScalarPtrTy, VF);
3259+ 
3260+     InstructionCost ScalarCost =
3261+         ScalarMemOpCost + Ctx.TTI .getAddressComputationCost (
3262+                               PtrTy, &Ctx.SE , nullptr , Ctx.CostKind );
3263+     if  (isSingleScalar ())
3264+       return  ScalarCost;
3265+ 
3266+     SmallVector<const  VPValue *> OpsToScalarize;
3267+     Type *ResultTy = Type::getVoidTy (PtrTy->getContext ());
3268+     //  Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3269+     //  don't assign scalarization overhead in general, if the target prefers
3270+     //  vectorized addressing or the loaded value is used as part of an address
3271+     //  of another load or store.
3272+     bool  PreferVectorizedAddressing = Ctx.TTI .prefersVectorizedAddressing ();
3273+     if  (PreferVectorizedAddressing || !isUsedByLoadStoreAddress (this )) {
3274+       bool  EfficientVectorLoadStore =
3275+           Ctx.TTI .supportsEfficientVectorElementLoadStore ();
3276+       if  (!(IsLoad && !PreferVectorizedAddressing) &&
3277+           !(!IsLoad && EfficientVectorLoadStore))
3278+         append_range (OpsToScalarize, operands ());
3279+ 
3280+       if  (!EfficientVectorLoadStore)
3281+         ResultTy = Ctx.Types .inferScalarType (this );
3282+     }
3283+ 
3284+     return  (ScalarCost * VF.getFixedValue ()) +
3285+            Ctx.getScalarizationOverhead (ResultTy, OpsToScalarize, VF, true );
31943286  }
31953287  }
31963288
0 commit comments