[regression][AArch64] cannot build sparta (with -flto) for A64FX after PR #93300

As in the title, after PR #93300 (commit 43100766f287185642a3ccbf1a629915f85575e2) I cannot build sparta (https://github.com/sparta/sparta.git) with `-flto` for A64FX:

```
mpicxx -O3 -fno-math-errno -mcpu=a64fx -ffp-contract=fast -flto -fdelayed-template-parsing -Wno-error=missing-template-arg-list-after-template-kw     adapt_grid.o balance_grid.o collide.o collide_vss.o collide_vss_kokkos.o comm.o comm_kokkos.o compute.o compute_boundary.o compute_bo
undary_kokkos.o compute_count.o compute_count_kokkos.o compute_distsurf_grid.o compute_distsurf_grid_kokkos.o compute_dt_grid.o compute_dt_grid_kokkos.o compute_eflux_grid.o compute_eflux_grid_kokkos.o compute_grid.o compute_grid_kokkos.o compute_isurf_grid.o compute_ke_particle.o compute_ke_particle_kokkos.o compu
te_lambda_grid.o compute_lambda_grid_kokkos.o compute_pflux_grid.o compute_pflux_grid_kokkos.o compute_property_grid.o compute_property_grid_kokkos.o compute_property_surf.o compute_react_boundary.o compute_react_isurf_grid.o compute_react_surf.o compute_reduce.o compute_sonine_grid.o compute_sonine_grid_kokkos.o c
ompute_surf.o compute_surf_kokkos.o compute_temp.o compute_temp_kokkos.o compute_thermal_grid.o compute_thermal_grid_kokkos.o compute_tvib_grid.o compute_tvib_grid_kokkos.o create_box.o create_grid.o create_isurf.o create_particles.o create_particles_kokkos.o custom.o cut2d.o cut3d.o domain.o domain_kokkos.o dump.o
 dump_grid.o dump_image.o dump_movie.o dump_particle.o dump_surf.o error.o finish.o fix.o fix_ablate.o fix_adapt.o fix_adapt_kokkos.o fix_ambipolar.o fix_ambipolar_kokkos.o fix_ave_grid.o fix_ave_grid_kokkos.o fix_ave_histo.o fix_ave_histo_kokkos.o fix_ave_histo_weight.o fix_ave_histo_weight_kokkos.o fix_ave_surf.o
 fix_ave_time.o fix_balance.o fix_balance_kokkos.o fix_dt_reset.o fix_dt_reset_kokkos.o fix_emit.o fix_emit_face.o fix_emit_face_file.o fix_emit_face_kokkos.o fix_emit_surf.o fix_field_grid.o fix_field_particle.o fix_grid_check.o fix_grid_check_kokkos.o fix_move_surf.o fix_move_surf_kokkos.o fix_print.o fix_surf_te
mp.o fix_surf_temp_kokkos.o fix_temp_global_rescale.o fix_temp_rescale.o fix_temp_rescale_kokkos.o fix_vibmode.o fix_vibmode_kokkos.o geometry.o grid.o grid_adapt.o grid_collate.o grid_comm.o grid_custom.o grid_custom_kokkos.o grid_id.o grid_id_kokkos.o grid_kokkos.o grid_surf.o hashlittle.o image.o input.o irregul
ar.o irregular_kokkos.o kokkos.o kokkos_scan.o library.o main.o marching_cubes.o marching_squares.o math_extra.o memory.o mixture.o modify.o modify_kokkos.o move_surf.o output.o particle.o particle_custom.o particle_custom_kokkos.o particle_kokkos.o rand_pool_wrap.o random_knuth.o random_mars.o rcb.o react.o react_
bird.o react_bird_kokkos.o react_qk.o react_tce.o react_tce_kokkos.o react_tce_qk.o read_grid.o read_isurf.o read_particles.o read_restart.o read_surf.o read_surf_kokkos.o region.o region_block.o region_cylinder.o region_intersect.o region_plane.o region_sphere.o region_union.o remove_surf.o run.o scale_particles.o
 sparta.o stats.o surf.o surf_collate.o surf_collide.o surf_collide_adiabatic.o surf_collide_cll.o surf_collide_diffuse.o surf_collide_diffuse_kokkos.o surf_collide_impulsive.o surf_collide_piston.o surf_collide_piston_kokkos.o surf_collide_specular.o surf_collide_specular_kokkos.o surf_collide_td.o surf_collide_tr
ansparent.o surf_collide_transparent_kokkos.o surf_collide_vanish.o surf_collide_vanish_kokkos.o surf_comm.o surf_custom.o surf_custom_kokkos.o surf_kokkos.o surf_react.o surf_react_adsorb.o surf_react_global.o surf_react_global_kokkos.o surf_react_prob.o surf_react_prob_kokkos.o timer.o universe.o update.o update_
kokkos.o variable.o write_grid.o write_isurf.o write_restart.o write_surf.o -lkokkos -ldl    -mtune=a64fx -mcpu=a64fx -fopenmp=libomp -L../Obj_astra   -o ../spa_astra
LLVM ERROR: Don't know how to widen the operands for INSERT_SUBVECTOR
clang++: error: unable to execute command: Aborted (core dumped)
clang++: error: linker command failed due to signal (use -v to see invocation)
make[1]: *** [Makefile:79: ../spa_astra] Error 1
```

The 43100766f287185642a3ccbf1a629915f85575e2 commit does not revert cleanly with today's top of `main`, but the problem goes away with the following reversion attempt (after conflict resoultion):

```
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4644,6 +4644,28 @@ bool LoopVectorizationPlanner::isMoreProfitable(

   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);

+  if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
+    // If the trip count is a known (possibly small) constant, the trip count
+    // will be rounded up to an integer number of iterations under
+    // FoldTailByMasking. The total cost in that case will be
+    // VecCost*ceil(TripCount/VF). When not folding the tail, the total
+    // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
+    // some extra overheads, but for the purpose of comparing the costs of
+    // different VFs we can use this to compare the total loop-body cost
+    // expected after vectorization.
+    auto GetCostForTC = [MaxTripCount, this](unsigned VF,
+                                             InstructionCost VectorCost,
+                                             InstructionCost ScalarCost) {
+      return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
+                                    : VectorCost * (MaxTripCount / VF) +
+                                          ScalarCost * (MaxTripCount % VF);
+    };
+    auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
+    auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
+
+    return RTCostA < RTCostB;
+  }
+
   // Improve estimate for the vector width if it is scalable.
   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
@@ -4657,39 +4679,14 @@ bool LoopVectorizationPlanner::isMoreProfitable(
   // Assume vscale may be larger than 1 (or the value being tuned for),
   // so that scalable vectorization is slightly favorable over fixed-width
   // vectorization.
-  bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
-                        A.Width.isScalable() && !B.Width.isScalable();
-
-  auto CmpFn = [PreferScalable](const InstructionCost &LHS,
-                                const InstructionCost &RHS) {
-    return PreferScalable ? LHS <= RHS : LHS < RHS;
-  };
+  if (!TTI.preferFixedOverScalableIfEqualCost() &&
+      A.Width.isScalable() && !B.Width.isScalable())
+    return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);

   // To avoid the need for FP division:
-  //      (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
-  // <=>  (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
-  if (!MaxTripCount)
-    return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
-
-  auto GetCostForTC = [MaxTripCount, this](unsigned VF,
-                                           InstructionCost VectorCost,
-                                           InstructionCost ScalarCost) {
-    // If the trip count is a known (possibly small) constant, the trip count
-    // will be rounded up to an integer number of iterations under
-    // FoldTailByMasking. The total cost in that case will be
-    // VecCost*ceil(TripCount/VF). When not folding the tail, the total
-    // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
-    // some extra overheads, but for the purpose of comparing the costs of
-    // different VFs we can use this to compare the total loop-body cost
-    // expected after vectorization.
-    if (CM.foldTailByMasking())
-      return VectorCost * divideCeil(MaxTripCount, VF);
-    return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
-  };
-
-  auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
-  auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
-  return CmpFn(RTCostA, RTCostB);
+  //      (CostA / A.Width) < (CostB / B.Width)
+  // <=>  (CostA * B.Width) < (CostB * A.Width)
+  return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
 }

 static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[regression][AArch64] cannot build sparta (with -flto) for A64FX after PR #93300 #99760

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[regression][AArch64] cannot build sparta (with -flto) for A64FX after PR #93300 #99760

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions