Closed
Description
As in the title, after PR #93300 (commit 4310076) I cannot build sparta (https://github.com/sparta/sparta.git) with -flto
for A64FX:
mpicxx -O3 -fno-math-errno -mcpu=a64fx -ffp-contract=fast -flto -fdelayed-template-parsing -Wno-error=missing-template-arg-list-after-template-kw adapt_grid.o balance_grid.o collide.o collide_vss.o collide_vss_kokkos.o comm.o comm_kokkos.o compute.o compute_boundary.o compute_bo
undary_kokkos.o compute_count.o compute_count_kokkos.o compute_distsurf_grid.o compute_distsurf_grid_kokkos.o compute_dt_grid.o compute_dt_grid_kokkos.o compute_eflux_grid.o compute_eflux_grid_kokkos.o compute_grid.o compute_grid_kokkos.o compute_isurf_grid.o compute_ke_particle.o compute_ke_particle_kokkos.o compu
te_lambda_grid.o compute_lambda_grid_kokkos.o compute_pflux_grid.o compute_pflux_grid_kokkos.o compute_property_grid.o compute_property_grid_kokkos.o compute_property_surf.o compute_react_boundary.o compute_react_isurf_grid.o compute_react_surf.o compute_reduce.o compute_sonine_grid.o compute_sonine_grid_kokkos.o c
ompute_surf.o compute_surf_kokkos.o compute_temp.o compute_temp_kokkos.o compute_thermal_grid.o compute_thermal_grid_kokkos.o compute_tvib_grid.o compute_tvib_grid_kokkos.o create_box.o create_grid.o create_isurf.o create_particles.o create_particles_kokkos.o custom.o cut2d.o cut3d.o domain.o domain_kokkos.o dump.o
dump_grid.o dump_image.o dump_movie.o dump_particle.o dump_surf.o error.o finish.o fix.o fix_ablate.o fix_adapt.o fix_adapt_kokkos.o fix_ambipolar.o fix_ambipolar_kokkos.o fix_ave_grid.o fix_ave_grid_kokkos.o fix_ave_histo.o fix_ave_histo_kokkos.o fix_ave_histo_weight.o fix_ave_histo_weight_kokkos.o fix_ave_surf.o
fix_ave_time.o fix_balance.o fix_balance_kokkos.o fix_dt_reset.o fix_dt_reset_kokkos.o fix_emit.o fix_emit_face.o fix_emit_face_file.o fix_emit_face_kokkos.o fix_emit_surf.o fix_field_grid.o fix_field_particle.o fix_grid_check.o fix_grid_check_kokkos.o fix_move_surf.o fix_move_surf_kokkos.o fix_print.o fix_surf_te
mp.o fix_surf_temp_kokkos.o fix_temp_global_rescale.o fix_temp_rescale.o fix_temp_rescale_kokkos.o fix_vibmode.o fix_vibmode_kokkos.o geometry.o grid.o grid_adapt.o grid_collate.o grid_comm.o grid_custom.o grid_custom_kokkos.o grid_id.o grid_id_kokkos.o grid_kokkos.o grid_surf.o hashlittle.o image.o input.o irregul
ar.o irregular_kokkos.o kokkos.o kokkos_scan.o library.o main.o marching_cubes.o marching_squares.o math_extra.o memory.o mixture.o modify.o modify_kokkos.o move_surf.o output.o particle.o particle_custom.o particle_custom_kokkos.o particle_kokkos.o rand_pool_wrap.o random_knuth.o random_mars.o rcb.o react.o react_
bird.o react_bird_kokkos.o react_qk.o react_tce.o react_tce_kokkos.o react_tce_qk.o read_grid.o read_isurf.o read_particles.o read_restart.o read_surf.o read_surf_kokkos.o region.o region_block.o region_cylinder.o region_intersect.o region_plane.o region_sphere.o region_union.o remove_surf.o run.o scale_particles.o
sparta.o stats.o surf.o surf_collate.o surf_collide.o surf_collide_adiabatic.o surf_collide_cll.o surf_collide_diffuse.o surf_collide_diffuse_kokkos.o surf_collide_impulsive.o surf_collide_piston.o surf_collide_piston_kokkos.o surf_collide_specular.o surf_collide_specular_kokkos.o surf_collide_td.o surf_collide_tr
ansparent.o surf_collide_transparent_kokkos.o surf_collide_vanish.o surf_collide_vanish_kokkos.o surf_comm.o surf_custom.o surf_custom_kokkos.o surf_kokkos.o surf_react.o surf_react_adsorb.o surf_react_global.o surf_react_global_kokkos.o surf_react_prob.o surf_react_prob_kokkos.o timer.o universe.o update.o update_
kokkos.o variable.o write_grid.o write_isurf.o write_restart.o write_surf.o -lkokkos -ldl -mtune=a64fx -mcpu=a64fx -fopenmp=libomp -L../Obj_astra -o ../spa_astra
LLVM ERROR: Don't know how to widen the operands for INSERT_SUBVECTOR
clang++: error: unable to execute command: Aborted (core dumped)
clang++: error: linker command failed due to signal (use -v to see invocation)
make[1]: *** [Makefile:79: ../spa_astra] Error 1
The 4310076 commit does not revert cleanly with today's top of main
, but the problem goes away with the following reversion attempt (after conflict resoultion):
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4644,6 +4644,28 @@ bool LoopVectorizationPlanner::isMoreProfitable(
unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
+ if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
+ // If the trip count is a known (possibly small) constant, the trip count
+ // will be rounded up to an integer number of iterations under
+ // FoldTailByMasking. The total cost in that case will be
+ // VecCost*ceil(TripCount/VF). When not folding the tail, the total
+ // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
+ // some extra overheads, but for the purpose of comparing the costs of
+ // different VFs we can use this to compare the total loop-body cost
+ // expected after vectorization.
+ auto GetCostForTC = [MaxTripCount, this](unsigned VF,
+ InstructionCost VectorCost,
+ InstructionCost ScalarCost) {
+ return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
+ : VectorCost * (MaxTripCount / VF) +
+ ScalarCost * (MaxTripCount % VF);
+ };
+ auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
+ auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
+
+ return RTCostA < RTCostB;
+ }
+
// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
@@ -4657,39 +4679,14 @@ bool LoopVectorizationPlanner::isMoreProfitable(
// Assume vscale may be larger than 1 (or the value being tuned for),
// so that scalable vectorization is slightly favorable over fixed-width
// vectorization.
- bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
- A.Width.isScalable() && !B.Width.isScalable();
-
- auto CmpFn = [PreferScalable](const InstructionCost &LHS,
- const InstructionCost &RHS) {
- return PreferScalable ? LHS <= RHS : LHS < RHS;
- };
+ if (!TTI.preferFixedOverScalableIfEqualCost() &&
+ A.Width.isScalable() && !B.Width.isScalable())
+ return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
// To avoid the need for FP division:
- // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
- // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
- if (!MaxTripCount)
- return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
-
- auto GetCostForTC = [MaxTripCount, this](unsigned VF,
- InstructionCost VectorCost,
- InstructionCost ScalarCost) {
- // If the trip count is a known (possibly small) constant, the trip count
- // will be rounded up to an integer number of iterations under
- // FoldTailByMasking. The total cost in that case will be
- // VecCost*ceil(TripCount/VF). When not folding the tail, the total
- // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
- // some extra overheads, but for the purpose of comparing the costs of
- // different VFs we can use this to compare the total loop-body cost
- // expected after vectorization.
- if (CM.foldTailByMasking())
- return VectorCost * divideCeil(MaxTripCount, VF);
- return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
- };
-
- auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
- auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
- return CmpFn(RTCostA, RTCostB);
+ // (CostA / A.Width) < (CostB / B.Width)
+ // <=> (CostA * B.Width) < (CostB * A.Width)
+ return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
}
static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,