Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More targeted fix for gather instructions being slow on intel processors #7945

Merged
merged 1 commit into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/CodeGen_LLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1122,7 +1122,7 @@ void CodeGen_LLVM::optimize_module() {
PipelineTuningOptions pto;
pto.LoopInterleaving = do_loop_opt;
pto.LoopVectorization = do_loop_opt;
pto.SLPVectorization = use_slp_vectorization();
pto.SLPVectorization = true;
pto.LoopUnrolling = do_loop_opt;
// Clear ScEv info for all loops. Certain Halide applications spend a very
// long time compiling in forgetLoop, and prefer to forget everything
Expand Down
7 changes: 0 additions & 7 deletions src/CodeGen_LLVM.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,6 @@ class CodeGen_LLVM : public IRVisitor {
virtual bool use_pic() const;
// @}

/** Should SLP vectorization be turned on in LLVM? SLP vectorization has no
* analogue in the Halide scheduling model so this is decided heuristically
* depending on the target. */
virtual bool use_slp_vectorization() const {
return true;
}

/** Should indexing math be promoted to 64-bit on platforms with
* 64-bit pointers? */
virtual bool promote_indices() const {
Expand Down
49 changes: 33 additions & 16 deletions src/CodeGen_X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,6 @@ class CodeGen_X86 : public CodeGen_Posix {
bool use_soft_float_abi() const override;
int native_vector_bits() const override;

bool use_slp_vectorization() const override;

int vector_lanes_for_slice(const Type &t) const;

using CodeGen_Posix::visit;
Expand Down Expand Up @@ -918,6 +916,34 @@ string CodeGen_X86::mcpu_target() const {
}
}

namespace {
bool gather_might_be_slow(Target target) {
// Intel x86 processors between broadwell and tiger lake have a microcode
// mitigation that makes gather instructions very slow. If we know we're on
// an AMD processor, gather is safe to use. If we have the AVX512 extensions
// present in Zen4 (or above), we also know we're not on an affected
// processor.
switch (target.processor_tune) {
case Target::Processor::AMDFam10:
case Target::Processor::BdVer1:
case Target::Processor::BdVer2:
case Target::Processor::BdVer3:
case Target::Processor::BdVer4:
case Target::Processor::BtVer1:
case Target::Processor::BtVer2:
case Target::Processor::K8:
case Target::Processor::K8_SSE3:
case Target::Processor::ZnVer1:
case Target::Processor::ZnVer2:
case Target::Processor::ZnVer3:
case Target::Processor::ZnVer4:
return false;
default:
return !target.has_feature(Target::AVX512_Zen4);
}
}
} // namespace

string CodeGen_X86::mcpu_tune() const {
// Check if any explicit request for tuning exists.
switch (target.processor_tune) { // Please keep sorted.
Expand Down Expand Up @@ -995,6 +1021,11 @@ string CodeGen_X86::mattrs() const {
features += ",+avxvnni,+amx-int8,+amx-bf16";
}
}
#if LLVM_VERSION >= 180
if (gather_might_be_slow(target)) {
features += ",+prefer-no-gather";
}
#endif
return features;
}

Expand Down Expand Up @@ -1030,20 +1061,6 @@ int CodeGen_X86::vector_lanes_for_slice(const Type &t) const {
return slice_bits / t.bits();
}

bool CodeGen_X86::use_slp_vectorization() const {
if (target.has_feature(Target::AVX512)) {
// LLVM's SLP vectorizer emits avx512 gather intrinsics for LUTs and
// boundary conditions, even though they're slower than just
// scalarizing. See https://github.com/llvm/llvm-project/issues/70259
//
// TODO: Once that issue is fixed, we should conditionalize this based on the
// LLVM version.
return false;
} else {
return true;
}
}

} // namespace

std::unique_ptr<CodeGen_Posix> new_CodeGen_X86(const Target &target) {
Expand Down
Loading