Turn off SLP vectorization for avx512 only (halide#7918)

Fixes halide#7917
ardier · Mar 3, 2024 · 6ba7ec3 · 6ba7ec3
1 parent e6e2f8f
commit 6ba7ec3
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 1 deletion.
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -1122,7 +1122,7 @@ void CodeGen_LLVM::optimize_module() {
     PipelineTuningOptions pto;
     pto.LoopInterleaving = do_loop_opt;
     pto.LoopVectorization = do_loop_opt;
-    pto.SLPVectorization = true;  // Note: SLP vectorization has no analogue in the Halide scheduling model
+    pto.SLPVectorization = use_slp_vectorization();
     pto.LoopUnrolling = do_loop_opt;
     // Clear ScEv info for all loops. Certain Halide applications spend a very
     // long time compiling in forgetLoop, and prefer to forget everything

diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
@@ -127,6 +127,13 @@ class CodeGen_LLVM : public IRVisitor {
     virtual bool use_pic() const;
     // @}
 
+    /** Should SLP vectorization be turned on in LLVM? SLP vectorization has no
+     * analogue in the Halide scheduling model so this is decided heuristically
+     * depending on the target. */
+    virtual bool use_slp_vectorization() const {
+        return true;
+    }
+
     /** Should indexing math be promoted to 64-bit on platforms with
      * 64-bit pointers? */
     virtual bool promote_indices() const {

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
@@ -68,6 +68,8 @@ class CodeGen_X86 : public CodeGen_Posix {
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
 
+    bool use_slp_vectorization() const override;
+
     int vector_lanes_for_slice(const Type &t) const;
 
     using CodeGen_Posix::visit;
@@ -1028,6 +1030,20 @@ int CodeGen_X86::vector_lanes_for_slice(const Type &t) const {
     return slice_bits / t.bits();
 }
 
+bool CodeGen_X86::use_slp_vectorization() const {
+    if (target.has_feature(Target::AVX512)) {
+        // LLVM's SLP vectorizer emits avx512 gather intrinsics for LUTs and
+        // boundary conditions, even though they're slower than just
+        // scalarizing. See https://github.com/llvm/llvm-project/issues/70259
+        //
+        // TODO: Once that issue is fixed, we should conditionalize this based on the
+        // LLVM version.
+        return false;
+    } else {
+        return true;
+    }
+}
+
 }  // namespace
 
 std::unique_ptr<CodeGen_Posix> new_CodeGen_X86(const Target &target) {