intel · bader · Sep 2, 2020 · Sep 10, 2020
@@ -411,46 +411,50 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createCFGSimplificationPass());      // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
 
-  // Begin the loop pass pipeline.
-  if (EnableSimpleLoopUnswitch) {
-    // The simple loop unswitch pass relies on separate cleanup passes. Schedule
-    // them first so when we re-process a loop they run before other loop
-    // passes.
-    MPM.add(createLoopInstSimplifyPass());
-    MPM.add(createLoopSimplifyCFGPass());
+  // Do not run loop pass pipeline in "SYCL Optimization Mode". Loop
+  // optimizations rely on TTI, which is not accurate for SPIR target.
+  if (!SYCLOptimizationMode) {
+    // Begin the loop pass pipeline.
+    if (EnableSimpleLoopUnswitch) {
+      // The simple loop unswitch pass relies on separate cleanup passes.
+      // Schedule them first so when we re-process a loop they run before other
+      // loop passes.
+      MPM.add(createLoopInstSimplifyPass());
+      MPM.add(createLoopSimplifyCFGPass());
+    }
+    // Rotate Loop - disable header duplication at -Oz
+    MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+    // TODO: Investigate promotion cap for O1.
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+    if (EnableSimpleLoopUnswitch)
+      MPM.add(createSimpleLoopUnswitchLegacyPass());
+    else
+      MPM.add(
+          createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+    // FIXME: We break the loop pass pipeline here in order to do full
+    // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace
+    // the need for this.
+    MPM.add(createCFGSimplificationPass());
+    MPM.add(createInstructionCombiningPass());
+    // We resume loop passes creating a second loop pipeline here.
+    // TODO: this pass hurts performance due to promotions of induction
+    // variables from 32-bit value to 64-bit values. I assume it's because SPIR
+    // is a virtual target with unlimited # of registers and pass doesn't take
+    // into account that on real HW this promotion is not beneficial.
+    MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
+    MPM.add(createLoopIdiomPass());      // Recognize idioms like memset.
+    addExtensionsToPM(EP_LateLoopOptimizations, MPM);
+    MPM.add(createLoopDeletionPass()); // Delete dead loops
+
+    if (EnableLoopInterchange)
+      MPM.add(createLoopInterchangePass()); // Interchange loops
+
+    // Unroll small loops
+    MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
+                                       ForgetAllSCEVInLoopUnroll));
+    addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+    // This ends the loop pass pipelines.
   }
-  // Rotate Loop - disable header duplication at -Oz
-  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
-  // TODO: Investigate promotion cap for O1.
-  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
-  if (EnableSimpleLoopUnswitch)
-    MPM.add(createSimpleLoopUnswitchLegacyPass());
-  else
-    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
-  // FIXME: We break the loop pass pipeline here in order to do full
-  // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
-  // need for this.
-  MPM.add(createCFGSimplificationPass());
-  MPM.add(createInstructionCombiningPass());
-  // We resume loop passes creating a second loop pipeline here.
-  // TODO: this pass hurts performance due to promotions of induction variables
-  // from 32-bit value to 64-bit values. I assume it's because SPIR is a virtual
-  // target with unlimited # of registers and pass doesn't take into account
-  // that on real HW this promotion is not beneficial.
-  if (!SYCLOptimizationMode)
-    MPM.add(createIndVarSimplifyPass());      // Canonicalize indvars
-  MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
-  addExtensionsToPM(EP_LateLoopOptimizations, MPM);
-  MPM.add(createLoopDeletionPass());          // Delete dead loops
-
-  if (EnableLoopInterchange)
-    MPM.add(createLoopInterchangePass()); // Interchange loops
-
-  // Unroll small loops
-  MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
-                                     ForgetAllSCEVInLoopUnroll));
-  addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
-  // This ends the loop pass pipelines.
 
   if (OptLevel > 1) {
     MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
@@ -819,19 +823,21 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createLoopUnrollAndJamPass(OptLevel));
   }
 
-  // Unroll small loops
-  MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
-                               ForgetAllSCEVInLoopUnroll));
+  if (!SYCLOptimizationMode) {
+    // Unroll small loops
+    MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
+                                 ForgetAllSCEVInLoopUnroll));
 
-  if (!DisableUnrollLoops) {
-    // LoopUnroll may generate some redundency to cleanup.
-    MPM.add(createInstructionCombiningPass());
+    if (!DisableUnrollLoops) {
+      // LoopUnroll may generate some redundency to cleanup.
+      MPM.add(createInstructionCombiningPass());
 
-    // Runtime unrolling will introduce runtime check in loop prologue. If the
-    // unrolled loop is a inner loop, then the prologue will be inside the
-    // outer loop. LICM pass can help to promote the runtime check out if the
-    // checked value is loop invariant.
-    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+      // Runtime unrolling will introduce runtime check in loop prologue. If the
+      // unrolled loop is a inner loop, then the prologue will be inside the
+      // outer loop. LICM pass can help to promote the runtime check out if the
+      // checked value is loop invariant.
+      MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+    }
   }
 
   MPM.add(createWarnMissedTransformationsPass());
@@ -1034,13 +1040,16 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   if (EnableLoopInterchange)
     PM.add(createLoopInterchangePass());
 
-  // Unroll small loops
-  PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
-                                    ForgetAllSCEVInLoopUnroll));
-  PM.add(createLoopVectorizePass(true, !LoopVectorize));
-  // The vectorizer may have significantly shortened a loop body; unroll again.
-  PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
-                              ForgetAllSCEVInLoopUnroll));
+  if (!SYCLOptimizationMode) {
+    // Unroll small loops
+    PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
+                                      ForgetAllSCEVInLoopUnroll));
+    PM.add(createLoopVectorizePass(true, !LoopVectorize));
+    // The vectorizer may have significantly shortened a loop body; unroll
+    // again.
+    PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
+                                ForgetAllSCEVInLoopUnroll));
+  }
 
   PM.add(createWarnMissedTransformationsPass());
 

@@ -1,4 +1,3 @@
-// XFAIL: cpu
 // UNSUPPORTED: cuda
 // CUDA compilation and runtime do not yet support sub-groups.
 

@@ -1,4 +1,3 @@
-// XFAIL: cpu
 // UNSUPPORTED: cuda
 // CUDA compilation and runtime do not yet support sub-groups.