[torch-mlir] bump to llvm/llvm-project@9b78ddf3b2abfb3e (#3491)

This bump triggered an upstream assert. Includes a WAR for #3506. Also includes several things I needed to do to repro: * When TORCH_MLIR_TEST_CONCURRENCY=1, test runs will be printed. * Added TORCH_MLIR_TEST_VERBOSE=1 handling to enable verbose mode (useful on CI). --------- Co-authored-by: Stella Laurenzo <stellaraccident@gmail.com>
llvm · Jun 28, 2024 · 1f73895 · 1f73895
1 parent 6d0ca49
commit 1f73895
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 10 deletions.
diff --git a/docs/development.md b/docs/development.md
@@ -429,6 +429,20 @@ cd projects/pt1
 python -m e2e_testing.main -f 'AtenEmbeddingBag'
 ```
 
+The default mode of running tests uses the multi-processing framework and is
+not tolerant of certain types of errors. If encountering native crashes/hangs,
+enable debug variables to run sequentially/in-process with more verbosity:
+
+```
+export TORCH_MLIR_TEST_CONCURRENCY=1
+export TORCH_MLIR_TEST_VERBOSE=1
+```
+
+In this way, you can run under `gdb`, etc and get useful results. Having env
+vars like this makes it easy to set in GH action files, etc. Note that the
+verbose flags are very verbose. Basic sequential progress reports will be
+printed regardless when not running in parallel.
+
 ## Running unit tests.
 
 To run all of the unit tests, run:

diff --git a/externals/llvm-project b/externals/llvm-project
diff --git a/lib/Dialect/TMTensor/IR/TMTensorOps.cpp b/lib/Dialect/TMTensor/IR/TMTensorOps.cpp
@@ -46,16 +46,17 @@ using namespace mlir::torch::TMTensor;
 static void getEffectsImpl(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects,
-    ValueRange results, ValueRange inputBuffers, ValueRange outputBuffers) {
-  for (Value value : results) {
+    ResultRange results, ArrayRef<OpOperand *> inputBuffers,
+    ArrayRef<OpOperand *> outputBuffers) {
+  for (OpResult value : results) {
     effects.emplace_back(MemoryEffects::Allocate::get(), value,
                          SideEffects::DefaultResource::get());
   }
-  for (Value value : inputBuffers) {
+  for (OpOperand *value : inputBuffers) {
     effects.emplace_back(MemoryEffects::Read::get(), value,
                          SideEffects::DefaultResource::get());
   }
-  for (Value value : outputBuffers) {
+  for (OpOperand *value : outputBuffers) {
     effects.emplace_back(MemoryEffects::Read::get(), value,
                          SideEffects::DefaultResource::get());
     effects.emplace_back(MemoryEffects::Write::get(), value,
@@ -1121,8 +1122,8 @@ bool TopkOp::payloadUsesValueFromOperand(OpOperand *opOperand) {
   void OP_NAME::getEffects(                                                    \
       SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>      \
           &effects) {                                                          \
-    SmallVector<Value> inputBuffers = getInputBufferOperands();                \
-    SmallVector<Value> outputBuffers = getOutputBufferOperands();              \
+    OpOperandVector inputBuffers = getInputBufferOperands();                   \
+    OpOperandVector outputBuffers = getOutputBufferOperands();                 \
     getEffectsImpl(effects, getOperation()->getResults(), inputBuffers,        \
                    outputBuffers);                                             \
   }

diff --git a/lib/Dialect/Torch/IR/TorchOps.cpp b/lib/Dialect/Torch/IR/TorchOps.cpp
@@ -2810,7 +2810,8 @@ LogicalResult CopyToNonValueTensorOp::inferReturnTypes(
 void CopyToNonValueTensorOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
-  effects.emplace_back(MemoryEffects::Allocate::get(), getResult());
+  effects.emplace_back(MemoryEffects::Allocate::get(),
+                       getOperation()->getOpResult(0));
 }
 
 //===----------------------------------------------------------------------===//
@@ -2837,7 +2838,8 @@ LogicalResult CopyToValueTensorOp::inferReturnTypes(
 void CopyToValueTensorOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
-  effects.emplace_back(MemoryEffects::Read::get(), getOperand());
+  effects.emplace_back(MemoryEffects::Read::get(),
+                       &getOperation()->getOpOperand(0));
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/projects/pt1/e2e_testing/main.py b/projects/pt1/e2e_testing/main.py
@@ -7,6 +7,10 @@
 import re
 import sys
 
+import torch
+
+torch.device("cpu")
+
 from torch_mlir_e2e_test.framework import run_tests
 from torch_mlir_e2e_test.reporting import report_results
 from torch_mlir_e2e_test.registry import GLOBAL_TEST_REGISTRY

diff --git a/projects/pt1/python/torch_mlir_e2e_test/framework.py b/projects/pt1/python/torch_mlir_e2e_test/framework.py
@@ -358,6 +358,15 @@ def run_tests(
     if env_concurrency > 0:
         num_processes = min(num_processes, env_concurrency)
 
+    try:
+        env_verbose = os.getenv("TORCH_MLIR_TEST_VERBOSE", "0")
+        if env_verbose is not None:
+            verbose = bool(int(env_verbose))
+    except ValueError as e:
+        raise ValueError(
+            "Bad value for TORCH_MLIR_TEST_VERBOSE env var: " "Expected integer."
+        ) from e
+
     # TODO: We've noticed that on certain 2 core machine parallelizing the tests
     # makes the llvm backend legacy pass manager 20x slower than using a
     # single process. Need to investigate the root cause eventually. This is a
@@ -375,7 +384,10 @@ def run_tests(
     # seems to cause a cascade of failures resulting in undecipherable error
     # messages.
     if num_processes == 1 or sequential:
-        return [compile_and_run_test(test, config, verbose) for test in tests]
+        print("Running tests sequentially with progress status")
+        for test in tests:
+            print(f"*** RUNNING TEST: {test.unique_name} ***")
+            compile_and_run_test(test, config, verbose)
 
     # This is needed because autograd does not support crossing process
     # boundaries.

diff --git a/python/torch_mlir/compiler_utils.py b/python/torch_mlir/compiler_utils.py
@@ -40,6 +40,9 @@ def run_pipeline_with_repro_report(
         )
         # Lower module in place to make it ready for compiler backends.
         with module.context as ctx:
+            # TODO(#3506): Passes can emit errors but not signal failure,
+            # which causes a native assert.
+            ctx.emit_error_diagnostics = True
             pm = PassManager.parse(pipeline)
             if enable_ir_printing:
                 ctx.enable_multithreading(False)