Skip to content

Commit

Permalink
[torch-mlir] bump to llvm/llvm-project@9b78ddf3b2abfb3e (#3491)
Browse files Browse the repository at this point in the history
This bump triggered an upstream assert. Includes a WAR for #3506.

Also includes several things I needed to do to repro:

* When TORCH_MLIR_TEST_CONCURRENCY=1, test runs will be printed.
* Added TORCH_MLIR_TEST_VERBOSE=1 handling to enable verbose mode
(useful on CI).

---------

Co-authored-by: Stella Laurenzo <stellaraccident@gmail.com>
  • Loading branch information
aartbik and stellaraccident authored Jun 28, 2024
1 parent 6d0ca49 commit 1f73895
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 10 deletions.
14 changes: 14 additions & 0 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,20 @@ cd projects/pt1
python -m e2e_testing.main -f 'AtenEmbeddingBag'
```

The default mode of running tests uses the multi-processing framework and is
not tolerant of certain types of errors. If encountering native crashes/hangs,
enable debug variables to run sequentially/in-process with more verbosity:

```
export TORCH_MLIR_TEST_CONCURRENCY=1
export TORCH_MLIR_TEST_VERBOSE=1
```

In this way, you can run under `gdb`, etc and get useful results. Having env
vars like this makes it easy to set in GH action files, etc. Note that the
verbose flags are very verbose. Basic sequential progress reports will be
printed regardless when not running in parallel.

## Running unit tests.

To run all of the unit tests, run:
Expand Down
2 changes: 1 addition & 1 deletion externals/llvm-project
Submodule llvm-project updated 1769 files
13 changes: 7 additions & 6 deletions lib/Dialect/TMTensor/IR/TMTensorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,17 @@ using namespace mlir::torch::TMTensor;
static void getEffectsImpl(
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
&effects,
ValueRange results, ValueRange inputBuffers, ValueRange outputBuffers) {
for (Value value : results) {
ResultRange results, ArrayRef<OpOperand *> inputBuffers,
ArrayRef<OpOperand *> outputBuffers) {
for (OpResult value : results) {
effects.emplace_back(MemoryEffects::Allocate::get(), value,
SideEffects::DefaultResource::get());
}
for (Value value : inputBuffers) {
for (OpOperand *value : inputBuffers) {
effects.emplace_back(MemoryEffects::Read::get(), value,
SideEffects::DefaultResource::get());
}
for (Value value : outputBuffers) {
for (OpOperand *value : outputBuffers) {
effects.emplace_back(MemoryEffects::Read::get(), value,
SideEffects::DefaultResource::get());
effects.emplace_back(MemoryEffects::Write::get(), value,
Expand Down Expand Up @@ -1121,8 +1122,8 @@ bool TopkOp::payloadUsesValueFromOperand(OpOperand *opOperand) {
void OP_NAME::getEffects( \
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>> \
&effects) { \
SmallVector<Value> inputBuffers = getInputBufferOperands(); \
SmallVector<Value> outputBuffers = getOutputBufferOperands(); \
OpOperandVector inputBuffers = getInputBufferOperands(); \
OpOperandVector outputBuffers = getOutputBufferOperands(); \
getEffectsImpl(effects, getOperation()->getResults(), inputBuffers, \
outputBuffers); \
}
Expand Down
6 changes: 4 additions & 2 deletions lib/Dialect/Torch/IR/TorchOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2810,7 +2810,8 @@ LogicalResult CopyToNonValueTensorOp::inferReturnTypes(
void CopyToNonValueTensorOp::getEffects(
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
&effects) {
effects.emplace_back(MemoryEffects::Allocate::get(), getResult());
effects.emplace_back(MemoryEffects::Allocate::get(),
getOperation()->getOpResult(0));
}

//===----------------------------------------------------------------------===//
Expand All @@ -2837,7 +2838,8 @@ LogicalResult CopyToValueTensorOp::inferReturnTypes(
void CopyToValueTensorOp::getEffects(
SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
&effects) {
effects.emplace_back(MemoryEffects::Read::get(), getOperand());
effects.emplace_back(MemoryEffects::Read::get(),
&getOperation()->getOpOperand(0));
}

//===----------------------------------------------------------------------===//
Expand Down
4 changes: 4 additions & 0 deletions projects/pt1/e2e_testing/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
import re
import sys

import torch

torch.device("cpu")

from torch_mlir_e2e_test.framework import run_tests
from torch_mlir_e2e_test.reporting import report_results
from torch_mlir_e2e_test.registry import GLOBAL_TEST_REGISTRY
Expand Down
14 changes: 13 additions & 1 deletion projects/pt1/python/torch_mlir_e2e_test/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,15 @@ def run_tests(
if env_concurrency > 0:
num_processes = min(num_processes, env_concurrency)

try:
env_verbose = os.getenv("TORCH_MLIR_TEST_VERBOSE", "0")
if env_verbose is not None:
verbose = bool(int(env_verbose))
except ValueError as e:
raise ValueError(
"Bad value for TORCH_MLIR_TEST_VERBOSE env var: " "Expected integer."
) from e

# TODO: We've noticed that on certain 2 core machine parallelizing the tests
# makes the llvm backend legacy pass manager 20x slower than using a
# single process. Need to investigate the root cause eventually. This is a
Expand All @@ -375,7 +384,10 @@ def run_tests(
# seems to cause a cascade of failures resulting in undecipherable error
# messages.
if num_processes == 1 or sequential:
return [compile_and_run_test(test, config, verbose) for test in tests]
print("Running tests sequentially with progress status")
for test in tests:
print(f"*** RUNNING TEST: {test.unique_name} ***")
compile_and_run_test(test, config, verbose)

# This is needed because autograd does not support crossing process
# boundaries.
Expand Down
3 changes: 3 additions & 0 deletions python/torch_mlir/compiler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def run_pipeline_with_repro_report(
)
# Lower module in place to make it ready for compiler backends.
with module.context as ctx:
# TODO(#3506): Passes can emit errors but not signal failure,
# which causes a native assert.
ctx.emit_error_diagnostics = True
pm = PassManager.parse(pipeline)
if enable_ir_printing:
ctx.enable_multithreading(False)
Expand Down

0 comments on commit 1f73895

Please sign in to comment.