[lit] add --max-retries-per-test execution option (llvm#141851)

kwk · web-flow · commit 84fd907aa7de · 2025-05-31T03:46:12.000+02:00
When packaging LLVM we've seen arbitrary tests fail. It happened sporadically and most of the times the test do work if they are run a second time on the next day. The tests themselves were always different and we didn't know ahead of time which ones we wanted to re-run. That's we filter-out a lot of `libomp` and `libarcher` tests [1]. This change allows us to set `LIT_OPTS="--max-retries-per-test=12"` when running any "check-XXX" build target. Then any lit test will at most be re-run 12 times, unless there's an `ALLOW_RETRIES:` in one of the test scripts that's specifying a different value than `12`. `12` is just an example here, any positive integer will work. Please note, that this only adds the possibility to re-run lit tests. It does not actually do it until the caller specifies `--max-retries-per-test=<POSITIVE_INT>` either on a call to `lit` or in `LIT_OPTS`. Also note, that one can still use `ALLOW_RETRIES:` in test scripts and it will always rule over `--max-retries-per-test`. When `--max-retries-per-test` is set too low, but the `config.test_retry_attempts` is high enough, it works as well. Any option in the list below overrules its predecessor: * `--max-retries-per-test` * `config.test_retry_attempts` * `ALLOW_RETRIES` keyword From the above options to re-run tests, `--max-retries-per-test` is the only one that doesn't require a change in the test scripts or the test config. [1]: https://src.fedoraproject.org/rpms/llvm/blob/rawhide/f/llvm.spec#_2326 Downstream PR to make use of the `--max-retries-per-test` option: https://src.fedoraproject.org/rpms/llvm/pull-request/434 Downstream ticket: https://issues.redhat.com/browse/LLVM-145
diff --git a/llvm/docs/CommandGuide/lit.rst b/llvm/docs/CommandGuide/lit.rst
@@ -218,6 +218,19 @@ EXECUTION OPTIONS
 
  Stop execution after the given number of failures.
 
+.. option:: --max-retries-per-test N
+
+ Retry running failed tests at most ``N`` times.
+ Out of the following options to rerun failed tests the
+ :option:`--max-retries-per-test` is the only one that doesn't
+ require a change in the test scripts or the test config:
+
+  * :option:`--max-retries-per-test` lit option
+  * ``config.test_retry_attempts`` test suite option
+  * ``ALLOW_RETRIES:`` annotation in test script
+
+ Any option in the list above overrules its predecessor.
+
 .. option:: --allow-empty-runs
 
  Do not fail the run if all tests are filtered out.
diff --git a/llvm/utils/lit/lit/LitConfig.py b/llvm/utils/lit/lit/LitConfig.py
@@ -35,6 +35,7 @@ def __init__(
         params,
         config_prefix=None,
         maxIndividualTestTime=0,
+        maxRetriesPerTest=None,
         parallelism_groups={},
         per_test_coverage=False,
         gtest_sharding=True,
@@ -86,6 +87,7 @@ def __init__(
             self.valgrindArgs.extend(self.valgrindUserArgs)
 
         self.maxIndividualTestTime = maxIndividualTestTime
+        self.maxRetriesPerTest = maxRetriesPerTest
         self.parallelism_groups = parallelism_groups
         self.per_test_coverage = per_test_coverage
         self.gtest_sharding = bool(gtest_sharding)
diff --git a/llvm/utils/lit/lit/TestingConfig.py b/llvm/utils/lit/lit/TestingConfig.py
@@ -235,6 +235,11 @@ def finish(self, litConfig):
             # files. Should we distinguish them?
             self.test_source_root = str(self.test_source_root)
         self.excludes = set(self.excludes)
+        if (
+            litConfig.maxRetriesPerTest is not None
+            and getattr(self, "test_retry_attempts", None) is None
+        ):
+            self.test_retry_attempts = litConfig.maxRetriesPerTest
 
     @property
     def root(self):
diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py
@@ -199,6 +199,15 @@ def parse_args():
         "0 means no time limit. [Default: 0]",
         type=_non_negative_int,
     )
+    execution_group.add_argument(
+        "--max-retries-per-test",
+        dest="maxRetriesPerTest",
+        metavar="N",
+        help="Maximum number of allowed retry attempts per test "
+        "(NOTE: The config.test_retry_attempts test suite option and "
+        "ALLOWED_RETRIES keyword always take precedence)",
+        type=_positive_int,
+    )
     execution_group.add_argument(
         "--max-failures",
         help="Stop execution after the given number of failures.",
diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py
@@ -42,6 +42,7 @@ def main(builtin_params={}):
         config_prefix=opts.configPrefix,
         per_test_coverage=opts.per_test_coverage,
         gtest_sharding=opts.gtest_sharding,
+        maxRetriesPerTest=opts.maxRetriesPerTest,
     )
 
     discovered_tests = lit.discovery.find_tests_for_inputs(
diff --git a/llvm/utils/lit/tests/Inputs/max-retries-per-test/allow-retries-no-test_retry_attempts/lit.cfg b/llvm/utils/lit/tests/Inputs/max-retries-per-test/allow-retries-no-test_retry_attempts/lit.cfg
@@ -0,0 +1,10 @@
+import lit.formats
+
+config.name = "allow-retries-no-test_retry_attempts"
+config.suffixes = [".py"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+
+config.substitutions.append(("%python", lit_config.params.get("python", "")))
+config.substitutions.append(("%counter", lit_config.params.get("counter", "")))
diff --git a/llvm/utils/lit/tests/Inputs/max-retries-per-test/allow-retries-no-test_retry_attempts/test.py b/llvm/utils/lit/tests/Inputs/max-retries-per-test/allow-retries-no-test_retry_attempts/test.py
@@ -0,0 +1,23 @@
+# ALLOW_RETRIES: 3
+# RUN: "%python" "%s" "%counter"
+
+import sys
+import os
+
+counter_file = sys.argv[1]
+
+# The first time the test is run, initialize the counter to 1.
+if not os.path.exists(counter_file):
+    with open(counter_file, "w") as counter:
+        counter.write("1")
+
+# Succeed if this is the fourth time we're being run.
+with open(counter_file, "r") as counter:
+    num = int(counter.read())
+    if num == 4:
+        sys.exit(0)
+
+# Otherwise, increment the counter and fail
+with open(counter_file, "w") as counter:
+    counter.write(str(num + 1))
+    sys.exit(1)
diff --git a/llvm/utils/lit/tests/Inputs/max-retries-per-test/allow-retries-test_retry_attempts/lit.cfg b/llvm/utils/lit/tests/Inputs/max-retries-per-test/allow-retries-test_retry_attempts/lit.cfg
@@ -0,0 +1,12 @@
+import lit.formats
+
+config.name = "allow-retries-test_retry_attempts"
+config.suffixes = [".py"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+
+config.substitutions.append(("%python", lit_config.params.get("python", "")))
+config.substitutions.append(("%counter", lit_config.params.get("counter", "")))
+
+config.test_retry_attempts = 2
diff --git a/llvm/utils/lit/tests/Inputs/max-retries-per-test/allow-retries-test_retry_attempts/test.py b/llvm/utils/lit/tests/Inputs/max-retries-per-test/allow-retries-test_retry_attempts/test.py
@@ -0,0 +1,23 @@
+# ALLOW_RETRIES: 3
+# RUN: "%python" "%s" "%counter"
+
+import sys
+import os
+
+counter_file = sys.argv[1]
+
+# The first time the test is run, initialize the counter to 1.
+if not os.path.exists(counter_file):
+    with open(counter_file, "w") as counter:
+        counter.write("1")
+
+# Succeed if this is the fourth time we're being run.
+with open(counter_file, "r") as counter:
+    num = int(counter.read())
+    if num == 4:
+        sys.exit(0)
+
+# Otherwise, increment the counter and fail
+with open(counter_file, "w") as counter:
+    counter.write(str(num + 1))
+    sys.exit(1)
diff --git a/llvm/utils/lit/tests/Inputs/max-retries-per-test/no-allow-retries-no-test_retry_attempts/lit.cfg b/llvm/utils/lit/tests/Inputs/max-retries-per-test/no-allow-retries-no-test_retry_attempts/lit.cfg
@@ -0,0 +1,10 @@
+import lit.formats
+
+config.name = "no-allow-retries-no-test_retry_attempts"
+config.suffixes = [".py"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+
+config.substitutions.append(("%python", lit_config.params.get("python", "")))
+config.substitutions.append(("%counter", lit_config.params.get("counter", "")))
diff --git a/llvm/utils/lit/tests/Inputs/max-retries-per-test/no-allow-retries-no-test_retry_attempts/test.py b/llvm/utils/lit/tests/Inputs/max-retries-per-test/no-allow-retries-no-test_retry_attempts/test.py
@@ -0,0 +1,22 @@
+# RUN: "%python" "%s" "%counter"
+
+import sys
+import os
+
+counter_file = sys.argv[1]
+
+# The first time the test is run, initialize the counter to 1.
+if not os.path.exists(counter_file):
+    with open(counter_file, "w") as counter:
+        counter.write("1")
+
+# Succeed if this is the fourth time we're being run.
+with open(counter_file, "r") as counter:
+    num = int(counter.read())
+    if num == 4:
+        sys.exit(0)
+
+# Otherwise, increment the counter and fail
+with open(counter_file, "w") as counter:
+    counter.write(str(num + 1))
+    sys.exit(1)
diff --git a/llvm/utils/lit/tests/Inputs/max-retries-per-test/no-allow-retries-test_retry_attempts/lit.cfg b/llvm/utils/lit/tests/Inputs/max-retries-per-test/no-allow-retries-test_retry_attempts/lit.cfg
@@ -0,0 +1,12 @@
+import lit.formats
+
+config.name = "no-allow-retries-test_retry_attempts"
+config.suffixes = [".py"]
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+
+config.substitutions.append(("%python", lit_config.params.get("python", "")))
+config.substitutions.append(("%counter", lit_config.params.get("counter", "")))
+
+config.test_retry_attempts = 3
diff --git a/llvm/utils/lit/tests/Inputs/max-retries-per-test/no-allow-retries-test_retry_attempts/test.py b/llvm/utils/lit/tests/Inputs/max-retries-per-test/no-allow-retries-test_retry_attempts/test.py
@@ -0,0 +1,22 @@
+# RUN: "%python" "%s" "%counter"
+
+import sys
+import os
+
+counter_file = sys.argv[1]
+
+# The first time the test is run, initialize the counter to 1.
+if not os.path.exists(counter_file):
+    with open(counter_file, "w") as counter:
+        counter.write("1")
+
+# Succeed if this is the fourth time we're being run.
+with open(counter_file, "r") as counter:
+    num = int(counter.read())
+    if num == 4:
+        sys.exit(0)
+
+# Otherwise, increment the counter and fail
+with open(counter_file, "w") as counter:
+    counter.write(str(num + 1))
+    sys.exit(1)
diff --git a/llvm/utils/lit/tests/allow-retries.py b/llvm/utils/lit/tests/allow-retries.py
@@ -70,3 +70,51 @@
 #     CHECK-TEST7: # executed command: export LLVM_PROFILE_FILE=
 # CHECK-TEST7-NOT: # executed command: export LLVM_PROFILE_FILE=
 #     CHECK-TEST7: Passed With Retry: 1
+
+# This test only passes on the 4th try. Here we check that a test can be re-run when:
+#  * The "--max-retries-per-test" is specified high enough (3).
+#  * No ALLOW_RETRIES keyword is used in the test script.
+#  * No config.test_retry_attempts is adjusted in the test suite config file.
+# RUN: rm -f %t.counter
+# RUN: %{lit} %{inputs}/max-retries-per-test/no-allow-retries-no-test_retry_attempts/test.py \
+# RUN:   --max-retries-per-test=3 \
+# RUN:   -Dcounter=%t.counter \
+# RUN:   -Dpython=%{python} \
+# RUN: | FileCheck --check-prefix=CHECK-TEST8 %s
+# CHECK-TEST8: Passed With Retry: 1
+
+# This test only passes on the 4th try. Here we check that a test can be re-run when:
+#  * The "--max-retries-per-test" is specified too low (2).
+#  * ALLOW_RETRIES is specified high enough (3)
+#  * No config.test_retry_attempts is adjusted in the test suite config file.
+# RUN: rm -f %t.counter
+# RUN: %{lit} %{inputs}/max-retries-per-test/allow-retries-no-test_retry_attempts/test.py \
+# RUN:   --max-retries-per-test=2 \
+# RUN:   -Dcounter=%t.counter \
+# RUN:   -Dpython=%{python} \
+# RUN: | FileCheck --check-prefix=CHECK-TEST9 %s
+# CHECK-TEST9: Passed With Retry: 1
+
+# This test only passes on the 4th try. Here we check that a test can be re-run when:
+#  * The "--max-retries-per-test" is specified too low (2).
+#  * No ALLOW_RETRIES keyword is used in the test script.
+#  * config.test_retry_attempts is set high enough (3).
+# RUN: rm -f %t.counter
+# RUN: %{lit} %{inputs}/max-retries-per-test/no-allow-retries-test_retry_attempts/test.py \
+# RUN:   --max-retries-per-test=2 \
+# RUN:   -Dcounter=%t.counter \
+# RUN:   -Dpython=%{python} \
+# RUN: | FileCheck --check-prefix=CHECK-TEST10 %s
+# CHECK-TEST10: Passed With Retry: 1
+
+# This test only passes on the 4th try. Here we check that a test can be re-run when:
+#  * The "--max-retries-per-test" is specified too low (1).
+#  * ALLOW_RETRIES keyword set high enough (3).
+#  * config.test_retry_attempts is set too low enough (2).
+# RUN: rm -f %t.counter
+# RUN: %{lit} %{inputs}/max-retries-per-test/no-allow-retries-test_retry_attempts/test.py \
+# RUN:   --max-retries-per-test=1 \
+# RUN:   -Dcounter=%t.counter \
+# RUN:   -Dpython=%{python} \
+# RUN: | FileCheck --check-prefix=CHECK-TEST11 %s
+# CHECK-TEST11: Passed With Retry: 1

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,7 @@ def main(builtin_params={}):`
`42`	`42`	`config_prefix=opts.configPrefix,`
`43`	`43`	`per_test_coverage=opts.per_test_coverage,`
`44`	`44`	`gtest_sharding=opts.gtest_sharding,`
	`45`	`+ maxRetriesPerTest=opts.maxRetriesPerTest,`
`45`	`46`	`)`
`46`	`47`
`47`	`48`	`discovered_tests = lit.discovery.find_tests_for_inputs(`