[CI] Create zip of ray session_latest/logs dir on test failure and …

…upload to buildkite via `/artifact-mount` (#23783) Creates a zip of session_latest dir with test name and timestamp upon python test failure. Writes to dir specified by env var `RAY_TEST_FAILURE_LOGS_DIR`. Noop if env var does not exist. Downstream consumer (e.g. CI) can upload all created artifacts in this dir. Thereby, PR submitters can more easily debug their CI failures, especially if they can't repro locally. Limitations: - a conftest.py file importing the main ray conftest.py needs to be present in same dir as test. This presents a challenge for e.g. dashboard tests which are highly scattered
ray-project · Apr 22, 2022 · e6a458a · e6a458a
1 parent 1807cff
commit e6a458a
Show file tree

Hide file tree

Showing 13 changed files with 69 additions and 3 deletions.
diff --git a/ci/README.md b/ci/README.md
@@ -43,7 +43,7 @@ The following practices can avoid such pitfalls while maintaining intuitive cont
   (The sheer length of the script is a secondary concern and can be mitigated by keeping functions modular.)
 
 - Avoid adding new scripts if possible. If it's necessary that you do so, call them instead of sourcing them.
-  Note that thies implies new scripts should not modify the environment, or the caller will not see such changes!
+  Note that this implies new scripts should not modify the environment, or the caller will not see such changes!
 
 - Always add code inside a function, not at global scope. Use `local` for variables where it makes sense.
   However, be careful and know the shell rules: for example, e.g. `local x=$(false)` succeeds even under `set -e`.

diff --git a/ci/ci.sh b/ci/ci.sh
@@ -191,7 +191,8 @@ test_python() {
 
 # For running large Python tests on Linux and MacOS.
 test_large() {
-  bazel test --config=ci "$(./ci/run/bazel_export_options)" --test_env=CONDA_EXE --test_env=CONDA_PYTHON_EXE \
+  # shellcheck disable=SC2046
+  bazel test --config=ci $(./ci/run/bazel_export_options) --test_env=CONDA_EXE --test_env=CONDA_PYTHON_EXE \
       --test_env=CONDA_SHLVL --test_env=CONDA_PREFIX --test_env=CONDA_DEFAULT_ENV --test_env=CONDA_PROMPT_MODIFIER \
       --test_env=CI --test_tag_filters="large_size_python_tests_shard_${BUILDKITE_PARALLEL_JOB}" \
       -- python/ray/tests/...

diff --git a/ci/run/bazel_export_options b/ci/run/bazel_export_options
@@ -1,4 +1,7 @@
 #!/usr/bin/env bash
 
 mkdir -p /tmp/bazel_event_logs
-echo "--build_event_json_file=$(mktemp /tmp/bazel_event_logs/bazel_log.XXXXX)"
+event_json_flag=--build_event_json_file=$(mktemp /tmp/bazel_event_logs/bazel_log.XXXXX)
+logs_archive_flag=--test_env=RAY_TEST_FAILURE_LOGS_ARCHIVE_DIR=/artifact-mount/.failed_test_logs
+
+echo "${event_json_flag} ${logs_archive_flag}"
diff --git a/python/ray/data/tests/conftest.py b/python/ray/data/tests/conftest.py
@@ -9,6 +9,9 @@
 from ray.data.tests.mock_server import *  # noqa
 from ray.data.datasource.file_based_datasource import BlockWritePathProvider
 
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
+
 
 @pytest.fixture(scope="function")
 def aws_credentials():

diff --git a/python/ray/experimental/dag/tests/conftest.py b/python/ray/experimental/dag/tests/conftest.py
@@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
diff --git a/python/ray/ml/tests/conftest.py b/python/ray/ml/tests/conftest.py
@@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
diff --git a/python/ray/serve/tests/conftest.py b/python/ray/serve/tests/conftest.py
@@ -8,6 +8,8 @@
 import ray
 from ray import serve
 
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
+
 # https://tools.ietf.org/html/rfc6335#section-6
 MIN_DYNAMIC_PORT = 49152
 MAX_DYNAMIC_PORT = 65535

diff --git a/python/ray/tests/conftest.py b/python/ray/tests/conftest.py
@@ -11,6 +11,9 @@
 import time
 from pathlib import Path
 from unittest import mock
+import shutil
+import platform
+from tempfile import gettempdir
 
 import ray
 import ray.ray_constants as ray_constants
@@ -679,3 +682,40 @@ def set_bad_runtime_env_cache_ttl_seconds(request):
     os.environ["BAD_RUNTIME_ENV_CACHE_TTL_SECONDS"] = ttl
     yield ttl
     del os.environ["BAD_RUNTIME_ENV_CACHE_TTL_SECONDS"]
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    # execute all other hooks to obtain the report object
+    outcome = yield
+    rep = outcome.get_result()
+
+    # We temporarily restrict to Linux until we have artifact dirs
+    # for Windows and Mac
+    if platform.system() != "Linux":
+        return
+
+    # Only archive failed tests after the "call" phase of the test
+    if rep.when != "call" or not rep.failed:
+        return
+
+    # Get dir to write zipped logs to
+    archive_dir = os.environ.get("RAY_TEST_FAILURE_LOGS_ARCHIVE_DIR")
+
+    if not archive_dir:
+        return
+
+    if not os.path.exists(archive_dir):
+        os.makedirs(archive_dir)
+
+    # Get logs dir from the latest ray session
+    tmp_dir = gettempdir()
+    logs_dir = os.path.join(tmp_dir, "ray", "session_latest", "logs")
+
+    if not os.path.exists(logs_dir):
+        return
+
+    # Write zipped logs to logs archive dir
+    test_name = rep.nodeid.replace(os.sep, "::")
+    output_file = os.path.join(archive_dir, f"{test_name}_{time.time():.4f}")
+    shutil.make_archive(output_file, "zip", logs_dir)
diff --git a/python/ray/train/tests/conftest.py b/python/ray/train/tests/conftest.py
@@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
diff --git a/python/ray/train/tests/test_backend.py b/python/ray/train/tests/test_backend.py
@@ -23,6 +23,9 @@
 from ray.train.worker_group import WorkerGroup
 from ray.util.placement_group import get_current_placement_group
 
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
+
 
 @pytest.fixture
 def ray_start_2_cpus():

diff --git a/python/ray/tune/tests/conftest.py b/python/ray/tune/tests/conftest.py
@@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
diff --git a/python/ray/workflow/tests/conftest.py b/python/ray/workflow/tests/conftest.py
@@ -7,6 +7,9 @@
 
 import tempfile
 from ray.tests.conftest import get_default_fixture_ray_kwargs
+
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
 import os
 import uuid
 from ray.workflow.tests import utils

diff --git a/rllib/tests/conftest.py b/rllib/tests/conftest.py
@@ -1 +1,4 @@
 from ray.tests.conftest import ray_start_regular_shared  # noqa: F401
+
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
		from ray.tests.conftest import pytest_runtest_makereport # noqa