ray-project · krfricke · Apr 22, 2022 · Apr 7, 2022 · Apr 7, 2022 · Apr 8, 2022
diff --git a/ci/README.md b/ci/README.md
@@ -43,7 +43,7 @@ The following practices can avoid such pitfalls while maintaining intuitive cont
   (The sheer length of the script is a secondary concern and can be mitigated by keeping functions modular.)
 
 - Avoid adding new scripts if possible. If it's necessary that you do so, call them instead of sourcing them.
-  Note that thies implies new scripts should not modify the environment, or the caller will not see such changes!
+  Note that this implies new scripts should not modify the environment, or the caller will not see such changes!
 
 - Always add code inside a function, not at global scope. Use `local` for variables where it makes sense.
   However, be careful and know the shell rules: for example, e.g. `local x=$(false)` succeeds even under `set -e`.

diff --git a/ci/ci.sh b/ci/ci.sh
@@ -191,7 +191,8 @@ test_python() {
 
 # For running large Python tests on Linux and MacOS.
 test_large() {
-  bazel test --config=ci "$(./ci/run/bazel_export_options)" --test_env=CONDA_EXE --test_env=CONDA_PYTHON_EXE \
+  # shellcheck disable=SC2046
+  bazel test --config=ci $(./ci/run/bazel_export_options) --test_env=CONDA_EXE --test_env=CONDA_PYTHON_EXE \
       --test_env=CONDA_SHLVL --test_env=CONDA_PREFIX --test_env=CONDA_DEFAULT_ENV --test_env=CONDA_PROMPT_MODIFIER \
       --test_env=CI --test_tag_filters="large_size_python_tests_shard_${BUILDKITE_PARALLEL_JOB}" \
       -- python/ray/tests/...

diff --git a/ci/run/bazel_export_options b/ci/run/bazel_export_options
@@ -1,4 +1,7 @@
 #!/usr/bin/env bash
 
 mkdir -p /tmp/bazel_event_logs
-echo "--build_event_json_file=$(mktemp /tmp/bazel_event_logs/bazel_log.XXXXX)"
+event_json_flag=--build_event_json_file=$(mktemp /tmp/bazel_event_logs/bazel_log.XXXXX)
+logs_archive_flag=--test_env=RAY_TEST_FAILURE_LOGS_ARCHIVE_DIR=/artifact-mount/.failed_test_logs
+
+echo "${event_json_flag} ${logs_archive_flag}"
@@ -9,6 +9,9 @@
 from ray.data.tests.mock_server import *  # noqa
 from ray.data.datasource.file_based_datasource import BlockWritePathProvider
 
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
+
 
 @pytest.fixture(scope="function")
 def aws_credentials():

diff --git a/python/ray/experimental/dag/tests/conftest.py b/python/ray/experimental/dag/tests/conftest.py
@@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
diff --git a/python/ray/ml/tests/conftest.py b/python/ray/ml/tests/conftest.py
@@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
diff --git a/python/ray/serve/tests/conftest.py b/python/ray/serve/tests/conftest.py
@@ -8,6 +8,7 @@
 import ray
 from ray import serve
 from ray.serve.pipeline.generate import DeploymentNameGenerator
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
 
 # https://tools.ietf.org/html/rfc6335#section-6
 MIN_DYNAMIC_PORT = 49152

diff --git a/python/ray/tests/conftest.py b/python/ray/tests/conftest.py
@@ -11,6 +11,9 @@
 import time
 from pathlib import Path
 from unittest import mock
+import shutil
+import platform
+from tempfile import gettempdir
 
 import ray
 import ray.ray_constants as ray_constants
@@ -679,3 +682,40 @@ def set_bad_runtime_env_cache_ttl_seconds(request):
     os.environ["BAD_RUNTIME_ENV_CACHE_TTL_SECONDS"] = ttl
     yield ttl
     del os.environ["BAD_RUNTIME_ENV_CACHE_TTL_SECONDS"]
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    # execute all other hooks to obtain the report object
+    outcome = yield
+    rep = outcome.get_result()
+
+    # We temporarily restrict to Linux until we have artifact dirs
+    # for Windows and Mac
+    if platform.system() != "Linux":
+        return
+
+    # Only archive failed tests after the "call" phase of the test
+    if rep.when != "call" or not rep.failed:
+        return
+
+    # Get dir to write zipped logs to
+    archive_dir = os.environ.get("RAY_TEST_FAILURE_LOGS_ARCHIVE_DIR")
+
+    if not archive_dir:
+        return
+
+    if not os.path.exists(archive_dir):
+        os.makedirs(archive_dir)
+
+    # Get logs dir from the latest ray session
+    tmp_dir = gettempdir()
+    logs_dir = os.path.join(tmp_dir, "ray", "session_latest", "logs")
+
+    if not os.path.exists(logs_dir):
+        return
+
+    # Write zipped logs to logs archive dir
+    test_name = rep.nodeid.replace(os.sep, "::")
+    output_file = os.path.join(archive_dir, f"{test_name}_{time.time():.4f}")
+    shutil.make_archive(output_file, "zip", logs_dir)
@@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
@@ -23,6 +23,9 @@
 from ray.train.worker_group import WorkerGroup
 from ray.util.placement_group import get_current_placement_group
 
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
+
 
 @pytest.fixture
 def ray_start_2_cpus():

@@ -0,0 +1,2 @@
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
@@ -7,6 +7,9 @@
 
 import tempfile
 from ray.tests.conftest import get_default_fixture_ray_kwargs
+
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
 import os
 import uuid
 from ray.workflow.tests import utils

@@ -1 +1,4 @@
 from ray.tests.conftest import ray_start_regular_shared  # noqa: F401
+
+# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
+from ray.tests.conftest import pytest_runtest_makereport  # noqa
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
		from ray.tests.conftest import pytest_runtest_makereport # noqa