Skip to content

Commit

Permalink
[CI] Create zip of ray session_latest/logs dir on test failure and …
Browse files Browse the repository at this point in the history
…upload to buildkite via `/artifact-mount` (#23783)

Creates a zip of session_latest dir with test name and timestamp upon python test failure. Writes to dir specified by env var `RAY_TEST_FAILURE_LOGS_DIR`. Noop if env var does not exist.

Downstream consumer (e.g. CI) can upload all created artifacts in this dir. Thereby, PR submitters can more easily debug their CI failures, especially if they can't repro locally.

Limitations:
- a conftest.py file importing the main ray conftest.py needs to be present in same dir as test. This presents a challenge for e.g. dashboard tests which are highly scattered
  • Loading branch information
jon-chuang authored Apr 22, 2022
1 parent 1807cff commit e6a458a
Show file tree
Hide file tree
Showing 13 changed files with 69 additions and 3 deletions.
2 changes: 1 addition & 1 deletion ci/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ The following practices can avoid such pitfalls while maintaining intuitive cont
(The sheer length of the script is a secondary concern and can be mitigated by keeping functions modular.)

- Avoid adding new scripts if possible. If it's necessary that you do so, call them instead of sourcing them.
Note that thies implies new scripts should not modify the environment, or the caller will not see such changes!
Note that this implies new scripts should not modify the environment, or the caller will not see such changes!

- Always add code inside a function, not at global scope. Use `local` for variables where it makes sense.
However, be careful and know the shell rules: for example, e.g. `local x=$(false)` succeeds even under `set -e`.
Expand Down
3 changes: 2 additions & 1 deletion ci/ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,8 @@ test_python() {

# For running large Python tests on Linux and MacOS.
test_large() {
bazel test --config=ci "$(./ci/run/bazel_export_options)" --test_env=CONDA_EXE --test_env=CONDA_PYTHON_EXE \
# shellcheck disable=SC2046
bazel test --config=ci $(./ci/run/bazel_export_options) --test_env=CONDA_EXE --test_env=CONDA_PYTHON_EXE \
--test_env=CONDA_SHLVL --test_env=CONDA_PREFIX --test_env=CONDA_DEFAULT_ENV --test_env=CONDA_PROMPT_MODIFIER \
--test_env=CI --test_tag_filters="large_size_python_tests_shard_${BUILDKITE_PARALLEL_JOB}" \
-- python/ray/tests/...
Expand Down
5 changes: 4 additions & 1 deletion ci/run/bazel_export_options
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/usr/bin/env bash

mkdir -p /tmp/bazel_event_logs
echo "--build_event_json_file=$(mktemp /tmp/bazel_event_logs/bazel_log.XXXXX)"
event_json_flag=--build_event_json_file=$(mktemp /tmp/bazel_event_logs/bazel_log.XXXXX)
logs_archive_flag=--test_env=RAY_TEST_FAILURE_LOGS_ARCHIVE_DIR=/artifact-mount/.failed_test_logs

echo "${event_json_flag} ${logs_archive_flag}"
3 changes: 3 additions & 0 deletions python/ray/data/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from ray.data.tests.mock_server import * # noqa
from ray.data.datasource.file_based_datasource import BlockWritePathProvider

# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa


@pytest.fixture(scope="function")
def aws_credentials():
Expand Down
2 changes: 2 additions & 0 deletions python/ray/experimental/dag/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa
2 changes: 2 additions & 0 deletions python/ray/ml/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa
2 changes: 2 additions & 0 deletions python/ray/serve/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import ray
from ray import serve

from ray.tests.conftest import pytest_runtest_makereport # noqa

# https://tools.ietf.org/html/rfc6335#section-6
MIN_DYNAMIC_PORT = 49152
MAX_DYNAMIC_PORT = 65535
Expand Down
40 changes: 40 additions & 0 deletions python/ray/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
import time
from pathlib import Path
from unittest import mock
import shutil
import platform
from tempfile import gettempdir

import ray
import ray.ray_constants as ray_constants
Expand Down Expand Up @@ -679,3 +682,40 @@ def set_bad_runtime_env_cache_ttl_seconds(request):
os.environ["BAD_RUNTIME_ENV_CACHE_TTL_SECONDS"] = ttl
yield ttl
del os.environ["BAD_RUNTIME_ENV_CACHE_TTL_SECONDS"]


@pytest.hookimpl(tryfirst=True, hookwrapper=True)
def pytest_runtest_makereport(item, call):
# execute all other hooks to obtain the report object
outcome = yield
rep = outcome.get_result()

# We temporarily restrict to Linux until we have artifact dirs
# for Windows and Mac
if platform.system() != "Linux":
return

# Only archive failed tests after the "call" phase of the test
if rep.when != "call" or not rep.failed:
return

# Get dir to write zipped logs to
archive_dir = os.environ.get("RAY_TEST_FAILURE_LOGS_ARCHIVE_DIR")

if not archive_dir:
return

if not os.path.exists(archive_dir):
os.makedirs(archive_dir)

# Get logs dir from the latest ray session
tmp_dir = gettempdir()
logs_dir = os.path.join(tmp_dir, "ray", "session_latest", "logs")

if not os.path.exists(logs_dir):
return

# Write zipped logs to logs archive dir
test_name = rep.nodeid.replace(os.sep, "::")
output_file = os.path.join(archive_dir, f"{test_name}_{time.time():.4f}")
shutil.make_archive(output_file, "zip", logs_dir)
2 changes: 2 additions & 0 deletions python/ray/train/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa
3 changes: 3 additions & 0 deletions python/ray/train/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
from ray.train.worker_group import WorkerGroup
from ray.util.placement_group import get_current_placement_group

# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa


@pytest.fixture
def ray_start_2_cpus():
Expand Down
2 changes: 2 additions & 0 deletions python/ray/tune/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa
3 changes: 3 additions & 0 deletions python/ray/workflow/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@

import tempfile
from ray.tests.conftest import get_default_fixture_ray_kwargs

# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa
import os
import uuid
from ray.workflow.tests import utils
Expand Down
3 changes: 3 additions & 0 deletions rllib/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
from ray.tests.conftest import ray_start_regular_shared # noqa: F401

# Trigger pytest hook to automatically zip test cluster logs to archive dir on failure
from ray.tests.conftest import pytest_runtest_makereport # noqa

0 comments on commit e6a458a

Please sign in to comment.