Skip to content

Commit

Permalink
[CI] [cluster launcher] Fix switched "latest" and "nightly" gcp tests (
Browse files Browse the repository at this point in the history
…#40453)

The GCP "latest" and "nightly" tests had their names switched. This PR unswitches them.

The PR also adds more debug logs for the issue #40241.

---------

Signed-off-by: Archit Kulkarni <architkulkarni@users.noreply.github.com>
  • Loading branch information
architkulkarni authored Oct 20, 2023
1 parent cde6e88 commit e615eaf
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 7 deletions.
12 changes: 8 additions & 4 deletions python/ray/autoscaler/_private/updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import subprocess
import time
import traceback
from threading import Thread

import click
Expand Down Expand Up @@ -164,16 +165,19 @@ def run(self):

cli_logger.error("!!!")
if hasattr(e, "cmd"):
stderr_output = getattr(e, "stderr", "No stderr available")
cli_logger.error(
"Setup command `{}` failed with exit code {}. stderr:",
"Setup command `{}` failed with exit code {}. stderr: {}",
cf.bold(e.cmd),
e.returncode,
stderr_output,
)
else:
cli_logger.verbose_error("{}", str(vars(e)))
cli_logger.verbose_error("Exception details: {}", str(vars(e)))
full_traceback = traceback.format_exc()
cli_logger.error("Full traceback: {}", full_traceback)
# todo: handle this better somehow?
cli_logger.error("{}", str(e))
# todo: print stderr here
cli_logger.error("Error message: {}", str(e))
cli_logger.error("!!!")
cli_logger.newline()

Expand Down
14 changes: 13 additions & 1 deletion python/ray/autoscaler/launch_and_verify_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import sys
import tempfile
import time
import traceback
from pathlib import Path

import boto3
Expand Down Expand Up @@ -187,7 +188,15 @@ def cleanup_cluster(cluster_config):
return
except subprocess.CalledProcessError as e:
print(f"ray down fails[{i+1}/{num_tries}]: ")
print(e.output)
print(e.output.decode("utf-8"))

# Print full traceback
traceback.print_exc()

# Print stdout and stderr from ray down
print(f"stdout:\n{e.stdout.decode('utf-8')}")
print(f"stderr:\n{e.stderr.decode('utf-8')}")

last_error = e

raise last_error
Expand Down Expand Up @@ -220,6 +229,9 @@ def run_ray_commands(cluster_config, retries, no_config_cache, num_expected_node
subprocess.run(cmd, check=True, capture_output=True)
except subprocess.CalledProcessError as e:
print(e.output)
# print stdout and stderr
print(f"stdout:\n{e.stdout.decode('utf-8')}")
print(f"stderr:\n{e.stderr.decode('utf-8')}")
raise e

print("======================================")
Expand Down
4 changes: 2 additions & 2 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6568,7 +6568,7 @@

run:
timeout: 3600
script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override nightly
script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest

- name: gcp_cluster_launcher_nightly_image
group: cluster-launcher-test
Expand All @@ -6586,7 +6586,7 @@

run:
timeout: 3600
script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override latest
script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override nightly


- name: gcp_cluster_launcher_release_image
Expand Down

0 comments on commit e615eaf

Please sign in to comment.