Skip to content

Commit

Permalink
Merge branch 'master' into unskip_1
Browse files Browse the repository at this point in the history
  • Loading branch information
czgdp1807 committed Jan 20, 2022
2 parents 40d17d8 + b6d3e01 commit 3eb8fde
Show file tree
Hide file tree
Showing 531 changed files with 2,807 additions and 1,372 deletions.
5 changes: 1 addition & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,7 @@ scripts/nodes.txt
**/MNIST_data/
**/cifar-10-batches-bin/

# Generated documentation files
/doc/_build
/doc/source/_static/thumbs
/doc/source/tune/generated_guides/


# User-specific stuff:
.idea/**/workspace.xml
Expand Down
65 changes: 13 additions & 52 deletions benchmarks/benchmark_tests.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
- name: single_node
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"

team: core
cluster:
app_config: app_config.yaml
compute_template: single_node.yaml
Expand All @@ -13,10 +10,7 @@
script: python single_node/test_single_node.py

- name: object_store
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"

team: core
cluster:
app_config: app_config.yaml
compute_template: object_store.yaml
Expand All @@ -27,10 +21,7 @@
script: python object_store/test_object_store.py

- name: many_actors
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"

team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
Expand All @@ -41,10 +32,7 @@
script: python distributed/test_many_actors.py

- name: many_actors_smoke_test
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"

team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
Expand All @@ -55,10 +43,7 @@
script: SMOKE_TEST=1 python distributed/test_many_actors.py

- name: many_tasks
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"

team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
Expand All @@ -69,10 +54,7 @@
script: python distributed/test_many_tasks.py --num-tasks=10000

- name: many_tasks_smoke_test
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"

team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
Expand All @@ -83,10 +65,7 @@
script: python distributed/test_many_tasks.py --num-tasks=100

- name: many_pgs
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"

team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
Expand All @@ -97,10 +76,7 @@
script: python distributed/test_many_pgs.py

- name: many_pgs_smoke_test
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"

team: core
cluster:
app_config: app_config.yaml
compute_template: distributed_smoke_test.yaml
Expand All @@ -112,10 +88,7 @@

# NOTE: No smoke test since this shares a script with the many_tasks_smoke_test
- name: many_nodes
owner:
mail: "core@anyscale.com"
slack: "@Alex Wu"

team: core
cluster:
app_config: app_config.yaml
compute_template: many_nodes.yaml
Expand All @@ -126,10 +99,7 @@
script: python distributed/test_many_tasks.py --num-tasks=1000

- name: many_tasks_redis_ha
owner:
mail: "core@anyscale.com"
slack: "@Yi Cheng"

team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
Expand All @@ -146,10 +116,7 @@
stable: false

- name: many_actors_redis_ha
owner:
mail: "core@anyscale.com"
slack: "@Yi Cheng"

team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
Expand All @@ -166,10 +133,7 @@
stable: false

- name: many_nodes_redis_ha
owner:
mail: "core@anyscale.com"
slack: "@Yi Cheng"

team: core
cluster:
app_config: app_config.yaml
compute_template: many_nodes.yaml
Expand All @@ -186,10 +150,7 @@
stable: false

- name: many_pgs_redis_ha
owner:
mail: "core@anyscale.com"
slack: "@Yi Cheng"

team: core
cluster:
app_config: app_config.yaml
compute_template: distributed.yaml
Expand Down
12 changes: 8 additions & 4 deletions dashboard/modules/node/node_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,11 @@ def process_log_batch(log_batch):

if self._dashboard_head.gcs_log_subscriber:
while True:
log_batch = await \
self._dashboard_head.gcs_log_subscriber.poll()
try:
log_batch = await \
self._dashboard_head.gcs_log_subscriber.poll()
if log_batch is None:
continue
process_log_batch(log_batch)
except Exception:
logger.exception("Error receiving log from GCS.")
Expand Down Expand Up @@ -304,9 +306,11 @@ def process_error(error_data):

if self._dashboard_head.gcs_error_subscriber:
while True:
_, error_data = await \
self._dashboard_head.gcs_error_subscriber.poll()
try:
_, error_data = await \
self._dashboard_head.gcs_error_subscriber.poll()
if error_data is None:
continue
process_error(error_data)
except Exception:
logger.exception("Error receiving error info from GCS.")
Expand Down
16 changes: 15 additions & 1 deletion dashboard/modules/reporter/reporter_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

logger = logging.getLogger(__name__)

enable_gpu_usage_check = True

# Are we in a K8s pod?
IN_KUBERNETES_POD = "KUBERNETES_SERVICE_HOST" in os.environ

Expand Down Expand Up @@ -202,14 +204,26 @@ def _get_cpu_percent():

@staticmethod
def _get_gpu_usage():
if gpustat is None:
global enable_gpu_usage_check
if gpustat is None or not enable_gpu_usage_check:
return []
gpu_utilizations = []
gpus = []
try:
gpus = gpustat.new_query().gpus
except Exception as e:
logger.debug(f"gpustat failed to retrieve GPU information: {e}")

# gpustat calls pynvml.nvmlInit()
# On machines without GPUs, this can run subprocesses that spew to
# stderr. Then with log_to_driver=True, we get log spew from every
# single raylet. To avoid this, disable the GPU usage check on
# certain errors.
# https://github.com/ray-project/ray/issues/14305
# https://github.com/ray-project/ray/pull/21686
if type(e).__name__ == "NVMLError_DriverNotLoaded":
enable_gpu_usage_check = False

for gpu in gpus:
# Note the keys in this dict have periods which throws
# off javascript so we change .s to _s
Expand Down
9 changes: 8 additions & 1 deletion doc/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,8 @@
auto_examples/
# Generated documentation files
_build
source/_static/thumbs

source/ray-core/examples/
source/ray-tune/tutorials/
source/ray-tune/generated_guides/
source/ray-data/examples/
46 changes: 23 additions & 23 deletions doc/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
py_test(
name = "dask_xgboost",
size = "medium",
main = "examples/dask_xgboost/dask_xgboost.py",
srcs = ["examples/dask_xgboost/dask_xgboost.py"],
main = "source/ray-core/_examples/dask_xgboost/dask_xgboost.py",
srcs = ["source/ray-core/_examples/dask_xgboost/dask_xgboost.py"],
tags = ["exclusive", "team:ml", "py37"],
args = ["--smoke-test", "--address ''", "--num-actors 4",
"--cpus-per-actor 1", "--num-actors-inference 4",
Expand All @@ -20,8 +20,8 @@ py_test(
py_test(
name = "modin_xgboost",
size = "medium",
main = "examples/modin_xgboost/modin_xgboost.py",
srcs = ["examples/modin_xgboost/modin_xgboost.py"],
main = "source/ray-core/_examples/modin_xgboost/modin_xgboost.py",
srcs = ["source/ray-core/_examples/modin_xgboost/modin_xgboost.py"],
tags = ["exclusive", "team:ml", "py37"],
args = ["--smoke-test", "--address ''", "--num-actors 4",
"--cpus-per-actor 1", "--num-actors-inference 4",
Expand All @@ -31,77 +31,77 @@ py_test(
py_test(
name = "big_data_ingestion",
size = "small",
srcs = ["source/data/_examples/big_data_ingestion.py"],
srcs = ["source/ray-data/_examples/big_data_ingestion.py"],
tags = ["exclusive", "team:core", "py37"]
)

py_test(
name = "datasets_train",
size = "medium",
srcs = ["examples/datasets_train/datasets_train.py"],
srcs = ["source/ray-core/_examples/datasets_train/datasets_train.py"],
tags = ["exclusive", "team:ml", "py37", "datasets_train"],
args = ["--smoke-test", "--num-workers=2", "--use-gpu"]
)

py_test(
name = "plot_hyperparameter",
size = "small",
srcs = ["examples/plot_hyperparameter.py"],
srcs = ["source/ray-core/_examples/plot_hyperparameter.py"],
tags = ["exclusive", "team:ml"]
)

py_test(
name = "plot_parameter_server",
size = "medium",
srcs = ["examples/plot_parameter_server.py"],
srcs = ["source/ray-core/_examples/plot_parameter_server.py"],
tags = ["exclusive", "team:ml"]
)

py_test(
name = "plot_pong_example",
size = "large",
srcs = ["examples/plot_pong_example.py"],
srcs = ["source/ray-core/_examples/plot_pong_example.py"],
tags = ["exclusive", "team:ml"]
)

py_test(
name = "progress_bar",
size = "small",
srcs = ["examples/progress_bar.py"],
srcs = ["source/ray-core/_examples/progress_bar.py"],
tags = ["exclusive", "team:ml"]
)


# Directory: examples/doc_code
# Directory: source/ray-core/_examples/doc_code
py_test(
name = "doc_code_tf_example",
size = "small",
main = "examples/doc_code/tf_example.py",
srcs = ["examples/doc_code/tf_example.py"],
main = "source/ray-core/_examples/doc_code/tf_example.py",
srcs = ["source/ray-core/_examples/doc_code/tf_example.py"],
tags = ["exclusive", "tf", "team:ml"]
)

py_test(
name = "doc_code_torch_example",
size = "small",
main = "examples/doc_code/torch_example.py",
srcs = ["examples/doc_code/torch_example.py"],
main = "source/ray-core/_examples/doc_code/torch_example.py",
srcs = ["source/ray-core/_examples/doc_code/torch_example.py"],
tags = ["exclusive", "pytorch", "team:ml"]
)

py_test(
name = "doc_code_metrics_example",
size = "small",
main = "examples/doc_code/metrics_example.py",
srcs = ["examples/doc_code/metrics_example.py"],
main = "source/ray-core/_examples/doc_code/metrics_example.py",
srcs = ["source/ray-core/_examples/doc_code/metrics_example.py"],
tags = ["exclusive", "team:serve"]
)

py_test(
name = "doc_code_runtime_env_example",
size = "small",
main = "examples/doc_code/runtime_env_example.py",
srcs = ["examples/doc_code/runtime_env_example.py"],
main = "source/ray-core/_examples/doc_code/runtime_env_example.py",
srcs = ["source/ray-core/_examples/doc_code/runtime_env_example.py"],
tags = ["exclusive", "post_wheel_build", "team:serve"]
)

Expand All @@ -113,17 +113,17 @@ py_test(
py_test(
name = "tune_sklearn",
size = "medium",
main = "source/tune/_tutorials/tune-sklearn.py",
srcs = ["source/tune/_tutorials/tune-sklearn.py"],
main = "source/ray-tune/_tutorials/tune-sklearn.py",
srcs = ["source/ray-tune/_tutorials/tune-sklearn.py"],
tags = ["exclusive", "example", "team:ml"],
args = ["--smoke-test"]
)

py_test(
name = "tune_serve_integration_mnist",
size = "medium",
main = "source/tune/_tutorials/tune-serve-integration-mnist.py",
srcs = ["source/tune/_tutorials/tune-serve-integration-mnist.py"],
main = "source/ray-tune/_tutorials/tune-serve-integration-mnist.py",
srcs = ["source/ray-tune/_tutorials/tune-serve-integration-mnist.py"],
tags = ["exclusive", "example", "team:ml"],
args = ["--smoke-test", "--from_scratch", "--day 0"]
)
Loading

0 comments on commit 3eb8fde

Please sign in to comment.