Skip to content

Commit

Permalink
Add serve stress test (#7076)
Browse files Browse the repository at this point in the history
  • Loading branch information
simon-mo authored Feb 10, 2020
1 parent 247a4d0 commit f6c09ff
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 3 deletions.
27 changes: 24 additions & 3 deletions ci/long_running_tests/ray-project/project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,27 @@ commands:
help: "Start a long running stress test."
command: |
# Install nightly Ray wheels.
source activate tensorflow_p36 && pip install -U {{wheel}}
source activate tensorflow_p36 && pip install ray[dashboard,debug,rllib,tune] gym[atari]
source activate tensorflow_p36 && pip install -q -U {{wheel}} Click
source activate tensorflow_p36 && pip install -q ray[all] gym[atari]
source activate tensorflow_p36 && python workloads/{{workload}}.py
params:
- name: wheel
help: "URL to the ray wheel to test (defaults to latest)."
default: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl
- name: workload
help: "Name of the workload to run."
choices: ["actor_deaths", "apex", "impala", "many_actor_tasks", "many_drivers", "many_tasks", "node_failures", "pbt"]
choices:
[
"actor_deaths",
"apex",
"impala",
"many_actor_tasks",
"many_drivers",
"many_tasks",
"node_failures",
"pbt",
"serve",
]
config:
tmux: true

Expand All @@ -29,3 +40,13 @@ commands:
- name: show-output
command: tmux capture-pane -p
help: "Show tail of the workoad output."

# Pathnames for files and directories that should be saved
# in a snapshot but that should not be synced with a# session. Pathnames can be relative to the project
# directory or absolute. Generally, this should be files
# that were created by an active session, such as
# application checkpoints and logs.
output_files: []
# For example, uncomment this to save the logs from the
# last ray job.
# "/tmp/ray/session_latest",
71 changes: 71 additions & 0 deletions ci/long_running_tests/workloads/serve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import time
import subprocess
from subprocess import PIPE

import requests

import ray
from ray.experimental import serve
from ray.cluster_utils import Cluster
from ray.experimental.serve.kv_store_service import RayInternalKVStore

num_redis_shards = 1
redis_max_memory = 10**8
object_store_memory = 10**8
num_nodes = 5
cluster = Cluster()
for i in range(num_nodes):
cluster.add_node(
redis_port=6379 if i == 0 else None,
num_redis_shards=num_redis_shards if i == 0 else None,
num_cpus=8,
num_gpus=0,
resources={str(i): 2},
object_store_memory=object_store_memory,
redis_max_memory=redis_max_memory,
webui_host="0.0.0.0")

print("Downloading load testing tool")
subprocess.call([
"bash", "-c", "rm hey_linux_amd64 || true;"
"wget https://storage.googleapis.com/hey-release/hey_linux_amd64;"
"chmod +x hey_linux_amd64"
])

ray.init(address=cluster.address, include_webui=True, webui_host='0.0.0.0')
serve.init(blocking=True, kv_store_connector=lambda ns: RayInternalKVStore(ns))


@serve.route('/echo')
@serve.accept_batch
def echo(_):
time.sleep(0.01) # Sleep for 10ms
ray.show_in_webui(str(serve.context.batch_size), key="Current batch size")
return ['hi {}'.format(i) for i in range(serve.context.batch_size)]


print("Scaling to 30 replicas")
config = serve.get_backend_config("echo:v0")
config.num_replicas = 30
config.max_batch_size = 16
serve.set_backend_config("echo:v0", config)

print("Warming up")
for _ in range(5):
resp = requests.get("http://127.0.0.1:8000/echo").json()
print(resp)
time.sleep(0.5)

connections = int(config.num_replicas * config.max_batch_size * 0.75)
proc = subprocess.Popen(
[
"./hey_linux_amd64", "-c",
str(connections), "-z", "360m", "http://127.0.0.1:8000/echo"
],
stdout=PIPE,
stderr=PIPE)
print("started load testing")
proc.wait()
out, err = proc.communicate()
print(out.decode())
print(err.decode())

0 comments on commit f6c09ff

Please sign in to comment.