Skip to content

Commit 8a9a9a6

Browse files
authored
CaaS - E2E fixes (#2212)
1 parent d4d7095 commit 8a9a9a6

File tree

4 files changed

+47
-19
lines changed

4 files changed

+47
-19
lines changed

.circleci/config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,8 @@ jobs:
159159
node_groups:
160160
- name: spot
161161
instance_type: t3.medium
162-
min_instances: 10
163-
max_instances: 10
162+
min_instances: 16
163+
max_instances: 16
164164
spot: true
165165
- name: cpu
166166
instance_type: c5.xlarge

test/e2e/e2e/tests.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,7 @@ def test_load_realtime(
542542
# controls the flow of requests
543543
request_stopper = td.Event()
544544
latencies: List[float] = []
545+
failed = False
545546
try:
546547
printer(f"getting {desired_replicas} replicas ready")
547548
assert apis_ready(
@@ -623,6 +624,7 @@ def test_load_realtime(
623624

624625
except:
625626
# best effort
627+
failed = True
626628
try:
627629
api_info = client.get_api(api_name)
628630
printer(json.dumps(api_info, indent=2))
@@ -632,6 +634,8 @@ def test_load_realtime(
632634
finally:
633635
request_stopper.set()
634636
delete_apis(client, [api_name])
637+
if failed:
638+
time.sleep(30)
635639

636640

637641
def test_load_async(
@@ -665,6 +669,7 @@ def test_load_async(
665669
request_stopper = td.Event()
666670
map_stopper = td.Event()
667671
responses: List[Dict[str, Any]] = []
672+
failed = False
668673
try:
669674
printer(f"getting {desired_replicas} replicas ready")
670675
assert apis_ready(
@@ -738,6 +743,7 @@ def test_load_async(
738743

739744
except:
740745
# best effort
746+
failed = True
741747
try:
742748
api_info = client.get_api(api_name)
743749
printer(json.dumps(api_info, indent=2))
@@ -749,6 +755,8 @@ def test_load_async(
749755
printer(f"{len(results)}/{total_requests} have been successfully retrieved")
750756
map_stopper.set()
751757
delete_apis(client, [api_name])
758+
if failed:
759+
time.sleep(30)
752760

753761

754762
def test_load_batch(
@@ -786,7 +794,7 @@ def test_load_batch(
786794
api_name = api_specs[0]["name"]
787795
client.deploy(api_spec=api_specs[0])
788796
api_endpoint = client.get_api(api_name)["endpoint"]
789-
797+
failed = False
790798
try:
791799
assert endpoint_ready(
792800
client=client, api_name=api_name, timeout=deploy_timeout
@@ -840,6 +848,7 @@ def test_load_batch(
840848

841849
except:
842850
# best effort
851+
failed = True
843852
try:
844853
api_info = client.get_api(api_name)
845854

@@ -853,6 +862,8 @@ def test_load_batch(
853862

854863
finally:
855864
delete_apis(client, [api_name])
865+
if failed:
866+
time.sleep(30)
856867

857868

858869
def test_load_task(
@@ -881,6 +892,7 @@ def test_load_task(
881892

882893
request_stopper = td.Event()
883894
map_stopper = td.Event()
895+
failed = False
884896
try:
885897
assert endpoint_ready(
886898
client=client, api_name=api_name, timeout=deploy_timeout
@@ -902,6 +914,9 @@ def test_load_task(
902914
check_futures_healthy(threads_futures)
903915
wait_on_futures(threads_futures)
904916

917+
# give it a bit of a delay to avoid overloading
918+
time.sleep(1)
919+
905920
printer("waiting on the jobs")
906921
job_ids = [job_spec.json()["job_id"] for job_spec in job_specs]
907922
retrieve_results_concurrently(
@@ -916,6 +931,7 @@ def test_load_task(
916931

917932
except:
918933
# best effort
934+
failed = True
919935
try:
920936
api_info = client.get_api(api_name)
921937

@@ -930,6 +946,8 @@ def test_load_task(
930946
finally:
931947
map_stopper.set()
932948
delete_apis(client, [api_name])
949+
if failed:
950+
time.sleep(30)
933951

934952

935953
def test_long_running_realtime(

test/e2e/e2e/utils.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -374,19 +374,29 @@ def _retriever(request_id: str):
374374
continue
375375

376376
result_response_json = result_response.json()
377-
if (
378-
async_kind
379-
and "status" in result_response_json
380-
and result_response_json["status"] == "completed"
381-
):
382-
break
377+
if async_kind and "status" in result_response_json:
378+
if result_response_json["status"] == "completed":
379+
break
380+
if result_response_json["status"] not in ["in_progress", "in_queue"]:
381+
raise RuntimeError(
382+
f"status for request ID {request_id} got set to {result_response_json['status']}"
383+
)
384+
383385
if (
384386
task_kind
385387
and "job_status" in result_response_json
386388
and "status" in result_response_json["job_status"]
387-
and result_response_json["job_status"]["status"] == "status_succeeded"
388389
):
389-
break
390+
if result_response_json["job_status"]["status"] == "succeeded":
391+
break
392+
if result_response_json["job_status"]["status"] not in [
393+
"pending",
394+
"enqueuing",
395+
"running",
396+
]:
397+
raise RuntimeError(
398+
f"status for job ID {request_id} got set to {result_response_json['job_status']['status']}"
399+
)
390400

391401
if event_stopper.is_set():
392402
return

test/e2e/tests/conftest.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,13 @@ def pytest_configure(config):
8686
"realtime_deploy_timeout": int(
8787
os.environ.get("CORTEX_TEST_REALTIME_DEPLOY_TIMEOUT", 200)
8888
),
89-
"batch_deploy_timeout": int(os.environ.get("CORTEX_TEST_BATCH_DEPLOY_TIMEOUT", 30)),
89+
"batch_deploy_timeout": int(os.environ.get("CORTEX_TEST_BATCH_DEPLOY_TIMEOUT", 150)),
9090
"batch_job_timeout": int(os.environ.get("CORTEX_TEST_BATCH_JOB_TIMEOUT", 200)),
91-
"async_deploy_timeout": int(os.environ.get("CORTEX_TEST_ASYNC_DEPLOY_TIMEOUT", 120)),
91+
"async_deploy_timeout": int(os.environ.get("CORTEX_TEST_ASYNC_DEPLOY_TIMEOUT", 150)),
9292
"async_workload_timeout": int(
9393
os.environ.get("CORTEX_TEST_ASYNC_WORKLOAD_TIMEOUT", 200)
9494
),
95-
"task_deploy_timeout": int(os.environ.get("CORTEX_TEST_TASK_DEPLOY_TIMEOUT", 30)),
95+
"task_deploy_timeout": int(os.environ.get("CORTEX_TEST_TASK_DEPLOY_TIMEOUT", 75)),
9696
"task_job_timeout": int(os.environ.get("CORTEX_TEST_TASK_JOB_TIMEOUT", 200)),
9797
"skip_gpus": config.getoption("--skip-gpus"),
9898
"skip_infs": config.getoption("--skip-infs"),
@@ -104,7 +104,7 @@ def pytest_configure(config):
104104
},
105105
"load_test_config": {
106106
"realtime": {
107-
"total_requests": 10 ** 6,
107+
"total_requests": 10 ** 5,
108108
"desired_replicas": 50,
109109
"concurrency": 50,
110110
"min_rtt": 0.004, # measured in seconds
@@ -115,7 +115,7 @@ def pytest_configure(config):
115115
},
116116
"async": {
117117
"total_requests": 10 ** 3,
118-
"desired_replicas": 50,
118+
"desired_replicas": 20,
119119
"concurrency": 10,
120120
"submit_timeout": 120, # measured in seconds
121121
"workload_timeout": 120, # measured in seconds
@@ -125,13 +125,13 @@ def pytest_configure(config):
125125
"workers_per_job": 10,
126126
"items_per_job": 10 ** 5,
127127
"batch_size": 10 * 2,
128-
"workload_timeout": 210, # measured in seconds
128+
"workload_timeout": 200, # measured in seconds
129129
},
130130
"task": {
131131
"jobs": 10 ** 2,
132132
"concurrency": 4,
133-
"submit_timeout": 240, # measured in seconds
134-
"workload_timeout": 180, # measured in seconds
133+
"submit_timeout": 200, # measured in seconds
134+
"workload_timeout": 400, # measured in seconds
135135
},
136136
},
137137
"long_running_test_config": {

0 commit comments

Comments
 (0)