Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the busy/idle execution state tracking for kernels. #1429

Merged
merged 22 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
67255a9
Fix the activity and execution-state tracking for kernels.
ojarjur Nov 17, 2023
30b45db
Improve kernel status tracking by matching idle status messages again…
ojarjur Nov 17, 2023
d2a952d
Merge branch 'main' of https://github.com/jupyter-server/jupyter_serv…
ojarjur Jun 4, 2024
4ce5539
Test fixes
ojarjur Jun 5, 2024
fadf5e8
Reduce the diff against the current code
ojarjur Jun 5, 2024
bdb9814
Add a test for execute_state
ojarjur Jun 5, 2024
f37557c
Fix lint warnings
ojarjur Jun 5, 2024
f1eb5a6
Fix race conditions and deadlocks in the test_execution_state tests
ojarjur Jun 6, 2024
c870f7e
Remove sleep that caused test failures on some jobs
ojarjur Jun 6, 2024
50c5f9c
Revert unexpected behavior change that affected tests on Windows
ojarjur Jun 6, 2024
982cb1e
Restore accidentally deleted pydoc
ojarjur Jun 6, 2024
4921682
Reduce the diff against the main branch and drop the unneeded, in-mem…
ojarjur Jun 6, 2024
6257ca1
Respect status messages that explicitly report a status of "starting"
ojarjur Jun 6, 2024
170cde0
Fix flakiness in thekernel test_execution_state test
ojarjur Jun 6, 2024
e583410
Make the kernel test_execution_state test more reliable by increasing…
ojarjur Jun 6, 2024
3ecc26d
Make kernel execution state test more reliable
ojarjur Jun 6, 2024
bcc2f1e
Make the test/test_utils.py test pass on Windows
ojarjur Jun 6, 2024
300008e
Fix a race condition in setting the initial kernel execution state
ojarjur Jun 7, 2024
101ebed
Simplify the retry logic for the kernel execution state test... inste…
ojarjur Jun 7, 2024
0e8bd97
Merge branch 'main' into ojarjur/fix-kernel-status
ojarjur Jul 10, 2024
54ea903
Switch from having a list of tracked message types for user activity …
ojarjur Jul 12, 2024
1b3ea06
Re-introduce retries in the execution status test to further reduce f…
ojarjur Jul 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix race conditions and deadlocks in the test_execution_state tests
  • Loading branch information
ojarjur committed Jun 6, 2024
commit f1eb5a63558adc1d45a7e3d3e00db8fc6404b520
2 changes: 1 addition & 1 deletion tests/services/kernels/test_cull.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ async def test_cull_connected(jp_fetch, jp_ws_fetch):
{
"channel": "shell",
"header": {
"date": datetime.datetime.now(tz=datetime.UTC).isoformat(),
"date": datetime.datetime.now(tz=datetime.timezone.utc).isoformat(),
"session": session_id,
"msg_id": message_id,
"msg_type": "execute_request",
Expand Down
53 changes: 31 additions & 22 deletions tests/services/kernels/test_execution_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import os
import platform
import time
import uuid
import warnings

Expand All @@ -11,27 +12,25 @@
from tornado.httpclient import HTTPClientError
from traitlets.config import Config

POLL_TIMEOUT = 10
POLL_INTERVAL = 1


async def test_execution_state(jp_fetch, jp_ws_fetch):
r = await jp_fetch("api", "kernels", method="POST", allow_nonstandard_methods=True)
kernel = json.loads(r.body.decode())
kid = kernel["id"]
await poll_for_execution_state(kid, "idle", jp_fetch)

# Open a websocket connection.
ws = await jp_ws_fetch("api", "kernels", kid, "channels")
await poll_for_execution_state(kid, "idle", jp_fetch)

session_id = uuid.uuid1().hex
message_id = uuid.uuid1().hex
await ws.write_message(
json.dumps(
{
"channel": "shell",
"header": {
"date": datetime.datetime.now(tz=datetime.UTC).isoformat(),
"date": datetime.datetime.now(tz=datetime.timezone.utc).isoformat(),
"session": session_id,
"msg_id": message_id,
"msg_type": "execute_request",
Expand All @@ -41,7 +40,7 @@ async def test_execution_state(jp_fetch, jp_ws_fetch):
"parent_header": {},
"metadata": {},
"content": {
"code": f"import time\ntime.sleep({POLL_TIMEOUT-1})",
"code": "while True:\n\tpass",
"silent": False,
"allow_stdin": False,
"stop_on_error": True,
Expand All @@ -50,41 +49,52 @@ async def test_execution_state(jp_fetch, jp_ws_fetch):
}
)
)
await poll_for_execution_state(kid, "busy", jp_fetch)
await poll_for_parent_message_status(kid, message_id, "busy", ws)
es = await get_execution_state(kid, jp_fetch)
assert es == "busy"

message_id_2 = uuid.uuid1().hex
await ws.write_message(
json.dumps(
{
"channel": "shell",
"channel": "control",
"header": {
"date": datetime.datetime.now(tz=datetime.UTC).isoformat(),
"date": datetime.datetime.now(tz=datetime.timezone.utc).isoformat(),
"session": session_id,
"msg_id": message_id_2,
"msg_type": "execute_request",
"msg_type": "debug_request",
"username": "",
"version": "5.2",
},
"parent_header": {},
"metadata": {},
"content": {
"code": "pass",
"silent": False,
"allow_stdin": False,
"stop_on_error": True,
"type": "request",
"command": "debugInfo",
},
"buffers": [],
}
)
)
await get_idle_reply(kid, message_id_2, ws)
await poll_for_parent_message_status(kid, message_id_2, "idle", ws)
es = await get_execution_state(kid, jp_fetch)

# Verify that the overall kernel status is still "busy" even though one
# "idle" response was already seen for the second execute request.
assert es == "busy"

await poll_for_execution_state(kid, "idle", jp_fetch)
await jp_fetch(
"api",
"kernels",
kid,
"interrupt",
method="POST",
allow_nonstandard_methods=True,
)

await poll_for_parent_message_status(kid, message_id, "idle", ws)
es = await get_execution_state(kid, jp_fetch)
assert es == "idle"
ws.close()


Expand All @@ -95,19 +105,18 @@ async def get_execution_state(kid, jp_fetch):


async def poll_for_execution_state(kid, target_state, jp_fetch):
for _ in range(int(POLL_TIMEOUT / POLL_INTERVAL)):
while True:
es = await get_execution_state(kid, jp_fetch)
if es == target_state:
return True
else:
await asyncio.sleep(POLL_INTERVAL)
raise AssertionError(f"Timed out waiting for kernel execution state {target_state}")
return
time.sleep(POLL_INTERVAL)


async def get_idle_reply(kid, parent_message_id, ws):
async def poll_for_parent_message_status(kid, parent_message_id, target_status, ws):
while True:
resp = await ws.read_message()
resp_json = json.loads(resp)
print(resp_json)
parent_message = resp_json.get("parent_header", {}).get("msg_id", None)
if parent_message != parent_message_id:
continue
Expand All @@ -117,5 +126,5 @@ async def get_idle_reply(kid, parent_message_id, ws):
continue

execution_state = resp_json.get("content", {}).get("execution_state", "")
if execution_state == "idle":
if execution_state == target_status:
return
Loading