You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Haven't seen this fail before, but it looks like an instance of #5186
Key part is just that a worker fails to come up and we see OSError: [Errno 98] Address already in use.
The fact that the nanny spews the same error 3 times would be fixed by #6427
____________ test_contact_listen_address[tcp://127.0.0.2:---nanny] _____________
args = (), kwds = {'listen_address': 'tcp://127.0.0.2:', 'nanny': '--nanny'}
@wraps(func)
definner(*args, **kwds):
withself._recreate_cm():
> return func(*args, **kwds)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/contextlib.py:79:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/utils_test.py:1199: in test_func
return _run_and_close_tornado(async_fn_outer)
distributed/utils_test.py:480: in _run_and_close_tornado
return asyncio.run(inner_fn())
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/runners.py:44: in run
return loop.run_until_complete(main)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/base_events.py:646: in run_until_complete
return future.result()
distributed/utils_test.py:477: in inner_fn
returnawait async_fn(*args, **kwargs)
distributed/utils_test.py:1196: in async_fn_outer
returnawait asyncio.wait_for(async_fn(), timeout=timeout * 2)
/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py:445: in wait_for
return fut.result()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
asyncdefasync_fn():
result = None
with dask.config.set(config):
workers = []
s = False
for _ inrange(60):
try:
s, ws = await start_cluster(
nthreads,
scheduler,
security=security,
Worker=Worker,
scheduler_kwargs=scheduler_kwargs,
worker_kwargs=worker_kwargs,
)
exceptExceptionas e:
logger.error(
"Failed to start gen_cluster: "
f"{e.__class__.__name__}: {e}; retrying",
exc_info=True,
)
await asyncio.sleep(1)
else:
workers[:] = ws
args = [s] + workers
break
if s isFalse:
raiseException("Could not start cluster")
if client:
c = await Client(
s.address,
security=security,
asynchronous=True,
**client_kwargs,
)
args = [c] + args
try:
coro = func(*args, *outer_args, **kwargs)
task = asyncio.create_task(coro)
coro2 = asyncio.wait_for(asyncio.shield(task), timeout)
result = await coro2
validate_state(s, *workers)
except asyncio.TimeoutError:
assert task
buffer = io.StringIO()
# This stack indicates where the coro/test is suspended
task.print_stack(file=buffer)
if cluster_dump_directory:
await dump_cluster_state(
s,
ws,
output_dir=cluster_dump_directory,
func_name=func.__name__,
)
task.cancel()
whilenot task.cancelled():
await asyncio.sleep(0.01)
# Hopefully, the hang has been caused by inconsistent
# state, which should be much more meaningful than the
# timeout
validate_state(s, *workers)
# Remove as much of the traceback as possible; it's
# uninteresting boilerplate from utils_test and asyncio
# and not from the code being tested.
> raise asyncio.TimeoutError(
f"Test timeout after {timeout}s.\n"
"========== Test stack trace starts here ==========\n"
f"{buffer.getvalue()}"
) fromNone
E asyncio.exceptions.TimeoutError: Test timeout after 30s.
E ========== Test stack trace starts here ==========
E Stack for <Task pending name='Task-1433' coro=<test_contact_listen_address() running at /home/runner/work/distributed/distributed/distributed/cli/tests/test_dask_worker.py:443> wait_for=<Future pending cb=[Task.task_wakeup()]>> (most recent call last):
E File "/home/runner/work/distributed/distributed/distributed/cli/tests/test_dask_worker.py", line 443, in test_contact_listen_address
E await c.wait_for_workers(1)
distributed/utils_test.py:1127: TimeoutError
----------------------------- Captured stdout call -----------------------------
Dumped cluster state to test_cluster_dump/test_contact_listen_address.yaml
----------------------------- Captured stderr call -----------------------------
2022-07-19 15:24:10,010 - distributed.scheduler - INFO - State start
2022-07-19 15:24:10,012 - distributed.scheduler - INFO - Clear task state
2022-07-19 15:24:10,012 - distributed.scheduler - INFO - Scheduler at: tcp://127.0.0.1:35175
2022-07-19 15:24:10,012 - distributed.scheduler - INFO - dashboard at: 127.0.0.1:42855
2022-07-19 15:24:10,020 - distributed.scheduler - INFO - Receive client connection: Client-d63bdbc0-0776-11ed-8b95-000d3aa688e3
2022-07-19 15:24:10,021 - distributed.core - INFO - Starting established connection
2022-07-19 15:24:10,659 - distributed.nanny - INFO - Start Nanny at: 'tcp://127.0.0.2:38265'
2022-07-19 15:24:11,358 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.2:38265
2022-07-19 15:24:11,358 - distributed.worker - INFO - Closed worker has not yet started: Status.init
2022-07-19 15:24:11,360 - distributed.nanny - ERROR - Failed to start worker
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 480, in start
await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 408, in wait_for
return await fut
File "/home/runner/work/distributed/distributed/distributed/worker.py", line 1316, in start_unsafe
await self.listen(start_address, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/core.py", line 658, in listen
listener = await listen(
File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 212, in _
await self.start()
File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 528, in start
sockets = netutil.bind_sockets(
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/netutil.py", line 161, in bind_sockets
sock.bind(sockaddr)
OSError: [Errno 98] Address already in use
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 878, in run
await worker
File "/home/runner/work/distributed/distributed/distributed/core.py", line 488, in start
raise RuntimeError(f"{type(self).__name__} failed to start.") from exc
RuntimeError: Worker failed to start.
2022-07-19 15:24:11,379 - distributed.nanny - ERROR - Failed while trying to start worker process: Worker failed to start.
2022-07-19 15:24:11,379 - distributed.nanny - ERROR - Failed to connect to process
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 480, in start
await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 408, in wait_for
return await fut
File "/home/runner/work/distributed/distributed/distributed/worker.py", line 1316, in start_unsafe
await self.listen(start_address, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/core.py", line 658, in listen
listener = await listen(
File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 212, in _
await self.start()
File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 528, in start
sockets = netutil.bind_sockets(
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/netutil.py", line 161, in bind_sockets
sock.bind(sockaddr)
OSError: [Errno 98] Address already in use
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 686, in start
msg = await self._wait_until_connected(uid)
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 806, in _wait_until_connected
raise msg["exception"]
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 878, in run
await worker
File "/home/runner/work/distributed/distributed/distributed/core.py", line 488, in start
raise RuntimeError(f"{type(self).__name__} failed to start.") from exc
RuntimeError: Worker failed to start.
[2022](https://github.com/dask/distributed/runs/7412362356?check_suite_focus=true#step:11:2023)-07-19 15:24:11,381 - distributed.nanny - ERROR - Failed to start process
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 480, in start
await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 408, in wait_for
return await fut
File "/home/runner/work/distributed/distributed/distributed/worker.py", line 1316, in start_unsafe
await self.listen(start_address, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/core.py", line 658, in listen
listener = await listen(
File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 212, in _
await self.start()
File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 528, in start
sockets = netutil.bind_sockets(
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/netutil.py", line 161, in bind_sockets
sock.bind(sockaddr)
OSError: [Errno 98] Address already in use
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 436, in instantiate
result = await self.process.start()
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 686, in start
msg = await self._wait_until_connected(uid)
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 806, in _wait_until_connected
raise msg["exception"]
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 878, in run
await worker
File "/home/runner/work/distributed/distributed/distributed/core.py", line 488, in start
raise RuntimeError(f"{type(self).__name__} failed to start.") from exc
RuntimeError: Worker failed to start.
2022-07-19 15:24:11,382 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.2:38265'.
2022-07-19 15:24:11,384 - distributed.dask_worker - INFO - End worker
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 480, in start
await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 408, in wait_for
return await fut
File "/home/runner/work/distributed/distributed/distributed/worker.py", line 1316, in start_unsafe
await self.listen(start_address, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/core.py", line 658, in listen
listener = await listen(
File "/home/runner/work/distributed/distributed/distributed/comm/core.py", line 212, in _
await self.start()
File "/home/runner/work/distributed/distributed/distributed/comm/tcp.py", line 528, in start
sockets = netutil.bind_sockets(
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/netutil.py", line 161, in bind_sockets
sock.bind(sockaddr)
OSError: [Errno 98] Address already in use
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/core.py", line 480, in start
await asyncio.wait_for(self.start_unsafe(), timeout=timeout)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 408, in wait_for
return await fut
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 363, in start_unsafe
response = await self.instantiate()
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 436, in instantiate
result = await self.process.start()
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 686, in start
msg = await self._wait_until_connected(uid)
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 806, in _wait_until_connected
raise msg["exception"]
File "/home/runner/work/distributed/distributed/distributed/nanny.py", line 878, in run
await worker
File "/home/runner/work/distributed/distributed/distributed/core.py", line 488, in start
raise RuntimeError(f"{type(self).__name__} failed to start.") from exc
RuntimeError: Worker failed to start.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/share/miniconda3/envs/dask-distributed/bin/dask-worker", line 33, in <module>
sys.exit(load_entry_point('distributed', 'console_scripts', 'dask-worker')())
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/click/core.py", line 1130, in __call__
return self.main(*args, **kwargs)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/click/core.py", line 1055, in main
rv = self.invoke(ctx)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "/home/runner/work/distributed/distributed/distributed/cli/dask_worker.py", line 500, in main
asyncio.run(run())
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/runners.py", line 44, in run
return loop.run_until_complete(main)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/base_events.py", line 646, in run_until_complete
return future.result()
File "/home/runner/work/distributed/distributed/distributed/cli/dask_worker.py", line 497, in run
[task.result() for task in done]
File "/home/runner/work/distributed/distributed/distributed/cli/dask_worker.py", line 497, in <listcomp>
[task.result() for task in done]
File "/home/runner/work/distributed/distributed/distributed/cli/dask_worker.py", line 472, in wait_for_nannies_to_finish
await asyncio.gather(*nannies)
File "/usr/share/miniconda3/envs/dask-distributed/lib/python3.10/asyncio/tasks.py", line 650, in _wrap_awaitable
return (yield from awaitable.__await__())
File "/home/runner/work/distributed/distributed/distributed/core.py", line 488, in start
raise RuntimeError(f"{type(self).__name__} failed to start.") from exc
RuntimeError: Nanny failed to start.
2022-07-19 15:24:40,060 - distributed.scheduler - INFO - Remove client Client-d63bdbc0-0776-11ed-8b95-000d3aa688e3
2022-07-19 15:24:40,061 - distributed.scheduler - INFO - Remove client Client-d63bdbc0-0776-11ed-8b95-000d3aa688e3
2022-07-19 15:24:40,062 - distributed.scheduler - INFO - Close client connection: Client-d63bdbc0-0776-11ed-8b95-000d3aa688e3
2022-07-19 15:24:40,062 - distributed.scheduler - INFO - Scheduler closing...
2022-07-19 15:24:40,063 - distributed.scheduler - INFO - Scheduler closing all comms
The text was updated successfully, but these errors were encountered:
Haven't seen this fail before, but it looks like an instance of #5186
Key part is just that a worker fails to come up and we see
OSError: [Errno 98] Address already in use
.The fact that the nanny spews the same error 3 times would be fixed by #6427
The text was updated successfully, but these errors were encountered: