@@ -241,6 +241,11 @@ function redirect_worker_output(ident, stream)
241
241
end
242
242
end
243
243
244
+ struct LaunchWorkerError <: Exception
245
+ msg:: String
246
+ end
247
+
248
+ Base. showerror (io:: IO , e:: LaunchWorkerError ) = print (io, e. msg)
244
249
245
250
# The default TCP transport relies on the worker listening on a free
246
251
# port available and printing its bind address and port.
@@ -272,7 +277,7 @@ function read_worker_host_port(io::IO)
272
277
273
278
conninfo = fetch (readtask)
274
279
if isempty (conninfo) && ! isopen (io)
275
- error ( " Unable to read host:port string from worker. Launch command exited with error?" )
280
+ throw ( LaunchWorkerError ( " Unable to read host:port string from worker. Launch command exited with error?" ) )
276
281
end
277
282
278
283
ntries -= 1
@@ -286,13 +291,13 @@ function read_worker_host_port(io::IO)
286
291
end
287
292
close (io)
288
293
if ntries > 0
289
- error ( " Timed out waiting to read host:port string from worker." )
294
+ throw ( LaunchWorkerError ( " Timed out waiting to read host:port string from worker." ) )
290
295
else
291
- error ( " Unexpected output from worker launch command. Host:port string not found." )
296
+ throw ( LaunchWorkerError ( " Unexpected output from worker launch command. Host:port string not found." ) )
292
297
end
293
298
finally
294
299
for line in leader
295
- println (" \t From failed worker startup:\t " , line)
300
+ println (" \t From worker startup:\t " , line)
296
301
end
297
302
end
298
303
end
@@ -354,6 +359,34 @@ the package `ClusterManagers.jl`.
354
359
The number of seconds a newly launched worker waits for connection establishment from the
355
360
master can be specified via variable `JULIA_WORKER_TIMEOUT` in the worker process's
356
361
environment. Relevant only when using TCP/IP as transport.
362
+
363
+ To launch workers without blocking the REPL, or the containing function
364
+ if launching workers programmatically, execute `addprocs` in its own task.
365
+
366
+ # Examples
367
+
368
+ ```
369
+ # On busy clusters, call `addprocs` asynchronously
370
+ t = @async addprocs(...)
371
+ ```
372
+
373
+ ```
374
+ # Utilize workers as and when they come online
375
+ if nprocs() > 1 # Ensure at least one new worker is available
376
+ .... # perform distributed execution
377
+ end
378
+ ```
379
+
380
+ ```
381
+ # Retrieve newly launched worker IDs, or any error messages
382
+ if istaskdone(t) # Check if `addprocs` has completed to ensure `fetch` doesn't block
383
+ if nworkers() == N
384
+ new_pids = fetch(t)
385
+ else
386
+ fetch(t)
387
+ end
388
+ end
389
+ ```
357
390
"""
358
391
function addprocs (manager:: ClusterManager ; kwargs... )
359
392
init_multi ()
@@ -499,9 +532,13 @@ function create_worker(manager, wconfig)
499
532
local r_s, w_s
500
533
try
501
534
(r_s, w_s) = connect (manager, w. id, wconfig)
502
- catch
503
- deregister_worker (w. id)
504
- rethrow ()
535
+ catch ex
536
+ try
537
+ deregister_worker (w. id)
538
+ kill (manager, w. id, wconfig)
539
+ finally
540
+ rethrow (ex)
541
+ end
505
542
end
506
543
507
544
w = Worker (w. id, r_s, w_s, manager; config= wconfig)
0 commit comments