@@ -124,7 +124,7 @@ def parse_args():
124
124
help = "The SSH user you want to connect as (default: root)" )
125
125
parser .add_option (
126
126
"--delete-groups" , action = "store_true" , default = False ,
127
- help = "When destroying a cluster, delete the security groups that were created" )
127
+ help = "When destroying a cluster, delete the security groups that were created. " )
128
128
parser .add_option (
129
129
"--use-existing-master" , action = "store_true" , default = False ,
130
130
help = "Launch fresh slaves, but use an existing stopped master if possible" )
@@ -138,7 +138,9 @@ def parse_args():
138
138
parser .add_option (
139
139
"--user-data" , type = "string" , default = "" ,
140
140
help = "Path to a user-data file (most AMI's interpret this as an initialization script)" )
141
-
141
+ parser .add_option (
142
+ "--security-group-prefix" , type = "string" , default = None ,
143
+ help = "Use this prefix for the security group rather than the cluster name." )
142
144
143
145
(opts , args ) = parser .parse_args ()
144
146
if len (args ) != 2 :
@@ -285,8 +287,12 @@ def launch_cluster(conn, opts, cluster_name):
285
287
user_data_content = user_data_file .read ()
286
288
287
289
print "Setting up security groups..."
288
- master_group = get_or_make_group (conn , cluster_name + "-master" )
289
- slave_group = get_or_make_group (conn , cluster_name + "-slaves" )
290
+ if opts .security_group_prefix is None :
291
+ master_group = get_or_make_group (conn , cluster_name + "-master" )
292
+ slave_group = get_or_make_group (conn , cluster_name + "-slaves" )
293
+ else :
294
+ master_group = get_or_make_group (conn , opts .security_group_prefix + "-master" )
295
+ slave_group = get_or_make_group (conn , opts .security_group_prefix + "-slaves" )
290
296
if master_group .rules == []: # Group was just now created
291
297
master_group .authorize (src_group = master_group )
292
298
master_group .authorize (src_group = slave_group )
@@ -310,12 +316,11 @@ def launch_cluster(conn, opts, cluster_name):
310
316
slave_group .authorize ('tcp' , 60060 , 60060 , '0.0.0.0/0' )
311
317
slave_group .authorize ('tcp' , 60075 , 60075 , '0.0.0.0/0' )
312
318
313
- # Check if instances are already running in our groups
319
+ # Check if instances are already running with the cluster name
314
320
existing_masters , existing_slaves = get_existing_cluster (conn , opts , cluster_name ,
315
321
die_on_error = False )
316
322
if existing_slaves or (existing_masters and not opts .use_existing_master ):
317
- print >> stderr , ("ERROR: There are already instances running in " +
318
- "group %s or %s" % (master_group .name , slave_group .name ))
323
+ print >> stderr , ("ERROR: There are already instances for name: %s " % cluster_name )
319
324
sys .exit (1 )
320
325
321
326
# Figure out Spark AMI
@@ -371,9 +376,13 @@ def launch_cluster(conn, opts, cluster_name):
371
376
for r in reqs :
372
377
id_to_req [r .id ] = r
373
378
active_instance_ids = []
379
+ outstanding_request_ids = []
374
380
for i in my_req_ids :
375
- if i in id_to_req and id_to_req [i ].state == "active" :
376
- active_instance_ids .append (id_to_req [i ].instance_id )
381
+ if i in id_to_req :
382
+ if id_to_req [i ].state == "active" :
383
+ active_instance_ids .append (id_to_req [i ].instance_id )
384
+ else :
385
+ outstanding_request_ids .append (i )
377
386
if len (active_instance_ids ) == opts .slaves :
378
387
print "All %d slaves granted" % opts .slaves
379
388
reservations = conn .get_all_instances (active_instance_ids )
@@ -382,8 +391,8 @@ def launch_cluster(conn, opts, cluster_name):
382
391
slave_nodes += r .instances
383
392
break
384
393
else :
385
- print "%d of %d slaves granted, waiting longer" % (
386
- len (active_instance_ids ), opts .slaves )
394
+ print "%d of %d slaves granted, waiting longer for request ids including %s " % (
395
+ len (active_instance_ids ), opts .slaves , outstanding_request_ids [ 0 : 10 ] )
387
396
except :
388
397
print "Canceling spot instance requests"
389
398
conn .cancel_spot_instance_requests (my_req_ids )
@@ -440,14 +449,29 @@ def launch_cluster(conn, opts, cluster_name):
440
449
print "Launched master in %s, regid = %s" % (zone , master_res .id )
441
450
442
451
# Give the instances descriptive names
452
+ # TODO: Add retry logic for tagging with name since it's used to identify a cluster.
443
453
for master in master_nodes :
444
- master .add_tag (
445
- key = 'Name' ,
446
- value = '{cn}-master-{iid}' .format (cn = cluster_name , iid = master .id ))
454
+ name = '{cn}-master-{iid}' .format (cn = cluster_name , iid = master .id )
455
+ for i in range (0 , 5 ):
456
+ try :
457
+ master .add_tag (key = 'Name' , value = name )
458
+ except :
459
+ print "Failed attempt %i of 5 to tag %s" % ((i + 1 ), name )
460
+ if (i == 5 ):
461
+ raise "Error - failed max attempts to add name tag"
462
+ time .sleep (5 )
463
+
464
+
447
465
for slave in slave_nodes :
448
- slave .add_tag (
449
- key = 'Name' ,
450
- value = '{cn}-slave-{iid}' .format (cn = cluster_name , iid = slave .id ))
466
+ name = '{cn}-slave-{iid}' .format (cn = cluster_name , iid = slave .id )
467
+ for i in range (0 , 5 ):
468
+ try :
469
+ slave .add_tag (key = 'Name' , value = name )
470
+ except :
471
+ print "Failed attempt %i of 5 to tag %s" % ((i + 1 ), name )
472
+ if (i == 5 ):
473
+ raise "Error - failed max attempts to add name tag"
474
+ time .sleep (5 )
451
475
452
476
# Return all the instances
453
477
return (master_nodes , slave_nodes )
@@ -463,18 +487,18 @@ def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
463
487
for res in reservations :
464
488
active = [i for i in res .instances if is_active (i )]
465
489
for inst in active :
466
- group_names = [ g . name for g in inst .groups ]
467
- if group_names == [ cluster_name + "-master" ] :
490
+ name = inst .tags . get ( u'Name' , "" )
491
+ if name . startswith ( cluster_name + "-master" ) :
468
492
master_nodes .append (inst )
469
- elif group_names == [ cluster_name + "-slaves" ] :
493
+ elif name . startswith ( cluster_name + "-slave" ) :
470
494
slave_nodes .append (inst )
471
495
if any ((master_nodes , slave_nodes )):
472
496
print ("Found %d master(s), %d slaves" % (len (master_nodes ), len (slave_nodes )))
473
497
if master_nodes != [] or not die_on_error :
474
498
return (master_nodes , slave_nodes )
475
499
else :
476
500
if master_nodes == [] and slave_nodes != []:
477
- print >> sys .stderr , "ERROR: Could not find master in group " + cluster_name + "-master"
501
+ print >> sys .stderr , "ERROR: Could not find master in with name " + cluster_name + "-master"
478
502
else :
479
503
print >> sys .stderr , "ERROR: Could not find any existing cluster"
480
504
sys .exit (1 )
@@ -816,7 +840,10 @@ def real_main():
816
840
# Delete security groups as well
817
841
if opts .delete_groups :
818
842
print "Deleting security groups (this will take some time)..."
819
- group_names = [cluster_name + "-master" , cluster_name + "-slaves" ]
843
+ if opts .security_group_prefix is None :
844
+ group_names = [cluster_name + "-master" , cluster_name + "-slaves" ]
845
+ else :
846
+ group_names = [opts .security_group_prefix + "-master" , opts .security_group_prefix + "-slaves" ]
820
847
821
848
attempt = 1
822
849
while attempt <= 3 :
0 commit comments