Skip to content

Commit 94053a7

Browse files
vidahaJoshRosen
authored andcommitted
SPARK-2333 - spark_ec2 script should allow option for existing security group
- Uses the name tag to identify machines in a cluster. - Allows overriding the security group name so it doesn't need to coincide with the cluster name. - Outputs the request id's of up to 10 pending spot instance requests. Author: Vida Ha <vida@databricks.com> Closes #1899 from vidaha/vida/ec2-reuse-security-group and squashes the following commits: c80d5c3 [Vida Ha] wrap retries in a try catch block b2989d5 [Vida Ha] SPARK-2333: spark_ec2 script should allow option for existing security group
1 parent 31f0b07 commit 94053a7

File tree

2 files changed

+57
-28
lines changed

2 files changed

+57
-28
lines changed

docs/ec2-scripts.md

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,16 @@ on the [Amazon Web Services site](http://aws.amazon.com/).
1212

1313
`spark-ec2` is designed to manage multiple named clusters. You can
1414
launch a new cluster (telling the script its size and giving it a name),
15-
shutdown an existing cluster, or log into a cluster. Each cluster is
16-
identified by placing its machines into EC2 security groups whose names
17-
are derived from the name of the cluster. For example, a cluster named
15+
shutdown an existing cluster, or log into a cluster. Each cluster
16+
launches a set of instances, which are tagged with the cluster name,
17+
and placed into EC2 security groups. If you don't specify a security
18+
group, the `spark-ec2` script will create security groups based on the
19+
cluster name you request. For example, a cluster named
1820
`test` will contain a master node in a security group called
1921
`test-master`, and a number of slave nodes in a security group called
20-
`test-slaves`. The `spark-ec2` script will create these security groups
21-
for you based on the cluster name you request. You can also use them to
22-
identify machines belonging to each cluster in the Amazon EC2 Console.
22+
`test-slaves`. You can also specify a security group prefix to be used
23+
in place of the cluster name. Machines in a cluster can be identified
24+
by looking for the "Name" tag of the instance in the Amazon EC2 Console.
2325

2426

2527
# Before You Start

ec2/spark_ec2.py

Lines changed: 49 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def parse_args():
124124
help="The SSH user you want to connect as (default: root)")
125125
parser.add_option(
126126
"--delete-groups", action="store_true", default=False,
127-
help="When destroying a cluster, delete the security groups that were created")
127+
help="When destroying a cluster, delete the security groups that were created.")
128128
parser.add_option(
129129
"--use-existing-master", action="store_true", default=False,
130130
help="Launch fresh slaves, but use an existing stopped master if possible")
@@ -138,7 +138,9 @@ def parse_args():
138138
parser.add_option(
139139
"--user-data", type="string", default="",
140140
help="Path to a user-data file (most AMI's interpret this as an initialization script)")
141-
141+
parser.add_option(
142+
"--security-group-prefix", type="string", default=None,
143+
help="Use this prefix for the security group rather than the cluster name.")
142144

143145
(opts, args) = parser.parse_args()
144146
if len(args) != 2:
@@ -285,8 +287,12 @@ def launch_cluster(conn, opts, cluster_name):
285287
user_data_content = user_data_file.read()
286288

287289
print "Setting up security groups..."
288-
master_group = get_or_make_group(conn, cluster_name + "-master")
289-
slave_group = get_or_make_group(conn, cluster_name + "-slaves")
290+
if opts.security_group_prefix is None:
291+
master_group = get_or_make_group(conn, cluster_name + "-master")
292+
slave_group = get_or_make_group(conn, cluster_name + "-slaves")
293+
else:
294+
master_group = get_or_make_group(conn, opts.security_group_prefix + "-master")
295+
slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves")
290296
if master_group.rules == []: # Group was just now created
291297
master_group.authorize(src_group=master_group)
292298
master_group.authorize(src_group=slave_group)
@@ -310,12 +316,11 @@ def launch_cluster(conn, opts, cluster_name):
310316
slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
311317
slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
312318

313-
# Check if instances are already running in our groups
319+
# Check if instances are already running with the cluster name
314320
existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
315321
die_on_error=False)
316322
if existing_slaves or (existing_masters and not opts.use_existing_master):
317-
print >> stderr, ("ERROR: There are already instances running in " +
318-
"group %s or %s" % (master_group.name, slave_group.name))
323+
print >> stderr, ("ERROR: There are already instances for name: %s " % cluster_name)
319324
sys.exit(1)
320325

321326
# Figure out Spark AMI
@@ -371,9 +376,13 @@ def launch_cluster(conn, opts, cluster_name):
371376
for r in reqs:
372377
id_to_req[r.id] = r
373378
active_instance_ids = []
379+
outstanding_request_ids = []
374380
for i in my_req_ids:
375-
if i in id_to_req and id_to_req[i].state == "active":
376-
active_instance_ids.append(id_to_req[i].instance_id)
381+
if i in id_to_req:
382+
if id_to_req[i].state == "active":
383+
active_instance_ids.append(id_to_req[i].instance_id)
384+
else:
385+
outstanding_request_ids.append(i)
377386
if len(active_instance_ids) == opts.slaves:
378387
print "All %d slaves granted" % opts.slaves
379388
reservations = conn.get_all_instances(active_instance_ids)
@@ -382,8 +391,8 @@ def launch_cluster(conn, opts, cluster_name):
382391
slave_nodes += r.instances
383392
break
384393
else:
385-
print "%d of %d slaves granted, waiting longer" % (
386-
len(active_instance_ids), opts.slaves)
394+
print "%d of %d slaves granted, waiting longer for request ids including %s" % (
395+
len(active_instance_ids), opts.slaves, outstanding_request_ids[0:10])
387396
except:
388397
print "Canceling spot instance requests"
389398
conn.cancel_spot_instance_requests(my_req_ids)
@@ -440,14 +449,29 @@ def launch_cluster(conn, opts, cluster_name):
440449
print "Launched master in %s, regid = %s" % (zone, master_res.id)
441450

442451
# Give the instances descriptive names
452+
# TODO: Add retry logic for tagging with name since it's used to identify a cluster.
443453
for master in master_nodes:
444-
master.add_tag(
445-
key='Name',
446-
value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
454+
name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)
455+
for i in range(0, 5):
456+
try:
457+
master.add_tag(key='Name', value=name)
458+
except:
459+
print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
460+
if (i == 5):
461+
raise "Error - failed max attempts to add name tag"
462+
time.sleep(5)
463+
464+
447465
for slave in slave_nodes:
448-
slave.add_tag(
449-
key='Name',
450-
value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
466+
name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)
467+
for i in range(0, 5):
468+
try:
469+
slave.add_tag(key='Name', value=name)
470+
except:
471+
print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
472+
if (i == 5):
473+
raise "Error - failed max attempts to add name tag"
474+
time.sleep(5)
451475

452476
# Return all the instances
453477
return (master_nodes, slave_nodes)
@@ -463,18 +487,18 @@ def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
463487
for res in reservations:
464488
active = [i for i in res.instances if is_active(i)]
465489
for inst in active:
466-
group_names = [g.name for g in inst.groups]
467-
if group_names == [cluster_name + "-master"]:
490+
name = inst.tags.get(u'Name', "")
491+
if name.startswith(cluster_name + "-master"):
468492
master_nodes.append(inst)
469-
elif group_names == [cluster_name + "-slaves"]:
493+
elif name.startswith(cluster_name + "-slave"):
470494
slave_nodes.append(inst)
471495
if any((master_nodes, slave_nodes)):
472496
print ("Found %d master(s), %d slaves" % (len(master_nodes), len(slave_nodes)))
473497
if master_nodes != [] or not die_on_error:
474498
return (master_nodes, slave_nodes)
475499
else:
476500
if master_nodes == [] and slave_nodes != []:
477-
print >> sys.stderr, "ERROR: Could not find master in group " + cluster_name + "-master"
501+
print >> sys.stderr, "ERROR: Could not find master in with name " + cluster_name + "-master"
478502
else:
479503
print >> sys.stderr, "ERROR: Could not find any existing cluster"
480504
sys.exit(1)
@@ -816,7 +840,10 @@ def real_main():
816840
# Delete security groups as well
817841
if opts.delete_groups:
818842
print "Deleting security groups (this will take some time)..."
819-
group_names = [cluster_name + "-master", cluster_name + "-slaves"]
843+
if opts.security_group_prefix is None:
844+
group_names = [cluster_name + "-master", cluster_name + "-slaves"]
845+
else:
846+
group_names = [opts.security_group_prefix + "-master", opts.security_group_prefix + "-slaves"]
820847

821848
attempt = 1
822849
while attempt <= 3:

0 commit comments

Comments
 (0)