Skip to content

Commit

Permalink
[#17] Enable running variant spark on AWS EMR (#33)
Browse files Browse the repository at this point in the history
* Added Initial CF template from: http://www.mikemorse.tech/2016/09/a-well-populated-aws-cloudformation.html

* Added my custom templating

* First working emr version

* Simplified cluster creation + default configuration

* Added better defaults for variant-spark

* Cleanin up aws stuff

* Refactored command line interface

* Finalized simple boostrap script

* Fixed scripts to work with new bootstrap script
  • Loading branch information
piotrszul authored Sep 14, 2017
1 parent 2096c42 commit 7ef8572
Show file tree
Hide file tree
Showing 10 changed files with 579 additions and 84 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,6 @@ tmp
*.iml

variant-spark_2.11.iml

*.pyc
*.egg-info
4 changes: 4 additions & 0 deletions cloud/aws-emr/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Running VariantSpark on AWS EMR
================================

TBP:
71 changes: 57 additions & 14 deletions cloud/aws-emr/bootstrap/install-variant-spark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,67 @@ error_msg ()
echo 1>&2 "Error: $1"
}

INST_DIR=${INST_VOL:-/mnt}
# error message
fatal_error_msg ()
{
echo 1>&2 "Fatal error: $1"
exit 1
}

VS_BUCKET="variant-spark"
RELEASE_DIR=

# get input parameters
while [ $# -gt 0 ]; do
case "$1" in
--release-url)
shift
VS_RELEASE_URL="$1"
;;
-*)
# do not exit out, just note failure
error_msg "unrecognized option: $1"
;;
*)
break;
;;
esac
shift
done

if [[ -z "${VS_RELEASE_URL}" ]]; then
fatal_error_msg "Parameter: --release-url is required"
fi

echo "Release location is: ${VS_RELEASE_URL}"

# peform some basic check on the location
VS_BUILD_INFO=$(aws s3 ls "${VS_RELEASE_URL}/buildinfo" || echo "")

if [ -z "${VS_BUILD_INFO}" ]; then
fatal_error_msg "There is no variant-spark release in: ${VS_RELEASE_URL}. Please check the '--release-url' parameter value"
fi

INST_VOL="${INST_VOL:-/mnt}"
VS_INST_DIR="${INST_VOL}/variant-spark"

echo "Bootstraping variant-spark"

if [ "$IS_MASTER" = true ]; then
echo "Installing variant-spark in: ${INST_DIR}"
if [ "$IS_MASTER" = false ]; then
echo "Installing variant-spark in: ${VS_INST_DIR}"
mkdir -p "${VS_INST_DIR}"
#download and install variant spark
cd ${INST_DIR}
aws s3 cp s3://au.csiro.pbdava.test/variant-spark/dist/variant-spark_2.11-0.0.2-SNAPSHOT.tar.gz .
tar -xzf variant-spark_2.11-0.0.2-SNAPSHOT.tar.gz
rm variant-spark_2.11-0.0.2-SNAPSHOT.tar.gz
ln -s variant-spark_2.11-0.0.2-SNAPSHOT variant-spark-0.0.2
VARIANT_SPARK_HOME=${INST_DIR}/variant-spark-0.0.2
cat << EOF | sudo tee /etc/profile.d/variant-spark.sh
export VARIANT_SPARK_HOME=${VARIANT_SPARK_HOME}
export PATH=\${PATH}:\${VARIANT_SPARK_HOME}
EOF
echo "Installed variant-spark in: ${INST_DIR}"
cd ${VS_INST_DIR}
aws s3 cp --recursive "${VS_RELEASE_URL}" .
VS_UNVERSIONED_JAR="lib/variant-spark_2.11-*-all.jar"
VS_VERSIONED_JAR="$(echo ${VS_UNVERSIONED_JAR})"
if [[ "${VS_UNVERSIONED_JAR}" == ${VS_VERSIONED_JAR} ]]; then
fatal_error_msg "Could not find variant-spark assembly jar. Check if ${VS_RELEASE_URL} is a valid release url in S3"
fi
echo "Found variant-spark assembly jar: ${VS_VERSIONED_JAR}"
#create symbolic link for libaries
ln -s "${VS_VERSIONED_JAR}" variant-spark_2.11-all.jar
echo "Installed variant-spark in: ${VS_INST_DIR}"
fi

echo "Finished bootstraping variant-spark"
132 changes: 132 additions & 0 deletions cloud/aws-emr/cf-templates/template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
---
AWSTemplateFormatVersion: '2010-09-09'
Description: Cloudformation Template to spin up EMR clusters V3 (Version 5 of EMR only)
Parameters:
clusterName:
Description: Name of the cluster
Type: String
taskInstanceCount:
Description: Number of task instances
Type: String
emrVersion:
Description: Version of EMR
Type: String
Default: "emr-5.7.0"
AllowedPattern: emr-5.[0-9].[0-9]
ConstraintDescription: 'Must be EMR Version 5 (i.e: emr-5.3.0)'
masterInstanceType:
Description: Instance type of Master Node
Type: String
Default: "m4.large"
coreInstanceType:
Description: Instance type of Core Node
Type: String
Default: "m4.large"
taskInstanceType:
Description: Instance type of Task Node
Type: String
Default: "m4.large"
environmentType:
Description: What environment do you want the cluster to be in
Type: String
s3BucketBasePath:
Description: Bucket to log EMR actions to
Type: String
taskBidPrice:
Description: Bid price for Task nodes
Type: String
terminationProtected:
Description: Is the cluster to have termination protection enabled
Type: String
AllowedValues:
- 'true'
- 'false'
ConstraintDescription: Boolean
awsRegion:
Description: awsRegion
Default: ap-southeast-2
AllowedValues:
- ap-southeast-2
Type: String
Conditions:
isLive:
Fn::Equals:
- Ref: environmentType
- live
Resources:
EMRClusterV5:
Type: AWS::EMR::Cluster
Properties:
Instances:
MasterInstanceGroup:
InstanceCount: 1
InstanceType:
Ref: masterInstanceType
Market: ON_DEMAND
Name: Master instance group - 1
CoreInstanceGroup:
InstanceCount: 1
InstanceType:
Ref: coreInstanceType
Market: ON_DEMAND
Name: Core instance group - 2
TerminationProtected:
Ref: terminationProtected
Ec2SubnetId: "subnet-a23d0fd4"
Ec2KeyName: "default"
AdditionalMasterSecurityGroups:
- "sg-14ffe073"
BootstrapActions:
- Name: Install VariantSpark
ScriptBootstrapAction:
Path: "s3://au.csiro.pbdava.test/variant-spark/bootstrap/install-variant-spark.sh"
Configurations:
- Classification: spark-defaults
ConfigurationProperties:
spark.dynamicAllocation.enabled: 'false'
spark.history.fs.logDirectory: "s3://au.csiro.pbdava.test/variant-spark/sparklog/"
spark.eventLog.dir: "s3://au.csiro.pbdava.test/variant-spark/sparklog/"
Applications:
- Name: Ganglia
- Name: Spark
Name:
Ref: clusterName
JobFlowRole: "EMR_EC2_DefaultRole"
ServiceRole: "EMR_DefaultRole"
ReleaseLabel:
Ref: emrVersion
LogUri:
Ref: s3BucketBasePath
VisibleToAllUsers: false
Tags:
- Key: Name
Value:
Fn::Join:
- ''
- - emr-instance-
- Ref: AWS::StackName
- ''
- Key: Environment
Value:
Ref: environmentType
- Key: Stack ID
Value:
Ref: AWS::StackName
EMRTaskNodes:
Type: AWS::EMR::InstanceGroupConfig
Properties:
InstanceCount:
Ref: taskInstanceCount
InstanceType:
Ref: taskInstanceType
BidPrice:
Ref: taskBidPrice
Market: SPOT
InstanceRole: TASK
Name: Task instance group - 3
JobFlowId:
Ref: EMRClusterV5
Outputs:
ClusterID:
Description: "EMR Cluster ID"
Value: !Ref EMRClusterV5
24 changes: 24 additions & 0 deletions cloud/aws-emr/conf/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
default:
region: ap-southeast-2
instanceType: "m4.large"
ec2Attributes:
AdditionalMasterSecurityGroups: ['sg-14ffe073']
KeyName: "default"
SubnetId: "subnet-01dea958"
InstanceProfile: "EMR_EC2_DefaultRole"
EmrManagedSlaveSecurityGroup: "sg-82ffe0e5"
EmrManagedMasterSecurityGroup: "sg-aefde2c9"
worker:
instanceCount: 2
bidPrice: 0.07
conf:
logBucketBase: "au.csiro.pbdava.test/variant-spark"
profiles:
m4_16xlarge:
worker:
instanceType: "m4.16xlarge"
bidPrice: 0.55
r4_16xlarge:
worker:
instanceType: "r4.16xlarge"
bidPrice: 0.6
14 changes: 0 additions & 14 deletions cloud/aws-emr/deploy/deploy-to-aws.sh

This file was deleted.

15 changes: 15 additions & 0 deletions cloud/aws-emr/python/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from setuptools import setup

setup(
name='vs_emr',
version='0.1',
py_modules=['vs_emr'],
install_requires=[
'Click',
],
package_data = {'':['*.yaml']},
entry_points='''
[console_scripts]
vs-emr=vs_emr:cli
''',
)
77 changes: 77 additions & 0 deletions cloud/aws-emr/python/templates/spot-cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
---
defaults:
region: null
releaseLabel: "emr-5.7.0"
autoTerminate: true
instanceType: "m4.large"
ebsRootVolumeSize: 10
ebsSizeInGB: 32
variantSparkReleaseUrl: null
variantSparkBootstrap: "{{variantSparkReleaseUrl}}/bootstrap/install-variant-spark.sh"
master:
instanceType: {{instanceType}}
ebsSizeInGB: {{ebsSizeInGB}}
bidPrice: null
worker:
instanceCount: 1
instanceType: {{instanceType}}
ebsSizeInGB: {{ebsSizeInGB}}
bidPrice: null
conf:
logBucketBase: null
ec2Attributes: {}
options:
region: "{{region}}"
name: "variant-spark_{{worker.instanceType}}-{{worker.instanceCount}}"
release-label: "{{releaseLabel}}"
auto-terminate: {{autoTerminate}}
applications:
- Name: Ganglia
- Name: Spark
tags:
application: "variant-spark"
use: "testing"
ec2-attributes: {{ec2Attributes}}
{{#conf.logBucketBase}}log-uri: "s3n://{{conf.logBucketBase}}/logs/"{{/conf.logBucketBase}}
ebs-root-volume-size: 10
service-role: "EMR_DefaultRole"
enable-debugging: true
scale-down-behavior: "TERMINATE_AT_INSTANCE_HOUR"
instance-groups:
- Name: "Master Instance Group"
InstanceCount: 1
InstanceGroupType: "MASTER"
InstanceType: "{{master.instanceType}}"
{{#master.bidPrice}}BidPrice: "{{master.bidPrice}}"{{/master.bidPrice}}
EbsConfiguration:
EbsBlockDeviceConfigs:
- VolumeSpecification:
SizeInGB: {{master.ebsSizeInGB}}
VolumeType: "gp2"
VolumesPerInstance: 1
- Name: "Core Instance Group"
InstanceCount: {{worker.instanceCount}}
InstanceGroupType: "CORE"
InstanceType: {{worker.instanceType}}
{{#worker.bidPrice}}BidPrice: "{{worker.bidPrice}}"{{/worker.bidPrice}}
EbsConfiguration:
EbsBlockDeviceConfigs:
- VolumeSpecification:
SizeInGB: 32
VolumeType: "gp2"
VolumesPerInstance: 1
configurations:
- Classification: "spark"
Properties:
maximizeResourceAllocation: "true"
- Classification: "spark-defaults"
Properties:
spark.dynamicAllocation.enabled: "false"
spark.serializer: "org.apache.spark.serializer.KryoSerializer"
spark.locality.wait: "10s"
{{#conf.logBucketBase}}spark.eventLog.dir: "s3://{{conf.logBucketBase}}/sparklog/"{{/conf.logBucketBase}}
{{#conf.logBucketBase}}spark.history.fs.logDirectory: "s3://{{conf.logBucketBase}}/sparklog/"{{/conf.logBucketBase}}
bootstrap-actions:
- Name: "Install Variant Spark"
Path: {{variantSparkBootstrap}}
Args: ["--release-url", "{{variantSparkReleaseUrl}}"]
Loading

0 comments on commit 7ef8572

Please sign in to comment.