Merge pull request #2 from slicelife/update-upstream

cyclops23 · web-flow · commit ab6ca9f5d352 · 2020-07-22T14:27:13.000+01:00
diff --git a/README.md b/README.md
@@ -32,7 +32,7 @@ Terraform modules which create AWS resources for a Segment Data Lake.
 * Authorized [AWS account](https://aws.amazon.com/account/).
 * Ability to run Terraform with your AWS Account. You must use Terraform 0.11 or higher.
 * A subnet within a VPC for the EMR cluster to run in.
-* [S3 Bucket](https://github.com/terraform-aws-modules/terraform-aws-s3-bucket) to send data from Segment to and to store logs.
+* An [S3 Bucket](https://github.com/terraform-aws-modules/terraform-aws-s3-bucket) for Segment to load data into. You can create a new one just for this, or re-use an existing one you already have.
 
 ## VPC
 
@@ -50,6 +50,33 @@ The repository is split into multiple modules, and each can be used independentl
 
 # Usage
 
+## Terraform Installation
+*Note*  - Skip this section if you already have a working Terraform setup
+### OSX:
+`brew` on OSX should install the latest version of Terraform.
+```
+brew install terraform
+```
+
+### Centos/Ubuntu:
+* Follow instructions [here](https://phoenixnap.com/kb/how-to-install-terraform-centos-ubuntu) to install on Centos/Ubuntu OS.
+* Ensure that the version installed in > 0.11.x
+
+Verify installation works by running:
+```
+terraform help
+```
+
+## Set up Project
+* Create project directory
+```
+mkdir segment-datalakes-tf
+```
+* Create `main.tf` file
+    * Update the `segment_sources` variable in the `locals` to the sources you want to sync 
+    * Update the `name` in the `aws_s3_bucket` resource to the desired name of your S3 bucket
+    * Update the `subnet_id` in the `emr` module to the subnet in which to create the EMR cluster
+
 ```hcl
 provider "aws" {
   region = "us-west-2"  # Replace this with the AWS region your infrastructure is set up in.
@@ -70,14 +97,6 @@ resource "aws_s3_bucket" "segment_datalake_s3" {
   name = "my-first-segment-datalake"
 }
 
-# This is optional.
-# Segment will create a DB for you if it does not exist already.
-module "glue" {
-  source = "git@github.com:segmentio/terraform-aws-data-lake//modules/glue?ref=v0.1.5"
-
-  name = "segment_data_lake"
-}
-
 # Creates the IAM Policy that allows Segment to access the necessary resources
 # in your AWS account for loading your data.
 module "iam" {
@@ -91,18 +110,53 @@ module "iam" {
 # Creates an EMR Cluster that Segment uses for performing the final ETL on your
 # data that lands in S3.
 module "emr" {
-  source = "git@github.com:segmentio/terraform-aws-data-lake//modules/emr?ref=v0.1.5"
+  source = "git@github.com:segmentio/terraform-aws-data-lake//modules/emr?ref=v0.2.0"
 
   s3_bucket = "${aws_s3_bucket.segment_datalake_s3.name}"
   subnet_id = "subnet-XXX" # Replace this with the subnet ID you want the EMR cluster to run in.
+ 
+  # LEAVE THIS AS-IS
+  iam_emr_autoscaling_role = "${module.iam.iam_emr_autoscaling_role}"
+  iam_emr_service_role     = "${module.iam.iam_emr_service_role}"
+  iam_emr_instance_profile = "${module.iam.iam_emr_instance_profile}"
 }
 ```
-
-With the Terraform CLI, you can run `terraform plan` to preview the changes by the modules, and `terraform apply` to generate the resources.
+## Provision Resources
+* Provide AWS credentials of the account being used. More details here: https://www.terraform.io/docs/providers/aws/index.html
+  ```
+  export AWS_ACCESS_KEY_ID="anaccesskey"
+  export AWS_SECRET_ACCESS_KEY="asecretkey"
+  export AWS_DEFAULT_REGION="us-west-2"
+  ```
+* Initialize the references modules
+  ```
+  terraform init
+  ```
+  You should see a success message once you run the plan:
+  ```
+  Terraform has been successfully initialized!
+  ```
+* Run plan
+  This does not create any resources. It just outputs what will be created after you run apply(next step).
+  ```
+  terraform plan
+  ```
+  You should see something like towards the end of the plan:
+  ```
+  Plan: 13 to add, 0 to change, 0 to destroy.
+  ```
+* Run apply - this step creates the resources in your AWS infrastructure
+  ```
+  terraform apply
+  ```
+  You should see:
+  ```
+  Apply complete! Resources: 13 added, 0 changed, 0 destroyed.
+  ```
 
 Note that creating the EMR cluster can take a while (typically 5 minutes).
 
-Once applied, make a note of the following (you'll need to provide this information to your Segment contact):
+Once applied, make a note of the following (you'll need to enter these as settings when configuring the Data Lake):
 * The **AWS Region** and **AWS Account ID** where your Data Lake was configured
 * The **Source ID and Slug** for _each_ Segment source that will be connected to the data lake
 * The generated **EMR Cluster ID**
@@ -148,6 +202,8 @@ If all else fails, teardown and start over.
 
 Terraform 0.11 or higher is supported.
 
+NOTE: Release v0.2.0 onwards only Terraform 0.12 or higher is supported.
+
 # Development
 
 To develop in this repository, you'll want the following tools set up:
diff --git a/modules/emr/README.md b/modules/emr/README.md
@@ -48,6 +48,62 @@ Type: `string`
 
 Default: `""`
 
+### master\_instance\_type
+
+Description: EC2 Instance Type for Master
+
+Type: `string`
+
+Default: `"m5.xlarge"`
+
+### core\_instance\_type
+
+Description: EC2 Instance Type for Core Nodes
+
+Type: `string`
+
+Default: `"m5.xlarge"`
+
+# task\_instance\_type
+
+Description: EC2 Instance Type for Task Nodes
+
+Type: `string`
+
+Default: `"m5.xlarge"`
+
+# core\_instance\_count
+
+Description: Number of instances of Core Nodes
+
+Type: `string`
+
+Default: `"2"`
+
+# core\_instance\_max\_count
+
+Description: Max number of Core Nodes used on autoscale
+
+Type: `string`
+
+Default: `"4"`
+
+# task\_instance\_count
+
+Description: Number of instances of Task Nodes
+
+Type: `string`
+
+Default: `"2"`
+
+# task\_instance\_max\_count
+
+Description: Max number of Task Nodes used on autoscale
+
+Type: `string`
+
+Default: `"4"`
+
 ### tags
 
 Description: A map of tags to add to all resources. A vendor=segment tag will be added automatically (which is also used by the IAM policy to provide Segment access to submit jobs).
diff --git a/modules/emr/main.tf b/modules/emr/main.tf
@@ -18,7 +18,7 @@ resource "aws_emr_cluster" "segment_data_lake_emr_cluster" {
   autoscaling_role = "${var.iam_emr_autoscaling_role}"
 
   master_instance_group {
-    instance_type = "m5.xlarge"
+    instance_type = "${var.master_instance_type}"
     name          = "master_group"
 
     ebs_config {
@@ -29,8 +29,8 @@ resource "aws_emr_cluster" "segment_data_lake_emr_cluster" {
   }
 
   core_instance_group {
-    instance_type  = "m5.xlarge"
-    instance_count = 2
+    instance_type  = "${var.core_instance_type}"
+    instance_count = "${var.core_instance_count}"
     name           = "core_group"
 
 
@@ -43,8 +43,8 @@ resource "aws_emr_cluster" "segment_data_lake_emr_cluster" {
     autoscaling_policy = <<EOF
 {
 	"Constraints": {
-		"MinCapacity": 2,
-		"MaxCapacity": 4
+		"MinCapacity": ${var.core_instance_count},
+		"MaxCapacity": ${var.core_instance_max_count}
 	},
 	"Rules": [{
 		"Action": {
@@ -120,8 +120,8 @@ resource "aws_emr_instance_group" "task" {
   name       = "task_group"
   cluster_id = join("", aws_emr_cluster.segment_data_lake_emr_cluster.*.id)
 
-  instance_type  = "m5.xlarge"
-  instance_count = "2"
+  instance_type  = "${var.task_instance_type}"
+  instance_count = "${var.task_instance_count}"
 
   ebs_config {
     size                 = "64"
@@ -132,8 +132,8 @@ resource "aws_emr_instance_group" "task" {
   autoscaling_policy = <<EOF
 {
 "Constraints": {
-			"MinCapacity": 2,
-			"MaxCapacity": 4
+			"MinCapacity": ${var.task_instance_count},
+			"MaxCapacity": ${var.task_instance_max_count}
 		},
 		"Rules": [{
 			"Action": {
diff --git a/modules/emr/variables.tf b/modules/emr/variables.tf
@@ -53,6 +53,48 @@ variable "iam_emr_instance_profile" {
   type        = "string"
 }
 
+variable "master_instance_type" {
+  description = "EC2 Instance Type for Master"
+  type        = "string"
+  default     = "m5.xlarge"
+}
+
+variable "core_instance_type" {
+  description = "EC2 Instance Type for Core Nodes"
+  type        = "string"
+  default     = "m5.xlarge"
+}
+
+variable "task_instance_type" {
+  description = "EC2 Instance Type for Task Nodes"
+  type        = "string"
+  default     = "m5.xlarge"
+}
+
+variable "core_instance_count" {
+  description = "Number of Core Nodes"
+  type        = "string"
+  default     = "2"
+}
+
+variable "core_instance_max_count" {
+  description = "Max number of Core Nodes used on autoscale"
+  type        = "string"
+  default     = "4"
+}
+
+variable "task_instance_count" {
+  description = "Number of instances of Task Nodes"
+  type        = "string"
+  default     = "2"
+}
+
+variable "task_instance_max_count" {
+  description = "Max number of Task Nodes used on autoscale"
+  type        = "string"
+  default     = "4"
+}
+
 locals {
   tags = "${merge(map("vendor", "segment"), var.tags)}"
 }