Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions aws_datalake/modules/emr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,30 @@ Type: `string`

Default: `"segment-data-lake"`

### emr\_cluster\_version

Description: Version of emr cluster

Type: `string`

Default: `"6.5.0"`

### additional\_applications

Description: List of applications to install on the EMR cluster, besides Hadoop, Hive, and Spark.

Type: `list(string)`

Default: `[]`

### key\_name

Description: Amazon EC2 key pair that can be used to ssh to the master node as the user called hadoop.

Type: `string`

Default: `null`

### core\_instance\_count

Description: Number of Core Nodes
Expand Down Expand Up @@ -102,6 +126,14 @@ Type: `string`

Default: `""`

### additional\_master\_security\_groups

Description: String containing a comma separated list of additional Amazon EC2 security group IDs for the master node.

Type: `string`

Default: `""`

### slave\_security\_group

Description: Identifier of the Amazon EC2 EMR-Managed security group for the slave nodes.
Expand All @@ -110,6 +142,14 @@ Type: `string`

Default: `""`

### additional\_slave\_security\_groups

Description: String containing a comma separated list of additional Amazon EC2 security group IDs for the slave nodes as a comma separated string.

Type: `string`

Default: `""`

### tags

Description: A map of tags to add to all resources. A vendor=segment tag will be added automatically (which is also used by the IAM policy to provide Segment access to submit jobs).
Expand Down Expand Up @@ -142,6 +182,22 @@ Type: `string`

Default: `"m5.xlarge"`

### ebs\_size

Description: Volume size, in gibibytes (GiB)

Type: `string`

Default: `"64"`

### ebs\_type

Description: Volume type. Valid options are gp3, gp2, io1, standard, st1 and sc1.

Type: `string`

Default: `"gp2"`

## Outputs

The following outputs are exported:
Expand Down
35 changes: 12 additions & 23 deletions aws_datalake/modules/emr/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,31 @@
resource "aws_emr_cluster" "segment_data_lake_emr_cluster" {
name = var.cluster_name
release_label = "emr-${var.emr_cluster_version}"
applications = ["Hadoop", "Hive", "Spark"]
applications = concat(["Hadoop", "Hive", "Spark"], var.additional_applications)

log_uri = "s3://${var.s3_bucket}/${var.emr_logs_s3_prefix}"

ec2_attributes {
subnet_id = var.subnet_id
emr_managed_master_security_group = var.master_security_group
additional_master_security_groups = var.additional_master_security_groups
emr_managed_slave_security_group = var.slave_security_group
additional_slave_security_groups = var.additional_slave_security_groups
instance_profile = var.iam_emr_instance_profile
key_name = var.key_name
}

service_role = var.iam_emr_service_role
autoscaling_role = var.iam_emr_autoscaling_role
#unhealthy_node_replacement = var.unhealthy_node_replacement

master_instance_group {
instance_type = var.master_instance_type
name = "master_group"

ebs_config {
size = "64"
type = "gp2"
size = var.ebs_size
type = var.ebs_type
volumes_per_instance = 1
}
}
Expand All @@ -34,8 +38,8 @@ resource "aws_emr_cluster" "segment_data_lake_emr_cluster" {
name = "core_group"

ebs_config {
size = "64"
type = "gp2"
size = var.ebs_size
type = var.ebs_type
volumes_per_instance = 1
}

Expand Down Expand Up @@ -94,22 +98,7 @@ resource "aws_emr_cluster" "segment_data_lake_emr_cluster" {
EOF
}

configurations_json = <<EOF
[
{
"Classification": "hive-site",
"Properties": {
"hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
}
},
{
"Classification": "spark-hive-site",
"Properties": {
"hive.metastore.client.factory.class":"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" }
}
]
EOF

configurations_json = var.configurations_json
tags = local.tags
}

Expand All @@ -121,8 +110,8 @@ resource "aws_emr_instance_group" "task" {
instance_count = var.task_instance_count

ebs_config {
size = "64"
type = "gp2"
size = var.ebs_size
type = var.ebs_type
volumes_per_instance = 1
}

Expand Down
72 changes: 72 additions & 0 deletions aws_datalake/modules/emr/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,24 @@ variable "master_security_group" {
default = ""
}

variable "additional_master_security_groups" {
description = "String containing a comma separated list of additional Amazon EC2 security group IDs for the master node."
type = string
default = ""
}

variable "slave_security_group" {
description = "Identifier of the Amazon EC2 EMR-Managed security group for the slave nodes."
type = string
default = ""
}

variable "additional_slave_security_groups" {
description = "String containing a comma separated list of additional Amazon EC2 security group IDs for the slave nodes as a comma separated string."
type = string
default = ""
}

variable "tags" {
description = "A map of tags to add to all resources. A vendor=segment tag will be added automatically (which is also used by the IAM policy to provide Segment access to submit jobs)."
type = map(string)
Expand Down Expand Up @@ -53,6 +65,19 @@ variable "iam_emr_instance_profile" {
type = string
}

variable "key_name" {
description = "Amazon EC2 key pair that can be used to ssh to the master node as the user called hadoop."
type = string
default = null
}

# FIXME requires aws provider v5
#variable "unhealthy_node_replacement" {
# description = "Whether Amazon EMR should gracefully replace core nodes that have degraded within the cluster."
# type = bool
# default = false
#}

variable "master_instance_type" {
description = "EC2 Instance Type for Master"
type = string
Expand Down Expand Up @@ -101,6 +126,53 @@ variable "emr_cluster_version" {
default = "6.5.0"
}

variable "additional_applications" {
description = "List of applications to install on the EMR cluster, besides Hadoop, Hive, and Spark."
type = list(string)
default = []
}

variable "ebs_size" {
description = "Volume size, in gibibytes (GiB)"
type = string
default = "64"
}

variable "ebs_type" {
description = "Volume type. Valid options are gp3, gp2, io1, standard, st1 and sc1."
type = string
default = "gp2"
}

variable "configurations_json" {
description = "JSON string for supplying list of configurations for the EMR cluster."
type = string
default = <<-EOF
[
{
"Classification": "hive-site",
"Properties": {
"hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
}
},
{
"Classification": "spark-hive-site",
"Properties": {
"hive.metastore.client.factory.class":"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
}
},
{
"Classification": "spark-defaults",
"Properties": {
"spark.history.fs.cleaner.enabled": "true",
"spark.history.fs.cleaner.interval": "1d",
"spark.history.fs.cleaner.maxAge": "7d"
}
}
]
EOF
}

locals {
tags = merge(tomap({"vendor" = "segment"}), var.tags)
}