segmentio · Ken-Michalak · May 21, 2024 · May 31, 2024
diff --git a/aws_datalake/modules/emr/README.md b/aws_datalake/modules/emr/README.md
@@ -54,6 +54,30 @@ Type: `string`
 
 Default: `"segment-data-lake"`
 
+### emr\_cluster\_version
+
+Description: Version of emr cluster
+
+Type: `string`
+
+Default: `"6.5.0"`
+
+### additional\_applications
+
+Description: List of applications to install on the EMR cluster, besides Hadoop, Hive, and Spark.
+
+Type: `list(string)`
+
+Default: `[]`
+
+### key\_name
+
+Description: Amazon EC2 key pair that can be used to ssh to the master node as the user called hadoop.
+
+Type: `string`
+
+Default: `null`
+
 ### core\_instance\_count
 
 Description: Number of Core Nodes
@@ -102,6 +126,14 @@ Type: `string`
 
 Default: `""`
 
+### additional\_master\_security\_groups
+
+Description: String containing a comma separated list of additional Amazon EC2 security group IDs for the master node.
+
+Type: `string`
+
+Default: `""`
+
 ### slave\_security\_group
 
 Description: Identifier of the Amazon EC2 EMR-Managed security group for the slave nodes.
@@ -110,6 +142,14 @@ Type: `string`
 
 Default: `""`
 
+### additional\_slave\_security\_groups
+
+Description: String containing a comma separated list of additional Amazon EC2 security group IDs for the slave nodes as a comma separated string.
+
+Type: `string`
+
+Default: `""`
+
 ### tags
 
 Description: A map of tags to add to all resources. A vendor=segment tag will be added automatically (which is also used by the IAM policy to provide Segment access to submit jobs).
@@ -142,6 +182,22 @@ Type: `string`
 
 Default: `"m5.xlarge"`
 
+### ebs\_size
+
+Description: Volume size, in gibibytes (GiB)
+
+Type: `string`
+
+Default: `"64"`
+
+### ebs\_type
+
+Description: Volume type. Valid options are gp3, gp2, io1, standard, st1 and sc1.
+
+Type: `string`
+
+Default: `"gp2"`
+
 ## Outputs
 
 The following outputs are exported:

diff --git a/aws_datalake/modules/emr/main.tf b/aws_datalake/modules/emr/main.tf
@@ -3,27 +3,31 @@
 resource "aws_emr_cluster" "segment_data_lake_emr_cluster" {
   name          = var.cluster_name
   release_label = "emr-${var.emr_cluster_version}"
-  applications  = ["Hadoop", "Hive", "Spark"]
+  applications  = concat(["Hadoop", "Hive", "Spark"], var.additional_applications)
 
   log_uri = "s3://${var.s3_bucket}/${var.emr_logs_s3_prefix}"
 
   ec2_attributes {
     subnet_id                         = var.subnet_id
     emr_managed_master_security_group = var.master_security_group
+    additional_master_security_groups = var.additional_master_security_groups
     emr_managed_slave_security_group  = var.slave_security_group
+    additional_slave_security_groups  = var.additional_slave_security_groups
     instance_profile                  = var.iam_emr_instance_profile
+    key_name                          = var.key_name
   }
 
   service_role     = var.iam_emr_service_role
   autoscaling_role = var.iam_emr_autoscaling_role
+  #unhealthy_node_replacement = var.unhealthy_node_replacement
 
   master_instance_group {
     instance_type = var.master_instance_type
     name          = "master_group"
 
     ebs_config {
-      size                 = "64"
-      type                 = "gp2"
+      size                 = var.ebs_size
+      type                 = var.ebs_type
       volumes_per_instance = 1
     }
   }
@@ -34,8 +38,8 @@ resource "aws_emr_cluster" "segment_data_lake_emr_cluster" {
     name           = "core_group"
 
     ebs_config {
-      size                 = "64"
-      type                 = "gp2"
+      size                 = var.ebs_size
+      type                 = var.ebs_type
       volumes_per_instance = 1
     }
 
@@ -94,22 +98,7 @@ resource "aws_emr_cluster" "segment_data_lake_emr_cluster" {
 EOF
   }
 
-  configurations_json = <<EOF
-  [
-    {
-      "Classification": "hive-site",
-      "Properties": {
-        "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
-      }
-    },
-    {
-      "Classification": "spark-hive-site",
-      "Properties": {
-        "hive.metastore.client.factory.class":"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"      }
-    }
-  ]
-EOF
-
+  configurations_json = var.configurations_json
   tags = local.tags
 }
 
@@ -121,8 +110,8 @@ resource "aws_emr_instance_group" "task" {
   instance_count = var.task_instance_count
 
   ebs_config {
-    size                 = "64"
-    type                 = "gp2"
+    size                 = var.ebs_size
+    type                 = var.ebs_type
     volumes_per_instance = 1
   }
 

diff --git a/aws_datalake/modules/emr/variables.tf b/aws_datalake/modules/emr/variables.tf
@@ -14,12 +14,24 @@ variable "master_security_group" {
   default     = ""
 }
 
+variable "additional_master_security_groups" {
+  description = "String containing a comma separated list of additional Amazon EC2 security group IDs for the master node."
+  type        = string
+  default     = ""
+}
+
 variable "slave_security_group" {
   description = "Identifier of the Amazon EC2 EMR-Managed security group for the slave nodes."
   type        = string
   default     = ""
 }
 
+variable "additional_slave_security_groups" {
+  description = "String containing a comma separated list of additional Amazon EC2 security group IDs for the slave nodes as a comma separated string."
+  type        = string
+  default     = ""
+}
+
 variable "tags" {
   description = "A map of tags to add to all resources. A vendor=segment tag will be added automatically (which is also used by the IAM policy to provide Segment access to submit jobs)."
   type        = map(string)
@@ -53,6 +65,19 @@ variable "iam_emr_instance_profile" {
   type        = string
 }
 
+variable "key_name" {
+  description = "Amazon EC2 key pair that can be used to ssh to the master node as the user called hadoop."
+  type        = string
+  default     = null
+}
+
+# FIXME requires aws provider v5
+#variable "unhealthy_node_replacement" {
+#  description = "Whether Amazon EMR should gracefully replace core nodes that have degraded within the cluster."
+#  type        = bool
+#  default     = false
+#}
+
 variable "master_instance_type" {
   description = "EC2 Instance Type for Master"
   type        = string
@@ -101,6 +126,53 @@ variable "emr_cluster_version" {
   default     = "6.5.0"
 }
 
+variable "additional_applications" {
+  description = "List of applications to install on the EMR cluster, besides Hadoop, Hive, and Spark."
+  type        = list(string)
+  default     = []
+}
+
+variable "ebs_size" {
+  description = "Volume size, in gibibytes (GiB)"
+  type        = string
+  default     = "64"
+}
+
+variable "ebs_type" {
+  description = "Volume type. Valid options are gp3, gp2, io1, standard, st1 and sc1."
+  type        = string
+  default     = "gp2"
+}
+
+variable "configurations_json" {
+  description = "JSON string for supplying list of configurations for the EMR cluster."
+  type        = string
+  default     = <<-EOF
+    [
+      {
+        "Classification": "hive-site",
+        "Properties": {
+          "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
+        }
+      },
+      {
+        "Classification": "spark-hive-site",
+        "Properties": {
+          "hive.metastore.client.factory.class":"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
+        }
+      },
+      {
+        "Classification": "spark-defaults",
+        "Properties": {
+          "spark.history.fs.cleaner.enabled": "true",
+          "spark.history.fs.cleaner.interval": "1d",
+          "spark.history.fs.cleaner.maxAge": "7d"
+        }
+      }
+    ]
+  EOF
+}
+
 locals {
   tags = merge(tomap({"vendor" = "segment"}), var.tags)
 }