Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Trainium blueprint Part1 #251

Merged
merged 10 commits into from
Jul 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 232 additions & 0 deletions ai-ml/trainium-inferentia/addons.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
#---------------------------------------------------------------
# IRSA for EBS CSI Driver
#---------------------------------------------------------------
module "ebs_csi_driver_irsa" {
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
version = "~> 5.20"
role_name_prefix = format("%s-%s-", local.name, "ebs-csi-driver")
attach_ebs_csi_policy = true
oidc_providers = {
main = {
provider_arn = module.eks.oidc_provider_arn
namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
}
}
tags = local.tags
}

#---------------------------------------------------------------
# EKS Blueprints Addons
#---------------------------------------------------------------
module "eks_blueprints_addons" {
source = "aws-ia/eks-blueprints-addons/aws"
version = "~> 1.2"

cluster_name = module.eks.cluster_name
cluster_endpoint = module.eks.cluster_endpoint
cluster_version = module.eks.cluster_version
oidc_provider_arn = module.eks.oidc_provider_arn

#---------------------------------------
# Amazon EKS Managed Add-ons
#---------------------------------------
eks_addons = {
aws-ebs-csi-driver = {
service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn
}
coredns = {
preserve = true
}
kube-proxy = {
preserve = true
}
# VPC CNI uses worker node IAM role policies
vpc-cni = {
preserve = true
}
}

#---------------------------------------
# Kubernetes Add-ons
#---------------------------------------
#---------------------------------------------------------------
# CoreDNS Autoscaler helps to scale for large EKS Clusters
# Further tuning for CoreDNS is to leverage NodeLocal DNSCache -> https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/
#---------------------------------------------------------------
enable_cluster_proportional_autoscaler = true
cluster_proportional_autoscaler = {
values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", {
target = "deployment/coredns"
})]
description = "Cluster Proportional Autoscaler for CoreDNS Service"
}

#---------------------------------------
# Metrics Server
#---------------------------------------
enable_metrics_server = true
metrics_server = {
values = [templatefile("${path.module}/helm-values/metrics-server-values.yaml", {})]
}

#---------------------------------------
# Cluster Autoscaler
#---------------------------------------
enable_cluster_autoscaler = true
cluster_autoscaler = {
values = [templatefile("${path.module}/helm-values/cluster-autoscaler-values.yaml", {
aws_region = var.region,
eks_cluster_id = module.eks.cluster_name
})]
}

#---------------------------------------
# Karpenter Autoscaler for EKS Cluster
#---------------------------------------
enable_karpenter = true
karpenter_enable_spot_termination = true
karpenter = {
repository_username = data.aws_ecrpublic_authorization_token.token.user_name
repository_password = data.aws_ecrpublic_authorization_token.token.password
}

#---------------------------------------
# CloudWatch metrics for EKS
#---------------------------------------
enable_aws_cloudwatch_metrics = true
aws_cloudwatch_metrics = {
values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
}

#---------------------------------------
# AWS for FluentBit - DaemonSet
#---------------------------------------
enable_aws_for_fluentbit = true
aws_for_fluentbit_cw_log_group = {
use_name_prefix = false
name = "/${local.name}/aws-fluentbit-logs" # Add-on creates this log group
retention_in_days = 30
}
aws_for_fluentbit = {
s3_bucket_arns = [
module.s3_bucket.s3_bucket_arn,
"${module.s3_bucket.s3_bucket_arn}/*}"
]
values = [templatefile("${path.module}/helm-values/aws-for-fluentbit-values.yaml", {
region = local.region,
cloudwatch_log_group = "/${local.name}/aws-fluentbit-logs"
s3_bucket_name = module.s3_bucket.s3_bucket_id
cluster_name = module.eks.cluster_name
})]
}

#---------------------------------------
# Prommetheus and Grafana stack
#---------------------------------------
#---------------------------------------------------------------
# Install Kafka Montoring Stack with Prometheus and Grafana
# 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
# 2- Grafana Admin user: admin
# 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id kafka-on-eks-grafana --region $AWS_REGION --query "SecretString" --output text`
#---------------------------------------------------------------
enable_kube_prometheus_stack = true
kube_prometheus_stack = {
values = [templatefile("${path.module}/helm-values/prom-grafana-values.yaml", {})]
set_sensitive = [
{
name = "grafana.adminPassword"
value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
}
],
set = var.enable_amazon_prometheus ? [
{
name = "prometheus.serviceAccount.name"
value = local.amp_ingest_service_account
},
{
name = "prometheus.serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
value = module.amp_ingest_irsa[0].iam_role_arn
},
{
name = "prometheus.prometheusSpec.remoteWrite[0].url"
value = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}/api/v1/remote_write"
},
{
name = "prometheus.prometheusSpec.remoteWrite[0].sigv4.region"
value = local.region
}
] : []
}

tags = local.tags
}

#---------------------------------------------------------------
# Data on EKS Kubernetes Addons
#---------------------------------------------------------------
# NOTE: This module will be moved to a dedicated repo and the source will be changed accordingly.
module "kubernetes_data_addons" {
# Please note that local source will be replaced once the below repo is public
# source = "https://github.com/aws-ia/terraform-aws-kubernetes-data-addons"
source = "../../workshop/modules/terraform-aws-eks-data-addons"
oidc_provider_arn = module.eks.oidc_provider_arn

enable_aws_neuron_device_plugin = true
enable_aws_efa_k8s_device_plugin = true
}

#---------------------------------------------------------------
# Grafana Admin credentials resources
# Login to AWS secrets manager with the same role as Terraform to extract the Grafana admin password with the secret name as "grafana"
#---------------------------------------------------------------
data "aws_secretsmanager_secret_version" "admin_password_version" {
secret_id = aws_secretsmanager_secret.grafana.id
depends_on = [aws_secretsmanager_secret_version.grafana]
}

resource "random_password" "grafana" {
length = 16
special = true
override_special = "@_"
}

#tfsec:ignore:aws-ssm-secret-use-customer-key
resource "aws_secretsmanager_secret" "grafana" {
name = "${local.name}-grafana"
recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
}

resource "aws_secretsmanager_secret_version" "grafana" {
secret_id = aws_secretsmanager_secret.grafana.id
secret_string = random_password.grafana.result
}

#---------------------------------------
# Karpenter Provisioners
#---------------------------------------
data "kubectl_path_documents" "karpenter_provisioners" {
pattern = "${path.module}/karpenter-provisioners/trainium-*.yaml"
vars = {
azs = local.region
eks_cluster_id = module.eks.cluster_name
}
}

resource "kubectl_manifest" "karpenter_provisioner" {
for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents)
yaml_body = each.value

depends_on = [module.eks_blueprints_addons]
}

#tfsec:ignore:*
module "s3_bucket" {
source = "terraform-aws-modules/s3-bucket/aws"
version = "~> 3.0"

bucket_prefix = "${local.name}-logs-"
# For example only - please evaluate for your environment
force_destroy = true

tags = local.tags
}
36 changes: 36 additions & 0 deletions ai-ml/trainium-inferentia/amp.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#------------------------------------------
# Amazon Prometheus
#------------------------------------------
locals {
amp_ingest_service_account = "amp-iamproxy-ingest-service-account"
amp_namespace = "kube-prometheus-stack"
}

resource "aws_prometheus_workspace" "amp" {
count = var.enable_amazon_prometheus ? 1 : 0

alias = format("%s-%s", "amp-ws", local.name)
tags = local.tags
}

#---------------------------------------------------------------
# IRSA for VPC AMP
#---------------------------------------------------------------
module "amp_ingest_irsa" {
count = var.enable_amazon_prometheus ? 1 : 0

source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
version = "~> 5.20"
role_name = format("%s-%s", local.name, "amp-ingest")

attach_amazon_managed_service_prometheus_policy = true
amazon_managed_service_prometheus_workspace_arns = [aws_prometheus_workspace.amp[0].arn]

oidc_providers = {
main = {
provider_arn = module.eks.oidc_provider_arn
namespace_service_accounts = ["${local.amp_namespace}:${local.amp_ingest_service_account}"]
}
}
tags = local.tags
}
45 changes: 45 additions & 0 deletions ai-ml/trainium-inferentia/cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
set -o errexit
set -o pipefail

targets=(
"module.kubernetes_data_addons"
"module.eks_blueprints_addons"
)

#-------------------------------------------
# Helpful to delete the stuck in "Terminating" namespaces
# Rerun the cleanup.sh script to detect and delete the stuck resources
#-------------------------------------------
terminating_namespaces=$(kubectl get namespaces --field-selector status.phase=Terminating -o json | jq -r '.items[].metadata.name')

# If there are no terminating namespaces, exit the script
if [[ -z $terminating_namespaces ]]; then
echo "No terminating namespaces found"
fi

for ns in $terminating_namespaces; do
echo "Terminating namespace: $ns"
kubectl get namespace $ns -o json | sed 's/"kubernetes"//' | kubectl replace --raw "/api/v1/namespaces/$ns/finalize" -f -
done

for target in "${targets[@]}"
do
terraform destroy -target="$target" -auto-approve
destroy_output=$(terraform destroy -target="$target" -auto-approve 2>&1)
if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
echo "SUCCESS: Terraform destroy of $target completed successfully"
else
echo "FAILED: Terraform destroy of $target failed"
exit 1
fi
done

terraform destroy -auto-approve
destroy_output=$(terraform destroy -auto-approve 2>&1)
if [[ $? -eq 0 && $destroy_output == *"Destroy complete!"* ]]; then
echo "SUCCESS: Terraform destroy of all targets completed successfully"
else
echo "FAILED: Terraform destroy of all targets failed"
exit 1
fi
Loading