Deploy Cellenics infrastructure on AWS #506

Workflow file for this run

.github/workflows/deploy-infra.yaml at c8e0fb7

	name: Deploy Cellenics infrastructure on AWS
	on:
	workflow_dispatch:
	inputs:
	environment_name:
	type: string
	description: Select the environment name to run the actions on
	required: true
	default: all
	workflow_actions:
	type: choice
	description: Select actions to perform
	options:
	- create and configure cluster
	- configure cluster
	- deploy monitoring
	default: configure cluster
	environment_type:
	type: choice
	description: Select environment type
	options:
	- staging
	- production
	- staging and production
	default: staging

	# this ensures that only one CI pipeline with the same key
	# can run at once in order to prevent undefined states
	concurrency: cluster-update-mutex

	permissions:
	id-token: write
	contents: read

	# After load-config and check-secrets jobs are finished:
	# "create and configure cluster" workflow_actions option runs all jobs.
	# "configure cluster" workflow_actions option runs only configure-cluster job
	# "deploy monitoring" workflow_actions option runs only deploy-monitoring job
	jobs:
	load-config:
	uses: ./.github/workflows/load-config.yaml
	with:
	environment_name: ${{ github.event.inputs.environment_name }}
	environment_type: ${{ github.event.inputs.environment_type }}

	check-secrets:
	name: Check that sufficient secrets are specified for environment name
	runs-on: ubuntu-20.04
	needs: load-config
	strategy:
	matrix:
	environment_name: ${{ fromJson(needs.load-config.outputs.environment_names) }}
	environment: ${{ matrix.environment_name }}
	steps:
	- id: check-secrets-for-environment
	name: Check if necessary secrets are installed.
	run: \|-
	echo Checking if secrets are defined in the repository.
	if [ -z "${{ secrets.ACM_CERTIFICATE_ARN}}" ]
	then
	echo AWS certificate ARN is not defined.
	ERROR=true
	fi
	if [ -z "${{ secrets.AWS_ACCOUNT_ID }}" ]
	then
	echo AWS Account ID is not defined.
	ERROR=true
	fi
	if [ -z "${{ secrets.API_TOKEN_GITHUB }}" ]
	then
	echo GitHub deploy key access token is not defined.
	ERROR=true
	fi
	if [ -z "${{ secrets.PRIMARY_DOMAIN_NAME }}" ]
	then
	echo Secret PRIMARY_DOMAIN_NAME is not set in repository secrets. Make sure this secret exists in the repository secrets.
	ERROR=true
	fi
	if [ -z "${{ secrets.DOMAIN_NAME }}" ]
	then
	echo Secret DOMAIN_NAME is not set in repository secrets. Make sure this secret exists in the repository secrets.
	ERROR=true
	fi
	if [ -n "$ERROR" ]
	then
	echo
	echo This workflow requires some secrets to complete.
	echo Please make they are created by adding/rotating them manually.
	exit 1
	fi

	create-eks-cluster:
	name: Create EKS cluster
	runs-on: ubuntu-20.04
	needs: [check-secrets, load-config]
	if: github.event.inputs.workflow_actions == 'create and configure cluster'
	env:
	CLUSTER_ENV: ${{ matrix.environment.type }}
	strategy:
	max-parallel: 1
	matrix:
	environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix) }}
	environment: ${{ matrix.environment.name }}
	steps:
	- id: checkout
	name: Check out source code
	uses: actions/checkout@v3

	- id: setup-aws
	name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role
	role-duration-seconds: 4800
	aws-region: ${{ secrets.AWS_REGION }}

	- id: fill-metadata
	name: Add name and region to the eksctl file.
	run: \|-
	export CLUSTER_NAME="biomage-$CLUSTER_ENV"

	yq -i '
	.metadata.name = strenv(CLUSTER_NAME) \|
	.metadata.region = strenv(AWS_REGION)
	' infra/config/cluster/cluster-template.yaml

	# CELLENICS_VPC_ID is set if using custom cluster deployment. In this case, use the custom template file.
	# If not set, create an empty template file to let eksctl create a new cluster for the deployment.
	if [ ! -z "$CELLENICS_VPC_ID" ]; then

	export PRIVATE_SUBNET_1_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=false" \| jq -r '.Subnets[0].SubnetId')
	export PRIVATE_SUBNET_2_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=false" \| jq -r '.Subnets[1].SubnetId')
	export PUBLIC_SUBNET_1_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=true" \| jq -r '.Subnets[0].SubnetId')
	export PUBLIC_SUBNET_2_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=true" \| jq -r '.Subnets[1].SubnetId')

	yq '
	.vpc.id = strenv(CELLENICS_VPC_ID) \|
	.vpc.subnets.private.private-1 = strenv(PRIVATE_SUBNET_1_ID) \|
	.vpc.subnets.private.private-2 = strenv(PRIVATE_SUBNET_2_ID) \|
	.vpc.subnets.public.public-1 = strenv(PUBLIC_SUBNET_1_ID) \|
	.vpc.subnets.public.public-2 = strenv(PUBLIC_SUBNET_2_ID)
	' infra/config/cluster/cluster-config-template.yaml > /tmp/cluster-config-values.yaml
	else
	touch /tmp/cluster-config-values.yaml
	fi

	yq eval-all '. as $item ireduce ({}; . *d $item)' infra/config/cluster/cluster-template.yaml /tmp/cluster-config-values.yaml > /tmp/cluster-$CLUSTER_ENV.yaml
	cat /tmp/cluster-$CLUSTER_ENV.yaml
	env:
	AWS_REGION: ${{ secrets.AWS_REGION }}
	CELLENICS_VPC_ID: ${{ secrets.CELLENICS_VPC_ID }}

	- id: install-eksctl
	name: Install eksctl
	run: \|-
	curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" \| tar xz -C /tmp
	sudo mv /tmp/eksctl /usr/local/bin

	- id: create-clusters
	name: Attempt to create clusters from spec.
	# this job will always pass, irrespective of whether creation was successful or not.
	# this is because the cluster may already exist. we will check for this condition
	# on failure in the next step
	continue-on-error: true
	run: \|-
	exec &> >(tee /tmp/eksctl-$CLUSTER_ENV.log)

	eksctl create cluster -f /tmp/cluster-$CLUSTER_ENV.yaml

	echo "outcome=created" >> $GITHUB_OUTPUT

	- id: check-for-failure
	name: Check for reason of failure if cluster creation failed.
	if: steps.create-clusters.outcome == 'failure'
	run: \|-
	# Check if failure was caused by an already exists exception.
	# If not, the job should fail.
	ALREADY_EXISTS=$(grep AlreadyExistsException /tmp/eksctl-$CLUSTER_ENV.log \| wc -l \| xargs)
	if [ $ALREADY_EXISTS -ne 1 ]
	then
	echo Step failed for reason other than stack already existing.
	echo Job failing...
	echo "reason=error" >> $GITHUB_OUTPUT
	false
	fi

	echo Cluster already exists.
	echo "reason=already-exists" >> $GITHUB_OUTPUT

	- id: update-addons-for-cluster
	name: Attempt to create addons for cluster.
	continue-on-error: true
	run: \|-
	exec &> >(tee /tmp/eksctl-$CLUSTER_ENV.log)

	eksctl create addon -f /tmp/cluster-$CLUSTER_ENV.yaml

	- id: update-nodegroup
	name: Attempt to update node groups for existing cluster.
	if: steps.create-clusters.outcome == 'failure' && steps.check-for-failure.outputs.reason == 'already-exists'
	run: \|-
	eksctl create nodegroup --config-file=/tmp/cluster-$CLUSTER_ENV.yaml
	eksctl delete nodegroup --config-file /tmp/cluster-$CLUSTER_ENV.yaml --only-missing --approve

	# note: iam service accounts should really be created from within the helm chart as seen here:
	# https://docs.aws.amazon.com/eks/latest/userguide/specify-service-account-role.html
	- id: update-serviceaccounts
	name: Attempt to update IAM service accounts for existing cluster.
	if: steps.create-clusters.outcome == 'failure' && steps.check-for-failure.outputs.reason == 'already-exists'
	run: \|-
	eksctl utils associate-iam-oidc-provider --config-file=/tmp/cluster-$CLUSTER_ENV.yaml --approve
	eksctl create iamserviceaccount --config-file=/tmp/cluster-$CLUSTER_ENV.yaml
	eksctl delete iamserviceaccount --config-file=/tmp/cluster-$CLUSTER_ENV.yaml --only-missing --approve

	configure-cluster:
	name: Configure Kubernetes resources on the EKS cluster
	runs-on: ubuntu-20.04
	needs: [check-secrets, create-eks-cluster, load-config]
	if: always() && (github.event.inputs.workflow_actions == 'create and configure cluster' \|\| github.event.inputs.workflow_actions == 'configure cluster') && (needs.check-secrets.result == 'success') && (needs.create-eks-cluster.result == 'success' \|\| needs.create-eks-cluster.result == 'skipped')
	env:
	CLUSTER_ENV: ${{ matrix.environment.type }}
	API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
	strategy:
	max-parallel: 1
	matrix:
	environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix) }}
	environment: ${{ matrix.environment.name }}
	steps:
	- id: checkout
	name: Check out source code
	uses: actions/checkout@v3

	- id: setup-aws
	name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role
	role-duration-seconds: 4800
	aws-region: ${{ secrets.AWS_REGION }}

	- id: add-kubeconfig
	name: Add k8s config file for existing cluster.
	run: \|-
	aws eks update-kubeconfig --name biomage-$CLUSTER_ENV

	- id: deploy-metrics-server
	name: Deploy k8s metrics server
	run: \|-
	kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml

	- id: install-helm
	name: Install Helm
	run: \|-
	sudo snap install helm --classic

	- id: install-eksctl
	name: Install eksctl
	run: \|-
	curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" \| tar xz -C /tmp
	sudo mv /tmp/eksctl /usr/local/bin

	- id: deploy-load-balancer-role
	name: Deploy permissions for AWS load balancer controller
	run: \|-
	curl -o iam-policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/v2.7.2/docs/install/iam_policy.json
	aws iam create-policy \
	--policy-name AWSLoadBalancerControllerIAMPolicy-$CLUSTER_ENV \
	--policy-document file://iam-policy.json \|\| true
	eksctl create iamserviceaccount \
	--cluster=biomage-$CLUSTER_ENV \
	--namespace=kube-system \
	--name=aws-load-balancer-controller \
	--attach-policy-arn=arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:policy/AWSLoadBalancerControllerIAMPolicy-$CLUSTER_ENV \
	--role-name eksctl-$CLUSTER_ENV-load-balancer-controller-role \
	--override-existing-serviceaccounts \
	--approve

	# we need to retry this due to an active issue with the AWS Load Balancer Controller
	# where there are intermittent failures that are only fixable by retrying
	# see issue at https://github.com/kubernetes-sigs/aws-load-balancer-controller/issues/2071
	- id: install-lbc
	name: Deploy AWS Load Balancer Controller
	uses: nick-invision/retry@v2
	with:
	timeout_seconds: 600
	max_attempts: 20
	retry_on: error
	on_retry_command: sleep $(shuf -i 5-15 -n 1)
	command: \|-
	helm repo add eks https://aws.github.io/eks-charts
	wget https://raw.githubusercontent.com/aws/eks-charts/master/stable/aws-load-balancer-controller/crds/crds.yaml
	kubectl apply -f crds.yaml
	helm repo update
	helm upgrade aws-load-balancer-controller eks/aws-load-balancer-controller \
	--namespace kube-system \
	--set serviceAccount.create=false \
	--set serviceAccount.name=aws-load-balancer-controller \
	--set clusterName=biomage-$CLUSTER_ENV \
	--install --wait

	- id: platform-public-facing
	name: Get config for whether platform should be public facing
	uses: mikefarah/yq@master
	with:
	cmd: yq '.[env(ENVIRONMENT_NAME)].publicFacing' 'infra/config/github-environments-config.yaml'
	env:
	ENVIRONMENT_NAME: ${{ matrix.environment.name }}

	- id: install-elb-503-subscription-endpoint
	name: Install ELB 503 subscription endpoint
	run: \|-
	echo "value of publicFacing: $PUBLIC_FACING"

	# Check that publicFacing is set to true or false
	if [ "$PUBLIC_FACING" != "true" ] && [ "$PUBLIC_FACING" != "false" ]; then
	echo "value of publicFacing in infra/config/github-environments-config.yaml is not set to true or false"
	exit 1
	fi

	# this is needed so SNS does not stop trying to subscribe to not-yet-deployed
	# API staging environments because their endpoints are not yet available.
	helm upgrade aws-elb-503-subscription-endpoint infra/aws-elb-503-subscription-endpoint \
	--namespace default \
	--set clusterEnv=$CLUSTER_ENV \
	--set acmCertificate="$ACM_CERTIFICATE_ARN" \
	--set-string publicFacing="$PUBLIC_FACING" \
	--install --wait
	env:
	PUBLIC_FACING: ${{ steps.platform-public-facing.outputs.result }}
	ACM_CERTIFICATE_ARN: ${{ steps.setup-domain.outputs.acm-certificate-arn }}

	- id: deploy-env-loadbalancer
	name: Deploy AWS Application Load Balancer for environment
	uses: aws-actions/aws-cloudformation-github-deploy@v1
	with:
	parameter-overrides: "Environment=${{ matrix.environment.type }},PublicFacing=${{ steps.platform-public-facing.outputs.result }}"
	name: "biomage-k8s-alb-${{ matrix.environment.type }}"
	template: 'infra/cf-loadbalancer.yaml'
	no-fail-on-empty-changeset: "1"

	# For HMS ACM_CERTIFICATE_ARN_STAGING exists, having different domains for staging and prod
	# so we need to check if it exists, otherwise set it to ACM_CERTIFICATE_ARN
	# same applies for PRIMARY_DOMAIN_NAME_STAGING
	- id: setup-domain
	name: Compile environment-specific domain name
	run: \|-
	if [ "${{ matrix.environment.type }}" = "production" ]; then
	PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME }}"
	DOMAIN_NAME="${{ secrets.DOMAIN_NAME }}"
	ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN }}"
	fi
	if [ "${{ matrix.environment.type }}" = "staging" ]; then
	PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME_STAGING }}"
	if [ -z "$PRIMARY_DOMAIN_NAME" ]; then
	PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME }}"
	fi
	DOMAIN_NAME="${{ secrets.DOMAIN_NAME_STAGING }}"
	ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN_STAGING }}"
	if [ -z "$ACM_CERTIFICATE_ARN" ]; then
	ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN }}"
	fi
	fi
	echo "primary-domain-name=$PRIMARY_DOMAIN_NAME" >> $GITHUB_OUTPUT
	echo "domain-name=$DOMAIN_NAME" >> $GITHUB_OUTPUT
	echo "acm-certificate-arn=$ACM_CERTIFICATE_ARN" >> $GITHUB_OUTPUT


	# This step should be run only once per deployment. The associated Route 53 records to be created
	# e.g. DOMAIN_NAME and *.DOMAIN_NAME should be deleted before running this comment, otherwise this step fails.
	# Refer to the new deployment runbook to learn more.
	# - id: deploy-route53
	# name: Deploy Route 53 DNS records to ELB
	# uses: aws-actions/aws-cloudformation-github-deploy@v1
	# with:
	# parameter-overrides: "Environment=${{ matrix.environment.type }},DNSName=${{ steps.deploy-env-loadbalancer.outputs.DNSName }},HostedZoneId=${{ steps.deploy-env-loadbalancer.outputs.CanonicalHostedZoneID }},PrimaryDomainName=${{ steps.setup-domain.outputs.primary-domain-name }},DomainName=${{ steps.setup-domain.outputs.domain-name }}"
	# name: "biomage-alb-route53-${{ matrix.environment.type }}"
	# template: 'infra/cf-route53.yaml'
	# no-fail-on-empty-changeset: "1"

	- id: deploy-xray-daemon
	name: Deploy AWS X-Ray daemon
	run: \|-
	helm upgrade "aws-xray-daemon" infra/aws-xray-daemon \
	--namespace default \
	--set iamRole=arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/xray-daemon-role-$CLUSTER_ENV \
	--install --wait

	- id: install-ebs-csi-driver
	name: Install AWS EBS Container Storage Interface (CSI) drivers
	run: \|-
	helm upgrade \
	aws-ebs-csi-driver https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/download/helm-chart-aws-ebs-csi-driver-2.17.2/aws-ebs-csi-driver-2.17.2.tgz \
	--namespace kube-system \
	--set enableVolumeScheduling=true \
	--set enableVolumeResizing=true \
	--set enableVolumeSnapshot=true \
	--install --wait

	- id: deploy-read-only-group
	name: Deploy read-only permission definition for cluster
	run: \|-
	helm upgrade "biomage-read-only-group" infra/biomage-read-only-group \
	--install --wait

	- id: deploy-state-machine-role
	name: Deploy AWS Step Function (state machine) roles
	uses: aws-actions/aws-cloudformation-github-deploy@v1
	with:
	parameter-overrides: "Environment=${{ matrix.environment.type }}"
	name: "biomage-state-machine-role-${{ matrix.environment.type }}"
	template: 'infra/cf-state-machine-role.yaml'
	capabilities: 'CAPABILITY_IAM,CAPABILITY_NAMED_IAM'
	no-fail-on-empty-changeset: "1"

	- id: remove-identitymappings
	name: Remove all previous identity mappings for IAM users
	run: \|-
	eksctl get iamidentitymapping --cluster=biomage-$CLUSTER_ENV --output=json \| \
	jq -r '.[] \| select(.userarn != null) \| .userarn' > /tmp/users_to_remove
	while IFS= read -r user
	do
	echo "Remove rights of $user"
	eksctl delete iamidentitymapping \
	--cluster=biomage-$CLUSTER_ENV \
	--arn $user \
	--all
	done < "/tmp/users_to_remove"

	# see https://eksctl.io/usage/iam-identity-mappings/
	# Grant login rights to ci-iac-role
	- id: add-ci-iac-oidc-cluster-role
	name: Allow the OIDC role to log in to our cluster.
	run: \|-
	eksctl create iamidentitymapping \
	--cluster=biomage-$CLUSTER_ENV \
	--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/ci-iac-role \
	--group system:masters \
	--username ci-iac-role

	# SSO access to cluster is only added if accessing AWS and cluster using SSO
	- id: allow-sso-roles-to-access-cluster
	env:
	SSO_ROLE: ${{ secrets.SSO_ROLE }}
	if: ${{ env.SSO_ROLE != '' }}
	name: Allow SSO role to log into the cluster
	run: \|-
	eksctl create iamidentitymapping \
	--cluster biomage-$CLUSTER_ENV \
	--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/${{ env.SSO_ROLE }} \
	--username sso-cluster-admin \
	--no-duplicate-arns \
	--group system:masters

	- id: add-state-machine-cluster-role
	name: Grant rights to the state machine IAM role.
	run: \|-
	eksctl create iamidentitymapping \
	--cluster=biomage-$CLUSTER_ENV \
	--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/state-machine-role-$CLUSTER_ENV \
	--group state-machine-runner-group \
	--username state-machine-runner

	# NOTE: after updating this step, make sure you apply the updates in other relevant Github Actions workflows
	- id: update-identitymapping-admin
	name: Add cluster admin rights to everyone on the admin list.
	run: \|-
	echo "Setting cluster admin rights for ${{matrix.environment.name}} in ${{matrix.environment.type}} environment"
	ADMINS="${{ join(matrix.environment.admins, ' ') }}"
	echo $ADMINS
	for user in $ADMINS; do
	echo "Adding cluster admin rights to $user"
	eksctl create iamidentitymapping \
	--cluster=biomage-$CLUSTER_ENV \
	--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:user/$user \
	--group system:masters \
	--username $user
	done

	###
	### INSTALL AND CONFIGURE FLUX V2 ###
	###
	- id: using-self-signed-certificate
	name: Get config for whether deployment is using self-signed certificate
	uses: mikefarah/yq@master
	with:
	cmd: yq '.[env(ENVIRONMENT_NAME)].selfSignedCertificate' 'infra/config/github-environments-config.yaml'
	env:
	ENVIRONMENT_NAME: ${{ matrix.environment.name }}

	- id: fill-account-specific-metadata
	name: Fill in account specific metadata in ConfigMap
	run: \|-
	yq -i '
	.myAccount.domainName = strenv(DOMAIN_NAME) \|
	.myAccount.region = strenv(AWS_REGION) \|
	.myAccount.accountId = strenv(AWS_ACCOUNT_ID) \|
	.myAccount.publicFacing = strenv(PUBLIC_FACING) \|
	.myAccount.acmCertificate = strenv(ACM_CERTIFICATE_ARN) \|
	.myAccount.selfSignedCertificate = strenv(SELF_SIGNED_CERTIFICATE)
	' infra/config/account-config.yaml

	if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]]
	then
	export DATADOG_API_KEY="${{ secrets.DATADOG_API_KEY }}"
	export DATADOG_APP_KEY="${{ secrets.DATADOG_APP_KEY }}"
	yq -i '
	.myAccount.datadogAppKey = strenv(DATADOG_APP_KEY) \|
	.myAccount.datadogApiKey = strenv(DATADOG_API_KEY)
	' infra/config/account-config.yaml
	fi

	cat infra/config/account-config.yaml
	env:
	AWS_REGION: ${{ secrets.AWS_REGION }}
	AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }}
	DOMAIN_NAME: ${{ steps.setup-domain.outputs.domain-name }}
	ACM_CERTIFICATE_ARN: ${{ steps.setup-domain.outputs.acm-certificate-arn }}
	PUBLIC_FACING: ${{ steps.platform-public-facing.outputs.result }}
	SELF_SIGNED_CERTIFICATE: ${{ steps.using-self-signed-certificate.outputs.result }}

	- id: create-flux-namespace
	name: Attempt to create flux namespace
	continue-on-error: true
	run: \|-
	kubectl create namespace flux-system

	- id: create-account-information-configmap
	name: Create a configmap containing AWS account specific details
	continue-on-error: false
	run: \|-
	kubectl create configmap account-config --from-file=infra/config/account-config.yaml -n flux-system -o yaml --dry-run \| kubectl apply -f -

	- id: install-flux-v2
	name: Install flux CLI version 2.1.0
	run: \|-
	curl -s https://fluxcd.io/install.sh \| sudo FLUX_VERSION=2.1.2 bash

	- id: delete-old-flux-github-deploy-key
	name: Attempt to delete previous github flux deploy key
	continue-on-error: true
	run: \|-
	kubectl -n flux-system delete secret flux-system

	- id: install-flux
	name: Install Flux to EKS cluster
	run: \|-

	# Refer to https://github.com/fluxcd/flux2/releases
	FLUX_VERSION=v2.1.2
	FLUX_REPO=releases
	FLUX_PATH=deployments/$ENVIRONMENT_NAME-$CLUSTER_ENV
	REPO_FULL_PATH=$GITHUB_REPOSITORY_OWNER/$FLUX_REPO

	echo "flux-full-repo=$(echo $REPO_FULL_PATH)" >> $GITHUB_ENV
	echo "flux-path=$(echo $FLUX_PATH)" >> $GITHUB_ENV

	args=(
	--version $FLUX_VERSION
	--owner $GITHUB_REPOSITORY_OWNER
	--repository $FLUX_REPO
	--branch master
	--path $FLUX_PATH
	--timeout 40s
	–-interval 2m
	--components-extra=image-reflector-controller,image-automation-controller
	--namespace flux-system
	--cluster arn:aws:eks:$AWS_REGION:$AWS_ACCOUNT_ID:cluster/biomage-$CLUSTER_ENV
	--context arn:aws:eks:$AWS_REGION:$AWS_ACCOUNT_ID:cluster/biomage-$CLUSTER_ENV
	)

	if [ "${{ matrix.environment.type }}" = "staging" ]
	then
	echo Flux will be deployed in staging with read and write permissions
	args+=(--read-write-key)
	elif [ "${{ matrix.environment.type }}" = "production" ]
	then
	echo Flux will be deployed in production with read-only permissions
	fi

	flux bootstrap github "${args[@]}"

	env:
	GITHUB_TOKEN: ${{ secrets.API_TOKEN_GITHUB }}
	AWS_REGION: ${{ secrets.AWS_REGION }}
	AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }}
	ENVIRONMENT_NAME: ${{ matrix.environment.name }}

	- id: fill-in-sync-yaml
	name: Create the sync.yaml file that contains the Kustomization to sync the cluster
	run: \|-
	export SPEC_PATH="./$CLUSTER_ENV"
	yq -i '
	.spec.path = strenv(SPEC_PATH)
	' infra/flux/sync.yaml

	cat infra/flux/sync.yaml

	- id: push-sync-yaml
	name: Push the sync.yaml file that was filled in during the previous step
	uses: dmnemec/copy_file_to_another_repo_action@v1.0.4
	env:
	API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
	with:
	source_file: infra/flux/sync.yaml
	destination_repo: ${{ env.flux-full-repo }}
	destination_folder: ${{ env.flux-path }}
	user_email: ci@biomage.net
	user_name: 'Biomage CI/CD'

	- id: fill-kustomization-template
	name: Fill in Kustomization template
	run: \|-
	cat infra/flux/kustomization-template.yaml \
	\| sed "s/AWS_ACCOUNT_ID/$AWS_ACCOUNT_ID/g" \
	\| sed "s/CLUSTER_ENV/$CLUSTER_ENV/g" \
	> infra/flux/kustomization.yaml

	cat infra/flux/kustomization.yaml
	env:
	AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }}

	- id: push-kustomization-yaml
	name: Push the kustomization.yaml file to apply our custom config
	uses: dmnemec/copy_file_to_another_repo_action@v1.0.4
	env:
	API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
	with:
	source_file: infra/flux/kustomization.yaml
	destination_repo: ${{ env.flux-full-repo }}
	destination_folder: ${{ env.flux-path }}/flux-system
	user_email: ci@biomage.net
	user_name: 'Biomage CI/CD'

	- id: install-kubernetes-reflector
	name: Install kubernetes reflector
	run: \|-
	helm repo add emberstack https://emberstack.github.io/helm-charts
	helm repo update
	helm upgrade --install reflector emberstack/reflector --namespace flux-system

	- id: add-account-config-configmap-annotations
	name: Add annotations to account-config configmap
	run: \|-
	kubectl annotate configmap account-config \
	--overwrite \
	--namespace flux-system \
	reflector.v1.k8s.emberstack.com/reflection-allowed="true" \
	reflector.v1.k8s.emberstack.com/reflection-allowed-namespaces="ui-.,api-.,pipeline-.,worker-." \
	reflector.v1.k8s.emberstack.com/reflection-auto-enabled="true"
	###
	### END OF INSTALL AND CONFIGURE FLUX V2 ###
	###

	deploy-monitoring:
	name: Setup logging and monitoring
	runs-on: ubuntu-20.04
	needs: [check-secrets, create-eks-cluster, configure-cluster, load-config]
	if: always() && (needs.check-secrets.result == 'success') && (github.event.inputs.workflow_actions == 'deploy monitoring')
	env:
	CLUSTER_ENV: ${{ matrix.environment.type }}
	API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
	strategy:
	matrix:
	environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix)}}
	environment: ${{ matrix.environment.name }}
	steps:
	- id: checkout
	name: Check out source code
	uses: actions/checkout@v3

	- id: setup-aws
	name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role
	role-duration-seconds: 3600
	aws-region: ${{ secrets.AWS_REGION }}

	- id: add-kubeconfig
	name: Add k8s config file for existing cluster.
	run: \|-
	aws eks update-kubeconfig --name biomage-$CLUSTER_ENV

	- id: install-eksctl
	name: Install eksctl
	run: \|-
	curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" \| tar xz -C /tmp
	sudo mv /tmp/eksctl /usr/local/bin

	- id: setup-cluster-cloudwatch-logging-policy
	name: Setup permissions required for cluster to log to Cloudwatch
	uses: aws-actions/aws-cloudformation-github-deploy@v1
	with:
	parameter-overrides: "Environment=${{ matrix.environment.type }}"
	name: "cluster-cloudwatch-logging-policy-${{ matrix.environment.type }}"
	template: 'infra/cluster-logging/cf-cluster-log-cloudwatch-policy.yaml'
	no-fail-on-empty-changeset: "1"
	capabilities: "CAPABILITY_IAM,CAPABILITY_NAMED_IAM"

	# Setting up log forwarding for pods hosted in EC2 nodes
	- id: create-fluent-bit-namespace
	name: Create namespace for node FluentBit deployment
	run: kubectl apply -f infra/cluster-logging/node-fluentbit-namespace.yaml

	- id: create-service-account-for-node-fluent-bit
	name: Create service account for node FluentBit
	env:
	LOGGING_POLICY_ARN: ${{ steps.setup-cluster-cloudwatch-logging-policy.outputs.PolicyARN }}
	run: \|-
	eksctl create iamserviceaccount \
	--name fluent-bit \
	--namespace node-logging \
	--cluster biomage-$CLUSTER_ENV \
	--role-name irsa-fluent-bit-$CLUSTER_ENV \
	--attach-policy-arn $LOGGING_POLICY_ARN \
	--override-existing-serviceaccounts \
	--approve

	- id: deploy-node-fluent-bit
	name: Deploy FluentBit for EC2 nodes
	env:
	AWS_REGION: ${{ secrets.AWS_REGION }}
	run: \|
	# FluentBit configuration is determined in infra/cluster-logging/node-fluentbit-config.yaml, specifically under [INPUT] > Path
	# We do not want to log everything for costs/security concerns

	yq -i "(.. \| select(type == \"!!str\")) \|= sub(\"CI_CLUSTER_ENV\", \"$CLUSTER_ENV\")" infra/cluster-logging/node-fluentbit-config.yaml
	yq -i "(.. \| select(type == \"!!str\")) \|= sub(\"CI_AWS_REGION\", \"$AWS_REGION\")" infra/cluster-logging/node-fluentbit-config.yaml

	kubectl apply -f infra/cluster-logging/node-fluentbit-config.yaml

	# Setting up log forwarding for pods hosted on Fargate nodes
	- id: attach-pod-execution-role-name
	name: Attach logging policy to pod execution role
	env:
	LOGGING_POLICY_ARN: ${{ steps.setup-cluster-cloudwatch-logging-policy.outputs.PolicyARN }}
	run: \|-
	# Pods launched in the same cluster has the same pod execution role, as pod execution role scope is cluster-wide.
	# See https://eksctl.io/usage/fargate-support/#creating-a-cluster-with-fargate-support
	# Getting fargate-profile of pipeline or worker in the same cluster gets the same pod execution role.

	POD_EXEC_ROLE_NAME=$(aws eks describe-fargate-profile \
	--cluster-name biomage-$CLUSTER_ENV \
	--fargate-profile-name pipeline-default \| jq -r '.fargateProfile.podExecutionRoleArn' \| awk -F"/" '{print (NF>1)? $NF : ""}' )

	aws iam attach-role-policy --role-name $POD_EXEC_ROLE_NAME --policy-arn $LOGGING_POLICY_ARN

	- id: deploy-fargate-fluent-bit
	name: Deploy FluentBit config for Fargate pods
	env:
	AWS_REGION: ${{ secrets.AWS_REGION }}
	run: \|-
	# FluentBit configuration is determined in infra/cluster-logging/fargate-fluentbit-config.yaml
	yq -i "(.. \| select(type == \"!!str\")) \|= sub(\"CI_CLUSTER_ENV\", \"$CLUSTER_ENV\")" infra/cluster-logging/fargate-fluentbit-config.yaml
	yq -i "(.. \| select(type == \"!!str\")) \|= sub(\"CI_AWS_REGION\", \"$AWS_REGION\")" infra/cluster-logging/fargate-fluentbit-config.yaml

	kubectl apply -f infra/cluster-logging/fargate-fluentbit-config.yaml

	# Setting up Datadog to watch pod metrics for pods hosted on EC2 and Fargate nodes
	- id: setup-datadog-cluster-agent
	name: Setup Datadog cluster agent
	run: \|-
	if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]];
	then
	helm repo add datadog https://helm.datadoghq.com
	helm repo update
	helm upgrade datadog-agent datadog/datadog \
	-f infra/datadog/cluster-agent-values.yaml \
	--set datadog.apiKey=$DATADOG_API_KEY \
	--set datadog.clusterName=biomage-$CLUSTER_ENV \
	--install
	else
	echo "Datadog api key missing, skipping datadog setup"
	fi

	- id: setup-datadog-sidecar-permissions
	name: Setup Datadog sidecar permissions
	run: \|-
	if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]];
	then
	kubectl apply -f infra/datadog/datadog-sidecar-rbac.yaml
	fi

	- id: login-ecr
	name: Login to Amazon ECR
	uses: aws-actions/amazon-ecr-login@v1

	- id: create-falcon-ecr-registries
	name: Create an ECR repositories for the Falcon Sensor (if needed)
	# This will fail if the registry already exists, which is fine. If there is some other
	# error, the `push` step will fail instead.
	continue-on-error: true
	run: \|-
	if [[ -n "${{ secrets.FALCON_CID }}" ]];
	then
	aws ecr create-repository --repository-name falcon-container/falcon-sensor --image-tag-mutability MUTABLE
	aws ecr create-repository --repository-name falcon-sensor/falcon-sensor --image-tag-mutability MUTABLE
	else
	echo "CrowdStrike CID missing, not creating falcon sensor repos"
	fi

	- id: create-falcon-namespace
	name: Attempt to create falcon namespace
	continue-on-error: true
	run: \|-
	if [[ -n "${{ secrets.FALCON_CID }}" ]];
	then
	kubectl create namespace falcon-system
	else
	echo "CrowdStrike CID missing, not creating falcon namespace"
	fi

	- id: setup-falcon-sensor
	name: Setup Falcon Sensor
	run: \|-
	if [[ -n "${{ secrets.FALCON_CID }}" ]];
	then

	# configure the API client ID and password
	export FALCON_CLIENT_ID="${{ secrets.FALCON_CLIENT_ID }}"
	export FALCON_CLIENT_SECRET="${{ secrets.FALCON_CLIENT_SECRET }}"

	# confgure CID
	export FALCON_CID="${{ secrets.FALCON_CID }}"

	# URL of falcon-container-sensor-pull.sh
	PULL_SCRIPT_URL="https://raw.githubusercontent.com/CrowdStrike/falcon-scripts/main/bash/containers/falcon-container-sensor-pull/falcon-container-sensor-pull.sh"

	# Download the pull script from GitHub and save it to the current directory
	# --silent - Supresses standard/error output
	# --remote-name - Keeps the original filename when saving
	# --location - Follow redirects
	curl --silent --remote-name --location "$PULL_SCRIPT_URL"

	# make script executable
	chmod +x falcon-container-sensor-pull.sh

	# download latest version of the Falcon Container (for fargate) and copy it to ECS
	./falcon-container-sensor-pull.sh \
	--client-id ${FALCON_CLIENT_ID} \
	--client-secret ${FALCON_CLIENT_SECRET} \
	--type falcon-container \
	--copy "${ECR_REGISTRY}/falcon-container"

	# download latest version of the Falcon Node Sensor (for EC2) and copy it to ECS
	./falcon-container-sensor-pull.sh \
	--client-id ${FALCON_CLIENT_ID} \
	--client-secret ${FALCON_CLIENT_SECRET} \
	--type falcon-sensor \
	--copy "${ECR_REGISTRY}/falcon-sensor"

	# functions to get image names for helm
	get_image_path() {
	local container_type=$1
	./falcon-container-sensor-pull.sh \
	--client-id ${FALCON_CLIENT_ID} \
	--client-secret ${FALCON_CLIENT_SECRET} \
	--type ${container_type} \
	--get-image-path
	}

	get_image_name() {
	local container_type=$1
	local image_path=$(get_image_path "$container_type")

	# Extract the image name using awk
	local image_name=$(echo "$image_path" \| awk -F':' '{print $2}')

	echo "$image_name"
	}


	FALCON_CONTAINER_IMAGE_TAG=$(get_image_name "falcon-container")
	FALCON_SENSOR_IMAGE_TAG=$(get_image_name "falcon-sensor")

	# install container sensor (for fargate) into a customized namespace
	helm repo add crowdstrike https://crowdstrike.github.io/falcon-helm
	helm repo update
	helm upgrade --install falcon-container-helm crowdstrike/falcon-sensor \
	-n falcon-container-system --create-namespace \
	--set node.enabled=false \
	--set container.enabled=true \
	--set falcon.cid="$FALCON_CID" \
	--set container.image.repository="${ECR_REGISTRY}/falcon-container/falcon-sensor" \
	--set container.image.tag="$FALCON_CONTAINER_IMAGE_TAG"

	# install node sensor (for ec2) with different release name (falcon-sensor-helm)
	helm upgrade --install falcon-sensor-helm crowdstrike/falcon-sensor \
	-n falcon-sensor-system --create-namespace \
	--set falcon.cid="$FALCON_CID" \
	--set node.image.repository="${ECR_REGISTRY}/falcon-sensor/falcon-sensor" \
	--set node.image.tag="$FALCON_SENSOR_IMAGE_TAG"

	# install KPA (kubernetes protection agent)
	helm upgrade --install kpagent crowdstrike/cs-k8s-protection-agent \
	-n falcon-kubernetes-protection --create-namespace \
	--set image.repository="registry.crowdstrike.com/kubernetes_protection/kpagent" \
	--set image.tag="0.2117.0" \
	--set crowdstrikeConfig.clientID="$FALCON_CLIENT_ID" \
	--set crowdstrikeConfig.clientSecret="$FALCON_CLIENT_SECRET" \
	--set crowdstrikeConfig.clusterName="arn:aws:eks:${AWS_REGION}:${AWS_ACCOUNT_ID}:cluster/biomage-${CLUSTER_ENV}" \
	--set crowdstrikeConfig.env="${{ secrets.FALCON_REGION }}" \
	--set crowdstrikeConfig.cid="${{ secrets.FALCON_CCID }}" \
	--set crowdstrikeConfig.dockerAPIToken="${{ secrets.FALCON_DOCKER_API_TOKEN }}"

	else
	echo "CrowdStrike CID missing, skipping falcon sensor setup"
	fi
	env:
	ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
	AWS_REGION: ${{ secrets.AWS_REGION }}
	AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}

	report-if-failed:
	name: Report if workflow failed
	runs-on: ubuntu-20.04
	needs: [load-config, check-secrets, create-eks-cluster, configure-cluster, deploy-monitoring]
	if: failure() && github.ref == 'refs/heads/master'
	steps:
	- id: send-to-slack
	name: Send failure notification to Slack on failure
	env:
	SLACK_BOT_TOKEN: ${{ secrets.WORKFLOW_STATUS_BOT_TOKEN }}
	uses: voxmedia/github-action-slack-notify-build@v1
	with:
	channel: workflow-failures
	status: FAILED
	color: danger

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Deploy Cellenics infrastructure on AWS #506

Workflow file

Deploy Cellenics infrastructure on AWS #506

Jobs

Run details

Workflow file for this run