Skip to content

Commit 9ac1b30

Browse files
committed
feat(): add EIP creation/release in gitlab runner lifecycle
1 parent 4b170bc commit 9ac1b30

File tree

8 files changed

+337
-52
lines changed

8 files changed

+337
-52
lines changed

main.tf

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ locals {
5050
file_yum_update = file("${path.module}/template/yum_update.tftpl")
5151

5252
template_eip = templatefile("${path.module}/template/eip.tftpl", {
53-
eip = join(",", [for eip in aws_eip.gitlab_runner : eip.public_ip])
53+
eip_tags = join(" ", [for k, v in local.tags : "Key=${k},Value=${v}"])
5454
})
5555

5656
template_gitlab_runner = templatefile("${path.module}/template/gitlab-runner.tftpl",
@@ -73,6 +73,7 @@ locals {
7373
secure_parameter_store_gitlab_runner_registration_token_name = var.runner_gitlab_registration_token_secure_parameter_store_name
7474
secure_parameter_store_runner_token_key = local.secure_parameter_store_runner_token_key
7575
secure_parameter_store_runner_sentry_dsn = local.secure_parameter_store_runner_sentry_dsn
76+
use_eip = var.runner_instance.use_eip
7677
secure_parameter_store_gitlab_token_name = var.runner_gitlab.access_token_secure_parameter_store_name
7778
secure_parameter_store_region = data.aws_region.current.name
7879
gitlab_runner_registration_token = var.runner_gitlab_registration_config.registration_token
@@ -357,13 +358,6 @@ resource "aws_iam_instance_profile" "instance" {
357358
tags = local.tags
358359
}
359360

360-
resource "aws_eip" "gitlab_runner" {
361-
# checkov:skip=CKV2_AWS_19:We can't use NAT gateway here as we are contacted from the outside.
362-
count = var.runner_instance.use_eip ? 1 : 0
363-
364-
tags = local.tags
365-
}
366-
367361
# We wait for 5 minutes until we set an EC2 instance to status `InService` so it has time to provision itself and it's configured capacity.
368362
resource "aws_autoscaling_lifecycle_hook" "wait_for_gitlab_runner" {
369363
name = "${var.environment}-wait-for-gitlab-runner-up"

modules/terminate-agent-hook/iam.tf

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,20 @@ data "aws_iam_policy_document" "spot_request_housekeeping" {
161161
}
162162
}
163163

164+
data "aws_iam_policy_document" "eip_cleanup" {
165+
statement {
166+
sid = "EIPCleanup"
167+
168+
effect = "Allow"
169+
actions = [
170+
"ec2:DescribeAddresses",
171+
"ec2:DisassociateAddress",
172+
"ec2:ReleaseAddress"
173+
]
174+
resources = ["*"]
175+
}
176+
}
177+
164178
resource "aws_iam_policy" "lambda" {
165179
name = "${var.name_iam_objects}-${var.name}-lambda"
166180
path = "/"
@@ -187,6 +201,19 @@ resource "aws_iam_role_policy_attachment" "spot_request_housekeeping" {
187201
policy_arn = aws_iam_policy.spot_request_housekeeping.arn
188202
}
189203

204+
resource "aws_iam_policy" "eip_cleanup" {
205+
name = "${var.name_iam_objects}-${var.name}-eip-cleanup"
206+
path = "/"
207+
policy = data.aws_iam_policy_document.eip_cleanup.json
208+
209+
tags = var.tags
210+
}
211+
212+
resource "aws_iam_role_policy_attachment" "eip_cleanup" {
213+
role = aws_iam_role.lambda.name
214+
policy_arn = aws_iam_policy.eip_cleanup.arn
215+
}
216+
190217
resource "aws_iam_role_policy_attachment" "aws_lambda_vpc_access_execution_role" {
191218
role = aws_iam_role.lambda.name
192219
policy_arn = "arn:${data.aws_partition.current.partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"

modules/terminate-agent-hook/lambda/lambda_function.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,126 @@ def remove_unused_ssh_key_pairs(client, executor_name_part):
223223
}))
224224

225225

226+
def cleanup_orphaned_eips(ec2_client, executor_name_part):
227+
"""
228+
Clean up orphaned EIPs from terminated instances.
229+
:param ec2_client: the boto3 EC2 client
230+
:param executor_name_part: used to filter EIPs by Environment tag to match this value
231+
"""
232+
print(json.dumps({
233+
"Level": "info",
234+
"Message": f"Checking for orphaned EIPs for agent {executor_name_part}"
235+
}))
236+
237+
try:
238+
# Find all EIPs (we'll filter by tag content below)
239+
eips_response = ec2_client.describe_addresses()
240+
241+
eips_to_cleanup = []
242+
243+
for eip in eips_response.get("Addresses", []):
244+
allocation_id = eip["AllocationId"]
245+
instance_id = eip.get("InstanceId")
246+
247+
# First check if this EIP belongs to our environment
248+
eip_tags = {tag["Key"]: tag["Value"] for tag in eip.get("Tags", [])}
249+
if not ("Environment" in eip_tags and executor_name_part in eip_tags["Environment"]):
250+
continue # Skip EIPs not belonging to our environment
251+
252+
if instance_id:
253+
# Check if the associated instance still exists and is terminated
254+
try:
255+
instance_response = ec2_client.describe_instances(InstanceIds=[instance_id])
256+
instance_state = instance_response["Reservations"][0]["Instances"][0]["State"]["Name"]
257+
258+
if instance_state == "terminated":
259+
eips_to_cleanup.append({
260+
"allocation_id": allocation_id,
261+
"instance_id": instance_id,
262+
"public_ip": eip.get("PublicIp", "unknown"),
263+
"reason": f"associated instance {instance_id} is terminated"
264+
})
265+
except ClientError as error:
266+
if 'InvalidInstanceID.NotFound' in str(error):
267+
# Instance no longer exists
268+
eips_to_cleanup.append({
269+
"allocation_id": allocation_id,
270+
"instance_id": instance_id,
271+
"public_ip": eip.get("PublicIp", "unknown"),
272+
"reason": f"associated instance {instance_id} no longer exists"
273+
})
274+
else:
275+
print(json.dumps({
276+
"Level": "warning",
277+
"Message": f"Could not check instance {instance_id} for EIP {allocation_id}",
278+
"Exception": str(error)
279+
}))
280+
else:
281+
# EIP is not associated with any instance and belongs to our environment
282+
eips_to_cleanup.append({
283+
"allocation_id": allocation_id,
284+
"instance_id": "none",
285+
"public_ip": eip.get("PublicIp", "unknown"),
286+
"reason": "unassociated EIP with matching Environment tag"
287+
})
288+
289+
# Clean up identified orphaned EIPs
290+
for eip_info in eips_to_cleanup:
291+
try:
292+
print(json.dumps({
293+
"Level": "info",
294+
"AllocationId": eip_info["allocation_id"],
295+
"PublicIp": eip_info["public_ip"],
296+
"Message": f"Releasing orphaned EIP: {eip_info['reason']}"
297+
}))
298+
299+
# Disassociate first if still associated
300+
if eip_info["instance_id"] != "none":
301+
try:
302+
ec2_client.disassociate_address(AllocationId=eip_info["allocation_id"])
303+
except ClientError as disassociate_error:
304+
print(json.dumps({
305+
"Level": "warning",
306+
"Message": f"Failed to disassociate EIP {eip_info['allocation_id']}",
307+
"Exception": str(disassociate_error)
308+
}))
309+
310+
# Release the EIP
311+
ec2_client.release_address(AllocationId=eip_info["allocation_id"])
312+
313+
print(json.dumps({
314+
"Level": "info",
315+
"AllocationId": eip_info["allocation_id"],
316+
"Message": "Successfully released orphaned EIP"
317+
}))
318+
319+
except ClientError as error:
320+
print(json.dumps({
321+
"Level": "error",
322+
"AllocationId": eip_info["allocation_id"],
323+
"Message": f"Failed to release orphaned EIP",
324+
"Exception": str(error)
325+
}))
326+
327+
if not eips_to_cleanup:
328+
print(json.dumps({
329+
"Level": "info",
330+
"Message": "No orphaned EIPs found to clean up"
331+
}))
332+
else:
333+
print(json.dumps({
334+
"Level": "info",
335+
"Message": f"Cleaned up {len(eips_to_cleanup)} orphaned EIP(s)"
336+
}))
337+
338+
except ClientError as error:
339+
print(json.dumps({
340+
"Level": "error",
341+
"Message": "Failed to describe EIPs for cleanup",
342+
"Exception": str(error)
343+
}))
344+
345+
226346
# context not used: this is the interface for a AWS Lambda function defined by AWS
227347
# pylint: disable=unused-argument
228348
def handler(event, context):
@@ -269,6 +389,9 @@ def handler(event, context):
269389

270390
remove_unused_ssh_key_pairs(client=client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE'])
271391

392+
# Clean up orphaned EIPs from terminated instances
393+
cleanup_orphaned_eips(ec2_client=client, executor_name_part=os.environ['NAME_EXECUTOR_INSTANCE'])
394+
272395
return "Housekeeping done"
273396

274397

outputs.tf

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,6 @@ output "runner_sg_id" {
4343
value = var.runner_worker.type == "docker-autoscaler" ? aws_security_group.docker_autoscaler[0].id : (var.runner_worker.type == "docker+machine" ? aws_security_group.docker_machine[0].id : null)
4444
}
4545

46-
output "runner_eip" {
47-
description = "EIP of the Gitlab Runner"
48-
value = length(aws_eip.gitlab_runner) > 0 ? aws_eip.gitlab_runner[0].public_ip : null
49-
}
50-
5146
output "runner_launch_template_name" {
5247
description = "The name of the runner's launch template."
5348
value = aws_launch_template.gitlab_runner_instance.name
Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,36 @@
11
{
2-
"Version": "2012-10-17",
3-
"Statement": [
4-
{
5-
"Effect": "Allow",
6-
"Action": [
7-
"autoscaling:SetDesiredCapacity",
8-
"autoscaling:TerminateInstanceInAutoScalingGroup"
9-
],
10-
"Resource": "${autoscaler_asg_arn}"
11-
},
12-
{
13-
"Effect": "Allow",
14-
"Action": [
15-
"autoscaling:DescribeAutoScalingGroups",
16-
"ec2:DescribeInstances"
17-
],
18-
"Resource": "*"
19-
},
20-
{
21-
"Effect": "Allow",
22-
"Action": [
23-
"ec2:GetPasswordData",
24-
"ec2-instance-connect:SendSSHPublicKey"
25-
],
26-
"Resource": "arn:${partition}:ec2:${aws_region}:*:instance/*",
27-
"Condition": {
28-
"StringEquals": {
29-
"ec2:ResourceTag/aws:autoscaling:groupName": "${autoscaler_asg_name}"
30-
}
31-
}
2+
"Version": "2012-10-17",
3+
"Statement": [
4+
{
5+
"Effect": "Allow",
6+
"Action": [
7+
"autoscaling:SetDesiredCapacity",
8+
"autoscaling:TerminateInstanceInAutoScalingGroup"
9+
],
10+
"Resource": "${autoscaler_asg_arn}"
11+
},
12+
{
13+
"Effect": "Allow",
14+
"Action": [
15+
"autoscaling:DescribeAutoScalingGroups",
16+
"ec2:DescribeInstances",
17+
"autoscaling:CompleteLifecycleAction",
18+
"autoscaling:DescribeLifecycleHooks"
19+
],
20+
"Resource": "*"
21+
},
22+
{
23+
"Effect": "Allow",
24+
"Action": [
25+
"ec2:GetPasswordData",
26+
"ec2-instance-connect:SendSSHPublicKey"
27+
],
28+
"Resource": "arn:${partition}:ec2:${aws_region}:*:instance/*",
29+
"Condition": {
30+
"StringEquals": {
31+
"ec2:ResourceTag/aws:autoscaling:groupName": "${autoscaler_asg_name}"
3232
}
33-
]
33+
}
34+
}
35+
]
3436
}

policies/instance-eip.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@
44
{
55
"Effect": "Allow",
66
"Action": [
7+
"ec2:AllocateAddress",
78
"ec2:AssociateAddress",
9+
"ec2:DisassociateAddress",
10+
"ec2:ReleaseAddress",
11+
"ec2:CreateTags",
812
"ec2:Describe*"
913
],
1014
"Resource": "*"

template/eip.tftpl

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,67 @@
1-
echo 'installing additional software for assigning EIP'
1+
echo 'Setting up dynamic EIP management'
22

3-
yum install python3 -y
4-
curl --fail --retry 6 -O https://bootstrap.pypa.io/get-pip.py
5-
python3 get-pip.py --user
6-
export PATH=~/.local/bin:$PATH
3+
# Get instance metadata (token already available from user-data)
4+
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-id)
5+
REGION=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r '.region')
76

8-
pip install aws-ec2-assign-elastic-ip
9-
export AWS_DEFAULT_REGION=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/dynamic/instance-identity/document | grep region | awk -F\" '{print $4}')
10-
/usr/local/bin/aws-ec2-assign-elastic-ip --valid-ips ${eip}
7+
export AWS_DEFAULT_REGION=$REGION
8+
9+
# Create directory for storing EIP allocation ID
10+
mkdir -p /var/lib/ec2-eip
11+
12+
max_retries=5
13+
retry_count=0
14+
wait_time=10
15+
16+
while [ $retry_count -lt $max_retries ]; do
17+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Attempting to allocate EIP (attempt $((retry_count + 1))/$max_retries)"
18+
19+
EIP_RESULT=$(aws ec2 allocate-address --domain vpc --query 'AllocationId' --output text 2>&1)
20+
EIP_EXIT_CODE=$?
21+
22+
if [ $EIP_EXIT_CODE -eq 0 ] && [[ "$EIP_RESULT" =~ ^eipalloc- ]]; then
23+
ALLOCATION_ID="$EIP_RESULT"
24+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Successfully allocated EIP with allocation ID: $ALLOCATION_ID"
25+
26+
TAG_RESULT=$(aws ec2 create-tags --resources "$ALLOCATION_ID" --tags ${eip_tags} 2>&1)
27+
TAG_EXIT_CODE=$?
28+
if [ $TAG_EXIT_CODE -eq 0 ]; then
29+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Successfully tagged EIP $ALLOCATION_ID"
30+
else
31+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Warning: Failed to tag EIP $ALLOCATION_ID: $TAG_RESULT"
32+
fi
33+
34+
ASSOC_RESULT=$(aws ec2 associate-address --instance-id "$INSTANCE_ID" --allocation-id "$ALLOCATION_ID" 2>&1)
35+
ASSOC_EXIT_CODE=$?
36+
if [ $ASSOC_EXIT_CODE -eq 0 ]; then
37+
echo "$ALLOCATION_ID" > /var/lib/ec2-eip/allocation-id
38+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: EIP allocation completed successfully"
39+
break
40+
else
41+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Error: Failed to associate EIP $ALLOCATION_ID with instance $INSTANCE_ID: $ASSOC_RESULT"
42+
43+
# Clean up the allocated EIP if association failed
44+
CLEANUP_RESULT=$(aws ec2 release-address --allocation-id "$ALLOCATION_ID" 2>&1)
45+
if [ $? -eq 0 ]; then
46+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Successfully cleaned up failed EIP allocation"
47+
else
48+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Warning: Failed to cleanup EIP $ALLOCATION_ID: $CLEANUP_RESULT"
49+
fi
50+
fi
51+
else
52+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Error: Failed to allocate EIP (exit code: $EIP_EXIT_CODE): $EIP_RESULT"
53+
fi
54+
55+
retry_count=$((retry_count + 1))
56+
if [ $retry_count -lt $max_retries ]; then
57+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Waiting $wait_time seconds before retry..."
58+
sleep $wait_time
59+
wait_time=$((wait_time * 2))
60+
fi
61+
done
62+
63+
if [ $retry_count -eq $max_retries ]; then
64+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: Error: Failed to allocate and associate EIP after $max_retries attempts"
65+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] EIP-ALLOCATION: EIP allocation failed - GitLab Runner will not have an EIP associated"
66+
exit 1
67+
fi

0 commit comments

Comments
 (0)