-
Notifications
You must be signed in to change notification settings - Fork 585
Gitub Actions CI implementation
rr needs self-hosted Github Actions runners because we need cloud VMs that can use the hardware PMU (which Azure does not support) and root privileges to configure the kernel appropriately. To provide these runners at low cost, our Actions workflow creates AWS spot instances when the workflow begins, runs the actual build and tests on those spot instances, and then shuts down the spot instances.
This is hard to implement safely because creating AWS instances requires AWS credentials, but we want rr PRs containing untrusted code to run the workflows that create the instances, and those PRs can steal any credentials accessible to those workflows. We also don't want to give those PRs any kind of write access to the rr repository.
There is a simple solution. We have an AWS Lambda function that exposes a public, unauthenticated HTTP REST API that creates AWS spot instances and registers them as rr runners (and a corresponding API that shuts down those instances). A snapshot of the source code for this Lambda is attached below. The Lambda has an AWS role (not attached here) that lets it manipulate the spot instances. It also contains a Github fine-grained personal access token that provides read-write access to the organization's self-hosted runner registrations and nothing else. Because it is public and unauthenticated, our workflows do not need or have access to any secrets. (Our API endpoint could be invoked directly by someone malicious, but all they can do is spawn idle rr test runners, which achieves nothing useful for the miscreant.)
To make this approach robust against failures to clean up, each spot instance is configured to terminate itself after one hour. Also, every time we unregister any self-hosted runner, we also unregister all offline runners (i.e. runners that have terminate for any reason apart from the usual cleanup). Each instance is labeled with a unique ID for the workflow that created it, and that workflow's job only runs on that runner, so as long as no run takes more than an hour things will work OK. At time of writing our workflows complete in under ten minutes.
import base64
import boto3
import json
import re
import urllib3
# A token for the rr-debugger/rr repository that provides read-write access
# to Actions runner registration --- and nothing else!
github_token = "..."
runner_version = "2.316.1"
instance_types = {"x86_64": "c5.9xlarge", "arm64": "c6g.8xlarge"}
security_group_id = "..."
org = "rr-debugger"
actions_runner_arch = {"x86_64": "x64", "arm64": "arm64"}
http = urllib3.PoolManager()
ec2_client = boto3.client("ec2")
def validate_response(response):
if response.status < 200 or response.status > 299:
raise ValueError(f"API call failed with {response.status}: {response.read()}")
def fetch_github_token(kind):
url = f"https://api.github.com/orgs/{org}/actions/runners/{kind}"
headers = {
"Accept": "application/vnd.github+json",
"Authorization": f"Bearer {github_token}",
"X-GitHub-Api-Version": "2022-11-28",
}
response = http.request("POST", url, headers=headers)
validate_response(response)
data = json.loads(response.data)
return data["token"]
def remove_runner(runner):
runner_id = runner["id"]
url = f"https://api.github.com/orgs/{org}/actions/runners/{runner_id}"
headers = {
"Accept": "application/vnd.github+json",
"Authorization": f"Bearer {github_token}",
"X-GitHub-Api-Version": "2022-11-28",
}
http.request("DELETE", url, headers=headers)
# Some other instance of this lambda may clean up runners concurrently, so
# that request can fail, so don't validate it.
def remove_runners(labels):
url = f"https://api.github.com/orgs/{org}/actions/runners"
headers = {
"Accept": "application/vnd.github+json",
"Authorization": f"Bearer {github_token}",
"X-GitHub-Api-Version": "2022-11-28",
}
response = http.request("GET", url, headers=headers)
validate_response(response)
data = json.loads(response.data)
for runner in data["runners"]:
# Any offline runner should be cleaned up too
if (runner["status"] == "offline" or
any(label["name"] in labels for label in runner["labels"])):
remove_runner(runner)
def vm_user_data(registration_token, architecture, label):
arch = actions_runner_arch[architecture]
return f"""#!/bin/bash
# Make sure the VM doesn't run for more than an hour
shutdown +60
apt-get update -y
apt-get dist-upgrade -f -y
sudo -u ubuntu --login <<EOF
# Install GitHub Actions runner
mkdir /home/ubuntu/actions-runner && cd /home/ubuntu/actions-runner
curl -o actions-runner-linux-{arch}-{runner_version}.tar.gz -L https://github.com/actions/runner/releases/download/v{runner_version}/actions-runner-linux-{arch}-{runner_version}.tar.gz
echo \"Github Runner Installed\"
# Extract the installer
tar xzf ./actions-runner-linux-{arch}-{runner_version}.tar.gz
echo \"Github Runner Installer Extracted\"
# Run GitHub Actions runner configuration
yes '' | ./config.sh --url https://github.com/rr-debugger --token {registration_token} --labels {label}
# Run GitHub Actions runner
yes '' | ./run.sh
EOF
"""
def create(label, architecture):
registration_token = fetch_github_token("registration-token")
image_filters = [
{"Name": "architecture", "Values": [architecture]},
{"Name": "name", "Values": ["ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-*"]},
]
images = ec2_client.describe_images(Owners=["099720109477"], Filters=image_filters)
image_id = images["Images"][0]["ImageId"]
user_data = vm_user_data(registration_token, architecture, label)
encoded_user_data = base64.b64encode(user_data.encode('utf-8')).decode('utf-8')
run_instance_params = {
"ImageId": image_id,
"InstanceType": instance_types[architecture],
"UserData": encoded_user_data,
"MinCount": 1,
"MaxCount": 1,
"SecurityGroupIds": [security_group_id],
"InstanceMarketOptions": {"MarketType": "spot"},
"InstanceInitiatedShutdownBehavior": "terminate",
"KeyName": "rr-testing",
"TagSpecifications": [{
"ResourceType": "instance",
"Tags": [{
"Key": "Label",
"Value": label,
}],
}],
}
try:
instance_id = ec2_client.run_instances(**run_instance_params)["Instances"][0]["InstanceId"]
except botocore.exceptions.ClientError as error:
if error.response['Error']['Code'] == 'InsufficientInstanceCapacity':
del run_instance_params['InstanceMarketOptions']
instance_id = ec2_client.run_instances(**run_instance_params)["Instances"][0]["InstanceId"]
else:
raise
ec2_client.get_waiter("instance_running").wait(InstanceIds=[instance_id])
return {
"statusCode": 200,
"body": "",
}
def destroy(labels):
filters = [{
"Name": "tag:Label",
"Values": labels
}]
instance_ids = []
reservations = ec2_client.describe_instances(Filters=filters)['Reservations']
for reservation in reservations:
for instance in reservation['Instances']:
instance_ids.append(instance['InstanceId'])
if instance_ids:
try:
# This might fail if the instance(s) shut themselves down
# concurrently
ec2_client.terminate_instances(InstanceIds=instance_ids)
except botocore.exceptions.ClientError as error:
pass
remove_runners(labels)
return {
"statusCode": 200,
"body": "",
}
def validate_label(label):
if re.compile(r"rr_runner_([a-zA-Z_0-9]+)").match(label):
return label
raise ValueError("Invalid label")
def lambda_handler(event, context):
print("Request body:", event["body"])
payload = json.loads(event["body"])
operation = payload["operation"]
if operation == "create":
return create(validate_label(payload['label']), payload["architecture"])
elif operation == "destroy":
return destroy([validate_label(label) for label in payload['labels']])
else:
raise ValueError("Invalid operation")