-
Notifications
You must be signed in to change notification settings - Fork 662
feat: Add last chance check before orphan termination for JIT instances #4595
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
3fa09d2
14df9fe
9d7c89a
e09337f
9064512
716e079
e826fbe
a5fcc88
bb8ba2b
97de234
8b61d39
9883b0b
43468a7
bc995ef
1182c8b
6e1c72c
097c14d
971ec2d
5a275e5
9f59abe
65c0b0e
8ead598
ab5b6b0
0b462bb
102edf0
e6d2d88
83610eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ import { | |
CreateFleetCommand, | ||
CreateFleetResult, | ||
CreateTagsCommand, | ||
DeleteTagsCommand, | ||
DescribeInstancesCommand, | ||
DescribeInstancesResult, | ||
EC2Client, | ||
|
@@ -91,6 +92,7 @@ function getRunnerInfo(runningInstances: DescribeInstancesResult) { | |
repo: i.Tags?.find((e) => e.Key === 'ghr:Repo')?.Value as string, | ||
org: i.Tags?.find((e) => e.Key === 'ghr:Org')?.Value as string, | ||
orphan: i.Tags?.find((e) => e.Key === 'ghr:orphan')?.Value === 'true', | ||
runnerId: i.Tags?.find((e) => e.Key === 'ghr:github_runner_id')?.Value as string, | ||
}); | ||
} | ||
} | ||
|
@@ -112,6 +114,12 @@ export async function tag(instanceId: string, tags: Tag[]): Promise<void> { | |
await ec2.send(new CreateTagsCommand({ Resources: [instanceId], Tags: tags })); | ||
} | ||
|
||
export async function untag(instanceId: string, tags: Tag[]): Promise<void> { | ||
logger.debug(`Untagging '${instanceId}'`, { tags }); | ||
const ec2 = getTracedAWSV3Client(new EC2Client({ region: process.env.AWS_REGION })); | ||
await ec2.send(new DeleteTagsCommand({ Resources: [instanceId], Tags: tags })); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you test untagging via an actual deployment. I do not see any update to the permisison of the lmabda role. So assuming the call wil fail. |
||
} | ||
|
||
function generateFleetOverrides( | ||
subnetIds: string[], | ||
instancesTypes: string[], | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,10 @@ | ||
import { Octokit } from '@octokit/rest'; | ||
import { Endpoints } from '@octokit/types'; | ||
import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; | ||
import moment from 'moment'; | ||
|
||
import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; | ||
import { bootTimeExceeded, listEC2Runners, tag, terminateRunner } from './../aws/runners'; | ||
import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners'; | ||
import { RunnerInfo, RunnerList } from './../aws/runners.d'; | ||
import { GhRunners, githubCache } from './cache'; | ||
import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; | ||
|
@@ -12,6 +13,10 @@ import { getGitHubEnterpriseApiUrl } from './scale-up'; | |
|
||
const logger = createChildLogger('scale-down'); | ||
|
||
type OrgRunnerList = Endpoints['GET /orgs/{org}/actions/runners']['response']['data']['runners']; | ||
type RepoRunnerList = Endpoints['GET /repos/{owner}/{repo}/actions/runners']['response']['data']['runners']; | ||
type RunnerState = OrgRunnerList[number] | RepoRunnerList[number]; | ||
|
||
async function getOrCreateOctokit(runner: RunnerInfo): Promise<Octokit> { | ||
const key = runner.owner; | ||
const cachedOctokit = githubCache.clients.get(key); | ||
|
@@ -46,7 +51,11 @@ async function getOrCreateOctokit(runner: RunnerInfo): Promise<Octokit> { | |
return octokit; | ||
} | ||
|
||
async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise<boolean> { | ||
async function getGitHubSelfHostedRunnerState( | ||
client: Octokit, | ||
ec2runner: RunnerInfo, | ||
runnerId: number, | ||
): Promise<RunnerState> { | ||
stuartp44 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
const state = | ||
ec2runner.type === 'Org' | ||
? await client.actions.getSelfHostedRunnerForOrg({ | ||
|
@@ -58,12 +67,24 @@ async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, | |
owner: ec2runner.owner.split('/')[0], | ||
repo: ec2runner.owner.split('/')[1], | ||
}); | ||
|
||
logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.data.busy}`); | ||
|
||
metricGitHubAppRateLimit(state.headers); | ||
stuartp44 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
return state.data.busy; | ||
return { | ||
id: state.data.id, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would it not simpler to re-use either the github state object or simpley explode return the data object instead of this 1-1 mapping. |
||
name: state.data.name, | ||
busy: state.data.busy, | ||
status: state.data.status, | ||
os: state.data.os, | ||
labels: state.data.labels, | ||
runner_group_id: state.data.runner_group_id, | ||
ephemeral: state.data.ephemeral, | ||
}; | ||
} | ||
|
||
async function getGitHubRunnerBusyState(client: Octokit, ec2runner: RunnerInfo, runnerId: number): Promise<boolean> { | ||
const state = await getGitHubSelfHostedRunnerState(client, ec2runner, runnerId); | ||
logger.info(`Runner '${ec2runner.instanceId}' - GitHub Runner ID '${runnerId}' - Busy: ${state.busy}`); | ||
return state.busy; | ||
} | ||
|
||
async function listGitHubRunners(runner: RunnerInfo): Promise<GhRunners> { | ||
|
@@ -200,18 +221,42 @@ async function markOrphan(instanceId: string): Promise<void> { | |
} | ||
} | ||
|
||
async function lastChanceCheckOrphanRunner(runner: RunnerList): Promise<void> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For me the method is confusing, it looks terminationg is now down on two palces. Makes the code flow hard to follow. Also |
||
const client = await getOrCreateOctokit(runner as RunnerInfo); | ||
const runnerId = parseInt(runner.runnerId || '0'); | ||
const ec2Instance = runner as RunnerInfo; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why casting here? Does the input object not contain the right information? |
||
const state = await getGitHubSelfHostedRunnerState(client, ec2Instance, runnerId); | ||
logger.debug(`Runner is currently '${runner.instanceId}' state: ${JSON.stringify(state)}`); | ||
if (state.status === 'online' && state.busy) { | ||
logger.info(`Runner '${runner.instanceId}' is orphan, but is online and busy.`); | ||
await untag(runner.instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); | ||
} else if (state.status === 'offline') { | ||
logger.warn(`Runner '${runner.instanceId}' is orphan.`); | ||
logger.info(`Terminating orphan runner '${runner.instanceId}'`); | ||
await terminateRunner(runner.instanceId).catch((e) => { | ||
logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); | ||
}); | ||
} | ||
} | ||
|
||
async function terminateOrphan(environment: string): Promise<void> { | ||
try { | ||
const orphanRunners = await listEC2Runners({ environment, orphan: true }); | ||
|
||
for (const runner of orphanRunners) { | ||
logger.info(`Terminating orphan runner '${runner.instanceId}'`); | ||
await terminateRunner(runner.instanceId).catch((e) => { | ||
logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); | ||
}); | ||
// do we have a valid runnerId? then we are in a Jit Runner scenario else, use old method | ||
if (runner.runnerId) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the code remains simpler if you update fhe code as follow. pseudeo code
or similar this wil keep the main control flow in one view. And avoiding termination is invoked in two differnet functions. |
||
logger.debug(`Runner '${runner.instanceId}' is orphan, but has a runnerId.`); | ||
await lastChanceCheckOrphanRunner(runner); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. untagging can lead to an exception, exception looks like not handled at all. Would suggest to handle a potential exception inside the untag function and return a boolean. |
||
} else { | ||
logger.info(`Terminating orphan runner '${runner.instanceId}'`); | ||
await terminateRunner(runner.instanceId).catch((e) => { | ||
logger.error(`Failed to terminate orphan runner '${runner.instanceId}'`, { error: e }); | ||
}); | ||
} | ||
} | ||
} catch (e) { | ||
logger.warn(`Failure during orphan runner termination.`, { error: e }); | ||
logger.warn(`Failure during orphan termination processing.`, { error: e }); | ||
} | ||
} | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.