Skip to content

Commit

Permalink
Runner edge cases improvements (#1030)
Browse files Browse the repository at this point in the history
* Runner edge cases improvements

* fix parselog

* refactor ugly return

* fix test

* idle check not needed

* fix tests

* fix gl

* multi logs parser

* feedback fixes

* process lost

Co-authored-by: Daniel Barnes <dabarnes2b@gmail.com>

* job not id

* patterns not entities

Co-authored-by: Daniel Barnes <dabarnes2b@gmail.com>
  • Loading branch information
DavidGOrtega and dacbd authored Jun 19, 2022
1 parent 3f2aadc commit 987ada4
Show file tree
Hide file tree
Showing 7 changed files with 233 additions and 238 deletions.
225 changes: 97 additions & 128 deletions bin/cml/runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ const tf = require('../../src/terraform');

let cml;
let RUNNER;
let RUNNER_ID;
let RUNNER_JOBS_RUNNING = [];
let RUNNER_SHUTTING_DOWN = false;
let RUNNER_TIMER = 0;
Expand Down Expand Up @@ -48,11 +47,6 @@ const shutdown = async (opts) => {
const retryWorkflows = async () => {
try {
if (!noRetry) {
if (cml.driver === 'github') {
const job = await cml.runnerJob({ runnerId: RUNNER_ID });
if (job) RUNNER_JOBS_RUNNING = [job];
}

if (RUNNER_JOBS_RUNNING.length > 0) {
await Promise.all(
RUNNER_JOBS_RUNNING.map(
Expand Down Expand Up @@ -80,7 +74,7 @@ const shutdown = async (opts) => {
};

if (error) {
winston.error(error, { reason, status: 'terminated' });
winston.error(error, { status: 'terminated' });
} else {
winston.info('runner status', { reason, status: 'terminated' });
}
Expand Down Expand Up @@ -128,6 +122,8 @@ const runCloud = async (opts) => {
workdir
} = opts;

await tf.checkMinVersion();

if (gpu === 'tesla')
winston.warn(
'GPU model "tesla" has been deprecated; please use "v100" instead.'
Expand Down Expand Up @@ -163,6 +159,7 @@ const runCloud = async (opts) => {
});

await fs.writeFile(tfMainPath, tpl);

await tf.init({ dir: tfPath });
await tf.apply({ dir: tfPath });

Expand Down Expand Up @@ -211,24 +208,54 @@ const runCloud = async (opts) => {

const runLocal = async (opts) => {
winston.info(`Launching ${cml.driver} runner`);
const { workdir, name, labels, single, idleTimeout, noRetry, dockerVolumes } =
opts;
const {
workdir,
name,
labels,
single,
idleTimeout,
noRetry,
dockerVolumes,
tfResource,
tpiVersion
} = opts;

if (tfResource) {
await tf.checkMinVersion();

const tfPath = workdir;
await fs.mkdir(tfPath, { recursive: true });
const tfMainPath = join(tfPath, 'main.tf');
const tpl = tf.iterativeProviderTpl({ tpiVersion });
await fs.writeFile(tfMainPath, tpl);

await tf.init({ dir: tfPath });
await tf.apply({ dir: tfPath });

const path = join(tfPath, 'terraform.tfstate');
const tfstate = await tf.loadTfState({ path });
tfstate.resources = [
JSON.parse(Buffer.from(tfResource, 'base64').toString('utf-8'))
];
await tf.saveTfState({ tfstate, path });
}

const dataHandler = async (data) => {
const log = cml.parseRunnerLog({ data });
log && winston.info('runner status', log);

if (log && log.status === 'job_started') {
RUNNER_JOBS_RUNNING.push({ id: log.job, date: log.date });
} else if (log && log.status === 'job_ended') {
const { job: jobId } = log;
RUNNER_JOBS_RUNNING = RUNNER_JOBS_RUNNING.filter(
(job) => job.id !== jobId
);
RUNNER_TIMER = 0;
const logs = await cml.parseRunnerLog({ data });
for (const log of logs) {
winston.info('runner status', log);

if (log.status === 'job_started') {
RUNNER_JOBS_RUNNING.push({ id: log.job, date: log.date });
}

if (single && cml.driver === 'bitbucket') {
await shutdown({ ...opts, reason: 'single job' });
if (log.status === 'job_ended') {
const { job: jobId } = log;
RUNNER_JOBS_RUNNING = RUNNER_JOBS_RUNNING.filter(
(job) => job.id !== jobId
);

if (single) await shutdown({ ...opts, reason: 'single job' });
}
}
};
Expand All @@ -245,63 +272,25 @@ const runLocal = async (opts) => {
proc.stderr.on('data', dataHandler);
proc.stdout.on('data', dataHandler);
proc.on('disconnect', () =>
shutdown({ ...opts, reason: `runner disconnected` })
);
proc.on('close', (exit) =>
shutdown({ ...opts, reason: `runner closed with exit code ${exit}` })
shutdown({ ...opts, error: new Error('runner proccess lost') })
);
proc.on('close', (exit) => {
const reason = `runner closed with exit code ${exit}`;
if (exit === 0) shutdown({ ...opts, reason });
else shutdown({ ...opts, error: new Error(reason) });
});

RUNNER = proc;
({ id: RUNNER_ID } = await cml.runnerByName({ name }));

if (idleTimeout > 0) {
const watcher = setInterval(async () => {
let idle = RUNNER_JOBS_RUNNING.length === 0;
const idle = RUNNER_JOBS_RUNNING.length === 0;

if (RUNNER_TIMER >= idleTimeout) {
try {
if (cml.driver === 'github') {
const job = await cml.runnerJob({ runnerId: RUNNER_ID });

if (!job && !idle) {
winston.error(
`Runner is idle as per the GitHub API but busy as per CML internal state. Resetting jobs. Retrying in ${idleTimeout} seconds...`
);
winston.warn(
`CML GitHub driver response: ${JSON.stringify(job)}`
);
winston.warn(
`CML internal state: ${JSON.stringify(RUNNER_JOBS_RUNNING)}`
);

RUNNER_JOBS_RUNNING = [];
}

if (job && idle) {
winston.error(
`Runner is busy as per the GitHub API but idle as per CML internal state. Retrying in ${idleTimeout} seconds...`
);

idle = false;
}
}
} catch (err) {
winston.error(
`Error connecting the SCM: ${err.message}. Will try again in ${idleTimeout} secs`
);

idle = false;
}

if (idle) {
shutdown({ ...opts, reason: `timeout:${idleTimeout}` });
clearInterval(watcher);
} else {
RUNNER_TIMER = 0;
}
shutdown({ ...opts, reason: `timeout:${idleTimeout}` });
clearInterval(watcher);
}

RUNNER_TIMER++;
RUNNER_TIMER = idle ? RUNNER_TIMER + 1 : 0;
}, 1000);
}

Expand Down Expand Up @@ -333,23 +322,26 @@ const runLocal = async (opts) => {
};

const run = async (opts) => {
process.on('unhandledRejection', (reason) =>
shutdown({ ...opts, error: new Error(reason) })
);
process.on('uncaughtException', (error) => shutdown({ ...opts, error }));

['SIGTERM', 'SIGINT', 'SIGQUIT'].forEach((signal) => {
process.on(signal, () => shutdown({ ...opts, reason: signal }));
});

opts.workdir = opts.workdir || `${homedir()}/.cml/${opts.name}`;
const {
tpiVersion,
driver,
repo,
token,
workdir,
cloud,
labels,
name,
reuse,
dockerVolumes,
tfResource,
workdir
dockerVolumes
} = opts;

cml = new CML({ driver, repo, token });
Expand All @@ -359,27 +351,8 @@ const run = async (opts) => {
if (dockerVolumes.length && cml.driver !== 'gitlab')
winston.warn('Parameters --docker-volumes is only supported in gitlab');

if (cloud || tfResource) await tf.checkMinVersion();

// prepare tf
if (tfResource) {
const tfPath = workdir;

await fs.mkdir(tfPath, { recursive: true });
const tfMainPath = join(tfPath, 'main.tf');
const tpl = tf.iterativeProviderTpl({ tpiVersion });
await fs.writeFile(tfMainPath, tpl);
await tf.init({ dir: tfPath });
await tf.apply({ dir: tfPath });
const path = join(tfPath, 'terraform.tfstate');
const tfstate = await tf.loadTfState({ path });
tfstate.resources = [
JSON.parse(Buffer.from(tfResource, 'base64').toString('utf-8'))
];
await tf.saveTfState({ tfstate, path });
}

const runners = await cml.runners();

const runner = await cml.runnerByName({ name, runners });
if (runner) {
if (!reuse)
Expand All @@ -402,13 +375,9 @@ const run = async (opts) => {
process.exit(0);
}

try {
winston.info(`Preparing workdir ${workdir}...`);
await fs.mkdir(workdir, { recursive: true });
await fs.chmod(workdir, '766');
} catch (err) {
winston.warn(err.message);
}
winston.info(`Preparing workdir ${workdir}...`);
await fs.mkdir(workdir, { recursive: true });
await fs.chmod(workdir, '766');

if (cloud) await runCloud(opts);
else await runLocal(opts);
Expand All @@ -433,17 +402,21 @@ exports.handler = async (opts) => {
exports.builder = (yargs) =>
yargs.env('CML_RUNNER').options(
kebabcaseKeys({
tpiVersion: {
driver: {
type: 'string',
default: '>= 0.9.10',
choices: ['github', 'gitlab', 'bitbucket'],
description:
'Pin the iterative/iterative terraform provider to a specific version. i.e. "= 0.10.4" See: https://www.terraform.io/language/expressions/version-constraints',
hidden: true
'Platform where the repository is hosted. If not specified, it will be inferred from the environment'
},
dockerVolumes: {
type: 'array',
default: [],
description: 'Docker volumes. This feature is only supported in GitLab'
repo: {
type: 'string',
description:
'Repository to be used for registering the runner. If not specified, it will be inferred from the environment'
},
token: {
type: 'string',
description:
'Personal access token to register a self-hosted runner on the repository. If not specified, it will be inferred from the environment'
},
labels: {
type: 'string',
Expand Down Expand Up @@ -479,21 +452,16 @@ exports.builder = (yargs) =>
description:
"Don't launch a new runner if an existing one has the same name or overlapping labels"
},
driver: {
type: 'string',
choices: ['github', 'gitlab', 'bitbucket'],
description:
'Platform where the repository is hosted. If not specified, it will be inferred from the environment'
},
repo: {
workdir: {
type: 'string',
description:
'Repository to be used for registering the runner. If not specified, it will be inferred from the environment'
hidden: true,
alias: 'path',
description: 'Runner working directory'
},
token: {
type: 'string',
description:
'Personal access token to register a self-hosted runner on the repository. If not specified, it will be inferred from the environment'
dockerVolumes: {
type: 'array',
default: [],
description: 'Docker volumes. This feature is only supported in GitLab'
},
cloud: {
type: 'string',
Expand Down Expand Up @@ -573,6 +541,13 @@ exports.builder = (yargs) =>
description: 'Specifies the subnet to use within AWS',
alias: 'cloud-aws-subnet-id'
},
tpiVersion: {
type: 'string',
default: '>= 0.9.10',
description:
'Pin the iterative/iterative terraform provider to a specific version. i.e. "= 0.10.4" See: https://www.terraform.io/language/expressions/version-constraints',
hidden: true
},
cmlVersion: {
type: 'string',
default: require('../../package.json').version,
Expand All @@ -589,12 +564,6 @@ exports.builder = (yargs) =>
hidden: true,
description:
'Seconds to wait for collecting logs on failure (https://github.com/iterative/cml/issues/413)'
},
workdir: {
type: 'string',
hidden: true,
alias: 'path',
description: 'Runner working directory'
}
})
);
32 changes: 16 additions & 16 deletions bin/cml/runner.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,19 @@ describe('CML e2e', () => {
--version Show version number [boolean]
--log Maximum log level
[string] [choices: \\"error\\", \\"warn\\", \\"info\\", \\"debug\\"] [default: \\"info\\"]
--docker-volumes Docker volumes. This feature is only
supported in GitLab
[array] [default: []]
--driver Platform where the repository is
hosted. If not specified, it will be
inferred from the environment
[string] [choices: \\"github\\", \\"gitlab\\", \\"bitbucket\\"]
--repo Repository to be used for
registering the runner. If not
specified, it will be inferred from
the environment [string]
--token Personal access token to register a
self-hosted runner on the
repository. If not specified, it
will be inferred from the
environment [string]
--labels One or more user-defined labels for
this runner (delimited with commas)
[string] [default: \\"cml\\"]
Expand All @@ -84,19 +94,9 @@ describe('CML e2e', () => {
--reuse Don't launch a new runner if an
existing one has the same name or
overlapping labels [boolean]
--driver Platform where the repository is
hosted. If not specified, it will be
inferred from the environment
[string] [choices: \\"github\\", \\"gitlab\\", \\"bitbucket\\"]
--repo Repository to be used for
registering the runner. If not
specified, it will be inferred from
the environment [string]
--token Personal access token to register a
self-hosted runner on the
repository. If not specified, it
will be inferred from the
environment [string]
--docker-volumes Docker volumes. This feature is only
supported in GitLab
[array] [default: []]
--cloud Cloud to deploy the runner
[string] [choices: \\"aws\\", \\"azure\\", \\"gcp\\", \\"kubernetes\\"]
--cloud-region Region where the instance is
Expand Down
Loading

1 comment on commit 987ada4

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test Comment

CML watermark

Please sign in to comment.