diff --git a/cdk/BackendStack.ts b/cdk/BackendStack.ts index 170f7501c..9cdf4e618 100644 --- a/cdk/BackendStack.ts +++ b/cdk/BackendStack.ts @@ -335,6 +335,11 @@ export class BackendStack extends Stack { mbff.startMultiBundleFOTAFlow.fn, ) + api.addRoute( + 'DELETE /device/{deviceId}/fota/job/{jobId}', + mbff.abortMultiBundleFOTAFlow.fn, + ) + const updateDevice = new UpdateDevice(this, { lambdaSources, layers: [baseLayerVersion], diff --git a/cdk/packBackendLambdas.ts b/cdk/packBackendLambdas.ts index 87d3b73ed..96d1c2f46 100644 --- a/cdk/packBackendLambdas.ts +++ b/cdk/packBackendLambdas.ts @@ -41,6 +41,8 @@ export type BackendLambdas = { createCNAMERecord: PackedLambda multiBundleFOTAFlow: { start: PackedLambda + abort: PackedLambda + onFail: PackedLambda getDeviceFirmwareDetails: PackedLambda getNextBundle: PackedLambda createFOTAJob: PackedLambda @@ -134,6 +136,14 @@ export const packBackendLambdas = async (): Promise => ({ 'multiBundleFOTAFlowStart', 'lambda/fota/multi-bundle-flow/start.ts', ), + abort: await packLambdaFromPath( + 'multiBundleFOTAFlowAbort', + 'lambda/fota/multi-bundle-flow/abort.ts', + ), + onFail: await packLambdaFromPath( + 'multiBundleFOTAFlowOnFail', + 'lambda/fota/multi-bundle-flow/onFail.ts', + ), getDeviceFirmwareDetails: await packLambdaFromPath( 'multiBundleFOTAFlowGetDeviceFirmareDetails', 'lambda/fota/multi-bundle-flow/getDeviceFirmwareDetails.ts', diff --git a/cdk/resources/FOTA/MultiBundleFlow.ts b/cdk/resources/FOTA/MultiBundleFlow.ts index dee9e0cfb..fb982cba2 100644 --- a/cdk/resources/FOTA/MultiBundleFlow.ts +++ b/cdk/resources/FOTA/MultiBundleFlow.ts @@ -8,10 +8,13 @@ import { definitions, LwM2MObjectID } from '@hello.nrfcloud.com/proto-map/lwm2m' import { FOTAJobStatus } from '@hello.nrfcloud.com/proto/hello' import { Duration, + aws_events as Events, aws_lambda_event_sources as EventSources, + aws_events_targets as EventsTargets, aws_iam as IAM, aws_iot as IoT, aws_lambda as Lambda, + Stack, aws_stepfunctions_tasks as StepFunctionsTasks, type aws_logs as Logs, } from 'aws-cdk-lib' @@ -27,7 +30,6 @@ import { StateMachineType, Succeed, TaskInput, - type IStateMachine, } from 'aws-cdk-lib/aws-stepfunctions' import { DynamoAttributeValue, @@ -44,7 +46,7 @@ import type { DeviceStorage } from '../DeviceStorage.js' * save the amount of data that needs to be transferred. */ export class MultiBundleFOTAFlow extends Construct { - public readonly stateMachine: IStateMachine + public readonly stateMachine: StateMachine public readonly GetDeviceFirmwareDetails: PackedLambdaFn public readonly GetNextBundle: PackedLambdaFn public readonly CreateFOTAJob: PackedLambdaFn @@ -53,6 +55,7 @@ export class MultiBundleFOTAFlow extends Construct { public readonly WaitForUpdateAppliedCallback: PackedLambdaFn public readonly WaitForUpdateApplied: PackedLambdaFn public readonly startMultiBundleFOTAFlow: PackedLambdaFn + public readonly abortMultiBundleFOTAFlow: PackedLambdaFn public constructor( parent: Construct, @@ -437,7 +440,7 @@ export class MultiBundleFOTAFlow extends Construct { }) deviceFOTA.nrfCloudJobStatusTable.grantReadWriteData(this.stateMachine) - const startMultiBundleFOTAFlow = new PackedLambdaFn( + this.startMultiBundleFOTAFlow = new PackedLambdaFn( this, 'startMultiBundleFOTAFlow', lambdas.start, @@ -459,10 +462,62 @@ export class MultiBundleFOTAFlow extends Construct { logGroup: deviceFOTA.logGroup, }, ) - this.startMultiBundleFOTAFlow = startMultiBundleFOTAFlow - this.stateMachine.grantStartExecution(startMultiBundleFOTAFlow.fn) - deviceStorage.devicesTable.grantReadData(startMultiBundleFOTAFlow.fn) - deviceFOTA.jobTable.grantWriteData(startMultiBundleFOTAFlow.fn) + this.stateMachine.grantStartExecution(this.startMultiBundleFOTAFlow.fn) + deviceStorage.devicesTable.grantReadData(this.startMultiBundleFOTAFlow.fn) + deviceFOTA.jobTable.grantWriteData(this.startMultiBundleFOTAFlow.fn) + + this.abortMultiBundleFOTAFlow = new PackedLambdaFn( + this, + 'abortMultiBundleFOTAFlow', + lambdas.abort, + { + description: 'REST entry point for aborting running FOTA flows', + environment: { + DEVICES_TABLE_NAME: deviceStorage.devicesTable.tableName, + STATE_MACHINE_ARN: this.stateMachine.stateMachineArn, + }, + layers, + logGroup: deviceFOTA.logGroup, + initialPolicy: [ + new IAM.PolicyStatement({ + actions: ['states:DescribeExecution', 'states:StopExecution'], + resources: [ + `arn:aws:states:${Stack.of(this).region}:${Stack.of(this).account}:execution:${this.stateMachine.stateMachineName}:*`, + ], + }), + ], + }, + ) + deviceStorage.devicesTable.grantReadData(this.abortMultiBundleFOTAFlow.fn) + + const onStepFunctionFail = new PackedLambdaFn( + this, + 'onFail', + lambdas.onFail, + { + description: 'Handles failed or cancelled step function executions', + environment: { + STATE_MACHINE_ARN: this.stateMachine.stateMachineArn, + JOB_TABLE_NAME: deviceFOTA.jobTable.tableName, + }, + layers, + logGroup: deviceFOTA.logGroup, + }, + ) + deviceFOTA.jobTable.grantWriteData(onStepFunctionFail.fn) + // FIXME: connect to state machine + const eventBus = new Events.EventBus(this, 'eventBus', {}) + new Events.Rule(this, 'onStepFunctionFailRule', { + eventPattern: { + source: ['aws.states'], + detail: { + status: ['FAILED', 'TIMED_OUT', 'ABORTED'], + stateMachineArn: [this.stateMachine.stateMachineArn], + }, + }, + targets: [new EventsTargets.LambdaFunction(onStepFunctionFail.fn)], + eventBus, + }) this.WaitForFOTAJobCompletion = new PackedLambdaFn( this, diff --git a/features/FOTA-abort.feature.md b/features/FOTA-abort.feature.md new file mode 100644 index 000000000..a7beabe66 --- /dev/null +++ b/features/FOTA-abort.feature.md @@ -0,0 +1,213 @@ +--- +exampleContext: + fingerprint: 92b.y7i24q + fingerprint_deviceId: oob-352656108602296 + APIURL: https://api.hello.nordicsemi.cloud + tsJob1CreatedISO: 2023-09-12T00:01:00.000Z + tsJob1CancelledISO: 2023-09-12T00:03:00.000Z + nrfCloudJobId: bc631093-7f7c-4c1b-aa63-a68c759bcd5c + jobId: 01J861VKYH5QVD6QQ5YXXF20EF +needs: + - Device FOTA +run: only +--- + +# Abort Device FOTA jobs + +> A user abort a running firmware update job for a device. + +## Background + +Given I have the fingerprint for a `PCA20065` device in `fingerprint` + +And I have a random UUIDv4 in `nrfCloudJobId` + +And I store `$fromMillis($millis())` into `tsJob1CreatedISO` + +And I store `$fromMillis($millis() + 60 * 1000)` into `tsJob1CancelledISO` + +## The device reports that it is eligible for FOTA + + + +Given there is this device shadow data for `${fingerprint_deviceId}` in nRF +Cloud + +```json +{ + "items": [ + { + "id": "${fingerprint_deviceId}", + "$meta": { + "createdAt": "${$fromMillis($millis())}", + "updatedAt": "${$fromMillis($millis())}" + }, + "state": { + "reported": { + "device": { + "deviceInfo": { + "appVersion": "2.0.0", + "modemFirmware": "mfw_nrf91x1_2.0.1", + "imei": "355025930003908", + "board": "thingy91x", + "hwVer": "nRF9151 LACA ADA" + }, + "serviceInfo": { + "fota_v2": ["BOOT", "MODEM", "APP"] + } + } + }, + "metadata": { + "reported": { + "device": { + "deviceInfo": { + "appVersion": { "timestamp": 1716801888 }, + "modemFirmware": { "timestamp": 1716801888 }, + "imei": { "timestamp": 1716801888 }, + "board": { "timestamp": 1716801888 }, + "hwVer": { "timestamp": 1716801888 } + }, + "serviceInfo": { + "fota_v2": [ + { + "timestamp": 1717409966 + }, + { + "timestamp": 1717409966 + }, + { + "timestamp": 1717409966 + } + ] + } + } + } + }, + "version": 8835 + } + } + ], + "total": 1 +} +``` + +And I connect to the websocket using fingerprint `${fingerprint}` + +Soon I should receive a message on the websocket that matches after 20 retries + +```json +{ + "@context": "https://github.com/hello-nrfcloud/proto/shadow", + "reported": [ + { + "ObjectID": 14401, + "Resources": { + "0": ["BOOT", "MODEM", "APP"], + "99": 1717409966 + } + } + ] +} +``` + +## Schedule the FOTA job + +Given this nRF Cloud API request is queued for a `POST /v1/fota-jobs` request + +``` +HTTP/1.1 200 OK +Content-Type: application/json + +{"jobId": "${nrfCloudJobId}"} +``` + +And this nRF Cloud API request is queued for a +`GET /v1/fota-jobs/${nrfCloudJobId}` request + +``` +HTTP/1.1 200 OK +Content-Type: application/json + +{ + "createdAt": "${tsJob1CreatedISO}", + "firmware": { + "bundleId": "APP*1e29dfa3*v2.0.1", + "fileSize": 425860, + "firmwareType": "APP", + "host": "firmware.nrfcloud.com", + "uris": [ + "bbfe6b73-a46a-43ad-94bd-8e4b4a7847ce/APP*1e29dfa3*v2.0.1/hello-nrfcloud-thingy91x-v2.0.1-fwupd.bin" + ], + "version": "v2.0.1" + }, + "jobId": "${nrfCloudJobId}", + "lastUpdatedAt": "${tsJob1CreatedISO}", + "name": "${nrfCloudJobId}", + "status": "IN_PROGRESS", + "statusDetail": "Job auto applied", + "target": { + "deviceIds": [ + "${fingerprint_deviceId}" + ], + "tags": [] + } +} +``` + +When I `POST` +`${APIURL}/device/${fingerprint_deviceId}/fota/app?fingerprint=${fingerprint}` +with + +```json +{ + "upgradePath": { + ">=0.0.0": "APP*1e29dfa3*v2.0.1" + } +} +``` + +Then the status code of the last response should be `201` + +And I should receive a `https://github.com/hello-nrfcloud/proto/fota/job` +response + +And I store `id` of the last response into `jobId` + +## Cancel the job + +When I `DELETE` +`${APIURL}/device/${fingerprint_deviceId}/fota/job/${jobId}?fingerprint=${fingerprint}` + +Then the status code of the last response should be `202` + +## Job is cancelled + +When I `GET` +`${APIURL}/device/${fingerprint_deviceId}/fota/jobs?fingerprint=${fingerprint}` +retrying 10 times + +Soon I should receive a `https://github.com/hello-nrfcloud/proto/fota/jobs` +response + +And `$.jobs[0]` of the last response should match + +```json +{ + "deviceId": "${fingerprint_deviceId}", + "status": "FAILED", + "statusDetail": "The job was cancelled." +} +``` + +## Receive a notification + +Soon I should receive a message on the websocket that matches + +```json +{ + "@context": "https://github.com/hello-nrfcloud/proto/fota/job", + "deviceId": "${fingerprint_deviceId}", + "status": "FAILED", + "statusDetail": "The job was cancelled." +} +``` diff --git a/features/FOTA.feature.md b/features/FOTA.feature.md index 5cfb242fc..d8a650c04 100644 --- a/features/FOTA.feature.md +++ b/features/FOTA.feature.md @@ -33,9 +33,6 @@ exampleContext: > > An update routine consists of one or more update jobs to execute to upgrade > the device from one (modem) firmware version to another. -> -> TODO: multi-path FOTA needs to wait for the device to report the updated -> version before progressing to the next update ## Background diff --git a/lambda/fota/multi-bundle-flow/abort.ts b/lambda/fota/multi-bundle-flow/abort.ts new file mode 100644 index 000000000..684d8ef0c --- /dev/null +++ b/lambda/fota/multi-bundle-flow/abort.ts @@ -0,0 +1,92 @@ +import { DynamoDBClient } from '@aws-sdk/client-dynamodb' +import { + DescribeExecutionCommand, + ExecutionStatus, + SFNClient, + StopExecutionCommand, +} from '@aws-sdk/client-sfn' +import { fromEnv } from '@bifravst/from-env' +import { aResponse } from '@hello.nrfcloud.com/lambda-helpers/aResponse' +import { addVersionHeader } from '@hello.nrfcloud.com/lambda-helpers/addVersionHeader' +import { corsOPTIONS } from '@hello.nrfcloud.com/lambda-helpers/corsOPTIONS' +import { + ProblemDetailError, + problemResponse, +} from '@hello.nrfcloud.com/lambda-helpers/problemResponse' +import { requestLogger } from '@hello.nrfcloud.com/lambda-helpers/requestLogger' +import { tryAsJSON } from '@hello.nrfcloud.com/lambda-helpers/tryAsJSON' +import { + validateInput, + type ValidInput, +} from '@hello.nrfcloud.com/lambda-helpers/validateInput' +import { fingerprintRegExp } from '@hello.nrfcloud.com/proto/fingerprint' +import { deviceId, HttpStatusCode } from '@hello.nrfcloud.com/proto/hello' +import middy from '@middy/core' +import { Type } from '@sinclair/typebox' +import type { + APIGatewayProxyEventV2, + APIGatewayProxyResultV2, +} from 'aws-lambda' +import { ulidRegEx } from '../../../util/ulid.js' +import { withDevice, type WithDevice } from '../../middleware/withDevice.js' + +const { version, DevicesTableName, StateMachineArn } = fromEnv({ + version: 'VERSION', + DevicesTableName: 'DEVICES_TABLE_NAME', + stackName: 'STACK_NAME', + StateMachineArn: 'STATE_MACHINE_ARN', +})(process.env) + +const db = new DynamoDBClient({}) +const sf = new SFNClient({}) + +const InputSchema = Type.Object({ + deviceId, + jobId: Type.RegExp(ulidRegEx, { title: 'Job ID', description: 'ULID' }), + fingerprint: Type.RegExp(fingerprintRegExp), +}) + +const h = async ( + event: APIGatewayProxyEventV2, + context: ValidInput & WithDevice, +): Promise => { + const { jobId } = context.validInput + + const executionArn = `${StateMachineArn.replace('stateMachine', 'execution')}:${jobId}` + + const execution = await sf.send( + new DescribeExecutionCommand({ + executionArn, + }), + ) + + if (execution.status !== ExecutionStatus.RUNNING) { + throw new ProblemDetailError({ + status: HttpStatusCode.CONFLICT, + title: `Execution is not running, but ${execution.status}!`, + }) + } + + if (tryAsJSON(execution.input)?.deviceId !== context.device.id) { + throw new ProblemDetailError({ + status: HttpStatusCode.FORBIDDEN, + title: `Job ${jobId} does not belong to device ${context.device.id}!`, + }) + } + + await sf.send( + new StopExecutionCommand({ + executionArn: execution.executionArn, + }), + ) + + return aResponse(HttpStatusCode.ACCEPTED) +} +export const handler = middy() + .use(corsOPTIONS('DELETE')) + .use(addVersionHeader(version)) + .use(requestLogger()) + .use(validateInput(InputSchema)) + .use(withDevice({ db, DevicesTableName })) + .use(problemResponse()) + .handler(h) diff --git a/lambda/fota/multi-bundle-flow/onFail.ts b/lambda/fota/multi-bundle-flow/onFail.ts new file mode 100644 index 000000000..fdf169bba --- /dev/null +++ b/lambda/fota/multi-bundle-flow/onFail.ts @@ -0,0 +1,28 @@ +import { requestLogger } from '@hello.nrfcloud.com/lambda-helpers/requestLogger' +import middy from '@middy/core' +import type { EventBridgeEvent } from 'aws-lambda' + +const h = async ( + event: EventBridgeEvent< + 'Step Functions Execution Status Change', + { + executionArn: string // e.g. 'arn:aws:states:us-east-2:123456789012:execution:state-machine-name:execution-name' + stateMachineArn: string // e.g. 'arn:aws:states:us-east-2:123456789012:stateMachine:state-machine' + name: string // e.g. 'execution-name' + status: string // e.g. 'ABORTED' + startDate: number // e.g. 1551225014968 + stopDate: number // e.g. 1551225017576 + input: string // e.g. '{}' + inputDetails: null | { + included: true + } + output: null | string // e.g. null or '{}' + outputDetails: null | { + included: true + } + } + >, +): Promise => { + void event +} +export const handler = middy().use(requestLogger()).handler(h)