Skip to content

Commit 6778ded

Browse files
committed
feat: add e2e-aws for nvidia extensions
Add e2e tests for nvidia Signed-off-by: Noel Georgi <git@frezbo.dev>
1 parent 74c07ed commit 6778ded

File tree

17 files changed

+621
-272
lines changed

17 files changed

+621
-272
lines changed

.drone.jsonnet

Lines changed: 77 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ local ExtensionsStep(with_e2e=true) =
386386
IMAGE_REGISTRY: local_registry,
387387
QEMU_EXTRA_DISKS: '1',
388388
SHORT_INTEGRATION_TEST: 'yes',
389-
EXTRA_TEST_ARGS: '-talos.extensions.testtype=qemu',
389+
EXTRA_TEST_ARGS: '-talos.extensions.qemu',
390390
});
391391

392392
local step_targets = [extensions_build, extensions_artifacts, extensions_patch_manifest, e2e_extensions];
@@ -656,63 +656,84 @@ local capi_docker = Step('e2e-docker', depends_on=[load_artifacts], target='e2e-
656656
});
657657
local e2e_capi = Step('e2e-capi', depends_on=[capi_docker], environment=creds_env_vars);
658658

659-
local e2e_aws_prepare = Step(
660-
'cloud-images',
661-
depends_on=[
662-
load_artifacts,
663-
],
664-
environment=creds_env_vars {
665-
CLOUD_IMAGES_EXTRA_ARGS: '--name-prefix talos-e2e --target-clouds aws --architectures amd64 --aws-regions us-east-1',
666-
},
667-
extra_commands=[
668-
'make e2e-aws-prepare',
669-
'az login --service-principal -u "$${AZURE_CLIENT_ID}" -p "$${AZURE_CLIENT_SECRET}" --tenant "$${AZURE_TENANT_ID}"',
670-
'az storage blob upload-batch --overwrite -s _out --pattern "e2e-aws-generated/*" -d "${CI_COMMIT_SHA}${DRONE_TAG//./-}"',
671-
]
672-
);
659+
local E2EAWS(target) =
660+
local extensions_artifacts = [step for step in ExtensionsStep(with_e2e=false)];
661+
local depends_on = if std.startsWith(target, 'nvidia') then [load_artifacts] + extensions_artifacts else [load_artifacts];
662+
local test_num_nodes = if std.startsWith(target, 'nvidia') then 4 else 6;
663+
local extra_test_args = if std.startsWith(target, 'nvidia') then '-talos.extensions.nvidia' else '';
664+
665+
local e2e_aws_prepare = Step(
666+
'e2e-aws-prepare',
667+
depends_on=depends_on,
668+
environment=creds_env_vars {
669+
IMAGE_REGISTRY: local_registry,
670+
E2E_AWS_TARGET: target,
671+
},
672+
extra_commands=[
673+
'az login --service-principal -u "$${AZURE_CLIENT_ID}" -p "$${AZURE_CLIENT_SECRET}" --tenant "$${AZURE_TENANT_ID}"',
674+
'az storage blob upload-batch --overwrite -s _out --pattern "e2e-aws-generated/*" -d "${CI_COMMIT_SHA}${DRONE_TAG//./-}"',
675+
]
676+
);
673677

674-
local tf_apply = TriggerDownstream(
675-
'tf-apply',
676-
'e2e-talos-tf-apply',
677-
['siderolabs/contrib@main'],
678-
params=[
679-
'BUCKET_PATH=${CI_COMMIT_SHA}${DRONE_TAG//./-}',
680-
'TYPE=aws',
681-
'AWS_DEFAULT_REGION=us-east-1',
682-
],
683-
depends_on=[e2e_aws_prepare],
684-
);
678+
local tf_apply = TriggerDownstream(
679+
'tf-apply',
680+
'e2e-talos-tf-apply',
681+
['siderolabs/contrib@main'],
682+
params=[
683+
'BUCKET_PATH=${CI_COMMIT_SHA}${DRONE_TAG//./-}',
684+
'TYPE=aws',
685+
'AWS_DEFAULT_REGION=us-east-1',
686+
],
687+
depends_on=[e2e_aws_prepare],
688+
);
685689

686-
local e2e_aws_tf_apply_post = Step(
687-
'e2e-aws-download-artifacts',
688-
with_make=false,
689-
environment=creds_env_vars,
690-
extra_commands=[
691-
'az login --service-principal -u "$${AZURE_CLIENT_ID}" -p "$${AZURE_CLIENT_SECRET}" --tenant "$${AZURE_TENANT_ID}"',
692-
'az storage blob download -f _out/e2e-aws-talosconfig -n e2e-aws-talosconfig -c ${CI_COMMIT_SHA}${DRONE_TAG//./-}',
693-
'az storage blob download -f _out/e2e-aws-kubeconfig -n e2e-aws-kubeconfig -c ${CI_COMMIT_SHA}${DRONE_TAG//./-}',
694-
],
695-
depends_on=[tf_apply],
696-
);
690+
local e2e_aws_tf_apply_post = Step(
691+
'e2e-aws-download-artifacts',
692+
with_make=false,
693+
environment=creds_env_vars,
694+
extra_commands=[
695+
'az login --service-principal -u "$${AZURE_CLIENT_ID}" -p "$${AZURE_CLIENT_SECRET}" --tenant "$${AZURE_TENANT_ID}"',
696+
'az storage blob download -f _out/e2e-aws-talosconfig -n e2e-aws-talosconfig -c ${CI_COMMIT_SHA}${DRONE_TAG//./-}',
697+
'az storage blob download -f _out/e2e-aws-kubeconfig -n e2e-aws-kubeconfig -c ${CI_COMMIT_SHA}${DRONE_TAG//./-}',
698+
],
699+
depends_on=[tf_apply],
700+
);
697701

698-
local e2e_aws = Step('e2e-aws', depends_on=[e2e_aws_tf_apply_post], environment=creds_env_vars);
702+
local e2e_aws = Step(
703+
'e2e-aws',
704+
depends_on=[e2e_aws_tf_apply_post],
705+
environment=creds_env_vars {
706+
TEST_NUM_NODES: test_num_nodes,
707+
EXTRA_TEST_ARGS: extra_test_args,
708+
}
709+
);
699710

700-
local tf_destroy = TriggerDownstream(
701-
'tf-destroy',
702-
'e2e-talos-tf-destroy',
703-
['siderolabs/contrib@main'],
704-
params=[
705-
'TYPE=aws',
706-
'AWS_DEFAULT_REGION=us-east-1',
707-
],
708-
depends_on=[e2e_aws],
709-
when={
710-
status: [
711-
'failure',
712-
'success',
711+
local tf_destroy = TriggerDownstream(
712+
'tf-destroy',
713+
'e2e-talos-tf-destroy',
714+
['siderolabs/contrib@main'],
715+
params=[
716+
'BUCKET_PATH=${CI_COMMIT_SHA}${DRONE_TAG//./-}',
717+
'TYPE=aws',
718+
'AWS_DEFAULT_REGION=us-east-1',
713719
],
714-
},
715-
);
720+
depends_on=[e2e_aws],
721+
when={
722+
status: [
723+
'failure',
724+
'success',
725+
],
726+
},
727+
);
728+
729+
local step_targets = [e2e_aws_prepare, tf_apply, e2e_aws_tf_apply_post, e2e_aws, tf_destroy];
730+
local targets = if std.startsWith(target, 'nvidia') then extensions_artifacts + step_targets else step_targets;
731+
732+
targets;
733+
734+
735+
local e2e_aws = [step for step in E2EAWS('default')];
736+
local e2e_aws_nvidia_oss = [step for step in E2EAWS('nvidia-oss')];
716737

717738
local e2e_azure = Step('e2e-azure', depends_on=[e2e_capi], environment=creds_env_vars);
718739
local e2e_gcp = Step('e2e-gcp', depends_on=[e2e_capi], environment=creds_env_vars);
@@ -727,11 +748,12 @@ local e2e_trigger(names) = {
727748

728749
local e2e_pipelines = [
729750
// regular pipelines, triggered on promote events
730-
Pipeline('e2e-aws', default_pipeline_steps + [e2e_aws_prepare, tf_apply, e2e_aws_tf_apply_post, e2e_aws, tf_destroy]) + e2e_trigger(['e2e-aws']),
751+
Pipeline('e2e-aws', default_pipeline_steps + e2e_aws) + e2e_trigger(['e2e-aws']),
752+
Pipeline('e2e-aws-nvidia-oss', default_pipeline_steps + e2e_aws_nvidia_oss) + e2e_trigger(['e2e-aws-nvidia-oss']),
731753
Pipeline('e2e-gcp', default_pipeline_steps + [capi_docker, e2e_capi, e2e_gcp]) + e2e_trigger(['e2e-gcp']),
732754

733755
// cron pipelines, triggered on schedule events
734-
Pipeline('cron-e2e-aws', default_pipeline_steps + [e2e_aws_prepare, tf_apply, e2e_aws_tf_apply_post, e2e_aws, tf_destroy], [default_cron_pipeline]) + cron_trigger(['thrice-daily', 'nightly']),
756+
Pipeline('cron-e2e-aws', default_pipeline_steps + e2e_aws, [default_cron_pipeline]) + cron_trigger(['thrice-daily', 'nightly']),
735757
Pipeline('cron-e2e-gcp', default_pipeline_steps + [capi_docker, e2e_capi, e2e_gcp], [default_cron_pipeline]) + cron_trigger(['thrice-daily', 'nightly']),
736758
];
737759

hack/test/e2e-aws-prepare.sh

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,68 @@ source ./hack/test/e2e.sh
66

77
REGION="us-east-1"
88

9-
AMI_ID=$(jq -r ".[] | select(.region == \"${REGION}\") | select (.arch == \"amd64\") | .id" "${ARTIFACTS}/cloud-images.json")
9+
function cloud_image_upload() {
10+
CLOUD_IMAGES_EXTRA_ARGS=("--name-prefix=${1}" "--target-clouds=aws" "--architectures=amd64" "--aws-regions=${REGION}")
11+
12+
make cloud-images CLOUD_IMAGES_EXTRA_ARGS="${CLOUD_IMAGES_EXTRA_ARGS[*]}"
13+
}
14+
15+
function get_ami_id() {
16+
jq -r ".[] | select(.region == \"${REGION}\") | select (.arch == \"amd64\") | .id" "${ARTIFACTS}/cloud-images.json"
17+
}
18+
19+
function cloud_image_upload_with_extensions() {
20+
case "${1}" in
21+
nvidia-oss)
22+
EXTENSIONS=$(jq -R < _out/extensions-metadata | jq -rs 'map(select(. | contains("nvidia") and (contains("nvidia-fabricmanager") or contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
23+
;;
24+
nvidia-oss-fabricmanager)
25+
EXTENSIONS=$(jq -R < _out/extensions-metadata | jq -rs 'map(select(. | contains("nvidia") and (contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
26+
;;
27+
nvidia-proprietary)
28+
EXTENSIONS=$(jq -R < _out/extensions-metadata | jq -rs 'map(select(. | contains("nvidia") and (contains("nvidia-fabricmanager") or contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
29+
;;
30+
nvidia-proprietary-fabricmanager)
31+
EXTENSIONS=$(jq -R < _out/extensions-metadata | jq -rs 'map(select(. | contains("nvidia") and (contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
32+
;;
33+
*)
34+
;;
35+
esac
36+
37+
make image-aws IMAGER_ARGS="${EXTENSIONS}" PLATFORM=linux/amd64
38+
cloud_image_upload "talos-e2e-${1}"
39+
}
40+
41+
cloud_image_upload "talos-e2e"
42+
43+
AMI_ID=$(get_ami_id)
44+
45+
WORKER_GROUP=
46+
NVIDIA_AMI_ID=
47+
48+
case "${E2E_AWS_TARGET:-default}" in
49+
default)
50+
;;
51+
*)
52+
WORKER_GROUP="nvidia"
53+
cloud_image_upload_with_extensions "${E2E_AWS_TARGET}"
54+
NVIDIA_AMI_ID=$(get_ami_id)
55+
# cloud_image_upload_with_extensions "${E2E_AWS_TARGET}-fabricmanager"
56+
# NVIDIA_FM_AMI_ID=$(get_ami_id)
57+
;;
58+
esac
1059

1160
mkdir -p "${ARTIFACTS}/e2e-aws-generated"
1261

1362
NAME_PREFIX="talos-e2e-${SHA}-aws"
1463

15-
jq --null-input --arg AMI_ID "${AMI_ID}" --arg CLUSTER_NAME "${NAME_PREFIX}" --arg KUBERNETES_VERSION "${KUBERNETES_VERSION}" '{ami_id: $AMI_ID, cluster_name: $CLUSTER_NAME, kubernetes_version: $KUBERNETES_VERSION}' \
64+
jq --null-input \
65+
--arg WORKER_GROUP "${WORKER_GROUP}" \
66+
--arg AMI_ID "${AMI_ID}" \
67+
--arg NVIDIA_AMI_ID "${NVIDIA_AMI_ID}" \
68+
--arg CLUSTER_NAME "${NAME_PREFIX}" \
69+
--arg KUBERNETES_VERSION "${KUBERNETES_VERSION}" \
70+
'{worker_group: $WORKER_GROUP, ami_id: $AMI_ID, nvidia_ami_id: $NVIDIA_AMI_ID, cluster_name: $CLUSTER_NAME, kubernetes_version: $KUBERNETES_VERSION}' \
1671
| jq -f hack/test/tfvars/aws.jq > "${ARTIFACTS}/e2e-aws-generated/vars.json"
72+
73+
cp hack/test/tfvars/*.yaml "${ARTIFACTS}/e2e-aws-generated"

hack/test/e2e.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ export KUBERNETES_VERSION=${KUBERNETES_VERSION:-1.28.0}
4444

4545
export NAME_PREFIX="talos-e2e-${SHA}-${PLATFORM}"
4646
export TIMEOUT=1200
47-
export NUM_NODES=6
47+
export NUM_NODES=${TEST_NUM_NODES:-6}
4848

4949
# default values, overridden by talosctl cluster create tests
5050
PROVISIONER=

hack/test/tfvars/aws.jq

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"cluster_name": .cluster_name,
33
"num_control_planes": 3,
4-
"num_workers": 3,
4+
"num_workers": (if .worker_group == "nvidia" then 0 else 3 end),
55
"ami_id": .ami_id,
66
"ccm": true,
77
"kubernetes_version": .kubernetes_version,
@@ -11,5 +11,18 @@
1111
"Name": .cluster_name,
1212
"Project": "talos-e2e-ci",
1313
"Environment": "ci"
14-
}
14+
},
15+
"worker_groups": (if .worker_group == "nvidia" then [
16+
{
17+
"name": "nvidia-t4",
18+
"ami_id": .nvidia_ami_id,
19+
"instance_type": "g4dn.xlarge",
20+
"config_patch_files": [
21+
"nvidia.yaml"
22+
],
23+
"tags": {
24+
"Type": "nvidia-t4"
25+
}
26+
}
27+
] else [] end)
1528
}

hack/test/tfvars/nvidia.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
machine:
2+
kernel:
3+
modules:
4+
- name: nvidia
5+
- name: nvidia_uvm
6+
- name: nvidia_drm
7+
- name: nvidia_modeset
8+
sysctls:
9+
net.core.bpf_jit_harden: 1

internal/integration/api/api.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@ import "github.com/stretchr/testify/suite"
1111

1212
var allSuites []suite.TestingSuite
1313

14-
const (
15-
provisionerDocker = "docker"
16-
provisionerQEMU = "qemu"
17-
)
18-
1914
// GetAllSuites returns all the suites for API test.
2015
//
2116
// Depending on build tags, this might return different lists.

internal/integration/api/common.go

Lines changed: 13 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,15 @@
77
package api
88

99
import (
10-
"bufio"
1110
"context"
11+
"os"
1212
"strings"
1313
"time"
1414

1515
corev1 "k8s.io/api/core/v1"
1616
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1717

1818
"github.com/siderolabs/talos/internal/integration/base"
19-
"github.com/siderolabs/talos/pkg/machinery/client"
2019
)
2120

2221
// CommonSuite verifies some default settings such as ulimits.
@@ -34,10 +33,6 @@ func (suite *CommonSuite) SuiteName() string {
3433

3534
// SetupTest ...
3635
func (suite *CommonSuite) SetupTest() {
37-
if suite.Cluster.Provisioner() == provisionerDocker {
38-
suite.T().Skip("skipping default values tests in docker")
39-
}
40-
4136
// make sure API calls have timeout
4237
suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 10*time.Minute)
4338
}
@@ -51,44 +46,27 @@ func (suite *CommonSuite) TearDownTest() {
5146

5247
// TestVirtioModulesLoaded verifies that the virtio modules are loaded.
5348
func (suite *CommonSuite) TestVirtioModulesLoaded() {
54-
if suite.Cluster.Provisioner() == provisionerQEMU {
55-
suite.T().Skip("skipping virtio modules tests in qemu")
49+
if provisioner := os.Getenv("PROVISIONER"); provisioner != "qemu" {
50+
suite.T().Skip("skipping virtio test since provisioner is not qemu")
5651
}
5752

58-
expectedVirtIOModules := []string{
59-
"virtio_balloon",
60-
"virtio_pci",
61-
"virtio_pci_legacy_dev",
62-
"virtio_pci_modern_dev",
53+
expectedVirtIOModules := map[string]string{
54+
"virtio_balloon": "",
55+
"virtio_pci": "",
56+
"virtio_pci_legacy_dev": "",
57+
"virtio_pci_modern_dev": "",
6358
}
6459

6560
node := suite.RandomDiscoveredNodeInternalIP()
66-
67-
ctx := client.WithNode(suite.ctx, node)
68-
69-
fileReader, err := suite.Client.Read(ctx, "/proc/modules")
70-
defer func() {
71-
err = fileReader.Close()
72-
}()
73-
74-
suite.Require().NoError(err)
75-
76-
scanner := bufio.NewScanner(fileReader)
77-
78-
var loadedModules []string
79-
80-
for scanner.Scan() {
81-
loadedModules = append(loadedModules, strings.Split(scanner.Text(), " ")[0])
82-
}
83-
suite.Require().NoError(scanner.Err())
84-
85-
for _, expectedModule := range expectedVirtIOModules {
86-
suite.Require().Contains(loadedModules, expectedModule, "expected module %s to be loaded", expectedModule)
87-
}
61+
suite.AssertExpectedModules(suite.ctx, node, expectedVirtIOModules)
8862
}
8963

9064
// TestCommonDefaults verifies that the default ulimits are set.
9165
func (suite *CommonSuite) TestCommonDefaults() {
66+
if provisioner := os.Getenv("PROVISIONER"); provisioner == "docker" {
67+
suite.T().Skip("skipping ulimits test since provisioner is docker")
68+
}
69+
9270
expectedUlimit := `
9371
core file size (blocks) (-c) 0
9472
data seg size (kb) (-d) unlimited

0 commit comments

Comments
 (0)