Skip to content

Commit

Permalink
feat: add e2e-aws for nvidia extensions
Browse files Browse the repository at this point in the history
Add e2e tests for nvidia

Signed-off-by: Noel Georgi <git@frezbo.dev>
  • Loading branch information
frezbo committed Aug 24, 2023
1 parent 74c07ed commit 6778ded
Show file tree
Hide file tree
Showing 17 changed files with 621 additions and 272 deletions.
132 changes: 77 additions & 55 deletions .drone.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ local ExtensionsStep(with_e2e=true) =
IMAGE_REGISTRY: local_registry,
QEMU_EXTRA_DISKS: '1',
SHORT_INTEGRATION_TEST: 'yes',
EXTRA_TEST_ARGS: '-talos.extensions.testtype=qemu',
EXTRA_TEST_ARGS: '-talos.extensions.qemu',
});

local step_targets = [extensions_build, extensions_artifacts, extensions_patch_manifest, e2e_extensions];
Expand Down Expand Up @@ -656,63 +656,84 @@ local capi_docker = Step('e2e-docker', depends_on=[load_artifacts], target='e2e-
});
local e2e_capi = Step('e2e-capi', depends_on=[capi_docker], environment=creds_env_vars);

local e2e_aws_prepare = Step(
'cloud-images',
depends_on=[
load_artifacts,
],
environment=creds_env_vars {
CLOUD_IMAGES_EXTRA_ARGS: '--name-prefix talos-e2e --target-clouds aws --architectures amd64 --aws-regions us-east-1',
},
extra_commands=[
'make e2e-aws-prepare',
'az login --service-principal -u "$${AZURE_CLIENT_ID}" -p "$${AZURE_CLIENT_SECRET}" --tenant "$${AZURE_TENANT_ID}"',
'az storage blob upload-batch --overwrite -s _out --pattern "e2e-aws-generated/*" -d "${CI_COMMIT_SHA}${DRONE_TAG//./-}"',
]
);
local E2EAWS(target) =
local extensions_artifacts = [step for step in ExtensionsStep(with_e2e=false)];
local depends_on = if std.startsWith(target, 'nvidia') then [load_artifacts] + extensions_artifacts else [load_artifacts];
local test_num_nodes = if std.startsWith(target, 'nvidia') then 4 else 6;
local extra_test_args = if std.startsWith(target, 'nvidia') then '-talos.extensions.nvidia' else '';

local e2e_aws_prepare = Step(
'e2e-aws-prepare',
depends_on=depends_on,
environment=creds_env_vars {
IMAGE_REGISTRY: local_registry,
E2E_AWS_TARGET: target,
},
extra_commands=[
'az login --service-principal -u "$${AZURE_CLIENT_ID}" -p "$${AZURE_CLIENT_SECRET}" --tenant "$${AZURE_TENANT_ID}"',
'az storage blob upload-batch --overwrite -s _out --pattern "e2e-aws-generated/*" -d "${CI_COMMIT_SHA}${DRONE_TAG//./-}"',
]
);

local tf_apply = TriggerDownstream(
'tf-apply',
'e2e-talos-tf-apply',
['siderolabs/contrib@main'],
params=[
'BUCKET_PATH=${CI_COMMIT_SHA}${DRONE_TAG//./-}',
'TYPE=aws',
'AWS_DEFAULT_REGION=us-east-1',
],
depends_on=[e2e_aws_prepare],
);
local tf_apply = TriggerDownstream(
'tf-apply',
'e2e-talos-tf-apply',
['siderolabs/contrib@main'],
params=[
'BUCKET_PATH=${CI_COMMIT_SHA}${DRONE_TAG//./-}',
'TYPE=aws',
'AWS_DEFAULT_REGION=us-east-1',
],
depends_on=[e2e_aws_prepare],
);

local e2e_aws_tf_apply_post = Step(
'e2e-aws-download-artifacts',
with_make=false,
environment=creds_env_vars,
extra_commands=[
'az login --service-principal -u "$${AZURE_CLIENT_ID}" -p "$${AZURE_CLIENT_SECRET}" --tenant "$${AZURE_TENANT_ID}"',
'az storage blob download -f _out/e2e-aws-talosconfig -n e2e-aws-talosconfig -c ${CI_COMMIT_SHA}${DRONE_TAG//./-}',
'az storage blob download -f _out/e2e-aws-kubeconfig -n e2e-aws-kubeconfig -c ${CI_COMMIT_SHA}${DRONE_TAG//./-}',
],
depends_on=[tf_apply],
);
local e2e_aws_tf_apply_post = Step(
'e2e-aws-download-artifacts',
with_make=false,
environment=creds_env_vars,
extra_commands=[
'az login --service-principal -u "$${AZURE_CLIENT_ID}" -p "$${AZURE_CLIENT_SECRET}" --tenant "$${AZURE_TENANT_ID}"',
'az storage blob download -f _out/e2e-aws-talosconfig -n e2e-aws-talosconfig -c ${CI_COMMIT_SHA}${DRONE_TAG//./-}',
'az storage blob download -f _out/e2e-aws-kubeconfig -n e2e-aws-kubeconfig -c ${CI_COMMIT_SHA}${DRONE_TAG//./-}',
],
depends_on=[tf_apply],
);

local e2e_aws = Step('e2e-aws', depends_on=[e2e_aws_tf_apply_post], environment=creds_env_vars);
local e2e_aws = Step(
'e2e-aws',
depends_on=[e2e_aws_tf_apply_post],
environment=creds_env_vars {
TEST_NUM_NODES: test_num_nodes,
EXTRA_TEST_ARGS: extra_test_args,
}
);

local tf_destroy = TriggerDownstream(
'tf-destroy',
'e2e-talos-tf-destroy',
['siderolabs/contrib@main'],
params=[
'TYPE=aws',
'AWS_DEFAULT_REGION=us-east-1',
],
depends_on=[e2e_aws],
when={
status: [
'failure',
'success',
local tf_destroy = TriggerDownstream(
'tf-destroy',
'e2e-talos-tf-destroy',
['siderolabs/contrib@main'],
params=[
'BUCKET_PATH=${CI_COMMIT_SHA}${DRONE_TAG//./-}',
'TYPE=aws',
'AWS_DEFAULT_REGION=us-east-1',
],
},
);
depends_on=[e2e_aws],
when={
status: [
'failure',
'success',
],
},
);

local step_targets = [e2e_aws_prepare, tf_apply, e2e_aws_tf_apply_post, e2e_aws, tf_destroy];
local targets = if std.startsWith(target, 'nvidia') then extensions_artifacts + step_targets else step_targets;

targets;


local e2e_aws = [step for step in E2EAWS('default')];
local e2e_aws_nvidia_oss = [step for step in E2EAWS('nvidia-oss')];

local e2e_azure = Step('e2e-azure', depends_on=[e2e_capi], environment=creds_env_vars);
local e2e_gcp = Step('e2e-gcp', depends_on=[e2e_capi], environment=creds_env_vars);
Expand All @@ -727,11 +748,12 @@ local e2e_trigger(names) = {

local e2e_pipelines = [
// regular pipelines, triggered on promote events
Pipeline('e2e-aws', default_pipeline_steps + [e2e_aws_prepare, tf_apply, e2e_aws_tf_apply_post, e2e_aws, tf_destroy]) + e2e_trigger(['e2e-aws']),
Pipeline('e2e-aws', default_pipeline_steps + e2e_aws) + e2e_trigger(['e2e-aws']),
Pipeline('e2e-aws-nvidia-oss', default_pipeline_steps + e2e_aws_nvidia_oss) + e2e_trigger(['e2e-aws-nvidia-oss']),
Pipeline('e2e-gcp', default_pipeline_steps + [capi_docker, e2e_capi, e2e_gcp]) + e2e_trigger(['e2e-gcp']),

// cron pipelines, triggered on schedule events
Pipeline('cron-e2e-aws', default_pipeline_steps + [e2e_aws_prepare, tf_apply, e2e_aws_tf_apply_post, e2e_aws, tf_destroy], [default_cron_pipeline]) + cron_trigger(['thrice-daily', 'nightly']),
Pipeline('cron-e2e-aws', default_pipeline_steps + e2e_aws, [default_cron_pipeline]) + cron_trigger(['thrice-daily', 'nightly']),
Pipeline('cron-e2e-gcp', default_pipeline_steps + [capi_docker, e2e_capi, e2e_gcp], [default_cron_pipeline]) + cron_trigger(['thrice-daily', 'nightly']),
];

Expand Down
61 changes: 59 additions & 2 deletions hack/test/e2e-aws-prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,68 @@ source ./hack/test/e2e.sh

REGION="us-east-1"

AMI_ID=$(jq -r ".[] | select(.region == \"${REGION}\") | select (.arch == \"amd64\") | .id" "${ARTIFACTS}/cloud-images.json")
function cloud_image_upload() {
CLOUD_IMAGES_EXTRA_ARGS=("--name-prefix=${1}" "--target-clouds=aws" "--architectures=amd64" "--aws-regions=${REGION}")

make cloud-images CLOUD_IMAGES_EXTRA_ARGS="${CLOUD_IMAGES_EXTRA_ARGS[*]}"
}

function get_ami_id() {
jq -r ".[] | select(.region == \"${REGION}\") | select (.arch == \"amd64\") | .id" "${ARTIFACTS}/cloud-images.json"
}

function cloud_image_upload_with_extensions() {
case "${1}" in
nvidia-oss)
EXTENSIONS=$(jq -R < _out/extensions-metadata | jq -rs 'map(select(. | contains("nvidia") and (contains("nvidia-fabricmanager") or contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
;;
nvidia-oss-fabricmanager)
EXTENSIONS=$(jq -R < _out/extensions-metadata | jq -rs 'map(select(. | contains("nvidia") and (contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
;;
nvidia-proprietary)
EXTENSIONS=$(jq -R < _out/extensions-metadata | jq -rs 'map(select(. | contains("nvidia") and (contains("nvidia-fabricmanager") or contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
;;
nvidia-proprietary-fabricmanager)
EXTENSIONS=$(jq -R < _out/extensions-metadata | jq -rs 'map(select(. | contains("nvidia") and (contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
;;
*)
;;
esac

make image-aws IMAGER_ARGS="${EXTENSIONS}" PLATFORM=linux/amd64
cloud_image_upload "talos-e2e-${1}"
}

cloud_image_upload "talos-e2e"

AMI_ID=$(get_ami_id)

WORKER_GROUP=
NVIDIA_AMI_ID=

case "${E2E_AWS_TARGET:-default}" in
default)
;;
*)
WORKER_GROUP="nvidia"
cloud_image_upload_with_extensions "${E2E_AWS_TARGET}"
NVIDIA_AMI_ID=$(get_ami_id)
# cloud_image_upload_with_extensions "${E2E_AWS_TARGET}-fabricmanager"
# NVIDIA_FM_AMI_ID=$(get_ami_id)
;;
esac

mkdir -p "${ARTIFACTS}/e2e-aws-generated"

NAME_PREFIX="talos-e2e-${SHA}-aws"

jq --null-input --arg AMI_ID "${AMI_ID}" --arg CLUSTER_NAME "${NAME_PREFIX}" --arg KUBERNETES_VERSION "${KUBERNETES_VERSION}" '{ami_id: $AMI_ID, cluster_name: $CLUSTER_NAME, kubernetes_version: $KUBERNETES_VERSION}' \
jq --null-input \
--arg WORKER_GROUP "${WORKER_GROUP}" \
--arg AMI_ID "${AMI_ID}" \
--arg NVIDIA_AMI_ID "${NVIDIA_AMI_ID}" \
--arg CLUSTER_NAME "${NAME_PREFIX}" \
--arg KUBERNETES_VERSION "${KUBERNETES_VERSION}" \
'{worker_group: $WORKER_GROUP, ami_id: $AMI_ID, nvidia_ami_id: $NVIDIA_AMI_ID, cluster_name: $CLUSTER_NAME, kubernetes_version: $KUBERNETES_VERSION}' \
| jq -f hack/test/tfvars/aws.jq > "${ARTIFACTS}/e2e-aws-generated/vars.json"

cp hack/test/tfvars/*.yaml "${ARTIFACTS}/e2e-aws-generated"
2 changes: 1 addition & 1 deletion hack/test/e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ export KUBERNETES_VERSION=${KUBERNETES_VERSION:-1.28.0}

export NAME_PREFIX="talos-e2e-${SHA}-${PLATFORM}"
export TIMEOUT=1200
export NUM_NODES=6
export NUM_NODES=${TEST_NUM_NODES:-6}

# default values, overridden by talosctl cluster create tests
PROVISIONER=
Expand Down
17 changes: 15 additions & 2 deletions hack/test/tfvars/aws.jq
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"cluster_name": .cluster_name,
"num_control_planes": 3,
"num_workers": 3,
"num_workers": (if .worker_group == "nvidia" then 0 else 3 end),
"ami_id": .ami_id,
"ccm": true,
"kubernetes_version": .kubernetes_version,
Expand All @@ -11,5 +11,18 @@
"Name": .cluster_name,
"Project": "talos-e2e-ci",
"Environment": "ci"
}
},
"worker_groups": (if .worker_group == "nvidia" then [
{
"name": "nvidia-t4",
"ami_id": .nvidia_ami_id,
"instance_type": "g4dn.xlarge",
"config_patch_files": [
"nvidia.yaml"
],
"tags": {
"Type": "nvidia-t4"
}
}
] else [] end)
}
9 changes: 9 additions & 0 deletions hack/test/tfvars/nvidia.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
machine:
kernel:
modules:
- name: nvidia
- name: nvidia_uvm
- name: nvidia_drm
- name: nvidia_modeset
sysctls:
net.core.bpf_jit_harden: 1
5 changes: 0 additions & 5 deletions internal/integration/api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@ import "github.com/stretchr/testify/suite"

var allSuites []suite.TestingSuite

const (
provisionerDocker = "docker"
provisionerQEMU = "qemu"
)

// GetAllSuites returns all the suites for API test.
//
// Depending on build tags, this might return different lists.
Expand Down
48 changes: 13 additions & 35 deletions internal/integration/api/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,15 @@
package api

import (
"bufio"
"context"
"os"
"strings"
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/siderolabs/talos/internal/integration/base"
"github.com/siderolabs/talos/pkg/machinery/client"
)

// CommonSuite verifies some default settings such as ulimits.
Expand All @@ -34,10 +33,6 @@ func (suite *CommonSuite) SuiteName() string {

// SetupTest ...
func (suite *CommonSuite) SetupTest() {
if suite.Cluster.Provisioner() == provisionerDocker {
suite.T().Skip("skipping default values tests in docker")
}

// make sure API calls have timeout
suite.ctx, suite.ctxCancel = context.WithTimeout(context.Background(), 10*time.Minute)
}
Expand All @@ -51,44 +46,27 @@ func (suite *CommonSuite) TearDownTest() {

// TestVirtioModulesLoaded verifies that the virtio modules are loaded.
func (suite *CommonSuite) TestVirtioModulesLoaded() {
if suite.Cluster.Provisioner() == provisionerQEMU {
suite.T().Skip("skipping virtio modules tests in qemu")
if provisioner := os.Getenv("PROVISIONER"); provisioner != "qemu" {
suite.T().Skip("skipping virtio test since provisioner is not qemu")
}

expectedVirtIOModules := []string{
"virtio_balloon",
"virtio_pci",
"virtio_pci_legacy_dev",
"virtio_pci_modern_dev",
expectedVirtIOModules := map[string]string{
"virtio_balloon": "",
"virtio_pci": "",
"virtio_pci_legacy_dev": "",
"virtio_pci_modern_dev": "",
}

node := suite.RandomDiscoveredNodeInternalIP()

ctx := client.WithNode(suite.ctx, node)

fileReader, err := suite.Client.Read(ctx, "/proc/modules")
defer func() {
err = fileReader.Close()
}()

suite.Require().NoError(err)

scanner := bufio.NewScanner(fileReader)

var loadedModules []string

for scanner.Scan() {
loadedModules = append(loadedModules, strings.Split(scanner.Text(), " ")[0])
}
suite.Require().NoError(scanner.Err())

for _, expectedModule := range expectedVirtIOModules {
suite.Require().Contains(loadedModules, expectedModule, "expected module %s to be loaded", expectedModule)
}
suite.AssertExpectedModules(suite.ctx, node, expectedVirtIOModules)
}

// TestCommonDefaults verifies that the default ulimits are set.
func (suite *CommonSuite) TestCommonDefaults() {
if provisioner := os.Getenv("PROVISIONER"); provisioner == "docker" {
suite.T().Skip("skipping ulimits test since provisioner is docker")
}

expectedUlimit := `
core file size (blocks) (-c) 0
data seg size (kb) (-d) unlimited
Expand Down
Loading

0 comments on commit 6778ded

Please sign in to comment.