kubeflow · briangallagher · Jun 10, 2025 · Jun 13, 2025 · tenzen-y · Jun 20, 2025
diff --git a/.github/workflows/test-e2e.yaml b/.github/workflows/test-e2e.yaml
@@ -0,0 +1,91 @@
+name: E2E Test
+
+on:
+  pull_request
+
+jobs:
+  e2e-test:
+    name: E2E Test
+    runs-on:
+      labels: ubuntu-latest-16-cores
+
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version: ["1.29.14", "1.30.0", "1.31.0", "1.32.3"]
+
+    steps:
+      - name: Checkout Kubeflow SDK repository
+        uses: actions/checkout@v4
+
+      # Checkout the Kubeflow Trainer repository in order to get the Go and KIND versions.
+      - name: Checkout Kubeflow Trainer repo for Go version
+        uses: actions/checkout@v4
+        with:
+          repository: kubeflow/trainer
+          ref: master
+          path: trainer
+
+      # Step 3: Get Go and KIND versions from the trainer repo's go.mod
+      # These versions will be set as environment variables for subsequent steps.
+      - name: Get Go and KIND versions from trainer repo
+        run: |
+          echo "Extracting Go version from trainer/go.mod..."
+          GO_VERSION=$(grep '^go ' ./trainer/go.mod | awk '{print $2}' | tr -d '\n')
+          echo "Detected Go version from trainer/go.mod: $GO_VERSION"
+          echo "GO_VERSION=$GO_VERSION" >> "$GITHUB_ENV"
+
+          echo "Extracting KIND version from trainer/go.mod..."
+          # This specifically looks for 'sigs.k8s.io/kind' and extracts its version
+          KIND_VERSION=$(grep 'sigs.k8s.io/kind' ./trainer/go.mod | awk '{print $2}' | tr -d '\n')
+          echo "Detected KIND version from trainer/go.mod: $KIND_VERSION"
+          echo "KIND_VERSION=$KIND_VERSION" >> "$GITHUB_ENV"
+
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ env.GO_VERSION }} # Use the GO_VERSION environment variable
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11
+
+      - name: Install Python dependencies
+        run: |
+          echo "Installing Papermill and Jupyter"
+          pip install papermill==2.6.0 jupyter==1.1.1 ipykernel==6.29.5
+
+          echo "Installing Kubeflow SDK locally from ./python"
+          # This path (./python) is relative to the *main* repository checkout (kubeflow/sdk)
+          pip install ./python
+        working-directory: . # Ensure pip runs from the SDK repo root
+
+      - name: Setup cluster
+        run: |
+          make test-e2e-setup-cluster \
+            K8S_VERSION=${{ matrix.kubernetes-version }} \
+            KIND_VERSION=${{ env.KIND_VERSION }}
+        working-directory: . # Execute make from the root of the SDK repo
+
+      - name: Run e2e test for example Notebooks
+        run: |
+          mkdir -p artifacts/notebooks # Create the output directory
+          # Execute make commands, passing notebook paths and output locations
+          make test-e2e-notebook \
+            NOTEBOOK_INPUT=./trainer/examples/pytorch/image-classification/mnist.ipynb \
+            NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_mnist.ipynb \
+            PAPERMILL_TIMEOUT=900
+          make test-e2e-notebook \
+            NOTEBOOK_INPUT=./trainer/examples/pytorch/question-answering/fine-tune-distilbert.ipynb \
+            NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_fine-tune-distilbert.ipynb \
+            PAPERMILL_TIMEOUT=900
+        working-directory: . # Execute make from the root of the SDK repo
+
+      - name: Upload Artifacts to GitHub
+        uses: actions/upload-artifact@v4
+        if: always() # Ensure artifacts are uploaded even if previous steps fail
+        with:
+          name: ${{ matrix.kubernetes-version }}
+          path: ./artifacts/notebooks/* # Path relative to the workspace root
+          retention-days: 1 #
diff --git a/Makefile b/Makefile
@@ -0,0 +1,33 @@
+# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
+ifeq (,$(shell go env GOBIN))
+GOBIN=$(shell go env GOPATH)/bin
+else
+GOBIN=$(shell go env GOBIN)
+endif
+
+PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
+LOCALBIN ?= $(PROJECT_DIR)/bin
+
+# Tool versions
+KIND_VERSION ?= v0.27.0
+K8S_VERSION ?= 1.32.0
+
+# Tool binaries
+KIND ?= $(LOCALBIN)/kind
+
+# Input and output location for Notebooks executed with Papermill.
+NOTEBOOK_INPUT=$(PROJECT_DIR)/examples/training/pytorch/image-classification/mnist.ipynb
+NOTEBOOK_OUTPUT=$(PROJECT_DIR)/artifacts/notebooks/trainer_output.ipynb
+PAPERMILL_TIMEOUT=900
+
+.PHONY: kind
+kind: ## Download Kind binary if required.
+	GOBIN=$(LOCALBIN) go install sigs.k8s.io/kind@$(KIND_VERSION)
+
+.PHONY: test-e2e-notebook
+test-e2e-notebook: ## Run Jupyter Notebook with Papermill.
+	NOTEBOOK_INPUT=$(NOTEBOOK_INPUT) NOTEBOOK_OUTPUT=$(NOTEBOOK_OUTPUT) PAPERMILL_TIMEOUT=$(PAPERMILL_TIMEOUT) ./hack/e2e-run-notebook.sh
+
+.PHONY: test-e2e-setup-cluster
+test-e2e-setup-cluster: kind ## Setup Kind cluster for e2e test.
+	KIND=$(KIND) K8S_VERSION=$(K8S_VERSION) ./hack/e2e-setup-cluster.sh
diff --git a/hack/e2e-run-notebook.sh b/hack/e2e-run-notebook.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This shell is used to run Jupyter Notebook with Papermill.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+set -x
+
+if [ -z "${NOTEBOOK_INPUT}" ]; then
+    echo "NOTEBOOK_INPUT env variable must be set to run this script."
+    exit 1
+fi
+
+if [ -z "${NOTEBOOK_OUTPUT}" ]; then
+    echo "NOTEBOOK_OUTPUT env variable must be set to run this script."
+    exit 1
+fi
+
+if [ -z "${PAPERMILL_TIMEOUT}" ]; then
+    echo "PAPERMILL_TIMEOUT env variable must be set to run this script."
+    exit 1
+fi
+
+print_results() {
+    kubectl get pods
+    kubectl describe pod
+    kubectl describe trainjob
+    kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer
+    kubectl logs -l jobset.sigs.k8s.io/replicatedjob-name=trainer-node,batch.kubernetes.io/job-completion-index=0 --tail -1
+    kubectl wait trainjob --for=condition=Complete --all --timeout 3s
+}
+
+(papermill "${NOTEBOOK_INPUT}" "${NOTEBOOK_OUTPUT}" --execution-timeout "${PAPERMILL_TIMEOUT}" && print_results) ||
+    (print_results && exit 1)
diff --git a/hack/e2e-setup-cluster.sh b/hack/e2e-setup-cluster.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 The Kubeflow Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This shell is used to setup Kind cluster for Kubeflow Trainer e2e tests.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+set -x
+
+# Configure variables.
+# KUBEFLOW_TRAINER_REPO="https://github.com/kubeflow/trainer" # This is no longer needed
+KIND=${KIND:-./bin/kind}
+K8S_VERSION=${K8S_VERSION:-1.32.0}
+KIND_NODE_VERSION=kindest/node:v${K8S_VERSION}
+NAMESPACE="kubeflow-system"
+TIMEOUT="5m"
+
+# Define the path to the already cloned Kubeflow Trainer repository
+# This path is relative to where the e2e-setup-cluster.sh script is executed (which is the SDK repo root)
+TRAINER_REPO_PATH="./trainer"
+
+# Kubeflow Trainer images.
+# TODO (andreyvelich): Support initializers images.
+CONTROLLER_MANAGER_CI_IMAGE_NAME="ghcr.io/kubeflow/trainer/trainer-controller-manager"
+CONTROLLER_MANAGER_CI_IMAGE_TAG="test"
+CONTROLLER_MANAGER_CI_IMAGE="${CONTROLLER_MANAGER_CI_IMAGE_NAME}:${CONTROLLER_MANAGER_CI_IMAGE_TAG}"
+
+echo "Build Kubeflow Trainer images"
+# Change 'docker build .' to 'docker build "${TRAINER_REPO_PATH}"'
+# And specify the Dockerfile path relative to the TRAINER_REPO_PATH
+docker build "${TRAINER_REPO_PATH}" -f "${TRAINER_REPO_PATH}"/cmd/trainer-controller-manager/Dockerfile -t "${CONTROLLER_MANAGER_CI_IMAGE}"
+
+echo "Create Kind cluster and load Kubeflow Trainer images"
+"${KIND}" create cluster --image "${KIND_NODE_VERSION}"
+"${KIND}" load docker-image "${CONTROLLER_MANAGER_CI_IMAGE}"
+
+echo "Deploy Kubeflow Trainer control plane and Jobset controller"
+E2E_MANIFESTS_DIR="artifacts/e2e/manifests"
+mkdir -p "${E2E_MANIFESTS_DIR}"
+cat <<EOF > "${E2E_MANIFESTS_DIR}/kustomization.yaml"
+  apiVersion: kustomize.config.k8s.io/v1beta1
+  kind: Kustomization
+  resources:
+  - "../../../trainer/manifests/overlays/manager"
+  images:
+  - name: "${CONTROLLER_MANAGER_CI_IMAGE_NAME}"
+    newTag: "${CONTROLLER_MANAGER_CI_IMAGE_TAG}"
+EOF
+
+# Ensure kubectl applies from the correct context.
+# The `E2E_MANIFESTS_DIR` is relative to where the script is run (SDK repo root).
+kubectl apply --server-side -k "${E2E_MANIFESTS_DIR}"
+
+# We should wait until Deployment is in Ready status.
+echo "Wait for Kubeflow Trainer to be ready"
+(kubectl wait deploy/kubeflow-trainer-controller-manager --for=condition=available -n "${NAMESPACE}" --timeout "${TIMEOUT}" &&
+  kubectl wait pods --for=condition=ready -n "${NAMESPACE}" --timeout "${TIMEOUT}" --all) ||
+  (
+    echo "Failed to wait until Kubeflow Trainer is ready" &&
+      kubectl get pods -n "${NAMESPACE}" &&
+      kubectl describe pods -n "${NAMESPACE}" &&
+      exit 1
+  )
+
+print_cluster_info() {
+  kubectl version
+  kubectl cluster-info
+  kubectl get nodes
+  kubectl get pods -n "${NAMESPACE}"
+  kubectl describe pod -n "${NAMESPACE}"
+}
+
+# TODO (andreyvelich): Currently, we print manager logs due to flaky test.
+echo "Deploy Kubeflow Trainer runtimes"
+# Adjust path to manifests/overlays/runtimes to be relative to the cloned Trainer repo
+kubectl apply --server-side -k "${TRAINER_REPO_PATH}/manifests/overlays/runtimes" || (
+  kubectl logs -n "${NAMESPACE}" -l app.kubernetes.io/name=trainer &&
+    print_cluster_info &&
+    exit 1
+)
+
+# TODO (andreyvelich): Discuss how we want to pre-load runtime images to the Kind cluster.
+TORCH_RUNTIME_IMAGE=pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
+docker pull "${TORCH_RUNTIME_IMAGE}"
+"${KIND}" load docker-image "${TORCH_RUNTIME_IMAGE}"
+
+print_cluster_info