openshift · jhernand · Feb 24, 2025 · Feb 24, 2025
diff --git a/internal/operators/openshiftai/openshift_ai_config.go b/internal/operators/openshiftai/openshift_ai_config.go
@@ -7,9 +7,20 @@ type Config struct {
 	MinWorkerNodes     int64 `envconfig:"OPENSHIFT_AI_MIN_WORKER_NODES" default:"2"`
 	MinWorkerMemoryGiB int64 `envconfig:"OPENSHIFT_AI_MIN_WORKER_MEMORY_GIB" default:"32"`
 	MinWorkerCPUCores  int64 `envconfig:"OPENSHIFT_AI_MIN_WORKER_CPU_CORES" default:"8"`
+	RequireGPU         bool  `envconfig:"OPENSHIFT_AI_REQUIRE_GPU" default:"true"`
 
 	// TODO: Currently we use the controller image to run the setup tools because all we need is the shell and the
 	// `oc` command, and that way we don't need an additional image. But in the future we will probably want to have
 	// a separate image that contains the things that we need to run these setup jobs.
 	ControllerImage string `envconfig:"CONTROLLER_IMAGE" default:"quay.io/edge-infrastructure/assisted-installer-controller:latest"`
+
+	// SupportedGPUS is a comma separated list of vendor identifiers of supported GPUs. For examaple, to enable
+	// support for NVIDIA and Virtio GPUS the value should be `10de,1af4`. By default only NVIDIA GPUs are
+	// supported.
+	SupportedGPUs []string `envconfig:"OPENSHIFT_AI_SUPPORTED_GPUS" default:"10de"`
+
+	// SupportUI_2_37 indicates if the operator will work in a way that supports the old 2.37 version of the UI
+	// whilch doesn't understand operator bundles yet. When this is set to true the dependencies of the operator
+	// will be installed automatically.
+	SupportUI_2_37 bool `envconfig:"OPENSHIFT_AI_SUPPORT_UI_2_37" default:"true"`
 }
diff --git a/internal/operators/openshiftai/openshift_ai_operator.go b/internal/operators/openshiftai/openshift_ai_operator.go
@@ -3,13 +3,20 @@ package openshiftai
 import (
 	"context"
 	"fmt"
+	"strings"
 	"text/template"
 
 	"github.com/kelseyhightower/envconfig"
 	"github.com/lib/pq"
 	"github.com/openshift/assisted-service/internal/common"
 	"github.com/openshift/assisted-service/internal/operators/api"
+	"github.com/openshift/assisted-service/internal/operators/authorino"
 	operatorscommon "github.com/openshift/assisted-service/internal/operators/common"
+	"github.com/openshift/assisted-service/internal/operators/nvidiagpu"
+	"github.com/openshift/assisted-service/internal/operators/odf"
+	"github.com/openshift/assisted-service/internal/operators/pipelines"
+	"github.com/openshift/assisted-service/internal/operators/serverless"
+	"github.com/openshift/assisted-service/internal/operators/servicemesh"
 	"github.com/openshift/assisted-service/internal/templating"
 	"github.com/openshift/assisted-service/models"
 	"github.com/openshift/assisted-service/pkg/conversions"
@@ -63,7 +70,21 @@ func (o *operator) GetFullName() string {
 
 // GetDependencies provides a list of dependencies of the Operator
 func (o *operator) GetDependencies(c *common.Cluster) (result []string, err error) {
-	return nil, nil
+	// TODO: We shold probably add the node feature discovery and NVIDIA GPU operators only if there is at least one
+	// NVIDIA GPU in the cluster, but unfortunatelly this is calculated and saved to the database only when the
+	// cluster is created or updated via the API, and at that point we don't have the host inventory yet to
+	// determine if there are NVIDIA GPU.
+	if o.config.SupportUI_2_37 {
+		result = []string{
+			authorino.Operator.Name,
+			nvidiagpu.Operator.Name,
+			odf.Operator.Name,
+			pipelines.Operator.Name,
+			serverless.Operator.Name,
+			servicemesh.Operator.Name,
+		}
+	}
+	return result, nil
 }
 
 // GetClusterValidationID returns cluster validation ID for the operator.
@@ -93,6 +114,33 @@ func (o *operator) ValidateCluster(ctx context.Context, cluster *common.Cluster)
 		)
 	}
 
+	// Check that there is at least one supported GPU:
+	if o.config.SupportUI_2_37 && o.config.RequireGPU {
+		var gpuList []*models.Gpu
+		gpuList, err = o.gpusInCluster(cluster)
+		if err != nil {
+			return
+		}
+		var supportedGpuCount int64
+		for _, gpu := range gpuList {
+			var isSupported bool
+			isSupported, err = o.isSupportedGpu(gpu)
+			if err != nil {
+				return
+			}
+			if isSupported {
+				supportedGpuCount++
+			}
+		}
+		if supportedGpuCount == 0 {
+			result.Reasons = append(
+				result.Reasons,
+				"OpenShift AI requires at least one supported GPU, but there is none in the "+
+					"discovered hosts.",
+			)
+		}
+	}
+
 	if len(result.Reasons) > 0 {
 		result.Status = api.Failure
 	} else {
@@ -101,6 +149,40 @@ func (o *operator) ValidateCluster(ctx context.Context, cluster *common.Cluster)
 	return
 }
 
+func (o *operator) gpusInCluster(cluster *common.Cluster) (result []*models.Gpu, err error) {
+	for _, host := range cluster.Hosts {
+		var gpus []*models.Gpu
+		gpus, err = o.gpusInHost(host)
+		if err != nil {
+			return
+		}
+		result = append(result, gpus...)
+	}
+	return
+}
+
+func (o *operator) gpusInHost(host *models.Host) (result []*models.Gpu, err error) {
+	if host.Inventory == "" {
+		return
+	}
+	inventory, err := common.UnmarshalInventory(host.Inventory)
+	if err != nil {
+		return
+	}
+	result = inventory.Gpus
+	return
+}
+
+func (o *operator) isSupportedGpu(gpu *models.Gpu) (result bool, err error) {
+	for _, supportedGpu := range o.config.SupportedGPUs {
+		if strings.EqualFold(gpu.VendorID, supportedGpu) {
+			result = true
+			return
+		}
+	}
+	return
+}
+
 // ValidateHost returns validationResult based on node type requirements such as memory and CPU.
 func (o *operator) ValidateHost(ctx context.Context, cluster *common.Cluster, host *models.Host,
 	_ *models.ClusterHostRequirementsDetails) (result api.ValidationResult, err error) {

diff --git a/internal/operators/openshiftai/openshift_ai_operator_test.go b/internal/operators/openshiftai/openshift_ai_operator_test.go
@@ -2,12 +2,19 @@ package openshiftai
 
 import (
 	"context"
+	"os"
 
 	. "github.com/onsi/ginkgo"
 	. "github.com/onsi/ginkgo/extensions/table"
 	. "github.com/onsi/gomega"
 	"github.com/openshift/assisted-service/internal/common"
 	"github.com/openshift/assisted-service/internal/operators/api"
+	"github.com/openshift/assisted-service/internal/operators/authorino"
+	"github.com/openshift/assisted-service/internal/operators/nvidiagpu"
+	"github.com/openshift/assisted-service/internal/operators/odf"
+	"github.com/openshift/assisted-service/internal/operators/pipelines"
+	"github.com/openshift/assisted-service/internal/operators/serverless"
+	"github.com/openshift/assisted-service/internal/operators/servicemesh"
 	"github.com/openshift/assisted-service/models"
 	"github.com/openshift/assisted-service/pkg/conversions"
 )
@@ -96,4 +103,123 @@ var _ = Describe("Operator", func() {
 			},
 		),
 	)
+
+	DescribeTable(
+		"Is supported GPU",
+		func(env map[string]string, gpu *models.Gpu, expected bool) {
+			// Set the environment variables and restore the previous values when finished:
+			for name, value := range env {
+				oldValue, present := os.LookupEnv(name)
+				if present {
+					defer func() {
+						err := os.Setenv(name, oldValue)
+						Expect(err).ToNot(HaveOccurred())
+					}()
+				} else {
+					defer func() {
+						err := os.Unsetenv(name)
+						Expect(err).ToNot(HaveOccurred())
+					}()
+				}
+				err := os.Setenv(name, value)
+				Expect(err).ToNot(HaveOccurred())
+			}
+
+			// Create the operator:
+			operator = NewOpenShiftAIOperator(common.GetTestLog())
+
+			// Run the check:
+			actual, err := operator.isSupportedGpu(gpu)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(actual).To(Equal(expected))
+		},
+		Entry(
+			"NVIDIA is supported by default",
+			nil,
+			&models.Gpu{
+				VendorID: "10de",
+			},
+			true,
+		),
+		Entry(
+			"Virtio isn't supported by default",
+			nil,
+			&models.Gpu{
+				VendorID: "1af4",
+			},
+			false,
+		),
+		Entry(
+			"Virtio is supported if explicitly added",
+			map[string]string{
+				"OPENSHIFT_AI_SUPPORTED_GPUS": "10de,1af4",
+			},
+			&models.Gpu{
+				VendorID: "1af4",
+			},
+			true,
+		),
+		Entry(
+			"Order isn't relevant",
+			map[string]string{
+				"OPENSHIFT_AI_SUPPORTED_GPUS": "1af4,10de",
+			},
+			&models.Gpu{
+				VendorID: "10de",
+			},
+			true,
+		),
+		Entry(
+			"Case isn't relevant",
+			nil,
+			&models.Gpu{
+				VendorID: "10DE",
+			},
+			true,
+		),
+	)
+
+	Describe("Get dependencies", func() {
+		const flagEnvVar = "OPENSHIFT_AI_SUPPORT_UI_2_37"
+
+		It("Returns the dependencies when compatibility with old UI is explicitly enabled", func() {
+			os.Setenv(flagEnvVar, "true")
+			defer os.Unsetenv(flagEnvVar)
+			operator := NewOpenShiftAIOperator(common.GetTestLog())
+			dependencies, err := operator.GetDependencies(nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(dependencies).To(ConsistOf(
+				authorino.Operator.Name,
+				nvidiagpu.Operator.Name,
+				odf.Operator.Name,
+				pipelines.Operator.Name,
+				serverless.Operator.Name,
+				servicemesh.Operator.Name,
+			))
+		})
+
+		It("Returns the dependencies when compatibility with the old UI isn't explicitly enabled or disabled", func() {
+			operator := NewOpenShiftAIOperator(common.GetTestLog())
+			dependencies, err := operator.GetDependencies(nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(dependencies).To(ConsistOf(
+				authorino.Operator.Name,
+				nvidiagpu.Operator.Name,
+				odf.Operator.Name,
+				pipelines.Operator.Name,
+				serverless.Operator.Name,
+				servicemesh.Operator.Name,
+			))
+		})
+
+		It("Doesn't return the dependencies when compatibility with the old UI is explicitly disabled", func() {
+			os.Setenv(flagEnvVar, "false")
+			defer os.Unsetenv(flagEnvVar)
+			operator := NewOpenShiftAIOperator(common.GetTestLog())
+			dependencies, err := operator.GetDependencies(nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(dependencies).To(BeEmpty())
+		})
+
+	})
 })
diff --git a/openshift/template.yaml b/openshift/template.yaml
@@ -226,6 +226,8 @@ parameters:
   value: assisted-service-envoy-config
 - name: OPENSHIFT_AI_SUPPORTED_GPUS
   value: "10de"
+- name: OPENSHIFT_AI_SUPPORT_UI_2_37
+  value: "true"
 apiVersion: v1
 kind: Template
 metadata:
@@ -504,6 +506,8 @@ objects:
                 value: ${IGNORED_OPENSHIFT_VERSIONS}
               - name: OPENSHIFT_AI_SUPPORTED_GPUS
                 value: ${OPENSHIFT_AI_SUPPORTED_GPUS}
+              - name: OPENSHIFT_AI_SUPPORT_UI_2_37
+                value: ${OPENSHIFT_AI_SUPPORT_UI_2_37}
             volumeMounts:
               - name: route53-creds
                 mountPath: "/etc/.aws"