Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MGMT-19860: Revert remove dependencies from OpenShift AI operator #7342

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions internal/operators/openshiftai/openshift_ai_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,20 @@ type Config struct {
MinWorkerNodes int64 `envconfig:"OPENSHIFT_AI_MIN_WORKER_NODES" default:"2"`
MinWorkerMemoryGiB int64 `envconfig:"OPENSHIFT_AI_MIN_WORKER_MEMORY_GIB" default:"32"`
MinWorkerCPUCores int64 `envconfig:"OPENSHIFT_AI_MIN_WORKER_CPU_CORES" default:"8"`
RequireGPU bool `envconfig:"OPENSHIFT_AI_REQUIRE_GPU" default:"true"`

// TODO: Currently we use the controller image to run the setup tools because all we need is the shell and the
// `oc` command, and that way we don't need an additional image. But in the future we will probably want to have
// a separate image that contains the things that we need to run these setup jobs.
ControllerImage string `envconfig:"CONTROLLER_IMAGE" default:"quay.io/edge-infrastructure/assisted-installer-controller:latest"`

// SupportedGPUS is a comma separated list of vendor identifiers of supported GPUs. For examaple, to enable
// support for NVIDIA and Virtio GPUS the value should be `10de,1af4`. By default only NVIDIA GPUs are
// supported.
SupportedGPUs []string `envconfig:"OPENSHIFT_AI_SUPPORTED_GPUS" default:"10de"`

// SupportUI_2_37 indicates if the operator will work in a way that supports the old 2.37 version of the UI
// whilch doesn't understand operator bundles yet. When this is set to true the dependencies of the operator
// will be installed automatically.
SupportUI_2_37 bool `envconfig:"OPENSHIFT_AI_SUPPORT_UI_2_37" default:"true"`
}
84 changes: 83 additions & 1 deletion internal/operators/openshiftai/openshift_ai_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,20 @@ package openshiftai
import (
"context"
"fmt"
"strings"
"text/template"

"github.com/kelseyhightower/envconfig"
"github.com/lib/pq"
"github.com/openshift/assisted-service/internal/common"
"github.com/openshift/assisted-service/internal/operators/api"
"github.com/openshift/assisted-service/internal/operators/authorino"
operatorscommon "github.com/openshift/assisted-service/internal/operators/common"
"github.com/openshift/assisted-service/internal/operators/nvidiagpu"
"github.com/openshift/assisted-service/internal/operators/odf"
"github.com/openshift/assisted-service/internal/operators/pipelines"
"github.com/openshift/assisted-service/internal/operators/serverless"
"github.com/openshift/assisted-service/internal/operators/servicemesh"
"github.com/openshift/assisted-service/internal/templating"
"github.com/openshift/assisted-service/models"
"github.com/openshift/assisted-service/pkg/conversions"
Expand Down Expand Up @@ -63,7 +70,21 @@ func (o *operator) GetFullName() string {

// GetDependencies provides a list of dependencies of the Operator
func (o *operator) GetDependencies(c *common.Cluster) (result []string, err error) {
return nil, nil
// TODO: We shold probably add the node feature discovery and NVIDIA GPU operators only if there is at least one
// NVIDIA GPU in the cluster, but unfortunatelly this is calculated and saved to the database only when the
// cluster is created or updated via the API, and at that point we don't have the host inventory yet to
// determine if there are NVIDIA GPU.
if o.config.SupportUI_2_37 {
result = []string{
authorino.Operator.Name,
nvidiagpu.Operator.Name,
odf.Operator.Name,
pipelines.Operator.Name,
serverless.Operator.Name,
servicemesh.Operator.Name,
}
}
return result, nil
}

// GetClusterValidationID returns cluster validation ID for the operator.
Expand Down Expand Up @@ -93,6 +114,33 @@ func (o *operator) ValidateCluster(ctx context.Context, cluster *common.Cluster)
)
}

// Check that there is at least one supported GPU:
if o.config.SupportUI_2_37 && o.config.RequireGPU {
var gpuList []*models.Gpu
gpuList, err = o.gpusInCluster(cluster)
if err != nil {
return
}
var supportedGpuCount int64
for _, gpu := range gpuList {
var isSupported bool
isSupported, err = o.isSupportedGpu(gpu)
if err != nil {
return
}
if isSupported {
supportedGpuCount++
}
}
if supportedGpuCount == 0 {
result.Reasons = append(
result.Reasons,
"OpenShift AI requires at least one supported GPU, but there is none in the "+
"discovered hosts.",
)
}
}

if len(result.Reasons) > 0 {
result.Status = api.Failure
} else {
Expand All @@ -101,6 +149,40 @@ func (o *operator) ValidateCluster(ctx context.Context, cluster *common.Cluster)
return
}

func (o *operator) gpusInCluster(cluster *common.Cluster) (result []*models.Gpu, err error) {
for _, host := range cluster.Hosts {
var gpus []*models.Gpu
gpus, err = o.gpusInHost(host)
if err != nil {
return
}
result = append(result, gpus...)
}
return
}

func (o *operator) gpusInHost(host *models.Host) (result []*models.Gpu, err error) {
if host.Inventory == "" {
return
}
inventory, err := common.UnmarshalInventory(host.Inventory)
if err != nil {
return
}
result = inventory.Gpus
return
}

func (o *operator) isSupportedGpu(gpu *models.Gpu) (result bool, err error) {
for _, supportedGpu := range o.config.SupportedGPUs {
if strings.EqualFold(gpu.VendorID, supportedGpu) {
result = true
return
}
}
return
}

// ValidateHost returns validationResult based on node type requirements such as memory and CPU.
func (o *operator) ValidateHost(ctx context.Context, cluster *common.Cluster, host *models.Host,
_ *models.ClusterHostRequirementsDetails) (result api.ValidationResult, err error) {
Expand Down
126 changes: 126 additions & 0 deletions internal/operators/openshiftai/openshift_ai_operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@ package openshiftai

import (
"context"
"os"

. "github.com/onsi/ginkgo"
. "github.com/onsi/ginkgo/extensions/table"
. "github.com/onsi/gomega"
"github.com/openshift/assisted-service/internal/common"
"github.com/openshift/assisted-service/internal/operators/api"
"github.com/openshift/assisted-service/internal/operators/authorino"
"github.com/openshift/assisted-service/internal/operators/nvidiagpu"
"github.com/openshift/assisted-service/internal/operators/odf"
"github.com/openshift/assisted-service/internal/operators/pipelines"
"github.com/openshift/assisted-service/internal/operators/serverless"
"github.com/openshift/assisted-service/internal/operators/servicemesh"
"github.com/openshift/assisted-service/models"
"github.com/openshift/assisted-service/pkg/conversions"
)
Expand Down Expand Up @@ -96,4 +103,123 @@ var _ = Describe("Operator", func() {
},
),
)

DescribeTable(
"Is supported GPU",
func(env map[string]string, gpu *models.Gpu, expected bool) {
// Set the environment variables and restore the previous values when finished:
for name, value := range env {
oldValue, present := os.LookupEnv(name)
if present {
defer func() {
err := os.Setenv(name, oldValue)
Expect(err).ToNot(HaveOccurred())
}()
} else {
defer func() {
err := os.Unsetenv(name)
Expect(err).ToNot(HaveOccurred())
}()
}
err := os.Setenv(name, value)
Expect(err).ToNot(HaveOccurred())
}

// Create the operator:
operator = NewOpenShiftAIOperator(common.GetTestLog())

// Run the check:
actual, err := operator.isSupportedGpu(gpu)
Expect(err).ToNot(HaveOccurred())
Expect(actual).To(Equal(expected))
},
Entry(
"NVIDIA is supported by default",
nil,
&models.Gpu{
VendorID: "10de",
},
true,
),
Entry(
"Virtio isn't supported by default",
nil,
&models.Gpu{
VendorID: "1af4",
},
false,
),
Entry(
"Virtio is supported if explicitly added",
map[string]string{
"OPENSHIFT_AI_SUPPORTED_GPUS": "10de,1af4",
},
&models.Gpu{
VendorID: "1af4",
},
true,
),
Entry(
"Order isn't relevant",
map[string]string{
"OPENSHIFT_AI_SUPPORTED_GPUS": "1af4,10de",
},
&models.Gpu{
VendorID: "10de",
},
true,
),
Entry(
"Case isn't relevant",
nil,
&models.Gpu{
VendorID: "10DE",
},
true,
),
)

Describe("Get dependencies", func() {
const flagEnvVar = "OPENSHIFT_AI_SUPPORT_UI_2_37"

It("Returns the dependencies when compatibility with old UI is explicitly enabled", func() {
os.Setenv(flagEnvVar, "true")
defer os.Unsetenv(flagEnvVar)
operator := NewOpenShiftAIOperator(common.GetTestLog())
dependencies, err := operator.GetDependencies(nil)
Expect(err).ToNot(HaveOccurred())
Expect(dependencies).To(ConsistOf(
authorino.Operator.Name,
nvidiagpu.Operator.Name,
odf.Operator.Name,
pipelines.Operator.Name,
serverless.Operator.Name,
servicemesh.Operator.Name,
))
})

It("Returns the dependencies when compatibility with the old UI isn't explicitly enabled or disabled", func() {
operator := NewOpenShiftAIOperator(common.GetTestLog())
dependencies, err := operator.GetDependencies(nil)
Expect(err).ToNot(HaveOccurred())
Expect(dependencies).To(ConsistOf(
authorino.Operator.Name,
nvidiagpu.Operator.Name,
odf.Operator.Name,
pipelines.Operator.Name,
serverless.Operator.Name,
servicemesh.Operator.Name,
))
})

It("Doesn't return the dependencies when compatibility with the old UI is explicitly disabled", func() {
os.Setenv(flagEnvVar, "false")
defer os.Unsetenv(flagEnvVar)
operator := NewOpenShiftAIOperator(common.GetTestLog())
dependencies, err := operator.GetDependencies(nil)
Expect(err).ToNot(HaveOccurred())
Expect(dependencies).To(BeEmpty())
})

})
})
4 changes: 4 additions & 0 deletions openshift/template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ parameters:
value: assisted-service-envoy-config
- name: OPENSHIFT_AI_SUPPORTED_GPUS
value: "10de"
- name: OPENSHIFT_AI_SUPPORT_UI_2_37
value: "true"
apiVersion: v1
kind: Template
metadata:
Expand Down Expand Up @@ -504,6 +506,8 @@ objects:
value: ${IGNORED_OPENSHIFT_VERSIONS}
- name: OPENSHIFT_AI_SUPPORTED_GPUS
value: ${OPENSHIFT_AI_SUPPORTED_GPUS}
- name: OPENSHIFT_AI_SUPPORT_UI_2_37
value: ${OPENSHIFT_AI_SUPPORT_UI_2_37}
volumeMounts:
- name: route53-creds
mountPath: "/etc/.aws"
Expand Down