Skip to content

Commit

Permalink
RUN-7801 MIG Faker bugfix (#37)
Browse files Browse the repository at this point in the history
* .

* .

* .

* Roi's PR fixes
  • Loading branch information
gshaibi authored Mar 13, 2023
1 parent 072b7dd commit 86d71b3
Show file tree
Hide file tree
Showing 15 changed files with 174 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ const (
GpuFractionAnnotation = "gpu-fraction"
PodGroupNameAnnotation = "pod-group-name"
ReservationPodGpuIdxAnnotation = "run.ai/reserve_for_gpu_index"
MigMappingAnnotation = "run.ai/mig-mapping"

GpuGroupLabel = "runai-pod-group"
GpuGroupLabel = "runai-pod-group"
GpuProductLabel = "nvidia.com/gpu.product"
MigConfigStateLabel = "nvidia.com/mig.config.state"

ReservationNs = "runai-reservation"

Expand Down
10 changes: 10 additions & 0 deletions internal/common/kubeclient/kubeclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
type KubeClientInterface interface {
SetNodeLabels(lables map[string]string) error
SetNodeAnnotations(annotations map[string]string) error
GetNodeLabels() (map[string]string, error)
WatchConfigMap(namespace string, configmapName string) (chan *corev1.ConfigMap, error)
GetConfigMap(namespace string, configmapName string) (*corev1.ConfigMap, bool)
}
Expand Down Expand Up @@ -71,6 +72,15 @@ func (client *KubeClient) SetNodeAnnotations(annotations map[string]string) erro
return err
}

func (client *KubeClient) GetNodeLabels() (map[string]string, error) {
nodeName := viper.GetString("NODE_NAME")
node, err := client.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
if err != nil {
return nil, err
}
return node.Labels, nil
}

func (client *KubeClient) GetConfigMap(namespace string, configmapName string) (*corev1.ConfigMap, bool) {
cm, err := client.ClientSet.CoreV1().ConfigMaps(
namespace).Get(
Expand Down
5 changes: 5 additions & 0 deletions internal/common/kubeclient/kubeclient_mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
type KubeClientMock struct {
ActualSetNodeLabels func(labels map[string]string)
ActualSetNodeAnnotations func(annotations map[string]string)
ActualGetNodeLabels func() (map[string]string, error)
ActualWatchConfigMap func(namespace string, configmapName string)
}

Expand All @@ -21,6 +22,10 @@ func (client *KubeClientMock) SetNodeLabels(labels map[string]string) error {
return nil
}

func (client *KubeClientMock) GetNodeLabels() (map[string]string, error) {
return client.ActualGetNodeLabels()
}

func (client *KubeClientMock) WatchConfigMap(namespace string, configmapName string) (chan *corev1.ConfigMap, error) {
client.ActualWatchConfigMap(namespace, configmapName)
return nil, nil
Expand Down
4 changes: 0 additions & 4 deletions internal/migfaker/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,6 @@ type MigFakeApp struct {

func (app *MigFakeApp) Run() {
ContinuouslySyncMigConfigChanges(app.KubeClient.ClientSet, app.SyncableMigConfig, app.stopCh)
err := app.MigFaker.FakeNodeLabels()
if err != nil {
log.Fatalf("Error faking node labels: %e", err)
}
for {
select {
case <-app.stopCh:
Expand Down
105 changes: 85 additions & 20 deletions internal/migfaker/migfaker.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,14 @@ import (
"encoding/json"
"fmt"
"log"
"strconv"
"strings"

"github.com/google/uuid"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/kubeclient"
)

var fakeLables = map[string]string{
"feature.node.kubernetes.io/pci-10de.present": "true",
"node-role.kubernetes.io/runai-dynamic-mig": "true",
"node-role.kubernetes.io/runai-mig-enabled": "true",
}

var GenerateUuid = uuid.New

type MigFaker struct {
Expand All @@ -28,23 +25,33 @@ func NewMigFaker(kubeclient kubeclient.KubeClientInterface) *MigFaker {
}
}

func (faker *MigFaker) FakeNodeLabels() error {
return faker.kubeclient.SetNodeLabels(fakeLables)
}

func (faker *MigFaker) FakeMapping(config *MigConfigs) error {
mappings := map[string]map[string]string{}
for id, selectedDevice := range config.SelectedDevices {
mappings[fmt.Sprint(id)] = faker.copyMigDevices(selectedDevice)
mappings := MigMapping{}
for _, selectedDevice := range config.SelectedDevices {
if len(selectedDevice.Devices) == 0 {
continue
}

gpuIdx, err := strconv.Atoi(selectedDevice.Devices[0])
if err != nil {
return fmt.Errorf("failed to parse gpu index %s: %w", selectedDevice.Devices[0], err)
}

migDeviceMappingInfo, err := faker.getGpuMigDeviceMappingInfo(selectedDevice)
if err != nil {
return fmt.Errorf("failed to get gpu mig device mapping info: %w", err)
}

mappings[gpuIdx] = migDeviceMappingInfo
}

smappings, _ := json.Marshal(mappings)

labels := map[string]string{
"nvidia.com/mig.config.state": "success",
constants.MigConfigStateLabel: "success",
}
annotations := map[string]string{
"run.ai/mig-mapping": base64.StdEncoding.EncodeToString(smappings),
constants.MigMappingAnnotation: base64.StdEncoding.EncodeToString(smappings),
}

err := faker.kubeclient.SetNodeLabels(labels)
Expand All @@ -60,10 +67,68 @@ func (faker *MigFaker) FakeMapping(config *MigConfigs) error {
return nil
}

func (*MigFaker) copyMigDevices(devices SelectedDevices) map[string]string {
migDevices := map[string]string{}
for key := range devices.MigDevices {
migDevices[key] = fmt.Sprintf("MIG-%s", GenerateUuid())
func (faker *MigFaker) getGpuMigDeviceMappingInfo(devices SelectedDevices) ([]MigDeviceMappingInfo, error) {
gpuProduct, err := faker.getGpuProduct()
if err != nil {
return nil, fmt.Errorf("failed to get gpu product: %w", err)
}

migDevices := []MigDeviceMappingInfo{}
for _, migDevice := range devices.MigDevices {
gpuInstanceId, err := migInstanceNameToGpuInstanceId(gpuProduct, migDevice.Name)
if err != nil {
return nil, fmt.Errorf("failed to get gpu instance id: %w", err)
}
migDevices = append(migDevices, MigDeviceMappingInfo{
Position: migDevice.Position,
DeviceUUID: fmt.Sprintf("MIG-%s", GenerateUuid()),
GpuInstanceId: gpuInstanceId,
})
}

return migDevices, nil
}

func (faker *MigFaker) getGpuProduct() (string, error) {
nodeLabels, err := faker.kubeclient.GetNodeLabels()
if err != nil {
return "", fmt.Errorf("failed to get node labels: %w", err)
}
return migDevices

return nodeLabels[constants.GpuProductLabel], nil
}

func migInstanceNameToGpuInstanceId(gpuProduct string, migInstanceName string) (int, error) {
var gpuInstanceId int
var ok bool
switch {
case strings.Contains(gpuProduct, "40GB"):
gpuInstanceId, ok = map[string]int{
"1g.5gb": 19,
"1g.5gb+me": 20,
"1g.10gb": 15,
"2g.10gb": 14,
"3g.20gb": 9,
"4g.20gb": 5,
"7g.40gb": 0,
}[migInstanceName]
case strings.Contains(gpuProduct, "80GB"):
gpuInstanceId, ok = map[string]int{
"1g.10gb": 19,
"1g.10gb+me": 20,
"1g.20gb": 15,
"2g.20gb": 14,
"3g.40gb": 9,
"4g.40gb": 5,
"7g.80gb": 0,
}[migInstanceName]
default:
return -1, fmt.Errorf("gpuProduct %s not supported", gpuProduct)
}

if !ok {
return -1, fmt.Errorf("failed mapping mig instance name %s to gpu instance id", migInstanceName)
}

return gpuInstanceId, nil
}
32 changes: 28 additions & 4 deletions internal/migfaker/migfaker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import (
"testing"

"encoding/base64"
"encoding/json"

"github.com/google/uuid"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/kubeclient"
"github.com/run-ai/fake-gpu-operator/internal/migfaker"
"github.com/stretchr/testify/assert"
Expand All @@ -19,8 +21,12 @@ func TestFakeMapping(t *testing.T) {
{
Devices: []string{"0"},
MigEnabled: true,
MigDevices: map[string]string{
"4": uid.String(),
MigDevices: []migfaker.MigDevice{
{
Name: "4g.20gb",
Position: 0,
Size: 4,
},
},
},
},
Expand All @@ -30,11 +36,29 @@ func TestFakeMapping(t *testing.T) {
kubeClientMock.ActualSetNodeLabels = func(labels map[string]string) {
assert.Equal(t, labels["nvidia.com/mig.config.state"], "success")
}
kubeClientMock.ActualGetNodeLabels = func() (map[string]string, error) {
return map[string]string{
constants.GpuProductLabel: "NVIDIA-A100-SXM4-40GB",
}, nil
}

kubeClientMock.ActualSetNodeAnnotations = func(labels map[string]string) {
b64mapping := labels["run.ai/mig-mapping"]
mapping, _ := base64.StdEncoding.DecodeString(b64mapping)
assert.JSONEq(t, string(mapping), fmt.Sprintf(`{"0":{"4":"MIG-%s"}}`, uid))
actualMappingJson, _ := base64.StdEncoding.DecodeString(b64mapping)

expectedMapping := migfaker.MigMapping{
0: []migfaker.MigDeviceMappingInfo{
{
Position: 0,
DeviceUUID: fmt.Sprintf("MIG-%s", uid),
GpuInstanceId: 5,
},
},
}
expectedMappingJson, err := json.Marshal(expectedMapping)

assert.NoError(t, err)
assert.JSONEq(t, string(expectedMappingJson), string(actualMappingJson))
}

migFaker := migfaker.NewMigFaker(kubeClientMock)
Expand Down
24 changes: 21 additions & 3 deletions internal/migfaker/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,30 @@ type AnnotationMigConfig struct {
MigConfigs MigConfigs `yaml:"mig-configs"`
}

// A copy of github.com/run-ai/runai-operator/mig-parted/api/spec/v1.Spec
// (not imported to reduce dependencies)
type MigConfigs struct {
SelectedDevices []SelectedDevices `yaml:"selected"`
}

type SelectedDevices struct {
Devices []string `yaml:"devices"`
MigEnabled bool `yaml:"mig-enabled"`
MigDevices map[string]string `yaml:"mig-devices"`
Devices []string `yaml:"devices"`
MigEnabled bool `yaml:"mig-enabled"`
MigDevices []MigDevice `yaml:"mig-devices"`
}

type MigDevice struct {
Name string `yaml:"name"`
Position int `yaml:"position"`
Size int `yaml:"size"`
}

// A copy of github.com/run-ai/runai-operator/mig-provisioner/pkg/node.MigMapping
// (not imported to reduce dependencies)
type MigMapping map[int][]MigDeviceMappingInfo

type MigDeviceMappingInfo struct {
Position int `json:"position"`
DeviceUUID string `json:"device_uuid"`
GpuInstanceId int `json:"gpu_instance_id"`
}
18 changes: 10 additions & 8 deletions internal/status-exporter/app_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,10 +183,11 @@ func getTestCases() map[string]testCase {
},
},
expectedLabels: map[string]string{
"nvidia.com/gpu.memory": "20000",
"nvidia.com/gpu.count": "1",
"nvidia.com/mig.strategy": "mixed",
"nvidia.com/gpu.product": "Tesla P100",
"feature.node.kubernetes.io/pci-10de.present": "true",
"nvidia.com/gpu.memory": "20000",
"nvidia.com/gpu.count": "1",
"nvidia.com/mig.strategy": "mixed",
"nvidia.com/gpu.product": "Tesla P100",
},
expectedMetrics: []*dto.MetricFamily{
{
Expand Down Expand Up @@ -308,10 +309,11 @@ func getTestCases() map[string]testCase {
},
},
expectedLabels: map[string]string{
"nvidia.com/gpu.memory": "20000",
"nvidia.com/gpu.count": "2",
"nvidia.com/mig.strategy": "mixed",
"nvidia.com/gpu.product": "Tesla P100",
"feature.node.kubernetes.io/pci-10de.present": "true",
"nvidia.com/gpu.memory": "20000",
"nvidia.com/gpu.count": "2",
"nvidia.com/mig.strategy": "mixed",
"nvidia.com/gpu.product": "Tesla P100",
},
expectedMetrics: []*dto.MetricFamily{
{
Expand Down
2 changes: 1 addition & 1 deletion internal/status-exporter/export/fs/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ import (
"path/filepath"
"strconv"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"github.com/run-ai/fake-gpu-operator/internal/status-exporter/export"
"github.com/run-ai/fake-gpu-operator/internal/status-exporter/watch"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/common/constants"
"github.com/spf13/viper"
)

Expand Down
9 changes: 5 additions & 4 deletions internal/status-exporter/export/labels/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,11 @@ func (e *LabelsExporter) export(clusterTopology *topology.Cluster) {
}

labels := map[string]string{
"nvidia.com/gpu.memory": strconv.Itoa(node.GpuMemory),
"nvidia.com/gpu.product": node.GpuProduct,
"nvidia.com/mig.strategy": clusterTopology.MigStrategy,
"nvidia.com/gpu.count": strconv.Itoa(len(node.Gpus)),
"nvidia.com/gpu.memory": strconv.Itoa(node.GpuMemory),
"nvidia.com/gpu.product": node.GpuProduct,
"nvidia.com/mig.strategy": clusterTopology.MigStrategy,
"nvidia.com/gpu.count": strconv.Itoa(len(node.Gpus)),
"feature.node.kubernetes.io/pci-10de.present": "true",
}

err := e.kubeclient.SetNodeLabels(labels)
Expand Down
2 changes: 1 addition & 1 deletion internal/status-updater/app_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ import (
"github.com/google/uuid"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
status_updater "github.com/run-ai/fake-gpu-operator/internal/status-updater"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/common/constants"
"gopkg.in/yaml.v3"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import (
"fmt"
"log"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/util"
v1 "k8s.io/api/core/v1"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ import (
"strconv"
"strings"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/util"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ import (
"strconv"

"github.com/hashicorp/go-multierror"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/status-updater/util"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down
Loading

0 comments on commit 86d71b3

Please sign in to comment.