Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Partitionable model with generated partitions #32

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 116 additions & 17 deletions dra-evolution/pkg/api/capacity_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,10 @@ type ResourcePoolSpec struct {
// vendor of the driver.
DriverName string `json:"driverName" protobuf:"bytes,3,name=driverName"`

// SharedCapacity defines the set of shared capacity consumable by
// devices in this ResourceSlice.
//
// Must not have more than 128 entries.
// DeviceShape defines the common shape of all devices in this pool.
//
// +listType=atomic
// +optional
SharedCapacity []SharedCapacity `json:"sharedCapacity,omitempty"`
// +required
DeviceShape DeviceShape `json:"deviceShape"`

// Devices lists all available devices in this pool.
//
Expand All @@ -81,14 +77,8 @@ type ResourcePoolSpec struct {
const ResourcePoolMaxSharedCapacity = 128
const ResourcePoolMaxDevices = 128

// Device represents one individual hardware instance that can be selected based
// on its attributes.
type Device struct {
// Name is unique identifier among all devices managed by
// the driver on the node. It must be a DNS label.
Name string `json:"name" protobuf:"bytes,1,name=name"`

// Attributes defines the attributes of this device.
type DeviceShape struct {
// Attributes defines the attributes of this device shape.
// The name of each attribute must be unique.
//
// Must not have more than 32 entries.
Expand All @@ -97,14 +87,108 @@ type Device struct {
// +optional
Attributes []DeviceAttribute `json:"attributes,omitempty" protobuf:"bytes,3,opt,name=attributes"`

// Partitions defines the set of partitions into which this device shape
// may be allocated. If not populated, then the device shape is always
// consumed in its entirety.
//
// +listType=atomic
// +optional
Partitions []DevicePartition `json:"partitions,omitempty"`

// SharedCapacity defines the set of shared capacity consumable by
// partitions in this DeviceShape. Not meaninful for non-partitioned
// devices.
//
// Must not have more than 128 entries.
//
// +listType=atomic
// +optional
SharedCapacity []SharedCapacity `json:"sharedCapacity,omitempty"`
}

// StringOrExpression contains either an explicit string Value or
// a CEL expression that will return a string.
type StringOrExpression struct {
Value *string `json:"value,omitempty"`
Expression *string `json:"expression,omitempty"`
}

// QuantityOrExpression contains either an explicit resource.Quantity Value
// or a CEL expression that results in a resource.Quantity (or a string that parses
// to one).
type QuantityOrExpression struct {
Value *resource.Quantity `json:"value,omitempty"`
Expression *string `json:"expression,omitempty"`
}

// QuantityOrExpression contains either an explicit bool Value
// or a CEL expression that results in a bool
type BoolOrExpression struct {
Value *bool `json:"value,omitempty"`
Expression *string `json:"expression,omitempty"`
}

// Device represents a format for a partition, and a count. The actual partitions of
// the device are generated in-memory by evaluating the format for the index values
// 0..Count.
type DevicePartition struct {

// Count identifies the number of partitions using this format.
//
// +required
Count int `json:"count"`

// Name is unique identifier among all partitions for this device. The
// device name as recorded in the allocation will be the concatenation
// of the device name and the partition name with a '-' separator.
//
// NOTE: may need a better naming scheme
//
// It must be a DNS label.
//
// +required
Name StringOrExpression `json:"name" protobuf:"bytes,1,name=name"`

// Attributes defines the attributes of this partition.
// The name of each attribute must be unique. The values
// in here are overlayed on top of the values in the device
// shape (overwriting them if the names are the same).
//
// NOTE: probably can get away with fewer
//
// Must not have more than 32 entries.
//
// +listType=atomic
// +optional
Attributes []DeviceAttributeFormat `json:"attributes,omitempty" protobuf:"bytes,3,opt,name=attributes"`

// SharedCapacityConsumed defines the set of shared capacity consumed by
// this device.
// this partition.
//
// Must not have more than 32 entries.
//
// +listType=atomic
// +optional
SharedCapacityConsumed []SharedCapacityFormat `json:"sharedCapacityConsumed,omitempty"`
}

type Device struct {
// Name is unique identifier among all devices managed by
// the driver on the node. It must be a DNS label.
Name string `json:"name" protobuf:"bytes,1,name=name"`

// Attributes defines the attributes of this device.
// The name of each attribute must be unique. The values
// in here are overlayed on top of the values in the device
// shape (overwriting them if the names are the same).
//
// Must not have more than 32 entries.
//
// NOTE: probably can get away with fewer
//
// +listType=atomic
// +optional
SharedCapacityConsumed []SharedCapacity `json:"sharedCapacityConsumed,omitempty"`
Attributes []DeviceAttribute `json:"attributes,omitempty" protobuf:"bytes,3,opt,name=attributes"`
}

const ResourcePoolMaxAttributesPerDevice = 32
Expand Down Expand Up @@ -180,6 +264,21 @@ type SharedCapacity struct {
Capacity resource.Quantity `json:"capacity"`
}

type DeviceAttributeFormat struct {
Name StringOrExpression `json:"name"`

QuantityValue *QuantityOrExpression `json:"quantity,omitempty"`
BoolValue *BoolOrExpression `json:"bool,omitempty"`
StringValue *StringOrExpression `json:"string,omitempty"`
VersionValue *StringOrExpression `json:"version,omitempty"`
}

type SharedCapacityFormat struct {
Name StringOrExpression `json:"name"`

Capacity *QuantityOrExpression `json:"capacity,omitempty"`
}

// CStyleIdentifierMaxLength is the maximum length of a c-style identifier used for naming.
const CStyleIdentifierMaxLength = 32

Expand Down
81 changes: 62 additions & 19 deletions dra-evolution/pkg/gen/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package gen

import (
"fmt"
"strings"

"github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100"
"github.com/kubernetes-sigs/wg-device-management/dra-evolution/pkg/api"
Expand All @@ -15,13 +16,30 @@ import (
)

func dgxa100Pool(nodeName, poolName string, gpus int) (*api.ResourcePool, error) {
shapeAttrs := map[string]bool{
"product-name": true,
"brand": true,
"architecture": true,
"cuda-compute-capability": true,
"driver-version": true,
"cuda-driver-version": true,
}
partitionAttrs := map[string]bool{
"mig-profile": true,
}
deviceAttrs := map[string]bool{
"uuid": true,
"mig-capable": true,
"mig-enabled": true,
}

// Instantiate an instance of a mock dgxa100 server and build a nvDeviceLib
// from it. The nvDeviceLib is then used to populate the list of allocatable
// devices from this mock server using standard NVML calls.
l := nvdevicelib.New(dgxa100.New())

var shape api.DeviceShape
var devices []api.Device
var shared []api.SharedCapacity
for gpu := 0; gpu < gpus; gpu++ {
// Get the full list of allocatable devices from this GPU on the server
allocatable, err := l.GetPerGpuAllocatableDevices(gpu)
Expand All @@ -39,12 +57,30 @@ func dgxa100Pool(nodeName, poolName string, gpus int) (*api.ResourcePool, error)
return nil, fmt.Errorf("found %d shared limit groups in the resources", len(model.NamedResources.SharedLimits))
}

shared = append(shared, sharedGroupToResources(model.NamedResources.SharedLimits[0], gpu)...)
if gpu == 0 {
// first time through, create the shape
shape.SharedCapacity = sharedGroupToResources(model.NamedResources.SharedLimits[0])
}

var devAttrVals []api.DeviceAttribute
var partitions []api.DevicePartition
for _, instance := range model.NamedResources.Instances {
devices = append(devices, instanceToDevice(instance, gpu))
if gpu == 0 {
shape.Attributes = attributesToDeviceAttributes(instance.Attributes, shapeAttrs)
partitions = append(partitions, instanceToPartition(instance, partitionAttrs))
}
devAttrVals = append(devAttrVals, attributesToDeviceAttributes(instance.Attributes, deviceAttrs)...)
}
if shape.Partitions == nil {
shape.Partitions = partitions
}
}

devices = append(devices, api.Device{
Name: fmt.Sprintf("gpu-%d", gpu),
Attributes: devAttrVals,
})

}
return &api.ResourcePool{
TypeMeta: metav1.TypeMeta{
APIVersion: DevMgmtAPIVersion,
Expand All @@ -54,31 +90,39 @@ func dgxa100Pool(nodeName, poolName string, gpus int) (*api.ResourcePool, error)
Name: nodeName + "-" + poolName,
},
Spec: api.ResourcePoolSpec{
NodeName: nodeName,
DriverName: "gpu.nvidia.com/dra",
SharedCapacity: shared,
Devices: devices,
NodeName: nodeName,
DriverName: "gpu.nvidia.com/dra",
DeviceShape: shape,
Devices: devices,
},
}, nil
}

func instanceToDevice(instance newresourceapi.NamedResourcesInstance, gpu int) api.Device {
device := api.Device{
Name: instance.Name,
Attributes: attributesToDeviceAttributes(instance.Attributes),
func instanceToPartition(instance newresourceapi.NamedResourcesInstance, partitionAttrs map[string]bool) api.DevicePartition {
name := strings.TrimPrefix(instance.Name, "gpu-0-")
if name == "gpu-0" {
name = "whole"
}

partition := api.DevicePartition{
Name: name,
Attributes: attributesToDeviceAttributes(instance.Attributes, partitionAttrs),
}

if len(instance.Resources) > 0 {
device.SharedCapacityConsumed = sharedGroupToResources(instance.Resources[0], gpu)
partition.SharedCapacityConsumed = sharedGroupToResources(instance.Resources[0])
}

return device
return partition
}

func attributesToDeviceAttributes(attrs []resourceapi.NamedResourcesAttribute) []api.DeviceAttribute {
func attributesToDeviceAttributes(attrs []resourceapi.NamedResourcesAttribute, keep map[string]bool) []api.DeviceAttribute {
var attributes []api.DeviceAttribute

for _, a := range attrs {
if _, ok := keep[a.Name]; !ok {
continue
}
if a.QuantityValue != nil {
attributes = append(attributes, api.DeviceAttribute{
Name: a.Name,
Expand Down Expand Up @@ -106,14 +150,13 @@ func attributesToDeviceAttributes(attrs []resourceapi.NamedResourcesAttribute) [
return attributes
}

func sharedGroupToResources(group newresourceapi.NamedResourcesSharedResourceGroup, gpu int) []api.SharedCapacity {
func sharedGroupToResources(group newresourceapi.NamedResourcesSharedResourceGroup) []api.SharedCapacity {
var resources []api.SharedCapacity

for _, item := range group.Items {
name := fmt.Sprintf("gpu-%d-%s", gpu, item.Name)
if item.QuantityValue != nil && !item.QuantityValue.IsZero() {
resources = append(resources, api.SharedCapacity{
Name: name,
Name: item.Name,
Capacity: *item.QuantityValue,
})
} else if item.IntRangeValue != nil {
Expand All @@ -123,7 +166,7 @@ func sharedGroupToResources(group newresourceapi.NamedResourcesSharedResourceGro
single := intrange.NewIntRange(int64(i), 1)
if item.IntRangeValue.Contains(single) {
resources = append(resources, api.SharedCapacity{
Name: fmt.Sprintf("%s-%d", name, i),
Name: fmt.Sprintf("%s-%d", item.Name, i),
Capacity: resource.MustParse("1"),
})
}
Expand Down
Loading