Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions internal/guest/runtime/hcsv2/nvidia_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ import (
const nvidiaDebugFilePath = "nvidia-container.log"
const nvidiaToolBinary = "nvidia-container-cli"

// described here: https://github.com/opencontainers/runtime-spec/blob/39c287c415bf86fb5b7506528d471db5405f8ca8/config.md#posix-platform-hooks
// addNvidiaDeviceHook builds the arguments for nvidia-container-cli and creates the prestart hook
// addNvidiaDeviceHook builds the arguments for nvidia-container-cli and creates the createRuntime [OCI hooks].
//
// [OCI hooks]: https://github.com/opencontainers/runtime-spec/blob/39c287c415bf86fb5b7506528d471db5405f8ca8/config.md#posix-platform-hooks
func addNvidiaDeviceHook(ctx context.Context, spec *oci.Spec, ociBundlePath string) error {
genericHookBinary := "generichook"
genericHookPath, err := exec.LookPath(genericHookBinary)
Expand Down
19 changes: 19 additions & 0 deletions internal/guest/runtime/hcsv2/workload_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,25 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.
if err := addNvidiaDeviceHook(ctx, spec, ociBundlePath); err != nil {
return err
}

// The NVIDIA device hook `nvidia-container-cli` adds `rw` permissions for the
// GPU and ctl nodes (`c 195:*`) to the devices allow list, but CUDA apparently also
// needs `rwm` permission for other device nodes (e.g., `c 235`)
//
// Grant `rwm` to all character devices (`c *:* rwm`) to avoid hard coding exact node
// numbers, which are unknown before the driver runs (GPU devices are presented as I2C
// devices initially) or could change with driver implementation.
//
// Note: runc already grants mknod, `c *:* m`, so this really adds `rw` permissions for
// all character devices:
// https://github.com/opencontainers/runc/blob/6bae6cad4759a5b3537d550f43ea37d51c6b518a/libcontainer/specconv/spec_linux.go#L205-L222
spec.Linux.Resources.Devices = append(spec.Linux.Resources.Devices,
oci.LinuxDeviceCgroup{
Allow: true,
Type: "c",
Access: "rwm",
},
)
}
// add other assigned devices to the spec
if err := specGuest.AddAssignedDevice(ctx, spec); err != nil {
Expand Down
16 changes: 14 additions & 2 deletions internal/guest/spec/spec_devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ import (
"strings"
"time"

"github.com/Microsoft/hcsshim/internal/guest/storage/pci"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/opencontainers/runc/libcontainer/devices"
oci "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"

"github.com/Microsoft/hcsshim/internal/guest/storage/pci"
"github.com/Microsoft/hcsshim/internal/log"
)

const (
Expand All @@ -23,13 +25,17 @@ const (
charType = "char"
blockType = "block"

// TODO: consolidate with `internal\uvm\virtual_device.go` and use in both locations
gpuDeviceIDType = "gpu"
vpciDeviceIDTypeLegacy = "vpci"
vpciDeviceIDType = "vpci-instance-id"
)

// AddAssignedDevice goes through the assigned devices that have been enumerated
// on the spec and updates the spec so that the correct device nodes can be mounted
// into the resulting container by the runtime.
//
// GPU devices are skipped, since they are handled in [addNvidiaDeviceHook].
func AddAssignedDevice(ctx context.Context, spec *oci.Spec) error {
// Add an explicit timeout before we try to find the dev nodes so we
// aren't waiting forever.
Expand All @@ -52,6 +58,12 @@ func AddAssignedDevice(ctx context.Context, spec *oci.Spec) error {
for _, dev := range devs {
AddLinuxDeviceToSpec(ctx, dev, spec, true)
}
case gpuDeviceIDType:
default:
log.G(ctx).WithFields(logrus.Fields{
"type": d.IDType,
"id": d.ID,
}).Warn("unknown device type")
}
}

Expand Down
26 changes: 13 additions & 13 deletions internal/hcsoci/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,21 +174,21 @@ func handleAssignedDevicesLCOW(

// assign device into UVM and create corresponding spec windows devices
for _, d := range specDevs {
if uvm.IsValidDeviceType(d.IDType) {
pciID, index := devices.GetDeviceInfoFromPath(d.ID)
vpci, err := vm.AssignDevice(ctx, pciID, index, "")
if err != nil {
return resultDevs, closers, errors.Wrapf(err, "failed to assign device %s, function %d to pod %s", pciID, index, vm.ID())
}
closers = append(closers, vpci)

// update device ID on the spec to the assigned device's resulting vmbus guid so gcs knows which devices to
// map into the container
d.ID = vpci.VMBusGUID
resultDevs = append(resultDevs, d)
} else {
if !uvm.IsValidDeviceType(d.IDType) {
return resultDevs, closers, errors.Errorf("specified device %s has unsupported type %s", d.ID, d.IDType)
}

pciID, index := devices.GetDeviceInfoFromPath(d.ID)
vpci, err := vm.AssignDevice(ctx, pciID, index, "")
if err != nil {
return resultDevs, closers, errors.Wrapf(err, "failed to assign device %s, function %d to pod %s", pciID, index, vm.ID())
}
closers = append(closers, vpci)

// update device ID on the spec to the assigned device's resulting vmbus guid so gcs knows which devices to
// map into the container
d.ID = vpci.VMBusGUID
resultDevs = append(resultDevs, d)
}

return resultDevs, closers, nil
Expand Down