Skip to content

Commit

Permalink
Merge pull request microsoft#1383 from katiewasnothere/remove_nvidia_…
Browse files Browse the repository at this point in the history
…boot_files

Update lcow driver installation path
  • Loading branch information
katiewasnothere authored Sep 8, 2022
2 parents d0ba9f8 + 7229955 commit 8a057cd
Show file tree
Hide file tree
Showing 9 changed files with 101 additions and 57 deletions.
76 changes: 57 additions & 19 deletions devices/drivers.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,28 @@ import (
"context"
"fmt"

"github.com/Microsoft/go-winio/pkg/guid"
"github.com/Microsoft/hcsshim/internal/cmd"
"github.com/Microsoft/hcsshim/internal/guestpath"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/internal/resources"
"github.com/Microsoft/hcsshim/internal/uvm"
"github.com/pkg/errors"
)

// InstallKernelDriver mounts a specified kernel driver, then installs it in the UVM.
// InstallDriver mounts a share from the host into the UVM, installs any kernel drivers in the share,
// and configures the environment for library files and/or binaries in the share.
//
// `driver` is a directory path on the host that contains driver files for standard installation.
// InstallDriver mounts a specified kernel driver, then installs it in the UVM.
//
// `share` is a directory path on the host that contains files for standard driver installation.
// For windows this means files for pnp installation (.inf, .cat, .sys, .cert files).
// For linux this means a vhd file that contains the drivers under /lib/modules/`uname -r` for use
// with depmod and modprobe.
// with depmod and modprobe. For GPU, this vhd may also contain library files (under /usr/lib) and
// binaries (under /usr/bin or /usr/sbin) to be used in conjunction with the modules.
//
// Returns a ResourceCloser for the added mount. On failure, the mounted share will be released,
// the returned ResourceCloser will be nil, and an error will be returned.
func InstallKernelDriver(ctx context.Context, vm *uvm.UtilityVM, driver string) (closer resources.ResourceCloser, err error) {
func InstallDrivers(ctx context.Context, vm *uvm.UtilityVM, share string, gpuDriver bool) (closer resources.ResourceCloser, err error) {
defer func() {
if err != nil && closer != nil {
// best effort clean up allocated resource on failure
Expand All @@ -36,25 +40,53 @@ func InstallKernelDriver(ctx context.Context, vm *uvm.UtilityVM, driver string)
}()
if vm.OS() == "windows" {
options := vm.DefaultVSMBOptions(true)
closer, err = vm.AddVSMB(ctx, driver, options)
closer, err = vm.AddVSMB(ctx, share, options)
if err != nil {
return closer, fmt.Errorf("failed to add VSMB share to utility VM for path %+v: %s", driver, err)
return closer, fmt.Errorf("failed to add VSMB share to utility VM for path %+v: %s", share, err)
}
uvmPath, err := vm.GetVSMBUvmPath(ctx, driver, true)
uvmPath, err := vm.GetVSMBUvmPath(ctx, share, true)
if err != nil {
return closer, err
}
// attempt to install even if the driver has already been installed before so we
// can guarantee the device is ready for use afterwards
return closer, execPnPInstallDriver(ctx, vm, uvmPath)
}

// no need to reinstall if the driver has already been added in LCOW, return early
if _, err := vm.GetScsiUvmPath(ctx, share); err != uvm.ErrNotAttached {
return nil, err
}

// first mount driver as scsi in standard mount location
uvmPathForShare := fmt.Sprintf(guestpath.LCOWGlobalMountPrefixFmt, vm.UVMMountCounter())
scsiCloser, err := vm.AddSCSI(ctx, driver, uvmPathForShare, true, false, []string{}, uvm.VMAccessTypeIndividual)
closer, err = vm.AddSCSI(ctx,
share,
uvmPathForShare,
true,
false,
[]string{},
uvm.VMAccessTypeIndividual)
if err != nil {
return closer, fmt.Errorf("failed to add SCSI disk to utility VM for path %+v: %s", driver, err)
return closer, fmt.Errorf("failed to add SCSI disk to utility VM for path %+v: %s", share, err)
}
return scsiCloser, execModprobeInstallDriver(ctx, vm, uvmPathForShare)

// construct path that the drivers will be remounted as read/write in the UVM
driverGUID, err := guid.NewV4()
if err != nil {
return closer, fmt.Errorf("failed to create a guid path for driver %+v: %s", share, err)
}
uvmReadWritePath := fmt.Sprintf(guestpath.LCOWGlobalDriverPrefixFmt, driverGUID.String())
if gpuDriver {
// if installing gpu drivers in lcow, use the nvidia mount path instead
uvmReadWritePath = guestpath.LCOWNvidiaMountPath
}

// install drivers using gcs tool `install-drivers`
return closer, execGCSInstallDriver(ctx, vm, uvmPathForShare, uvmReadWritePath)
}

func execModprobeInstallDriver(ctx context.Context, vm *uvm.UtilityVM, driverDir string) error {
func execGCSInstallDriver(ctx context.Context, vm *uvm.UtilityVM, driverDir string, driverReadWriteDir string) error {
p, l, err := cmd.CreateNamedPipeListener()
if err != nil {
return err
Expand All @@ -68,29 +100,35 @@ func execModprobeInstallDriver(ctx context.Context, vm *uvm.UtilityVM, driverDir

args := []string{
"/bin/install-drivers",
driverReadWriteDir,
driverDir,
}
req := &cmd.CmdProcessRequest{
Args: args,
Stderr: p,
}

// A call to `ExecInUvm` may fail in the following ways:
// - The process runs and exits with a non-zero exit code. In this case we need to wait on the output
// from stderr so we can log it for debugging.
// - There's an error trying to run the process. No need to wait for stderr logs.
// - There's an error copying IO. No need to wait for stderr logs.
//
// Since we cannot distinguish between the cases above, we should always wait to read the stderr output.
exitCode, execErr := cmd.ExecInUvm(ctx, vm, req)

// wait to finish parsing stdout results
select {
case err := <-errChan:
if err != nil {
return errors.Wrap(err, execErr.Error())
if err != nil && err != noExecOutputErr {
return fmt.Errorf("failed to get stderr output from command %s: %v", driverDir, err)
}
case <-ctx.Done():
return errors.Wrap(ctx.Err(), execErr.Error())
return fmt.Errorf("timed out waiting for the console output from installing driver %s: %v", driverDir, ctx.Err())
}

if execErr != nil && execErr != noExecOutputErr {
return errors.Wrapf(execErr, "failed to install driver %s in uvm with exit code %d: %v", driverDir, exitCode, stderrOutput)
if execErr != nil {
return fmt.Errorf("%v: failed to install driver %s in uvm with exit code %d: %v", execErr, driverDir, exitCode, stderrOutput)
}

log.G(ctx).WithField("added drivers", driverDir).Debug("installed drivers")
return nil
}
1 change: 1 addition & 0 deletions guest/kmsg/kmsg.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//go:build linux
// +build linux

package kmsg

Expand Down
25 changes: 19 additions & 6 deletions guest/runtime/hcsv2/nvidia_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,9 @@ const nvidiaDebugFilePath = "/nvidia-container.log"

const nvidiaToolBinary = "nvidia-container-cli"

// TODO katiewasnothere: prestart hooks will be depracated, this needs to be moved to a createRuntime hook
// described here: https://github.com/opencontainers/runtime-spec/blob/39c287c415bf86fb5b7506528d471db5405f8ca8/config.md#posix-platform-hooks
// addNvidiaDevicePreHook builds the arguments for nvidia-container-cli and creates the prestart hook
func addNvidiaDevicePreHook(ctx context.Context, spec *oci.Spec) error {
// addNvidiaDeviceHook builds the arguments for nvidia-container-cli and creates the prestart hook
func addNvidiaDeviceHook(ctx context.Context, spec *oci.Spec) error {
genericHookBinary := "generichook"
genericHookPath, err := exec.LookPath(genericHookBinary)
if err != nil {
Expand Down Expand Up @@ -72,13 +71,27 @@ func addNvidiaDevicePreHook(ctx context.Context, spec *oci.Spec) error {
hookLogDebugFileEnvOpt := fmt.Sprintf("%s=%s", generichook.LogDebugFileEnvKey, nvidiaDebugFilePath)
hookEnv := append(updateEnvWithNvidiaVariables(), hookLogDebugFileEnvOpt)
nvidiaHook := hooks.NewOCIHook(genericHookPath, args, hookEnv)
return hooks.AddOCIHook(spec, hooks.Prestart, nvidiaHook)
return hooks.AddOCIHook(spec, hooks.CreateRuntime, nvidiaHook)
}

// Helper function to find the usr/lib path for the installed nvidia library files.
// This function assumes that the drivers have been installed using
// gcstool's `install-drivers` binary.
func getNvidiaDriversUsrLibPath() string {
return fmt.Sprintf("%s/content/usr/lib", guestpath.LCOWNvidiaMountPath)

}

// Helper function to find the usr/bin path for the installed nvidia tools.
// This function assumes that the drivers have been installed using
// gcstool's `install-drivers` binary.
func getNvidiaDriverUsrBinPath() string {
return fmt.Sprintf("%s/content/usr/bin", guestpath.LCOWNvidiaMountPath)
}

// updateEnvWithNvidiaVariables creates an env with the nvidia gpu vhd in PATH and insecure mode set
func updateEnvWithNvidiaVariables() []string {
nvidiaBin := fmt.Sprintf("%s/bin", guestpath.LCOWNvidiaMountPath)
env := updatePathEnv(nvidiaBin)
env := updatePathEnv(getNvidiaDriverUsrBinPath())
// NVC_INSECURE_MODE allows us to run nvidia-container-cli without seccomp
// we don't currently use seccomp in the uvm, so avoid using it here for now as well
env = append(env, "NVC_INSECURE_MODE=1")
Expand Down
7 changes: 0 additions & 7 deletions guest/runtime/hcsv2/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import (
oci "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"

"github.com/Microsoft/hcsshim/internal/hooks"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/pkg/annotations"
)
Expand Down Expand Up @@ -239,12 +238,6 @@ func applyAnnotationsToSpec(ctx context.Context, spec *oci.Spec) error {
return nil
}

// Helper function to create an oci prestart hook to run ldconfig
func addLDConfigHook(_ context.Context, spec *oci.Spec, args, env []string) error {
ldConfigHook := hooks.NewOCIHook("/sbin/ldconfig", args, env)
return hooks.AddOCIHook(spec, hooks.Prestart, ldConfigHook)
}

// devShmMountWithSize returns a /dev/shm device mount with size set to
// `sizeString` if it represents a valid size in KB, returns error otherwise.
func devShmMountWithSize(sizeString string) (*oci.Mount, error) {
Expand Down
13 changes: 8 additions & 5 deletions guest/runtime/hcsv2/workload_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (

specInternal "github.com/Microsoft/hcsshim/internal/guest/spec"
"github.com/Microsoft/hcsshim/internal/guestpath"
"github.com/Microsoft/hcsshim/internal/hooks"
"github.com/Microsoft/hcsshim/internal/oc"
"github.com/Microsoft/hcsshim/pkg/annotations"
)
Expand Down Expand Up @@ -152,14 +153,16 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.
spec.Linux.CgroupsPath = "/containers/" + id

if spec.Windows != nil {
// we only support Nvidia gpus right now
if specHasGPUDevice(spec) {
// we only support Nvidia gpus right now
ldConfigargs := []string{"-l", "/run/nvidia/lib"}
env := updateEnvWithNvidiaVariables()
if err := addLDConfigHook(ctx, spec, ldConfigargs, env); err != nil {
// run ldconfig as a `CreateRuntime` hook so that it's executed in the runtime namespace. This is
// necessary so that when we execute the nvidia hook to assign the device, the runtime can
// find the nvidia library files.
ldconfigHook := hooks.NewOCIHook("/sbin/ldconfig", []string{getNvidiaDriversUsrLibPath()}, os.Environ())
if err := hooks.AddOCIHook(spec, hooks.CreateRuntime, ldconfigHook); err != nil {
return err
}
if err := addNvidiaDevicePreHook(ctx, spec); err != nil {
if err := addNvidiaDeviceHook(ctx, spec); err != nil {
return err
}
}
Expand Down
3 changes: 3 additions & 0 deletions guestpath/paths.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ const (
// LCOWGlobalMountPrefixFmt is the path format in the LCOW UVM where global
// mounts are added
LCOWGlobalMountPrefixFmt = "/run/mounts/m%d"
// LCOWGlobalDriverPrefixFmt is the path format in the LCOW UVM where drivers
// are mounted as read/write
LCOWGlobalDriverPrefixFmt = "/run/drivers/%s"
// WCOWGlobalMountPrefixFmt is the path prefix format in the WCOW UVM where
// mounts are added
WCOWGlobalMountPrefixFmt = "C:\\mounts\\m%d"
Expand Down
2 changes: 1 addition & 1 deletion hcsoci/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C
// install kernel drivers if necessary.
// do this before network setup in case any of the drivers requested are
// network drivers
driverClosers, err := installPodDrivers(ctx, coi.HostingSystem, coi.Spec.Annotations)
driverClosers, err := addSpecGuestDrivers(ctx, coi.HostingSystem, coi.Spec.Annotations)
if err != nil {
return nil, r, err
}
Expand Down
29 changes: 11 additions & 18 deletions hcsoci/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (
"github.com/pkg/errors"

"github.com/Microsoft/hcsshim/internal/devices"
"github.com/Microsoft/hcsshim/internal/guestpath"
hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/internal/oci"
Expand Down Expand Up @@ -216,29 +215,21 @@ func handleAssignedDevicesLCOW(
if err != nil {
return resultDevs, closers, errors.Wrapf(err, "failed to add gpu vhd to %v", vm.ID())
}
// use lcowNvidiaMountPath since we only support nvidia gpus right now
// must use scsi here since DDA'ing a hyper-v pci device is not supported on VMs that have ANY virtual memory
// gpuvhd must be granted VM Group access.
options := []string{"ro"}
scsiMount, err := vm.AddSCSI(
ctx,
gpuSupportVhdPath,
guestpath.LCOWNvidiaMountPath,
true,
false,
options,
uvm.VMAccessTypeNoop,
)
driverCloser, err := devices.InstallDrivers(ctx, vm, gpuSupportVhdPath, true)
if err != nil {
return resultDevs, closers, errors.Wrapf(err, "failed to add scsi device %s in the UVM %s at %s", gpuSupportVhdPath, vm.ID(), guestpath.LCOWNvidiaMountPath)
return resultDevs, closers, err
}
if driverCloser != nil {
closers = append(closers, driverCloser)
}
closers = append(closers, scsiMount)
}

return resultDevs, closers, nil
}

func installPodDrivers(ctx context.Context, vm *uvm.UtilityVM, annotations map[string]string) (closers []resources.ResourceCloser, err error) {
// addSpecGuestDrivers is a helper function to install kernel drivers specified on a spec into the guest
func addSpecGuestDrivers(ctx context.Context, vm *uvm.UtilityVM, annotations map[string]string) (closers []resources.ResourceCloser, err error) {
defer func() {
if err != nil {
// best effort clean up allocated resources on failure
Expand All @@ -256,11 +247,13 @@ func installPodDrivers(ctx context.Context, vm *uvm.UtilityVM, annotations map[s
return closers, err
}
for _, d := range drivers {
driverCloser, err := devices.InstallKernelDriver(ctx, vm, d)
driverCloser, err := devices.InstallDrivers(ctx, vm, d, false)
if err != nil {
return closers, err
}
closers = append(closers, driverCloser)
if driverCloser != nil {
closers = append(closers, driverCloser)
}
}
return closers, err
}
Expand Down
2 changes: 1 addition & 1 deletion hcsoci/resources_wcow.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ func allocateWindowsResources(ctx context.Context, coi *createOptionsInternal, r
// when driver installation completes, we are guaranteed that the device is ready for use,
// so reinstall drivers to make sure the devices are ready when we proceed.
// TODO katiewasnothere: we should find a way to avoid reinstalling drivers
driverClosers, err := installPodDrivers(ctx, coi.HostingSystem, coi.Spec.Annotations)
driverClosers, err := addSpecGuestDrivers(ctx, coi.HostingSystem, coi.Spec.Annotations)
if err != nil {
return err
}
Expand Down

0 comments on commit 8a057cd

Please sign in to comment.