Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/runhcs/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func launchShim(cmd, pidFile, logFile string, args []string, data interface{}) (
}

fullargs = append(fullargs, "--log-format", logFormat)
if logrus.GetLevel() == logrus.DebugLevel {
if logrus.IsLevelEnabled(logrus.DebugLevel) {
fullargs = append(fullargs, "--debug")
}
}
Expand Down
39 changes: 22 additions & 17 deletions internal/annotations/annotations.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
// Do not rely on these annotations to customize production workload behavior.
package annotations

// uVM specific annotations

// uVM annotations.
const (
// UVMHyperVSocketConfigPrefix is the prefix of an annotation to map a [hyper-v socket] service GUID
// to a JSON-encoded string of its [configuration].
Expand All @@ -30,24 +29,15 @@ const (
// [configuration]: https://learn.microsoft.com/en-us/virtualization/api/hcs/schemareference#HvSocketServiceConfig
UVMHyperVSocketConfigPrefix = "io.microsoft.virtualmachine.hv-socket.service-table."

// AdditionalRegistryValues specifies additional registry keys and their values to set in the WCOW UVM.
// The format is a JSON-encoded string of an array containing [HCS RegistryValue] objects.
//
// Registry values will be available under `HKEY_LOCAL_MACHINE` root key.
//
// For example:
//
// "[{\"Key\": {\"Hive\": \"System\", \"Name\": \"registry\\key\\path"}, \"Name\": \"ValueName\", \"Type\": \"String\", \"StringValue\": \"value\"}]"
//
// [HCS RegistryValue]: https://learn.microsoft.com/en-us/virtualization/api/hcs/schemareference#registryvalue
AdditionalRegistryValues = "io.microsoft.virtualmachine.wcow.additional-reg-keys"

// ExtraVSockPorts adds additional ports to the list of ports that the UVM is allowed to use.
ExtraVSockPorts = "io.microsoft.virtualmachine.lcow.extra-vsock-ports"

// UVMConsolePipe is the name of the named pipe that the UVM console is connected to. This works only for non-SNP
// scenario, for SNP use a debugger.
UVMConsolePipe = "io.microsoft.virtualmachine.console.pipe"
)

// LCOW uVM annotations.
const (
// ExtraVSockPorts adds additional ports to the list of ports that the UVM is allowed to use.
ExtraVSockPorts = "io.microsoft.virtualmachine.lcow.extra-vsock-ports"

// NetworkingPolicyBasedRouting toggles on the ability to set policy based routing in the
// guest for LCOW.
Expand All @@ -57,3 +47,18 @@ const (
// LCOW scenarios. Ideally, this annotation should be removed if no issues are found.
NetworkingPolicyBasedRouting = "io.microsoft.virtualmachine.lcow.network.policybasedrouting"
)

// WCOW uVM annotations.
const (
// AdditionalRegistryValues specifies additional registry keys and their values to set in the WCOW UVM.
// The format is a JSON-encoded string of an array containing [HCS RegistryValue] objects.
//
// Registry values will be available under `HKEY_LOCAL_MACHINE` root key.
//
// For example:
//
// "[{\"Key\": {\"Hive\": \"System\", \"Name\": \"registry\\key\\path"}, \"Name\": \"ValueName\", \"Type\": \"String\", \"StringValue\": \"value\"}]"
//
// [HCS RegistryValue]: https://learn.microsoft.com/en-us/virtualization/api/hcs/schemareference#registryvalue
AdditionalRegistryValues = "io.microsoft.virtualmachine.wcow.additional-reg-keys"
)
2 changes: 1 addition & 1 deletion internal/gcs/bridge.go
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ func (brdg *bridge) writeMessage(buf *bytes.Buffer, enc *json.Encoder, typ msgTy
// Update the message header with the size.
binary.LittleEndian.PutUint32(buf.Bytes()[hdrOffSize:], uint32(buf.Len()))

if brdg.log.Logger.GetLevel() > logrus.DebugLevel {
if brdg.log.Logger.IsLevelEnabled(logrus.TraceLevel) {
b := buf.Bytes()[hdrSize:]
switch typ {
// container environment vars are in rpCreate for linux; rpcExecuteProcess for windows
Expand Down
2 changes: 1 addition & 1 deletion internal/guest/bridge/bridge.go
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser
trace.StringAttribute("cid", base.ContainerID))

entry := log.G(ctx)
if entry.Logger.GetLevel() > logrus.DebugLevel {
if entry.Logger.IsLevelEnabled(logrus.TraceLevel) {
var err error
var msgBytes []byte
switch header.Type {
Expand Down
2 changes: 1 addition & 1 deletion internal/guest/network/netns.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ func NetNSConfig(ctx context.Context, ifStr string, nsPid int, adapter *guestres
}

// Add some debug logging
if entry.Logger.GetLevel() >= logrus.DebugLevel {
if entry.Logger.IsLevelEnabled(logrus.DebugLevel) {
curNS, _ := netns.Get()
// Refresh link attributes/state
link, _ = netlink.LinkByIndex(link.Attrs().Index)
Expand Down
5 changes: 3 additions & 2 deletions internal/guest/runtime/hcsv2/nvidia_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ import (
const nvidiaDebugFilePath = "nvidia-container.log"
const nvidiaToolBinary = "nvidia-container-cli"

// described here: https://github.com/opencontainers/runtime-spec/blob/39c287c415bf86fb5b7506528d471db5405f8ca8/config.md#posix-platform-hooks
// addNvidiaDeviceHook builds the arguments for nvidia-container-cli and creates the prestart hook
// addNvidiaDeviceHook builds the arguments for nvidia-container-cli and creates the createRuntime [OCI hooks].
//
// [OCI hooks]: https://github.com/opencontainers/runtime-spec/blob/39c287c415bf86fb5b7506528d471db5405f8ca8/config.md#posix-platform-hooks
func addNvidiaDeviceHook(ctx context.Context, spec *oci.Spec, ociBundlePath string) error {
genericHookBinary := "generichook"
genericHookPath, err := exec.LookPath(genericHookBinary)
Expand Down
19 changes: 19 additions & 0 deletions internal/guest/runtime/hcsv2/workload_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,25 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.
if err := addNvidiaDeviceHook(ctx, spec, ociBundlePath); err != nil {
return err
}

// The NVIDIA device hook `nvidia-container-cli` adds `rw` permissions for the
// GPU and ctl nodes (`c 195:*`) to the devices allow list, but CUDA apparently also
// needs `rwm` permission for other device nodes (e.g., `c 235`)
//
// Grant `rwm` to all character devices (`c *:* rwm`) to avoid hard coding exact node
// numbers, which are unknown before the driver runs (GPU devices are presented as I2C
// devices initially) or could change with driver implementation.
//
// Note: runc already grants mknod, `c *:* m`, so this really adds `rw` permissions for
// all character devices:
// https://github.com/opencontainers/runc/blob/6bae6cad4759a5b3537d550f43ea37d51c6b518a/libcontainer/specconv/spec_linux.go#L205-L222
spec.Linux.Resources.Devices = append(spec.Linux.Resources.Devices,
oci.LinuxDeviceCgroup{
Allow: true,
Type: "c",
Access: "rwm",
},
)
}
// add other assigned devices to the spec
if err := specGuest.AddAssignedDevice(ctx, spec); err != nil {
Expand Down
16 changes: 14 additions & 2 deletions internal/guest/spec/spec_devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ import (
"strings"
"time"

"github.com/Microsoft/hcsshim/internal/guest/storage/pci"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/opencontainers/runc/libcontainer/devices"
oci "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"

"github.com/Microsoft/hcsshim/internal/guest/storage/pci"
"github.com/Microsoft/hcsshim/internal/log"
)

const (
Expand All @@ -23,13 +25,17 @@ const (
charType = "char"
blockType = "block"

// TODO: consolidate with `internal\uvm\virtual_device.go` and use in both locations
gpuDeviceIDType = "gpu"
vpciDeviceIDTypeLegacy = "vpci"
vpciDeviceIDType = "vpci-instance-id"
)

// AddAssignedDevice goes through the assigned devices that have been enumerated
// on the spec and updates the spec so that the correct device nodes can be mounted
// into the resulting container by the runtime.
//
// GPU devices are skipped, since they are handled in [addNvidiaDeviceHook].
func AddAssignedDevice(ctx context.Context, spec *oci.Spec) error {
// Add an explicit timeout before we try to find the dev nodes so we
// aren't waiting forever.
Expand All @@ -52,6 +58,12 @@ func AddAssignedDevice(ctx context.Context, spec *oci.Spec) error {
for _, dev := range devs {
AddLinuxDeviceToSpec(ctx, dev, spec, true)
}
case gpuDeviceIDType:
default:
log.G(ctx).WithFields(logrus.Fields{
"type": d.IDType,
"id": d.ID,
}).Warn("unknown device type")
}
}

Expand Down
26 changes: 13 additions & 13 deletions internal/hcsoci/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,21 +174,21 @@ func handleAssignedDevicesLCOW(

// assign device into UVM and create corresponding spec windows devices
for _, d := range specDevs {
if uvm.IsValidDeviceType(d.IDType) {
pciID, index := devices.GetDeviceInfoFromPath(d.ID)
vpci, err := vm.AssignDevice(ctx, pciID, index, "")
if err != nil {
return resultDevs, closers, errors.Wrapf(err, "failed to assign device %s, function %d to pod %s", pciID, index, vm.ID())
}
closers = append(closers, vpci)

// update device ID on the spec to the assigned device's resulting vmbus guid so gcs knows which devices to
// map into the container
d.ID = vpci.VMBusGUID
resultDevs = append(resultDevs, d)
} else {
if !uvm.IsValidDeviceType(d.IDType) {
return resultDevs, closers, errors.Errorf("specified device %s has unsupported type %s", d.ID, d.IDType)
}

pciID, index := devices.GetDeviceInfoFromPath(d.ID)
vpci, err := vm.AssignDevice(ctx, pciID, index, "")
if err != nil {
return resultDevs, closers, errors.Wrapf(err, "failed to assign device %s, function %d to pod %s", pciID, index, vm.ID())
}
closers = append(closers, vpci)

// update device ID on the spec to the assigned device's resulting vmbus guid so gcs knows which devices to
// map into the container
d.ID = vpci.VMBusGUID
resultDevs = append(resultDevs, d)
}

return resultDevs, closers, nil
Expand Down
2 changes: 1 addition & 1 deletion internal/oci/annotations.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ func ProcessAnnotations(ctx context.Context, s *specs.Spec) error {

// expand annotations
var errs []error
for key, exps := range annotations.AnnotationExpansions {
for key, exps := range annotations.AnnotationExpansionMap() {
// check if annotation is present
if val, ok := s.Annotations[key]; ok {
// ideally, some normalization would occur here (ie, "True" -> "true")
Expand Down
3 changes: 2 additions & 1 deletion internal/oci/annotations_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ func TestProccessAnnotations_Expansion(t *testing.T) {
subtest.Fatalf("could not update spec from options: %v", err)
}

for _, k := range annotations.AnnotationExpansions[annotations.DisableUnsafeOperations] {
ae := annotations.AnnotationExpansionMap()
for _, k := range ae[annotations.DisableUnsafeOperations] {
if vv := tt.spec.Annotations[k]; vv != v {
subtest.Fatalf("annotation %q was incorrectly expanded to %q, expected %q", k, vv, v)
}
Expand Down
5 changes: 4 additions & 1 deletion internal/oci/uvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,10 @@ func specToUVMCreateOptionsCommon(ctx context.Context, opts *uvm.Options, s *spe
opts.ProcessDumpLocation = ParseAnnotationsString(s.Annotations, annotations.ContainerProcessDumpLocation, opts.ProcessDumpLocation)
opts.NoWritableFileShares = ParseAnnotationsBool(ctx, s.Annotations, annotations.DisableWritableFileShares, opts.NoWritableFileShares)
opts.DumpDirectoryPath = ParseAnnotationsString(s.Annotations, annotations.DumpDirectoryPath, opts.DumpDirectoryPath)

// NUMA settings
opts.MaxProcessorsPerNumaNode = ParseAnnotationsUint32(ctx, s.Annotations, annotations.NumaMaximumProcessorsPerNode, opts.MaxProcessorsPerNumaNode)
opts.MaxSizePerNode = ParseAnnotationsUint64(ctx, s.Annotations, annotations.NumaMaximumSizePerNode, opts.MaxSizePerNode)
opts.MaxMemorySizePerNumaNode = ParseAnnotationsUint64(ctx, s.Annotations, annotations.NumaMaximumMemorySizePerNode, opts.MaxMemorySizePerNumaNode)
opts.PreferredPhysicalNumaNodes = ParseAnnotationCommaSeparatedUint32(ctx, s.Annotations, annotations.NumaPreferredPhysicalNodes,
opts.PreferredPhysicalNumaNodes)
opts.NumaMappedPhysicalNodes = ParseAnnotationCommaSeparatedUint32(ctx, s.Annotations, annotations.NumaMappedPhysicalNodes,
Expand All @@ -278,6 +280,7 @@ func specToUVMCreateOptionsCommon(ctx context.Context, opts *uvm.Options, s *spe
opts.NumaProcessorCounts)
opts.NumaMemoryBlocksCounts = ParseAnnotationCommaSeparatedUint64(ctx, s.Annotations, annotations.NumaCountOfMemoryBlocks,
opts.NumaMemoryBlocksCounts)

maps.Copy(opts.AdditionalHyperVConfig, parseHVSocketServiceTable(ctx, s.Annotations))
}

Expand Down
4 changes: 2 additions & 2 deletions internal/uvm/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ type Options struct {
AdditionalHyperVConfig map[string]hcsschema.HvSocketServiceConfig

// The following options are for implicit vNUMA topology settings.
// MaxSizePerNode is the maximum size of memory per vNUMA node.
MaxSizePerNode uint64
// MaxMemorySizePerNumaNode is the maximum size of memory (in MiB) per vNUMA node.
MaxMemorySizePerNumaNode uint64
// MaxProcessorsPerNumaNode is the maximum number of processors per vNUMA node.
MaxProcessorsPerNumaNode uint32
// PhysicalNumaNodes are the preferred physical NUMA nodes to map to vNUMA nodes.
Expand Down
2 changes: 1 addition & 1 deletion internal/uvm/create_lcow.go
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcs
return nil, err
}

numa, numaProcessors, err := prepareVNumaTopology(opts.Options)
numa, numaProcessors, err := prepareVNumaTopology(ctx, opts.Options)
if err != nil {
return nil, err
}
Expand Down
2 changes: 1 addition & 1 deletion internal/uvm/create_wcow.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ func prepareCommonConfigDoc(ctx context.Context, uvm *UtilityVM, opts *OptionsWC
Weight: uint64(opts.ProcessorWeight),
}

numa, numaProcessors, err := prepareVNumaTopology(opts.Options)
numa, numaProcessors, err := prepareVNumaTopology(ctx, opts.Options)
if err != nil {
return nil, err
}
Expand Down
66 changes: 49 additions & 17 deletions internal/uvm/vnuma.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,68 @@
package uvm

import (
"context"
"fmt"

"github.com/sirupsen/logrus"

hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/osversion"
)

// prepareVNumaTopology creates vNUMA settings for implicit (platform) or explicit (user-defined) topology.
//
// For implicit topology we look for `MaxProcessorsPerNumaNode`, `MaxSizePerNode` and `preferredNumaNodes` create options. Setting them
// in HCS doc, will trigger platform to create vNUMA topology based on the given values. Based on experiments, the
// platform will create an evenly distributed topology based on requested memory and processor count for the HCS VM.
// For implicit topology we look for `MaxProcessorsPerNumaNode`, `MaxMemorySizePerNumaNode` and
// `PreferredPhysicalNumaNodes` create options.
// Setting them in HCS doc will trigger platform to create vNUMA topology based on the given values.
// Based on experiments, the platform will create an evenly distributed topology based on
// requested memory and processor count for the HCS VM.
//
// For explicit topology we look for `NumaMappedPhysicalNodes`, `NumaProcessorCounts` and `NumaMemoryBlocksCounts` create
// options. The above options are number slices, where a value at index `i` in each slice represents the corresponding
// For explicit topology we look for `NumaMappedPhysicalNodes`, `NumaProcessorCounts` and
// `NumaMemoryBlocksCounts` create options.
// The above options are number slices, where a value at index `i` in each slice represents the corresponding
// value for the `i`th vNUMA node.
//
// Limitations:
// - only hcsschema.MemoryBackingType_PHYSICAL is supported
// - `PhysicalNumaNodes` values at index `i` will be mapped to virtual node number `i`
// - client is responsible for setting wildcard physical node numbers
// TODO: Add exact OS build version for vNUMA support.
func prepareVNumaTopology(opts *Options) (*hcsschema.Numa, *hcsschema.NumaProcessors, error) {
//
// - only `hcsschema.MemoryBackingType_PHYSICAL` is supported
// - `PhysicalNumaNodes` values at index `i` will be mapped to virtual node number `i`
// - client is responsible for setting wildcard physical node numbers
func prepareVNumaTopology(ctx context.Context, opts *Options) (*hcsschema.Numa, *hcsschema.NumaProcessors, error) {
if opts.MaxProcessorsPerNumaNode == 0 && len(opts.NumaMappedPhysicalNodes) == 0 {
// warn if vNUMA settings are partially specified, since its likely an error on the client's side
if opts.MaxMemorySizePerNumaNode > 0 || len(opts.PreferredPhysicalNumaNodes) > 0 {
log.G(ctx).WithFields(logrus.Fields{
"max-memory-size-per-numa-node": opts.MaxMemorySizePerNumaNode,
"max-processors-per-numa-node": opts.MaxProcessorsPerNumaNode,
"preferred-physical-numa-nodes": log.Format(ctx, opts.PreferredPhysicalNumaNodes),
}).Warn("potentially incomplete implicit vNUMA topology")
}
if len(opts.NumaProcessorCounts) > 0 || len(opts.NumaMemoryBlocksCounts) > 0 {
log.G(ctx).WithFields(logrus.Fields{
"numa-mapped-physical-nodes": log.Format(ctx, opts.NumaMappedPhysicalNodes),
"numa-processor-counts": log.Format(ctx, opts.NumaProcessorCounts),
"numa-memory-block-counts": log.Format(ctx, opts.NumaMemoryBlocksCounts),
}).Warn("potentially incomplete explicit vNUMA topology")
}
// vNUMA settings are missing, return empty topology
return nil, nil, nil
}

// TODO: Add exact OS build version for vNUMA support, or check for dedicated NUMA feature.
if build := osversion.Build(); build < osversion.V25H1Server {
return nil, nil, fmt.Errorf("vNUMA topology is not supported on %d version of Windows", build)
}

var preferredNumaNodes []int64
for _, pn := range opts.PreferredPhysicalNumaNodes {
preferredNumaNodes = append(preferredNumaNodes, int64(pn))
}

build := osversion.Get().Build
if build < osversion.V25H1Server {
return nil, nil, fmt.Errorf("vNUMA topology is not supported on %d version of Windows", build)
}

// Implicit vNUMA topology.
if opts.MaxProcessorsPerNumaNode > 0 {
if opts.MaxSizePerNode == 0 {
if opts.MaxMemorySizePerNumaNode == 0 {
return nil, nil, fmt.Errorf("max size per node must be set when max processors per numa node is set")
}
numaProcessors := &hcsschema.NumaProcessors{
Expand All @@ -50,9 +73,15 @@ func prepareVNumaTopology(opts *Options) (*hcsschema.Numa, *hcsschema.NumaProces
},
}
numa := &hcsschema.Numa{
MaxSizePerNode: opts.MaxSizePerNode,
MaxSizePerNode: opts.MaxMemorySizePerNumaNode,
PreferredPhysicalNodes: preferredNumaNodes,
}
if entry := log.G(ctx); entry.Logger.IsLevelEnabled(logrus.DebugLevel) {
entry.WithFields(logrus.Fields{
"numa": log.Format(ctx, numa),
"numa-processors": log.Format(ctx, numaProcessors),
}).Debug("created implicit NUMA topology")
}
return numa, numaProcessors, nil
}

Expand All @@ -79,6 +108,9 @@ func prepareVNumaTopology(opts *Options) (*hcsschema.Numa, *hcsschema.NumaProces
}
numa.Settings = append(numa.Settings, nodeTopology)
}
if entry := log.G(ctx); entry.Logger.IsLevelEnabled(logrus.DebugLevel) {
entry.WithField("numa", log.Format(ctx, numa)).Debug("created explicit NUMA topology")
}
return numa, nil, validate(numa)
}

Expand Down
Loading