Skip to content

Commit 6fbcfd0

Browse files
committed
Fix: adding support for SWiftV2 Linux stateless CNI mode
1 parent d66b2f9 commit 6fbcfd0

File tree

7 files changed

+283
-32
lines changed

7 files changed

+283
-32
lines changed

cni/network/network.go

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ import (
1818
"github.com/Azure/azure-container-networking/cni/util"
1919
"github.com/Azure/azure-container-networking/cns"
2020
cnscli "github.com/Azure/azure-container-networking/cns/client"
21-
"github.com/Azure/azure-container-networking/cns/fsnotify"
2221
"github.com/Azure/azure-container-networking/common"
2322
"github.com/Azure/azure-container-networking/dhcp"
2423
"github.com/Azure/azure-container-networking/iptables"
@@ -716,7 +715,7 @@ func (plugin *NetPlugin) createEpInfo(opt *createEpInfoOpt) (*network.EndpointIn
716715
*opt.infraSeen = true
717716
} else {
718717
ifName = "eth" + strconv.Itoa(opt.endpointIndex)
719-
endpointID = plugin.nm.GetEndpointID(opt.args.ContainerID, ifName)
718+
endpointID = plugin.nm.GetEndpointIDByNicType(opt.args.ContainerID, ifName, opt.ifInfo.NICType)
720719
}
721720

722721
endpointInfo := network.EndpointInfo{
@@ -1069,32 +1068,45 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
10691068
if plugin.nm.IsStatelessCNIMode() {
10701069
// network ID is passed in and used only for migration
10711070
// otherwise, in stateless, we don't need the network id for deletion
1072-
epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID)
1073-
// if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found
1071+
epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID, args.Netns)
1072+
// if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found or CNS connection failure
1073+
// return a retriable error so the container runtime will retry this DEL later
1074+
// the implementation of this function returns nil if the endpoint doesn't exist, so
1075+
// we don't have to check that here
10741076
if err != nil {
1075-
if errors.Is(err, network.ErrConnectionFailure) {
1076-
logger.Info("failed to connect to CNS", zap.String("containerID", args.ContainerID), zap.Error(err))
1077-
addErr := fsnotify.AddFile(args.ContainerID, args.ContainerID, watcherPath)
1078-
logger.Info("add containerid file for Asynch delete", zap.String("containerID", args.ContainerID), zap.Error(addErr))
1079-
if addErr != nil {
1080-
logger.Error("failed to add file to watcher", zap.String("containerID", args.ContainerID), zap.Error(addErr))
1081-
return errors.Wrap(addErr, fmt.Sprintf("failed to add file to watcher with containerID %s", args.ContainerID))
1077+
switch {
1078+
case errors.Is(err, network.ErrConnectionFailure):
1079+
logger.Error("Failed to connect to CNS", zap.Error(err))
1080+
logger.Info("Endpoint will be deleted from state file asynchronously", zap.String("containerID", args.ContainerID))
1081+
// In SwiftV2 Linux stateless CNI mode, if the plugin cannot connect to CNS,
1082+
// we asynchronously remove the secondary (delegated) interface from the pod’s network namespace in the absence of the endpoint state.
1083+
// This is necessary because leaving the delegated NIC in the pod netns can cause the kernel to block rtnetlink operations.
1084+
// When that happens, kubelet and containerd hang during sandbox creation or teardown.
1085+
// The delegated NIC (SR-IOV VF) used by SwiftV2 for multitenant pods remains tied to the pod namespace,
1086+
// triggering hot-unplug/re-register events and leaving the node in an unhealthy state.
1087+
// This workaround mitigates the issue by removing the secondary NIC from the pod netns when CNS is unreachable during DEL to provide the endpoint state.
1088+
if err = plugin.nm.RemoveSecondaryEndpointFromPodNetNS(args.IfName, args.Netns); err != nil {
1089+
logger.Error("Failed to remove secondary endpoint from pod netns", zap.String("netns", args.Netns), zap.Error(err))
1090+
return plugin.RetriableError(fmt.Errorf("failed to remove secondary endpoint from pod netns: %w", err))
10821091
}
1083-
return nil
1084-
}
1085-
if errors.Is(err, network.ErrEndpointStateNotFound) {
1092+
case errors.Is(err, network.ErrEndpointStateNotFound):
10861093
logger.Info("Endpoint Not found", zap.String("containerID", args.ContainerID), zap.Error(err))
10871094
return nil
1095+
default:
1096+
logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err))
1097+
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
1098+
}
1099+
} else {
1100+
for _, epInfo := range epInfos {
1101+
logger.Info("Found endpoint to delete", zap.String("IfName", epInfo.IfName), zap.String("EndpointID", epInfo.EndpointID), zap.Any("NICType", epInfo.NICType))
10881102
}
1089-
logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err))
1090-
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
10911103
}
10921104
} else {
10931105
epInfos = plugin.nm.GetEndpointInfosFromContainerID(args.ContainerID)
10941106
}
10951107

1096-
// for when the endpoint is not created, but the ips are already allocated (only works if single network, single infra)
1097-
// this block is not applied to stateless CNI
1108+
// for Stateful CNI when the endpoint is not created, but the ips are already allocated (only works if single network, single infra)
1109+
// this block is applied to stateless CNI only if there was a connection failure in previous block and asynchronous delete by CNS will remover the endpoint from state file
10981110
if len(epInfos) == 0 {
10991111
endpointID := plugin.nm.GetEndpointID(args.ContainerID, args.IfName)
11001112
if !nwCfg.MultiTenancy {
@@ -1120,7 +1132,7 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
11201132
if err = plugin.nm.DeleteEndpoint(epInfo.NetworkID, epInfo.EndpointID, epInfo); err != nil {
11211133
// An error will not be returned if the endpoint is not found
11221134
// return a retriable error so the container runtime will retry this DEL later
1123-
// the implementation of this function returns nil if the endpoint doens't exist, so
1135+
// the implementation of this function returns nil if the endpoint doesn't exist, so
11241136
// we don't have to check that here
11251137
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
11261138
}

network/endpoint_linux.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,3 +547,12 @@ func getDefaultGateway(routes []RouteInfo) net.IP {
547547
func (epInfo *EndpointInfo) GetEndpointInfoByIPImpl(_ []net.IPNet, _ string) (*EndpointInfo, error) {
548548
return epInfo, nil
549549
}
550+
551+
// removeSecondaryEndpointFromPodNetNSImpl deletes an existing secondary endpoint from the pod network namespace.
552+
func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(nsc NamespaceClientInterface) error {
553+
secondaryepClient := NewSecondaryEndpointClient(nil, nil, nil, nsc, nil, ep)
554+
if err := secondaryepClient.RemoveInterfacesFromNetnsPath(ep.IfName, ep.NetworkNameSpace); err != nil {
555+
return err
556+
}
557+
return nil
558+
}

network/endpoint_windows.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,3 +746,8 @@ func getPnpDeviceState(instanceID string, plc platform.ExecClient) (string, stri
746746
logger.Info("Retrieved device problem code", zap.String("code", devpkeyDeviceProblemCode))
747747
return devpkeyDeviceIsPresent, devpkeyDeviceProblemCode, nil
748748
}
749+
750+
// removeSecondaryEndpointFromPodNetNSImpl removes an existing secondary endpoint from the pod network namespace.
751+
func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(_ NamespaceClientInterface) error {
752+
return nil
753+
}

network/manager.go

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -116,11 +116,13 @@ type NetworkManager interface {
116116
UpdateEndpoint(networkID string, existingEpInfo *EndpointInfo, targetEpInfo *EndpointInfo) error
117117
GetNumberOfEndpoints(ifName string, networkID string) int
118118
GetEndpointID(containerID, ifName string) string
119+
GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string
119120
IsStatelessCNIMode() bool
120121
SaveState(eps []*endpoint) error
121122
DeleteState(epInfos []*EndpointInfo) error
122123
GetEndpointInfosFromContainerID(containerID string) []*EndpointInfo
123-
GetEndpointState(networkID, containerID string) ([]*EndpointInfo, error)
124+
GetEndpointState(networkID, containerID, netns string) ([]*EndpointInfo, error)
125+
RemoveSecondaryEndpointFromPodNetNS(ifName string, netns string) error
124126
}
125127

126128
// Creates a new network manager.
@@ -455,7 +457,7 @@ func validateUpdateEndpointState(endpointID string, ifNameToIPInfoMap map[string
455457
// GetEndpointState will make a call to CNS GetEndpointState API in the stateless CNI mode to fetch the endpointInfo
456458
// TODO unit tests need to be added, WorkItem: 26606939
457459
// In stateless cni, container id is the endpoint id, so you can pass in either
458-
func (nm *networkManager) GetEndpointState(networkID, containerID string) ([]*EndpointInfo, error) {
460+
func (nm *networkManager) GetEndpointState(networkID, containerID, netns string) ([]*EndpointInfo, error) {
459461
endpointResponse, err := nm.CnsClient.GetEndpoint(context.TODO(), containerID)
460462
if err != nil {
461463
if endpointResponse.Response.ReturnCode == types.NotFound {
@@ -466,7 +468,7 @@ func (nm *networkManager) GetEndpointState(networkID, containerID string) ([]*En
466468
}
467469
return nil, ErrGetEndpointStateFailure
468470
}
469-
epInfos := cnsEndpointInfotoCNIEpInfos(endpointResponse.EndpointInfo, containerID)
471+
epInfos := cnsEndpointInfotoCNIEpInfos(endpointResponse.EndpointInfo, containerID, netns)
470472

471473
for i := 0; i < len(epInfos); i++ {
472474
if epInfos[i].NICType == cns.InfraNIC {
@@ -514,7 +516,7 @@ func (nm *networkManager) DeleteEndpointState(networkID string, epInfo *Endpoint
514516
nw := &network{
515517
Id: networkID, // currently unused in stateless cni
516518
HnsId: epInfo.HNSNetworkID,
517-
Mode: opModeTransparentVlan,
519+
Mode: opModeTransparent,
518520
SnatBridgeIP: "",
519521
NetNs: dummyGUID, // to trigger hns v2, windows
520522
extIf: &externalInterface{
@@ -529,6 +531,7 @@ func (nm *networkManager) DeleteEndpointState(networkID string, epInfo *Endpoint
529531
HNSNetworkID: epInfo.HNSNetworkID, // unused (we use nw.HnsId for deleting the network)
530532
HostIfName: epInfo.HostIfName,
531533
LocalIP: "",
534+
IPAddresses: epInfo.IPAddresses,
532535
VlanID: 0,
533536
AllowInboundFromHostToNC: false, // stateless currently does not support apipa
534537
AllowInboundFromNCToHost: false,
@@ -537,11 +540,12 @@ func (nm *networkManager) DeleteEndpointState(networkID string, epInfo *Endpoint
537540
NetworkContainerID: epInfo.NetworkContainerID, // we don't use this as long as AllowInboundFromHostToNC and AllowInboundFromNCToHost are false
538541
NetNs: dummyGUID, // to trigger hnsv2, windows
539542
NICType: epInfo.NICType,
543+
NetworkNameSpace: epInfo.NetNsPath,
540544
IfName: epInfo.IfName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client
541545
}
542546
logger.Info("Deleting endpoint with", zap.String("Endpoint Info: ", epInfo.PrettyString()), zap.String("HNISID : ", ep.HnsId))
543547

544-
err := nw.deleteEndpointImpl(netlink.NewNetlink(), platform.NewExecClient(logger), nil, nil, nil, nil, nil, ep)
548+
err := nw.deleteEndpointImpl(nm.netlink, nm.plClient, nil, nm.netio, nm.nsClient, nm.iptablesClient, nm.dhcpClient, ep)
545549
if err != nil {
546550
return err
547551
}
@@ -562,7 +566,7 @@ func (nm *networkManager) GetEndpointInfo(networkID, endpointID string) (*Endpoi
562566

563567
if nm.IsStatelessCNIMode() {
564568
logger.Info("calling cns getEndpoint API")
565-
epInfos, err := nm.GetEndpointState(networkID, endpointID)
569+
epInfos, err := nm.GetEndpointState(networkID, endpointID, "")
566570
if err != nil {
567571
return nil, err
568572
}
@@ -745,6 +749,16 @@ func (nm *networkManager) GetEndpointID(containerID, ifName string) string {
745749
return containerID + "-" + ifName
746750
}
747751

752+
// GetEndpointIDByNicType returns a unique endpoint ID based on the CNI mode and NIC type.
753+
func (nm *networkManager) GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string {
754+
// For stateless CNI, secondary NICs use containerID-ifName as endpointID.
755+
if nm.IsStatelessCNIMode() && nicType != cns.InfraNIC {
756+
return containerID + "-" + ifName
757+
}
758+
// For InfraNIC, use GetEndpointID() logic.
759+
return nm.GetEndpointID(containerID, ifName)
760+
}
761+
748762
// saves the map of network ids to endpoints to the state file
749763
func (nm *networkManager) SaveState(eps []*endpoint) error {
750764
nm.Lock()
@@ -779,7 +793,7 @@ func (nm *networkManager) DeleteState(_ []*EndpointInfo) error {
779793
}
780794

781795
// called to convert a cns restserver EndpointInfo into a network EndpointInfo
782-
func cnsEndpointInfotoCNIEpInfos(endpointInfo restserver.EndpointInfo, endpointID string) []*EndpointInfo {
796+
func cnsEndpointInfotoCNIEpInfos(endpointInfo restserver.EndpointInfo, endpointID, netns string) []*EndpointInfo {
783797
ret := []*EndpointInfo{}
784798

785799
for ifName, ipInfo := range endpointInfo.IfnameToIPMap {
@@ -809,6 +823,10 @@ func cnsEndpointInfotoCNIEpInfos(endpointInfo restserver.EndpointInfo, endpointI
809823
epInfo.NICType = ipInfo.NICType
810824
epInfo.HNSNetworkID = ipInfo.HnsNetworkID
811825
epInfo.MacAddress = net.HardwareAddr(ipInfo.MacAddress)
826+
// fill out the netns if it is empty via args passed by container runtime
827+
if epInfo.NetNsPath == "" {
828+
epInfo.NetNsPath = netns
829+
}
812830
ret = append(ret, epInfo)
813831
}
814832
return ret
@@ -847,3 +865,14 @@ func generateCNSIPInfoMap(eps []*endpoint) map[string]*restserver.IPInfo {
847865

848866
return ifNametoIPInfoMap
849867
}
868+
869+
// RemoveSecondaryEndpointFromPodNetNS removes the secondary endpoint from the pod netns
870+
func (nm *networkManager) RemoveSecondaryEndpointFromPodNetNS(ifName, netns string) error {
871+
ep := &endpoint{
872+
NetworkNameSpace: netns,
873+
IfName: ifName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client
874+
}
875+
logger.Info("Removing Secondary Endpoint from", zap.String("NetworkNameSpace: ", netns))
876+
err := ep.removeSecondaryEndpointFromPodNetNSImpl(nm.nsClient)
877+
return err
878+
}

network/manager_mock.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package network
22

33
import (
4+
"github.com/Azure/azure-container-networking/cns"
45
"github.com/Azure/azure-container-networking/common"
56
)
67

@@ -94,6 +95,16 @@ func (nm *MockNetworkManager) GetEndpointID(containerID, ifName string) string {
9495
return containerID + "-" + ifName
9596
}
9697

98+
// GetEndpointIDByNicType returns a unique endpoint ID based on the CNI mode and NIC type.
99+
func (nm *MockNetworkManager) GetEndpointIDByNicType(containerID, ifName string, nicType cns.NICType) string {
100+
// For stateless CNI, secondary NICs use containerID-ifName as endpointID.
101+
if nm.IsStatelessCNIMode() && nicType != cns.InfraNIC {
102+
return containerID + "-" + ifName
103+
}
104+
// For InfraNIC, use GetEndpointID() logic.
105+
return nm.GetEndpointID(containerID, ifName)
106+
}
107+
97108
func (nm *MockNetworkManager) GetAllEndpoints(networkID string) (map[string]*EndpointInfo, error) {
98109
return nm.TestEndpointInfoMap, nil
99110
}
@@ -207,6 +218,10 @@ func (nm *MockNetworkManager) GetEndpointInfosFromContainerID(containerID string
207218
return ret
208219
}
209220

210-
func (nm *MockNetworkManager) GetEndpointState(_, _ string) ([]*EndpointInfo, error) {
221+
func (nm *MockNetworkManager) GetEndpointState(_, _, _ string) ([]*EndpointInfo, error) {
211222
return []*EndpointInfo{}, nil
212223
}
224+
225+
func (nm *MockNetworkManager) RemoveSecondaryEndpointFromPodNetNS(_, _ string) error {
226+
return nil
227+
}

network/manager_test.go

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ var _ = Describe("Test Manager", func() {
350350
PodNamespace: "test-pod-ns",
351351
}
352352

353-
epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID)
353+
epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID, "")
354354

355355
Expect(len(epInfos)).To(Equal(1))
356356
Expect(epInfos[0]).To(Equal(
@@ -400,7 +400,7 @@ var _ = Describe("Test Manager", func() {
400400
PodNamespace: "test-pod-ns",
401401
}
402402

403-
epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID)
403+
epInfos := cnsEndpointInfotoCNIEpInfos(cnsEndpointInfo, endpointID, "")
404404

405405
Expect(len(epInfos)).To(Equal(2))
406406
Expect(epInfos).To(ContainElement(
@@ -489,3 +489,109 @@ var _ = Describe("Test Manager", func() {
489489
})
490490
})
491491
})
492+
493+
func TestGetEndpointIDByNicType_Cases(t *testing.T) {
494+
nm := &networkManager{}
495+
496+
cases := []struct {
497+
name string
498+
stateless bool
499+
containerID string
500+
ifName string
501+
nicType cns.NICType
502+
expectedResult string
503+
}{
504+
{
505+
name: "Stateless InfraNIC",
506+
stateless: true,
507+
containerID: "container123",
508+
ifName: "eth0",
509+
nicType: cns.InfraNIC,
510+
expectedResult: "container123",
511+
},
512+
{
513+
name: "Stateless SecondaryNIC",
514+
stateless: true,
515+
containerID: "container123",
516+
ifName: "eth1",
517+
nicType: cns.DelegatedVMNIC,
518+
expectedResult: "container123-eth1",
519+
},
520+
{
521+
name: "Stateful InfraNIC",
522+
stateless: false,
523+
containerID: "container123456789",
524+
ifName: "eth0",
525+
nicType: cns.InfraNIC,
526+
expectedResult: "containe-eth0", // truncated to 8 chars
527+
},
528+
}
529+
530+
for _, tc := range cases {
531+
t.Run(tc.name, func(t *testing.T) {
532+
nm.statelessCniMode = tc.stateless
533+
id := nm.GetEndpointIDByNicType(tc.containerID, tc.ifName, tc.nicType)
534+
if id != tc.expectedResult {
535+
t.Errorf("expected %s, got %s", tc.expectedResult, id)
536+
}
537+
})
538+
}
539+
}
540+
541+
func TestCnsEndpointInfotoCNIEpInfos_Cases(t *testing.T) {
542+
cases := []struct {
543+
name string
544+
ifName string
545+
ipInfo restserver.IPInfo
546+
netNs string
547+
expectedNetNs string
548+
expectedIfName string
549+
expectedNICType cns.NICType
550+
}{
551+
{
552+
name: "DelegatedVMNIC",
553+
ifName: "eth1",
554+
netNs: "/var/run/netns/testns",
555+
ipInfo: restserver.IPInfo{
556+
NICType: cns.DelegatedVMNIC,
557+
},
558+
expectedNetNs: "/var/run/netns/testns",
559+
expectedIfName: "eth1",
560+
expectedNICType: cns.DelegatedVMNIC,
561+
},
562+
{
563+
name: "InfraNIC",
564+
ifName: "eth0",
565+
netNs: "",
566+
ipInfo: restserver.IPInfo{
567+
NICType: cns.InfraNIC,
568+
},
569+
expectedNetNs: "",
570+
expectedIfName: "eth0",
571+
expectedNICType: cns.InfraNIC,
572+
},
573+
}
574+
575+
for _, tc := range cases {
576+
t.Run(tc.name, func(t *testing.T) {
577+
endpointInfo := restserver.EndpointInfo{
578+
IfnameToIPMap: map[string]*restserver.IPInfo{
579+
tc.ifName: &tc.ipInfo,
580+
},
581+
}
582+
epInfos := cnsEndpointInfotoCNIEpInfos(endpointInfo, "container123", tc.netNs)
583+
if len(epInfos) == 0 {
584+
t.Fatalf("expected at least one epInfo")
585+
}
586+
if epInfos[0].NetNsPath != tc.expectedNetNs {
587+
t.Errorf("expected NetNsPath %q, got %q", tc.expectedNetNs, epInfos[0].NetNsPath)
588+
}
589+
if epInfos[0].IfName != tc.expectedIfName {
590+
t.Errorf("expected IfName %q, got %q", tc.expectedIfName, epInfos[0].IfName)
591+
}
592+
if epInfos[0].NICType != tc.expectedNICType {
593+
t.Errorf("expected NICType %v, got %v", tc.expectedNICType, epInfos[0].NICType)
594+
}
595+
})
596+
}
597+
}

0 commit comments

Comments
 (0)