Skip to content

Commit e0bea0b

Browse files
committed
fix: generating endpoint locally when CNS is not reachabe.
1 parent 9600774 commit e0bea0b

File tree

8 files changed

+428
-199
lines changed

8 files changed

+428
-199
lines changed

cni/network/network.go

Lines changed: 23 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,49 +1070,11 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
10701070
}
10711071
logger.Info("Retrieved network info, populating endpoint infos with container id", zap.String("containerID", args.ContainerID))
10721072

1073-
var epInfos []*network.EndpointInfo
1074-
if plugin.nm.IsStatelessCNIMode() {
1075-
// network ID is passed in and used only for migration
1076-
// otherwise, in stateless, we don't need the network id for deletion
1077-
epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID, args.Netns)
1078-
// if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found or CNS connection failure
1079-
// return a retriable error so the container runtime will retry this DEL later
1080-
// the implementation of this function returns nil if the endpoint doesn't exist, so
1081-
// we don't have to check that here
1082-
if err != nil {
1083-
switch {
1084-
case errors.Is(err, network.ErrConnectionFailure):
1085-
logger.Error("Failed to connect to CNS", zap.Error(err))
1086-
logger.Info("Endpoint will be deleted from state file asynchronously", zap.String("containerID", args.ContainerID))
1087-
// In SwiftV2 Linux stateless CNI mode, if the plugin cannot connect to CNS,
1088-
// we asynchronously remove the secondary (delegated) interface from the pod’s network namespace in the absence of the endpoint state.
1089-
// This is necessary because leaving the delegated NIC in the pod netns can cause the kernel to block rtnetlink operations.
1090-
// When that happens, kubelet and containerd hang during sandbox creation or teardown.
1091-
// The delegated NIC (SR-IOV VF) used by SwiftV2 for multitenant pods remains tied to the pod namespace,
1092-
// triggering hot-unplug/re-register events and leaving the node in an unhealthy state.
1093-
// This workaround mitigates the issue by removing the secondary NIC from the pod netns when CNS is unreachable during DEL to provide the endpoint state.
1094-
if err = plugin.nm.RemoveSecondaryEndpointFromPodNetNS(args.IfName, args.Netns); err != nil {
1095-
logger.Error("Failed to remove secondary endpoint from pod netns", zap.String("netns", args.Netns), zap.Error(err))
1096-
return plugin.RetriableError(fmt.Errorf("failed to remove secondary endpoint from pod netns: %w", err))
1097-
}
1098-
case errors.Is(err, network.ErrEndpointStateNotFound):
1099-
logger.Info("Endpoint Not found", zap.String("containerID", args.ContainerID), zap.Error(err))
1100-
return nil
1101-
default:
1102-
logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err))
1103-
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
1104-
}
1105-
} else {
1106-
for _, epInfo := range epInfos {
1107-
logger.Info("Found endpoint to delete", zap.String("IfName", epInfo.IfName), zap.String("EndpointID", epInfo.EndpointID), zap.Any("NICType", epInfo.NICType))
1108-
}
1109-
}
1110-
} else {
1111-
epInfos = plugin.nm.GetEndpointInfosFromContainerID(args.ContainerID)
1073+
epInfos, err := plugin.nm.GetEndpointInfos(networkID, args, nwCfg.DisableAsyncDelete)
1074+
if err != nil {
1075+
return plugin.RetriableError(fmt.Errorf("failed to retrieve endpoint: %w", err))
11121076
}
1113-
1114-
// for Stateful CNI when the endpoint is not created, but the ips are already allocated (only works if single network, single infra)
1115-
// this block is applied to stateless CNI only if there was a connection failure in previous block and asynchronous delete by CNS will remover the endpoint from state file
1077+
// when the endpoint is not created, but the ips are already allocated (only works if single network, single infra)
11161078
if len(epInfos) == 0 {
11171079
endpointID := plugin.nm.GetEndpointID(args.ContainerID, args.IfName)
11181080
if !nwCfg.MultiTenancy {
@@ -1150,15 +1112,26 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
11501112
zap.String("endpointID", epInfo.EndpointID))
11511113
telemetryClient.SendEvent("Deleting endpoint: " + epInfo.EndpointID)
11521114

1115+
// Delegated/secondary nic ips are statically allocated so we don't need to release
1116+
// Call into IPAM plugin to release the endpoint's addresses.
11531117
if !nwCfg.MultiTenancy && (epInfo.NICType == cns.InfraNIC || epInfo.NICType == "") {
1154-
// Delegated/secondary nic ips are statically allocated so we don't need to release
1155-
// Call into IPAM plugin to release the endpoint's addresses.
1156-
for i := range epInfo.IPAddresses {
1157-
logger.Info("Release ip", zap.String("ip", epInfo.IPAddresses[i].IP.String()))
1158-
telemetryClient.SendEvent(fmt.Sprintf("Release ip: %s container id: %s endpoint id: %s", epInfo.IPAddresses[i].IP.String(), args.ContainerID, epInfo.EndpointID))
1159-
err = plugin.ipamInvoker.Delete(&epInfo.IPAddresses[i], nwCfg, args, nwInfo.Options)
1160-
if err != nil {
1161-
return plugin.RetriableError(fmt.Errorf("failed to release address: %w", err))
1118+
// This is an special case for stateless CNI when Asynchronous DEL to CNS will take place for SwiftV2 after the endpointinfo is recreated locally
1119+
// At this point the endpoint is already deleted and since it is created locally the IPAddress is nil. CNS will release the IP asynchronously whenever it is up
1120+
if epInfo.IPAddresses == nil && plugin.nm.IsStatelessCNIMode() && !nwCfg.DisableAsyncDelete {
1121+
logger.Warn("Release ip Asynchronously by CNS",
1122+
zap.String("containerID", args.ContainerID))
1123+
telemetryClient.SendEvent(fmt.Sprintf("Release ip: %s container id: %s endpoint id: %s asynchronously", epInfo.IPAddresses[i].IP.String(), args.ContainerID, epInfo.EndpointID))
1124+
if err = plugin.ipamInvoker.Delete(nil, nwCfg, args, nwInfo.Options); err != nil {
1125+
return plugin.RetriableError(fmt.Errorf("failed to release address(no endpoint): %w", err))
1126+
}
1127+
} else {
1128+
for i := range epInfo.IPAddresses {
1129+
logger.Info("Release ip", zap.String("ip", epInfo.IPAddresses[i].IP.String()))
1130+
telemetryClient.SendEvent(fmt.Sprintf("Release ip: %s container id: %s endpoint id: %s", epInfo.IPAddresses[i].IP.String(), args.ContainerID, epInfo.EndpointID))
1131+
err = plugin.ipamInvoker.Delete(&epInfo.IPAddresses[i], nwCfg, args, nwInfo.Options)
1132+
if err != nil {
1133+
return plugin.RetriableError(fmt.Errorf("failed to release address: %w", err))
1134+
}
11621135
}
11631136
}
11641137
} else if epInfo.EnableInfraVnet { // remove in future PR

network/endpoint_linux.go

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/Azure/azure-container-networking/network/networkutils"
1717
"github.com/Azure/azure-container-networking/ovsctl"
1818
"github.com/Azure/azure-container-networking/platform"
19+
"github.com/pkg/errors"
1920
"go.uber.org/zap"
2021
)
2122

@@ -548,11 +549,28 @@ func (epInfo *EndpointInfo) GetEndpointInfoByIPImpl(_ []net.IPNet, _ string) (*E
548549
return epInfo, nil
549550
}
550551

551-
// removeSecondaryEndpointFromPodNetNSImpl deletes an existing secondary endpoint from the pod network namespace.
552-
func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(nsc NamespaceClientInterface) error {
553-
secondaryepClient := NewSecondaryEndpointClient(nil, nil, nil, nsc, nil, ep)
554-
if err := secondaryepClient.RemoveInterfacesFromNetnsPath(ep.IfName, ep.NetworkNameSpace); err != nil {
555-
return err
552+
// getEndpointInfoByIfNameImpl returns an array of EndpointInfo for the given endpoint based on the IfName(s) found in the network namespace.
553+
func (nm *networkManager) getEndpointInfoByIfNameImpl(ep *endpoint) ([]*EndpointInfo, error) {
554+
epInfo := &EndpointInfo{
555+
EndpointID: ep.Id,
556+
NetNsPath: ep.NetworkNameSpace,
557+
NICType: cns.InfraNIC,
558+
IfName: ep.IfName,
556559
}
557-
return nil
560+
ret := []*EndpointInfo{}
561+
ret = append(ret, epInfo)
562+
logger.Info("Fetching Secondary Endpoint from", zap.String("NetworkNameSpace", ep.NetworkNameSpace))
563+
secondaryepClient := NewSecondaryEndpointClient(nil, nil, nil, nm.nsClient, nil, ep)
564+
ifnames, err := secondaryepClient.FetchInterfacesFromNetnsPath(ep.IfName, ep.NetworkNameSpace)
565+
if err != nil {
566+
return nil, errors.Wrap(err, "failed to fetch secondary interfaces")
567+
}
568+
for _, ifName := range ifnames {
569+
ret = append(ret, &EndpointInfo{
570+
NetNsPath: ep.NetworkNameSpace,
571+
IfName: ifName,
572+
NICType: cns.NodeNetworkInterfaceFrontendNIC,
573+
})
574+
}
575+
return ret, nil
558576
}

network/endpoint_windows.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,6 @@ func getPnpDeviceState(instanceID string, plc platform.ExecClient) (string, stri
771771
return devpkeyDeviceIsPresent, devpkeyDeviceProblemCode, nil
772772
}
773773

774-
// removeSecondaryEndpointFromPodNetNSImpl removes an existing secondary endpoint from the pod network namespace.
775-
func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(_ NamespaceClientInterface) error {
776-
return nil
774+
func (nm *networkManager) getEndpointInfoByIfNameImpl(_ *endpoint) ([]*EndpointInfo, error) {
775+
return nil, nil
777776
}

network/errors.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@ package network
33
import "errors"
44

55
var (
6-
errSubnetV6NotFound = errors.New("Couldn't find ipv6 subnet in network info") // nolint
7-
errV6SnatRuleNotSet = errors.New("ipv6 snat rule not set. Might be VM ipv6 address missing") // nolint
8-
ErrEndpointStateNotFound = errors.New("endpoint state could not be found in the statefile")
9-
ErrConnectionFailure = errors.New("couldn't connect to CNS")
10-
ErrGetEndpointStateFailure = errors.New("failure to obtain the endpoint state")
6+
errSubnetV6NotFound = errors.New("couldn't find ipv6 subnet in network info") // nolint
7+
errV6SnatRuleNotSet = errors.New("ipv6 snat rule not set. Might be VM ipv6 address missing") // nolint
8+
ErrEndpointStateNotFound = errors.New("endpoint state could not be found in the statefile")
9+
ErrConnectionFailure = errors.New("couldn't connect to CNS")
10+
ErrEndpointRemovalFailure = errors.New("failed to remove endpoint")
11+
ErrEndpointRetrievalFailure = errors.New("failed to obtain endpoint")
12+
ErrGetEndpointStateFailure = errors.New("failure to obtain the endpoint state")
1113
)

network/manager.go

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/Azure/azure-container-networking/netlink"
2020
"github.com/Azure/azure-container-networking/platform"
2121
"github.com/Azure/azure-container-networking/store"
22+
cniSkel "github.com/containernetworking/cni/pkg/skel"
2223
"github.com/pkg/errors"
2324
"go.uber.org/zap"
2425
)
@@ -120,9 +121,9 @@ type NetworkManager interface {
120121
IsStatelessCNIMode() bool
121122
SaveState(eps []*endpoint) error
122123
DeleteState(epInfos []*EndpointInfo) error
124+
GetEndpointInfos(networkID string, args *cniSkel.CmdArgs, disableAsyncDelete bool) ([]*EndpointInfo, error)
123125
GetEndpointInfosFromContainerID(containerID string) []*EndpointInfo
124126
GetEndpointState(networkID, containerID, netns string) ([]*EndpointInfo, error)
125-
RemoveSecondaryEndpointFromPodNetNS(ifName string, netns string) error
126127
}
127128

128129
// Creates a new network manager.
@@ -841,7 +842,9 @@ func cnsEndpointInfotoCNIEpInfos(endpointInfo restserver.EndpointInfo, endpointI
841842
epInfo.HNSNetworkID = ipInfo.HnsNetworkID
842843
epInfo.MacAddress = net.HardwareAddr(ipInfo.MacAddress)
843844
epInfo.NetworkContainerID = ipInfo.NetworkContainerID
844-
845+
if epInfo.NetNsPath == "" {
846+
epInfo.NetNsPath = netns
847+
}
845848
ret = append(ret, epInfo)
846849
}
847850
return ret
@@ -882,13 +885,57 @@ func generateCNSIPInfoMap(eps []*endpoint) map[string]*restserver.IPInfo {
882885
return ifNametoIPInfoMap
883886
}
884887

885-
// RemoveSecondaryEndpointFromPodNetNS removes the secondary endpoint from the pod netns
886-
func (nm *networkManager) RemoveSecondaryEndpointFromPodNetNS(ifName, netns string) error {
888+
// GetEndpointInfos gets all endpoint infos associated with a container id and networkID
889+
// In stateless CNI mode, it calls CNS GetEndpointState API to get the endpoint infos or genreate them locally if CNS is unreachable in SwiftV2 mode and AsyncDelete is enabled
890+
// In stateful CNI mode, it fetches the endpoint infos by calling GetEndpointInfosFromContainerID
891+
func (nm *networkManager) GetEndpointInfos(networkID string, args *cniSkel.CmdArgs, disableAsyncDelete bool) ([]*EndpointInfo, error) {
892+
if nm.IsStatelessCNIMode() {
893+
logger.Info("Calling cns getEndpoint API")
894+
epInfos, err := nm.GetEndpointState(networkID, args.ContainerID, args.Netns)
895+
if err != nil {
896+
switch {
897+
// async delete should be disabled for standalone scenarios but will be enabled for AKS scenarios
898+
case errors.Is(err, ErrConnectionFailure) && !disableAsyncDelete:
899+
logger.Info("Failed to connect to CNS, endpoint will be deleted from state file asynchronously", zap.String("containerID", args.ContainerID))
900+
// In SwiftV2 Linux stateless CNI mode, if the plugin cannot connect to CNS,
901+
// we still have to remove the secondary (delegated) interface from the pod’s network namespace in the absence of the endpoint state.
902+
// This is necessary because leaving the delegated NIC in the pod netns can cause the kernel to block rtnetlink operations.
903+
// When that happens, kubelet and containerd hang during sandbox creation or teardown.
904+
// The delegated NIC (SR-IOV VF) used by SwiftV2 for multitenant pods remains tied to the pod namespace,
905+
// triggering hot-unplug/re-register events and leaving the node in an unhealthy state.
906+
// This workaround mitigates the issue by generating a minimal endpointInfo via containerd args and netlink APIs that can be then passed to DeleteEndpoint API.
907+
epInfos, err = nm.generateEndpointLocally(args)
908+
if err != nil {
909+
logger.Error("Failed to fetch secondary endpoint from pod netns", zap.String("netns", args.Netns), zap.Error(err))
910+
return nil, errors.Wrap(err, "failed to fetch secondary interfaces")
911+
}
912+
case errors.Is(err, ErrEndpointStateNotFound):
913+
logger.Info("Endpoint Not found", zap.String("containerID", args.ContainerID), zap.Error(err))
914+
return nil, nil
915+
default:
916+
logger.Error("Get Endpoint State API returned error", zap.String("containerID", args.ContainerID), zap.Error(err))
917+
return nil, ErrEndpointRetrievalFailure
918+
}
919+
}
920+
for _, epInfo := range epInfos {
921+
logger.Info("Found endpoint to delete", zap.String("IfName", epInfo.IfName), zap.String("EndpointID", epInfo.EndpointID), zap.Any("NICType", epInfo.NICType))
922+
}
923+
return epInfos, nil
924+
}
925+
// Stateful CNI mode
926+
return nm.GetEndpointInfosFromContainerID(args.ContainerID), nil
927+
}
928+
929+
// generateEndpointLocally fetches the endpoint information using containerd args and netlink APIs
930+
func (nm *networkManager) generateEndpointLocally(args *cniSkel.CmdArgs) ([]*EndpointInfo, error) {
887931
ep := &endpoint{
888-
NetworkNameSpace: netns,
889-
IfName: ifName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client
932+
Id: args.ContainerID,
933+
NetworkNameSpace: args.Netns,
934+
IfName: args.IfName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client
890935
}
891-
logger.Info("Removing Secondary Endpoint from", zap.String("NetworkNameSpace: ", netns))
892-
err := ep.removeSecondaryEndpointFromPodNetNSImpl(nm.nsClient)
893-
return err
936+
epInfo, err := nm.getEndpointInfoByIfNameImpl(ep)
937+
if err != nil {
938+
return nil, errors.Wrap(err, "failed to fetch secondary interfaces")
939+
}
940+
return epInfo, nil
894941
}

network/manager_mock.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package network
33
import (
44
"github.com/Azure/azure-container-networking/cns"
55
"github.com/Azure/azure-container-networking/common"
6+
cniSkel "github.com/containernetworking/cni/pkg/skel"
67
)
78

89
// MockNetworkManager is a mock structure for Network Manager
@@ -222,6 +223,6 @@ func (nm *MockNetworkManager) GetEndpointState(_, _, _ string) ([]*EndpointInfo,
222223
return []*EndpointInfo{}, nil
223224
}
224225

225-
func (nm *MockNetworkManager) RemoveSecondaryEndpointFromPodNetNS(_, _ string) error {
226-
return nil
226+
func (nm *MockNetworkManager) GetEndpointInfos(_ string, args *cniSkel.CmdArgs, _ bool) ([]*EndpointInfo, error) {
227+
return nm.GetEndpointInfosFromContainerID(args.ContainerID), nil
227228
}

0 commit comments

Comments
 (0)