From f6793767ea21da32759e81c6aed8e8eac8dab915 Mon Sep 17 00:00:00 2001 From: Hongliang Liu Date: Fri, 5 May 2023 20:50:07 +0800 Subject: [PATCH] Improve AntreaProxy route syncing on Windows This PR fixes some issues: 1. AntreaAgent logs "Failed to sync route" when attempting to sync route entries every time. 2. AntreaAgent logs "Failed to install route for Service CIDR" during startup. 3. AntreaAgent may crash during the first installation of Antrea with proxyAll enabled on Windows. 4. AntreaAgent doesn't clean the previously installed calculated route. For the first issue, previously, to recover the connected route of antrea-gw0 (assuming the IP address is 10.10.0.1/24) that may have been deleted by mistake, a route with a destination 10.10.0.1/24 and gateway 10.10.0.1 was periodically synced. However, this caused an error because an existing active route with the same destination but a different gateway 0.0.0.0 should have already been automatically installed when antrea-gw0 was created. To address this issue, this PR changes the gateway of the recover route from 10.10.0.1 to 0.0.0.0, which matches the existing installed route. This ensures that the periodic sync will not cause any errors. For the second issue, when syncing the first ClusterIP, the route entry installed for the first ClusterIP is added to the stale routes and then deleted. This results in the first ClusterIP becoming unavailable until the calculated route entry for the second ClusterIP is installed. This PR resolves the issue by excluding the route entry for the first ClusterIP from the stale routes. For the third issue, if the first ClusterIP is Service `kubernetes` in the second issue, the unavailability of the ClusterIP might result in crashing of Antrea with proxyAll enabled. For the fourth issue, when syncing the first ClusterIP, list all existing routes and clean the routes whose destination CIDR that contains the ClusterIP to resolve the issue. Signed-off-by: Hongliang Liu --- pkg/agent/route/route_windows.go | 33 +++++++++---- .../util/powershell/powershell_windows.go | 2 +- pkg/util/ip/ip.go | 12 +++++ pkg/util/ip/ip_test.go | 49 +++++++++++++++++++ 4 files changed, 86 insertions(+), 10 deletions(-) diff --git a/pkg/agent/route/route_windows.go b/pkg/agent/route/route_windows.go index 82257b9a995..ff96efe9eae 100644 --- a/pkg/agent/route/route_windows.go +++ b/pkg/agent/route/route_windows.go @@ -280,28 +280,43 @@ func (c *Client) addServiceCIDRRoute(serviceCIDR *net.IPNet) error { c.serviceRoutes.Store(serviceIPv4CIDRKey, route) // Collect stale routes. - var staleRoutes []*util.Route - // If current destination CIDR is not nil, the route with current destination CIDR should be uninstalled since - // a new route with a newly calculated destination CIDR has been installed. + staleRoutes := make(map[string]*util.Route) if serviceCIDRRouteExists { - staleRoutes = append(staleRoutes, oldServiceCIDRRoute.(*util.Route)) + // If current destination CIDR is not nil, the route with current destination CIDR should be uninstalled since + // a new route with a newly calculated destination CIDR has been installed. + rt := oldServiceCIDRRoute.(*util.Route) + staleRoutes[rt.String()] = rt + } else { + // If current destination CIDR is nil, which means that Antrea Agent has just started, then all existing routes + // whose destination CIDR contains the first ClusterIP should be uninstalled, except the newly installed route. + // Note that, there may be multiple stale routes prior to this commit. When upgrading, all stale routes will be + // collected. After this commit, there will be only one stale route after Antrea Agent started. + routes, err := c.listIPRoutesOnGW() + if err != nil { + return fmt.Errorf("error listing ip routes: %w", err) + } + for _, rt := range routes { + if rt.GatewayAddress.Equal(gw) && !rt.DestinationSubnet.IP.Equal(serviceCIDR.IP) && rt.DestinationSubnet.Contains(serviceCIDR.IP) { + staleRoutes[rt.String()] = rt + } + } } routes, err := c.listIPRoutesOnGW() if err != nil { return fmt.Errorf("error listing ip routes: %w", err) } // Collect stale per-IP routes for ClusterIPs before this patch. + // TODO(lhongliang): remove this in v1.13.0 or later. for _, rt := range routes { - ones, _ := rt.DestinationSubnet.Mask.Size() - if ones == net.IPv4len*8 && serviceCIDR.Contains(rt.DestinationSubnet.IP) { - staleRoutes = append(staleRoutes, rt) + if iputil.IsSubCIDR(serviceCIDR, rt.DestinationSubnet) { + staleRoutes[rt.String()] = rt } } // Remove stale routes. for _, rt := range staleRoutes { if err := util.RemoveNetRoute(rt); err != nil { - if err.Error() == "No matching MSFT_NetRoute objects" { + if strings.Contains(err.Error(), "No matching MSFT_NetRoute objects") { klog.InfoS("Failed to delete stale Service CIDR route since the route has been deleted", "route", rt) } else { return fmt.Errorf("failed to delete stale Service CIDR route %s: %w", rt.String(), err) @@ -383,7 +398,7 @@ func (c *Client) syncRoute() error { gwAutoconfRoute := &util.Route{ LinkIndex: c.nodeConfig.GatewayConfig.LinkIndex, DestinationSubnet: c.nodeConfig.PodIPv4CIDR, - GatewayAddress: c.nodeConfig.GatewayConfig.IPv4, + GatewayAddress: net.IPv4zero, RouteMetric: util.MetricDefault, } restoreRoute(gwAutoconfRoute) diff --git a/pkg/agent/util/powershell/powershell_windows.go b/pkg/agent/util/powershell/powershell_windows.go index 29c3acb6372..c2e259d733d 100644 --- a/pkg/agent/util/powershell/powershell_windows.go +++ b/pkg/agent/util/powershell/powershell_windows.go @@ -27,7 +27,7 @@ func RunCommand(cmd string) (string, error) { // https://stackoverflow.com/questions/19282870/how-can-i-use-try-catch-and-get-my-script-to-stop-if-theres-an-error/19285405 psCmd := exec.Command("powershell.exe", "-NoLogo", "-NoProfile", "-NonInteractive", "-Command", fmt.Sprintf(`$ErrorActionPreference="Stop";try {%s} catch {Write-Host $_;os.Exit(1)}`, cmd)) // #nosec G204 - stdout, err := psCmd.Output() + stdout, err := psCmd.CombinedOutput() stdoutStr := string(stdout) if err != nil { return "", fmt.Errorf("failed to run command '%s': output '%s', %v", cmd, stdoutStr, err) diff --git a/pkg/util/ip/ip.go b/pkg/util/ip/ip.go index fc504508a43..daa8b5fb6b7 100644 --- a/pkg/util/ip/ip.go +++ b/pkg/util/ip/ip.go @@ -226,3 +226,15 @@ func AppendPortIfMissing(addr, port string) string { return net.JoinHostPort(addr, port) } + +func IsSubCIDR(parentNet, subNet *net.IPNet) bool { + parentMaskOnes, parentMaskBits := parentNet.Mask.Size() + subMaskOnes, subMaskBits := subNet.Mask.Size() + if parentMaskBits != subMaskBits { + return false + } + if parentMaskOnes >= subMaskOnes { + return false + } + return parentNet.Contains(subNet.IP) +} diff --git a/pkg/util/ip/ip_test.go b/pkg/util/ip/ip_test.go index ef812257fba..6522632d487 100644 --- a/pkg/util/ip/ip_test.go +++ b/pkg/util/ip/ip_test.go @@ -239,3 +239,52 @@ func TestAppendPortIfMissing(t *testing.T) { }) } } + +func TestIsSubCIDR(t *testing.T) { + ipNet1 := newCIDR("192.168.1.0/24") + ipNet1.IP = net.ParseIP("192.168.1.255") + + ipNet2 := newCIDR("192.168.1.128/25") + ipNet2.IP = net.ParseIP("192.168.1.129") + + ipNet3 := newCIDR("192.168.1.1/32") + + ipNet4 := newCIDR("192.168.2.0/24") + + tests := []struct { + name string + ipNetA *net.IPNet + ipNetB *net.IPNet + expected bool + }{ + { + name: "a sub CIDR", + ipNetA: ipNet1, + ipNetB: ipNet3, + expected: true, + }, + { + name: "a sub CIDR", + ipNetA: ipNet1, + ipNetB: ipNet2, + expected: true, + }, + { + name: "different CIDRs", + ipNetA: ipNet1, + ipNetB: ipNet4, + expected: false, + }, + { + name: "the same CIDR", + ipNetA: ipNet3, + ipNetB: ipNet3, + expected: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, IsSubCIDR(tt.ipNetA, tt.ipNetB)) + }) + } +}