Skip to content

Commit

Permalink
Improve AntreaProxy route syncing on Windows
Browse files Browse the repository at this point in the history
This PR fixes some issues:

1. AntreaAgent logs "Failed to sync route" when attempting to sync route entries
  every time.
2. AntreaAgent logs "Failed to install route for Service CIDR" during startup.
3. AntreaAgent may crash during the first installation of Antrea with proxyAll
  enabled on Windows.
4. AntreaAgent doesn't clean the previously installed calculated route.

For the first issue, previously, to recover the connected route of antrea-gw0
(assuming the IP address is 10.10.0.1/24) that may have been deleted by mistake,
a route with a destination 10.10.0.1/24 and gateway 10.10.0.1 was periodically
synced. However, this caused an error because an existing active route with the
same destination but a different gateway 0.0.0.0 should have already been
automatically installed when antrea-gw0 was created. To address this issue, this
PR changes the gateway of the recover route from 10.10.0.1 to 0.0.0.0, which
matches the existing installed route. This ensures that the periodic sync will
not cause any errors.

For the second issue, when syncing the first ClusterIP, the route entry installed
for the first ClusterIP is added to the stale routes and then deleted. This results
in the first ClusterIP becoming unavailable until the calculated route entry for
the second ClusterIP is installed. This PR resolves the issue by excluding the route
entry for the first ClusterIP from the stale routes.

For the third issue, if the first ClusterIP is Service `kubernetes` in the second
issue, the unavailability of the ClusterIP might result in crashing of Antrea
with proxyAll enabled.

For the fourth issue, when syncing the first ClusterIP, list all existing routes and
clean the routes whose destination CIDR that contains the ClusterIP to resolve the
issue.

Signed-off-by: Hongliang Liu <lhongliang@vmware.com>
  • Loading branch information
hongliangl committed May 11, 2023
1 parent c9c9559 commit 204864c
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 10 deletions.
33 changes: 24 additions & 9 deletions pkg/agent/route/route_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,28 +280,43 @@ func (c *Client) addServiceCIDRRoute(serviceCIDR *net.IPNet) error {
c.serviceRoutes.Store(serviceIPv4CIDRKey, route)

// Collect stale routes.
var staleRoutes []*util.Route
// If current destination CIDR is not nil, the route with current destination CIDR should be uninstalled since
// a new route with a newly calculated destination CIDR has been installed.
staleRoutes := make(map[string]*util.Route)
if serviceCIDRRouteExists {
staleRoutes = append(staleRoutes, oldServiceCIDRRoute.(*util.Route))
// If current destination CIDR is not nil, the route with current destination CIDR should be uninstalled since
// a new route with a newly calculated destination CIDR has been installed.
rt := oldServiceCIDRRoute.(*util.Route)
staleRoutes[rt.String()] = rt
} else {
// If current destination CIDR is nil, which means that Antrea Agent has just started, then all existing routes
// whose destination CIDR contains the first ClusterIP should be uninstalled, except the newly installed route.
// Note that, there may be multiple stale routes prior to this commit. When upgrading, all stale routes will be
// collected. After this commit, there will be only one stale route after Antrea Agent started.
routes, err := c.listIPRoutesOnGW()
if err != nil {
return fmt.Errorf("error listing ip routes: %w", err)
}
for _, rt := range routes {
if rt.GatewayAddress.Equal(gw) && !rt.DestinationSubnet.IP.Equal(serviceCIDR.IP) && rt.DestinationSubnet.Contains(serviceCIDR.IP) {
staleRoutes[rt.String()] = rt
}
}
}
routes, err := c.listIPRoutesOnGW()
if err != nil {
return fmt.Errorf("error listing ip routes: %w", err)
}
// Collect stale per-IP routes for ClusterIPs before this patch.
// TODO(lhongliang): remove this in v1.13.0 or later.
for _, rt := range routes {
ones, _ := rt.DestinationSubnet.Mask.Size()
if ones == net.IPv4len*8 && serviceCIDR.Contains(rt.DestinationSubnet.IP) {
staleRoutes = append(staleRoutes, rt)
if iputil.IsSubCIDR(serviceCIDR, rt.DestinationSubnet) {
staleRoutes[rt.String()] = rt
}
}

// Remove stale routes.
for _, rt := range staleRoutes {
if err := util.RemoveNetRoute(rt); err != nil {
if err.Error() == "No matching MSFT_NetRoute objects" {
if strings.Contains(err.Error(), "No matching MSFT_NetRoute objects") {
klog.InfoS("Failed to delete stale Service CIDR route since the route has been deleted", "route", rt)
} else {
return fmt.Errorf("failed to delete stale Service CIDR route %s: %w", rt.String(), err)
Expand Down Expand Up @@ -383,7 +398,7 @@ func (c *Client) syncRoute() error {
gwAutoconfRoute := &util.Route{
LinkIndex: c.nodeConfig.GatewayConfig.LinkIndex,
DestinationSubnet: c.nodeConfig.PodIPv4CIDR,
GatewayAddress: c.nodeConfig.GatewayConfig.IPv4,
GatewayAddress: net.IPv4zero,
RouteMetric: util.MetricDefault,
}
restoreRoute(gwAutoconfRoute)
Expand Down
2 changes: 1 addition & 1 deletion pkg/agent/util/powershell/powershell_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func RunCommand(cmd string) (string, error) {
// https://stackoverflow.com/questions/19282870/how-can-i-use-try-catch-and-get-my-script-to-stop-if-theres-an-error/19285405
psCmd := exec.Command("powershell.exe", "-NoLogo", "-NoProfile", "-NonInteractive", "-Command",
fmt.Sprintf(`$ErrorActionPreference="Stop";try {%s} catch {Write-Host $_;os.Exit(1)}`, cmd)) // #nosec G204
stdout, err := psCmd.Output()
stdout, err := psCmd.CombinedOutput()
stdoutStr := string(stdout)
if err != nil {
return "", fmt.Errorf("failed to run command '%s': output '%s', %v", cmd, stdoutStr, err)
Expand Down
12 changes: 12 additions & 0 deletions pkg/util/ip/ip.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,15 @@ func AppendPortIfMissing(addr, port string) string {

return net.JoinHostPort(addr, port)
}

func IsSubCIDR(parentNet, subNet *net.IPNet) bool {
parentMaskOnes, parentMaskBits := parentNet.Mask.Size()
subMaskOnes, subMaskBits := subNet.Mask.Size()
if parentMaskBits != subMaskBits {
return false
}
if parentMaskOnes >= subMaskOnes {
return false
}
return parentNet.Contains(subNet.IP) && subMaskOnes > parentMaskOnes
}
39 changes: 39 additions & 0 deletions pkg/util/ip/ip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,3 +239,42 @@ func TestAppendPortIfMissing(t *testing.T) {
})
}
}

func TestIsSubCIDR(t *testing.T) {
tests := []struct {
name string
ipNetA *net.IPNet
ipNetB *net.IPNet
expected bool
}{
{
name: "a sub CIDR with the same network IP",
ipNetA: newCIDR("192.168.1.0/24"),
ipNetB: newCIDR("192.168.1.0/25"),
expected: true,
},
{
name: "a sub CIDR with different network IPs",
ipNetA: newCIDR("192.168.1.0/24"),
ipNetB: newCIDR("192.168.1.128/25"),
expected: true,
},
{
name: "different CIDRs",
ipNetA: newCIDR("192.168.1.0/24"),
ipNetB: newCIDR("192.168.21.0/24"),
expected: false,
},
{
name: "the same CIDR",
ipNetA: newCIDR("192.168.1.0/32"),
ipNetB: newCIDR("192.168.1.0/32"),
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.expected, IsSubCIDR(tt.ipNetA, tt.ipNetB))
})
}
}

0 comments on commit 204864c

Please sign in to comment.