Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add host-local IPAM GC on startup #5660

Merged
merged 2 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ require (
github.com/MakeNowJust/heredoc v1.0.0 // indirect
github.com/NYTimes/gziphandler v1.1.1 // indirect
github.com/VividCortex/ewma v1.2.0 // indirect
github.com/alexflint/go-filemutex v1.2.0 // indirect
github.com/andybalholm/brotli v1.0.4 // indirect
github.com/antlr/antlr4/runtime/Go/antlr v1.4.10 // indirect
github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRF
github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
github.com/alessio/shellescape v1.2.2/go.mod h1:PZAiSCk0LJaZkiCSkPv8qIobYglO3FPpyFjDCtHLS30=
github.com/alexflint/go-filemutex v0.0.0-20171022225611-72bdc8eae2ae/go.mod h1:CgnQgUtFrFz9mxFNtED3jI5tLDjKlOM+oUF/sTk6ps0=
github.com/alexflint/go-filemutex v1.2.0 h1:1v0TJPDtlhgpW4nJ+GvxCLSlUDC3+gW0CQQvlmfDR/s=
github.com/alexflint/go-filemutex v1.2.0/go.mod h1:mYyQSWvw9Tx2/H2n9qXPb52tTYfE0pZAWcBq5mK025c=
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8=
github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
Expand Down
118 changes: 118 additions & 0 deletions pkg/agent/cniserver/ipam/hostlocal/gc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// Copyright 2023 Antrea Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostlocal

import (
"fmt"
"net"
"os"
"path/filepath"
"runtime"
"strings"

"github.com/containernetworking/plugins/plugins/ipam/host-local/backend/disk"
"github.com/spf13/afero"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"
)

// dataDir is a variable so it can be overridden by tests if needed
var dataDir = "/var/lib/cni/networks"

func networkDir(network string) string {
return filepath.Join(dataDir, network)
}

// This is a hacky approach as we access the internals of the host-local plugin,
// instead of using the CNI interface. However, crafting a CNI DEL request from
// scratch would also be hacky.
func GarbageCollectContainerIPs(network string, desiredIPs sets.Set[string]) error {
dir := networkDir(network)

info, err := os.Stat(dir)
if os.IsNotExist(err) {
klog.V(2).InfoS("Host-local IPAM data directory does not exist, nothing to do", "dir", dir)
return nil
}
if !info.IsDir() {
return fmt.Errorf("path '%s' is not a directory: %w", dir, err)
}

lk, err := disk.NewFileLock(dir)
if err != nil {
return err
}
defer lk.Close()
lk.Lock()
defer lk.Unlock()

fs := afero.NewOsFs()
return gcContainerIPs(fs, dir, desiredIPs)
}

// Internal version of GarbageCollectContainerIPs which does not acquire the
// file lock and can work with an arbitrary afero filesystem.
func gcContainerIPs(fs afero.Fs, dir string, desiredIPs sets.Set[string]) error {
paths := make([]string, 0)

if err := afero.Walk(fs, dir, func(path string, info os.FileInfo, err error) error {
if err != nil || info.IsDir() {
return nil
}
paths = append(paths, path)
return nil
}); err != nil {
return fmt.Errorf("error when gathering IP filenames in the host-local data directory: %w", err)
}

hasRemovalError := false
for _, p := range paths {
ip := getIPFromPath(p)
if net.ParseIP(ip) == nil {
// not a valid IP, nothing to do
continue
}
if desiredIPs.Has(ip) {
// IP is in-use
continue
}
if err := fs.Remove(p); err != nil {
klog.ErrorS(err, "Failed to release unused IP from host-local IPAM plugin", "IP", ip)
hasRemovalError = true
continue
}
klog.InfoS("Unused IP was successfully released from host-local IPAM plugin", "IP", ip)
}

if hasRemovalError {
return fmt.Errorf("not all unused IPs could be released from host-local IPAM plugin, some IPs may be leaked")
}

// Note that it is perfectly possible for some IPs to be in desiredIPs but not in the
// host-local data directory. This can be the case when another IPAM plugin (e.g.,
// AntreaIPAM) is also used.

return nil
}

func getIPFromPath(path string) string {
fname := filepath.Base(path)
// need to unespace IPv6 addresses on Windows
// see https://github.com/containernetworking/plugins/blob/38f18d26ecfef550b8bac02656cc11103fd7cff1/plugins/ipam/host-local/backend/disk/backend.go#L197
if runtime.GOOS == "windows" {
fname = strings.ReplaceAll(fname, "_", ":")
}
return fname
}
231 changes: 231 additions & 0 deletions pkg/agent/cniserver/ipam/hostlocal/gc_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
// Copyright 2023 Antrea Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostlocal

import (
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
"testing"

"github.com/spf13/afero"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"k8s.io/apimachinery/pkg/util/sets"
)

type testFs struct {
afero.Fs
removeError error
removedFiles []string
}

func (fs *testFs) Remove(name string) error {
if fs.removeError != nil {
err := &os.PathError{Op: "remove", Path: name, Err: fs.removeError}
// reset error
fs.removeError = nil
return err
}
if err := fs.Fs.Remove(name); err != nil {
return err
}
fs.removedFiles = append(fs.removedFiles, name)
return nil
}

// forceRemoveError forces the next Remove call to fail. Error will be cleared after the first call.
func (fs *testFs) forceRemoveError() {
fs.removeError = fmt.Errorf("permission denied")
}

func (fs *testFs) removedIPs() sets.Set[string] {
s := sets.New[string]()
for _, p := range fs.removedFiles {
s.Insert(getIPFromPath(p))
}
return s
}

// from https://github.com/containernetworking/plugins/blob/38f18d26ecfef550b8bac02656cc11103fd7cff1/plugins/ipam/host-local/backend/disk/backend.go#L197
func getEscapedPath(dir string, fname string) string {
if runtime.GOOS == "windows" {
fname = strings.ReplaceAll(fname, ":", "_")
}
return filepath.Join(dir, fname)
}

func allocateIPs(t *testing.T, fs afero.Fs, dir string, ips ...string) {
for _, ip := range ips {
path := getEscapedPath(dir, ip)
// The real host-local IPAM plugin writes the container ID + interface name to the
// file, but it is irrelevant in our case.
require.NoError(t, afero.WriteFile(fs, path, []byte("foo"), 0o600))
}
}

func TestGcContainerIPs(t *testing.T) {
dir := networkDir("antrea")

newTestFs := func() *testFs {
return &testFs{
Fs: afero.NewMemMapFs(),
}
}

t.Run("missing directory", func(t *testing.T) {
fs := newTestFs()
// create the plugin data directory, but not the "network" sub-directory
require.NoError(t, fs.MkdirAll(dataDir, 0o755))
assert.NoError(t, gcContainerIPs(fs, dir, sets.New[string]()))
removedIPs := fs.removedIPs()
assert.Empty(t, removedIPs)
})

t.Run("remove error", func(t *testing.T) {
ips := []string{"10.0.0.1", "10.0.0.2"}
fs := newTestFs()
require.NoError(t, fs.MkdirAll(dir, 0o755))
allocateIPs(t, fs, dir, ips...)
fs.forceRemoveError()
require.Error(t, gcContainerIPs(fs, dir, sets.New[string]()))
// one of the IPs will fail to be released, the other one will succeed
removedIPs := fs.removedIPs()
assert.Len(t, removedIPs, 1)
})

resolveIP := func(id int, ipv6 bool) string {
if ipv6 {
return fmt.Sprintf("2001:db8:a::%d", id)
} else {
return fmt.Sprintf("10.0.0.%d", id)
}
}

// some success test cases, will be run for both IPv4 and IPv6
testCases := []struct {
name string
desiredIPs []int
allocatedIPs []int
expectedRemovedIPs []int
}{
{
name: "same sets",
desiredIPs: []int{1, 2},
allocatedIPs: []int{1, 2},
expectedRemovedIPs: []int{},
},
{
name: "multiple removals",
desiredIPs: []int{1, 3},
allocatedIPs: []int{1, 2, 3, 4},
expectedRemovedIPs: []int{2, 4},
},
{
name: "extra ip",
desiredIPs: []int{1, 2, 3},
allocatedIPs: []int{1, 2},
expectedRemovedIPs: []int{},
},
}

runTests := func(t *testing.T, ipv6 bool) {
name := "ipv4"
if ipv6 {
name = "ipv6"
}

toIPSet := func(ids []int) sets.Set[string] {
ips := sets.New[string]()
for _, id := range ids {
ip := resolveIP(id, ipv6)
require.NotEmpty(t, ip)
ips.Insert(ip)
}
return ips
}

t.Run(name, func(t *testing.T) {
for _, tc := range testCases {
fs := newTestFs()
require.NoError(t, fs.MkdirAll(dir, 0o755))
desiredIPs := toIPSet(tc.desiredIPs)
allocatedIPs := toIPSet(tc.allocatedIPs)
expectedRemovedIPs := toIPSet(tc.expectedRemovedIPs)
allocateIPs(t, fs, dir, allocatedIPs.UnsortedList()...)
require.NoError(t, gcContainerIPs(fs, dir, desiredIPs))
assert.Equal(t, expectedRemovedIPs, fs.removedIPs())
}
})
}

runTests(t, false)
runTests(t, true)
}

// TestGarbageCollectContainerIPs tests some edge cases and logic that depends on the real OS
// filesystem. The actual GC logic is tested by TestGcContainerIPs.
func TestGarbageCollectContainerIPs(t *testing.T) {
ips := sets.New[string]()
tempDir, err := os.MkdirTemp("", "test-networks")
require.NoError(t, err)
savedDir := dataDir
defer func() {
dataDir = savedDir
}()
dataDir = tempDir
defer os.RemoveAll(tempDir)

idx := 0
networkName := func() string {
idx++
return fmt.Sprintf("net%d", idx)
}

lockFile := func(network string) string {
return filepath.Join(tempDir, network, "lock")
}

t.Run("missing directory", func(t *testing.T) {
network := networkName()
// there is no directory in tempDir for the "antrea" network
// we don't expect an error, and the lock file should not be created
require.NoError(t, GarbageCollectContainerIPs(network, ips))
assert.NoFileExists(t, lockFile(network))
})

t.Run("not a directory", func(t *testing.T) {
network := networkName()
netDir := filepath.Join(tempDir, network)
// create a file instead of a directory: GarbageCollectContainerIPs should return an
// error
_, err := os.Create(netDir)
require.NoError(t, err)
defer os.Remove(netDir)
assert.ErrorContains(t, GarbageCollectContainerIPs(network, ips), "not a directory")
})

t.Run("lock file created", func(t *testing.T) {
network := networkName()
netDir := filepath.Join(tempDir, network)
require.NoError(t, os.Mkdir(netDir, 0o755))
defer os.RemoveAll(netDir)
// make sure that the lock file is created in the right place
require.NoError(t, GarbageCollectContainerIPs(network, ips))
assert.FileExists(t, lockFile(network))
})
}
14 changes: 14 additions & 0 deletions pkg/agent/cniserver/ipam/ipam_delegator.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@ import (
"github.com/containernetworking/cni/pkg/invoke"
"github.com/containernetworking/cni/pkg/types"
current "github.com/containernetworking/cni/pkg/types/100"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"

"antrea.io/antrea/pkg/agent/cniserver/ipam/hostlocal"
argtypes "antrea.io/antrea/pkg/agent/cniserver/types"
)

Expand Down Expand Up @@ -86,6 +88,18 @@ func (d *IPAMDelegator) Check(args *invoke.Args, k8sArgs *argtypes.K8sArgs, netw
return true, nil
}

// GarbageCollectContainerIPs will release IPs allocated by the delegated IPAM
// plugin that are no longer in-use (if there is any). It should be called on an
// agent restart to provide garbage collection for IPs, and to avoid IP leakage
// in case of missed CNI DEL events. Normally, it is not Antrea's responsibility
// to implement this, as the above layers should ensure that there is always one
// successful CNI DEL for every corresponding CNI ADD. However, we include this
// support to increase robustness in case of a container runtime bug.
// Only the host-local plugin is supported.
func GarbageCollectContainerIPs(network string, desiredIPs sets.Set[string]) error {
return hostlocal.GarbageCollectContainerIPs(network, desiredIPs)
}

var defaultExec invoke.Exec = &invoke.DefaultExec{
RawExec: &invoke.RawExec{Stderr: os.Stderr},
}
Expand Down
Loading
Loading