Skip to content

Commit

Permalink
added state restoring from checkpoint file
Browse files Browse the repository at this point in the history
Change-Id: I52072035a80f2cff6b781d530f8a8381b7c81d66
  • Loading branch information
ahalimx86 committed Jul 17, 2018
1 parent fd5e8eb commit 0bbfc91
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 2 deletions.
60 changes: 60 additions & 0 deletions checkpoint/checkpoint.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package checkpoint

import (
"encoding/json"
"fmt"
"io/ioutil"

pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
)

const (
checkPointfile = "/var/lib/kubelet/device-plugins/kubelet_internal_checkpoint"
pluginDir = pluginapi.DevicePluginPath
kubeletDeviceManagerCheckpoint = "kubelet_internal_checkpoint"
)

type PodDevicesEntry struct {
PodUID string
ContainerName string
ResourceName string
DeviceIDs []string
AllocResp []byte
}

type checkpointData struct {
PodDeviceEntries []PodDevicesEntry
RegisteredDevices map[string][]string
}

type Data struct {
Data checkpointData
Checksum checksum.Checksum
}

// Get all Pod entires for given resourceName
func GetPodEntries(resourceName string) ([]PodDevicesEntry, error) {

podEntries := []PodDevicesEntry{}

cpd := &Data{}
rawBytes, err := ioutil.ReadFile(checkPointfile)
if err != nil {
return podEntries, fmt.Errorf("Error reading file %s\n%v\n", checkPointfile, err)

}

if err = json.Unmarshal(rawBytes, cpd); err != nil {
return podEntries, fmt.Errorf("Error unmarshalling raw bytes %v", err)
}

responseData := cpd.Data.PodDeviceEntries

for _, p := range responseData {
if p.ResourceName == resourceName {
podEntries = append(podEntries, p)
}
}
return podEntries, nil
}
34 changes: 32 additions & 2 deletions cmd/sriovdp/sriov-device-plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"google.golang.org/grpc"

"github.com/intel/sriov-network-device-plugin/api"
"github.com/intel/sriov-network-device-plugin/checkpoint"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
Expand Down Expand Up @@ -79,13 +80,37 @@ func newSriovManager() *sriovManager {
glog.Errorf("Error. Could not create K8s Client using supplied config. %v", err)
return nil
}
return &sriovManager{
sm := &sriovManager{
k8ClientSet: clientset,
devices: make(map[string]pluginapi.Device),
allocatedDevices: make(map[string][]*deviceEntry),
managedDevices: make(map[string]*api.VfInformation),
socketFile: fmt.Sprintf("%s.sock", pluginEndpointPrefix),
}
// get device mapping from checkpoint if there's any
restorePodMapping(sm)

return sm
}

// Restore Pod to device mapping from checkpoint file
func restorePodMapping(sm *sriovManager) {
glog.Infof("Checking checkpoint file")
podEntries, err := checkpoint.GetPodEntries(resourceName)
if err == nil && len(podEntries) > 0 {
for _, p := range podEntries {
logStr := fmt.Sprintf("Restored PodUID: %s DeviceIDs : {", p.PodUID)
dEntry := []*deviceEntry{}
for _, d := range p.DeviceIDs {
// assuming CNI plugin already configured these VFs mark them as configured 'true'
dEntry = append(dEntry, &deviceEntry{deviceID: d, allocated: true})
logStr += fmt.Sprintf("%s,", d)
}
logStr += "}"
sm.allocatedDevices[p.PodUID] = dEntry
glog.Infof(logStr)
}
}
}

// Returns a list of SRIOV capable PF names as string
Expand Down Expand Up @@ -315,6 +340,7 @@ func (sm *sriovManager) ListAndWatch(empty *pluginapi.Empty, stream pluginapi.De
changed = false
time.Sleep(5 * time.Second)
}
return nil
}

func (sm *sriovManager) PreStartContainer(ctx context.Context, psRqt *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
Expand Down Expand Up @@ -427,8 +453,12 @@ func (sm *sriovManager) CNIDelete(ctx context.Context, podInfo *api.PodInformati

func main() {
flag.Parse()
glog.Infof("SRIOV Network Device Plugin started...")
glog.Infof("Starting SRIOV Network Device Plugin...")
sm := newSriovManager()
if sm == nil {
glog.Errorf("Unable to get instance of a SRIOV-Manager")
return
}
sm.cleanup()

// respond to syscalls for termination
Expand Down

0 comments on commit 0bbfc91

Please sign in to comment.