Skip to content

Commit

Permalink
Merge pull request #130 from haitwang-cloud/update-logging
Browse files Browse the repository at this point in the history
Improved log messages
  • Loading branch information
archlitchi authored Jan 19, 2024
2 parents 2d4e2d1 + ebad887 commit 8e4f378
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 40 deletions.
4 changes: 2 additions & 2 deletions cmd/device-plugin/nvidia/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,10 @@ func start(c *cli.Context, flags []cli.Flag) error {
defer watcher.Close()

/*Loading config files*/
fmt.Println("NodeName=", util.NodeName)
klog.Infof("Start working on node %s", util.NodeName)
err = readFromConfigFile()
if err != nil {
fmt.Printf("failed to load config file %s", err.Error())
klog.Errorf("failed to load config file %s", err.Error())
}

klog.Info("Starting OS watcher.")
Expand Down
5 changes: 3 additions & 2 deletions cmd/device-plugin/nvidia/vgpucfg.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"4pd.io/k8s-vgpu/pkg/util"
spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
cli "github.com/urfave/cli/v2"
"k8s.io/klog/v2"
)

func addFlags() []cli.Flag {
Expand Down Expand Up @@ -88,10 +89,10 @@ func readFromConfigFile() error {
if err != nil {
return err
}
fmt.Println("json=", deviceConfigs)
klog.Infof("Device Plugin Configs: %v", fmt.Sprintf("%v", deviceConfigs))
for _, val := range deviceConfigs.Nodeconfig {
if strings.Compare(os.Getenv("NodeName"), val.Name) == 0 {
fmt.Println("Reading config from file", val.Name)
klog.Infof("Reading config from file %s", val.Name)
if val.Devicememoryscaling > 0 {
util.DeviceMemoryScaling = &val.Devicememoryscaling
}
Expand Down
5 changes: 3 additions & 2 deletions cmd/scheduler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog/v2"
)

// ClusterManager is an example for a system that might have been built without
Expand Down Expand Up @@ -63,7 +64,7 @@ func (cc ClusterManagerCollector) Describe(ch chan<- *prometheus.Desc) {
// Note that Collect could be called concurrently, so we depend on
// ReallyExpensiveAssessmentOfTheSystemState to be concurrency-safe.
func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
fmt.Println("begin collect")
klog.Infof("Starting to collect metrics for scheduler")
nodevGPUMemoryLimitDesc := prometheus.NewDesc(
"GPUDeviceMemoryLimit",
"Device memory limit for a certain GPU",
Expand Down Expand Up @@ -223,7 +224,7 @@ func NewClusterManager(zone string, reg prometheus.Registerer) *ClusterManager {
func initmetrics() {
// Since we are dealing with custom Collector implementations, it might
// be a good idea to try it out with a pedantic registry.
fmt.Println("Initializing metrics...")
klog.Infof("Initializing metrics for scheduler")
reg := prometheus.NewRegistry()

// Construct cluster managers. In real code, we would assign them to
Expand Down
21 changes: 11 additions & 10 deletions cmd/vGPUmonitor/feedback.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/NVIDIA/go-nvml/pkg/nvml"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
)

var cgroupDriver int
Expand Down Expand Up @@ -226,27 +227,27 @@ func Observe(srlist *map[string]podusage) error {
}
if CheckBlocking(utSwitchOn, int(val.sr.priority), val) {
if (*srlist)[idx].sr.recentKernel >= 0 {
fmt.Println("utSwitchon=", utSwitchOn)
fmt.Println("Setting Blocking to on", idx)
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting Blocking to on %v", idx)
(*srlist)[idx].sr.recentKernel = -1
}
} else {
if (*srlist)[idx].sr.recentKernel < 0 {
fmt.Println("utSwitchon=", utSwitchOn)
fmt.Println("Setting Blocking to off", idx)
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting Blocking to off %v", idx)
(*srlist)[idx].sr.recentKernel = 0
}
}
if CheckPriority(utSwitchOn, int(val.sr.priority), val) {
if (*srlist)[idx].sr.utilizationSwitch != 1 {
fmt.Println("utSwitchon=", utSwitchOn)
fmt.Println("Setting UtilizationSwitch to on", idx)
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting UtilizationSwitch to on %v", idx)
(*srlist)[idx].sr.utilizationSwitch = 1
}
} else {
if (*srlist)[idx].sr.utilizationSwitch != 0 {
fmt.Println("utSwitchon=", utSwitchOn)
fmt.Println("Setting UtilizationSwitch to off", idx)
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting UtilizationSwitch to off %v", idx)
(*srlist)[idx].sr.utilizationSwitch = 0
}
}
Expand All @@ -260,9 +261,9 @@ func watchAndFeedback() {
time.Sleep(time.Second * 5)
err := monitorpath(srPodList)
if err != nil {
fmt.Println("monitorPath failed", err.Error())
klog.Errorf("monitorPath failed %v", err.Error())
}
//fmt.Println("watchAndFeedback", srPodList)
klog.Infof("WatchAndFeedback srPodList=%v", srPodList)
Observe(&srPodList)

}
Expand Down
5 changes: 2 additions & 3 deletions cmd/vGPUmonitor/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ func getsrlist() map[string]podusage {
// Note that Collect could be called concurrently, so we depend on
// ReallyExpensiveAssessmentOfTheSystemState to be concurrency-safe.
func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
fmt.Println("begin collect")
klog.Infof("Starting to collect metrics for vGPUMonitor")
if srPodList == nil {
srPodList = make(map[string]podusage)
}
Expand Down Expand Up @@ -276,8 +276,7 @@ func NewClusterManager(zone string, reg prometheus.Registerer) *ClusterManager {
func initmetrics() {
// Since we are dealing with custom Collector implementations, it might
// be a good idea to try it out with a pedantic registry.
fmt.Println("Initializing metrics...")

klog.Infof("Initializing metrics for vGPUmonitor")
reg := prometheus.NewRegistry()
//reg := prometheus.NewPedanticRegistry()
config, err := rest.InClusterConfig()
Expand Down
17 changes: 9 additions & 8 deletions cmd/vGPUmonitor/pathmonitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package main
import (
"context"
"errors"
"fmt"
"io/ioutil"
"log"
"net"
Expand All @@ -16,6 +15,7 @@ import (
"google.golang.org/grpc"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
)

const containerpath = "/usr/local/vgpu/containers"
Expand All @@ -28,7 +28,7 @@ type podusage struct {
var lock sync.Mutex

func checkfiles(fpath string) (*sharedRegionT, error) {
fmt.Println("Checking path", fpath)
klog.Infof("Checking path %s", fpath)
files, err := ioutil.ReadDir(fpath)
if err != nil {
return nil, err
Expand All @@ -53,9 +53,9 @@ func checkfiles(fpath string) (*sharedRegionT, error) {
}
sr, err := getvGPUMemoryInfo(&nc)
if err != nil {
fmt.Println("err=", err.Error())
klog.Errorf("getvGPUMemoryInfo failed: %v", err)
} else {
fmt.Println("sr=", sr.utilizationSwitch, sr.recentKernel, sr.priority)
klog.Infof("getvGPUMemoryInfo success with utilizationSwitch=%d, recentKernel=%d, priority=%d", sr.utilizationSwitch, sr.recentKernel, sr.priority)
return sr, nil
}
}
Expand Down Expand Up @@ -88,21 +88,22 @@ func monitorpath(podmap map[string]podusage) error {
info, err1 := os.Stat(dirname)
if err1 != nil || !checkpodvalid(info.Name(), pods) {
if info.ModTime().Add(time.Second * 300).Before(time.Now()) {
fmt.Println("removing" + dirname)
klog.Infof("Removing dirname %s in in monitorpath", dirname)
//syscall.Munmap(unsafe.Pointer(podmap[dirname].sr))
delete(podmap, dirname)
err2 := os.RemoveAll(dirname)
if err2 != nil {
klog.Errorf("Failed to remove dirname: %s , error: %v", dirname, err)
return err2
}
}
} else {
_, ok := podmap[dirname]
if !ok {
fmt.Println("Adding ctr", dirname)
klog.Infof("Adding ctr dirname %s in monitorpath", dirname)
sr, err2 := checkfiles(dirname)
if err2 != nil {
//fmt.Println("err2=", err2.Error())
klog.Errorf("Failed to checkfiles dirname: %s , error: %v", dirname, err)
return err2
}
if sr == nil {
Expand Down Expand Up @@ -130,7 +131,7 @@ func serveinfo(ch chan error) {
log.Fatalf("failed to listen: %v", err)
}
vGPUmonitor.RegisterNodeVGPUInfoServer(s, &server{})
fmt.Println("server listening at", lis.Addr())
klog.Infof("server listening at %v", lis.Addr())
if err := s.Serve(lis); err != nil {
log.Fatalf("failed to serve: %v", err)
} /*
Expand Down
18 changes: 11 additions & 7 deletions pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,14 @@ func (r *NvidiaDevicePlugin) getApiDevices() *[]*api.DeviceInfo {
Health: health,
})
idx++
klog.V(3).Infoln("nvml registered device id=", idx, "memory=", memory, "type=", Model)
klog.Infof("nvml registered device id=%v, memory=%v, type=%v, numa=%v", idx, memory, Model, numa)
}
return &res
}

func (r *NvidiaDevicePlugin) RegistrInAnnotation() error {
devices := r.getApiDevices()
klog.InfoS("node devices", "devices", devices)
klog.InfoS("start working on the devices", "devices", devices)
annos := make(map[string]string)
node, err := util.GetNode(util.NodeName)
if err != nil {
Expand All @@ -170,7 +170,7 @@ func (r *NvidiaDevicePlugin) RegistrInAnnotation() error {
encodeddevices := util.EncodeNodeDevices(*devices)
annos[nvidia.HandshakeAnnos] = "Reported " + time.Now().String()
annos[nvidia.RegisterAnnos] = encodeddevices
klog.Infoln("Reporting devices", encodeddevices, "in", time.Now().String())
klog.Infof("patch node with the following annos %v", fmt.Sprintf("%v", annos))
err = util.PatchNodeAnnotations(node, annos)

if err != nil {
Expand All @@ -180,14 +180,18 @@ func (r *NvidiaDevicePlugin) RegistrInAnnotation() error {
}

func (r *NvidiaDevicePlugin) WatchAndRegister() {
klog.Infof("into WatchAndRegister")
klog.Infof("Starting WatchAndRegister")
errorSleepInterval := time.Second * 5
successSleepInterval := time.Second * 30
for {
err := r.RegistrInAnnotation()
if err != nil {
klog.Errorf("register error, %v", err)
time.Sleep(time.Second * 5)
klog.Errorf("Failed to register annotation: %v", err)
klog.Infof("Retrying in %v seconds...", errorSleepInterval/time.Second)
time.Sleep(errorSleepInterval)
} else {
time.Sleep(time.Second * 30)
klog.Infof("Successfully registered annotation. Next check in %v seconds...", successSleepInterval/time.Second)
time.Sleep(successSleepInterval)
}
}
}
4 changes: 2 additions & 2 deletions pkg/scheduler/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func (m *podManager) addPod(pod *corev1.Pod, nodeID string, devices util.PodDevi
if !ok {
pi := &podInfo{Name: pod.Name, Uid: pod.UID, Namespace: pod.Namespace, NodeID: nodeID, Devices: devices}
m.pods[pod.UID] = pi
klog.Info(pod.Name + "Added")
klog.Infof("Pod added: Name: %s, Uid: %s, Namespace: %s, NodeID: %s", pod.Name, pod.UID, pod.Namespace, nodeID)
}
}

Expand All @@ -59,7 +59,7 @@ func (m *podManager) delPod(pod *corev1.Pod) {
defer m.mutex.Unlock()
pi, ok := m.pods[pod.UID]
if ok {
klog.Infof(pi.Name + " deleted")
klog.Infof("Deleted pod %s with node ID %s", pi.Name, pi.NodeID)
delete(m.pods, pod.UID)
}
}
Expand Down
9 changes: 5 additions & 4 deletions pkg/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func EncodeNodeDevices(dlist []*api.DeviceInfo) string {
for _, val := range dlist {
tmp += val.Id + "," + strconv.FormatInt(int64(val.Count), 10) + "," + strconv.Itoa(int(val.Devmem)) + "," + strconv.Itoa(int(val.Devcore)) + "," + val.Type + "," + strconv.Itoa(val.Numa) + "," + strconv.FormatBool(val.Health) + ":"
}
klog.V(6).Infoln("Encoded node Devices", tmp)
klog.Infof("Encoded node Devices: %s", tmp)
return tmp
}

Expand All @@ -112,7 +112,7 @@ func EncodeContainerDevices(cd ContainerDevices) string {
for _, val := range cd {
tmp += val.UUID + "," + val.Type + "," + strconv.Itoa(int(val.Usedmem)) + "," + strconv.Itoa(int(val.Usedcores)) + ":"
}
fmt.Println("Encoded container Devices=", tmp)
klog.Infof("Encoded container Devices: %s", tmp)
return tmp
//return strings.Join(cd, ",")
}
Expand All @@ -122,6 +122,7 @@ func EncodePodDevices(pd PodDevices) string {
for _, cd := range pd {
ss = append(ss, EncodeContainerDevices(cd))
}
klog.Infof("Encoded pod Devices: %s", strings.Join(ss, ";"))
return strings.Join(ss, ";")
}

Expand All @@ -132,7 +133,7 @@ func DecodeContainerDevices(str string) (ContainerDevices, error) {
cd := strings.Split(str, ":")
contdev := ContainerDevices{}
tmpdev := ContainerDevice{}
//fmt.Println("before container device", str)
klog.Infof("Start to decode container device %s", str)
if len(str) == 0 {
return ContainerDevices{}, nil
}
Expand All @@ -152,7 +153,7 @@ func DecodeContainerDevices(str string) (ContainerDevices, error) {
contdev = append(contdev, tmpdev)
}
}
//fmt.Println("Decoded container device", contdev)
klog.Infof("Finished decoding container devices. Total devices: %d", len(contdev))
return contdev, nil
}

Expand Down

0 comments on commit 8e4f378

Please sign in to comment.