Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: improve accuracy of CPU gauges #26793

Merged
merged 9 commits into from
Mar 6, 2023
Merged
Prev Previous commit
Next Next commit
metrics: compute CPU percentage metrics using float64 as base unit
  • Loading branch information
fjl committed Mar 6, 2023
commit 0c9e7142e2564e6dddd025c0150fa987d373475e
7 changes: 4 additions & 3 deletions metrics/cpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
package metrics

// CPUStats is the system and process CPU stats.
// All values are in seconds.
type CPUStats struct {
GlobalTime int64 // Time spent by the CPU working on all processes
GlobalWait int64 // Time spent by waiting on disk for all processes
LocalTime int64 // Time spent by the CPU working on this process
GlobalTime float64 // Time spent by the CPU working on all processes
GlobalWait float64 // Time spent by waiting on disk for all processes
LocalTime float64 // Time spent by the CPU working on this process
}
4 changes: 2 additions & 2 deletions metrics/cpu_enabled.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func ReadCPUStats(stats *CPUStats) {
}
// requesting all cpu times will always return an array with only one time stats entry
timeStat := timeStats[0]
stats.GlobalTime = int64((timeStat.User + timeStat.Nice + timeStat.System) * cpu.ClocksPerSec)
stats.GlobalWait = int64((timeStat.Iowait) * cpu.ClocksPerSec)
stats.GlobalTime = timeStat.User + timeStat.Nice + timeStat.System
stats.GlobalWait = timeStat.Iowait
stats.LocalTime = getProcessCPUTime()
}
2 changes: 1 addition & 1 deletion metrics/cputime_nop.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ package metrics

// getProcessCPUTime returns 0 on Windows as there is no system call to resolve
// the actual process' CPU time.
func getProcessCPUTime() int64 {
func getProcessCPUTime() float64 {
return 0
}
4 changes: 2 additions & 2 deletions metrics/cputime_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ import (
)

// getProcessCPUTime retrieves the process' CPU time since program startup.
func getProcessCPUTime() int64 {
func getProcessCPUTime() float64 {
var usage syscall.Rusage
if err := syscall.Getrusage(syscall.RUSAGE_SELF, &usage); err != nil {
log.Warn("Failed to retrieve CPU time", "err", err)
return 0
}
return int64(usage.Utime.Sec+usage.Stime.Sec)*100 + int64(usage.Utime.Usec+usage.Stime.Usec)/10000 //nolint:unconvert
return float64(usage.Utime.Sec+usage.Stime.Sec) + float64(usage.Utime.Usec+usage.Stime.Usec)/1000000 //nolint:unconvert
}
18 changes: 11 additions & 7 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,9 @@ func CollectProcessMetrics(refresh time.Duration) {

// Define the various metrics to collect
var (
cpuSysLoad = GetOrRegisterGaugeFloat64("system/cpu/sysload", DefaultRegistry)
cpuSysWait = GetOrRegisterGaugeFloat64("system/cpu/syswait", DefaultRegistry)
cpuProcLoad = GetOrRegisterGaugeFloat64("system/cpu/procload", DefaultRegistry)
cpuSysLoad = GetOrRegisterGauge("system/cpu/sysload", DefaultRegistry)
cpuSysWait = GetOrRegisterGauge("system/cpu/syswait", DefaultRegistry)
cpuProcLoad = GetOrRegisterGauge("system/cpu/procload", DefaultRegistry)
cpuThreads = GetOrRegisterGauge("system/cpu/threads", DefaultRegistry)
cpuGoroutines = GetOrRegisterGauge("system/cpu/goroutines", DefaultRegistry)
cpuSchedLatency = getOrRegisterRuntimeHistogram("system/cpu/schedlatency", secondsToNs, nil)
Expand All @@ -167,12 +167,16 @@ func CollectProcessMetrics(refresh time.Duration) {
// Iterate loading the different stats and updating the meters.
now, prev := 0, 1
for ; ; now, prev = prev, now {
// CPU
// Gather CPU times.
ReadCPUStats(&cpustats[now])
refreshFreq := time.Since(lastCollectionTime).Seconds()
cpuSysLoad.Update(float64(cpustats[now].GlobalTime-cpustats[prev].GlobalTime) / refreshFreq)
cpuSysWait.Update(float64(cpustats[now].GlobalWait-cpustats[prev].GlobalWait) / refreshFreq)
cpuProcLoad.Update(float64(cpustats[now].LocalTime-cpustats[prev].LocalTime) / refreshFreq)
sysLoad := (cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq
sysWait := (cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq
procLoad := (cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq
// Convert to integer percentage.
cpuSysLoad.Update(int64(sysLoad * 100))
cpuSysWait.Update(int64(sysWait * 100))
cpuProcLoad.Update(int64(procLoad * 100))

// Threads
cpuThreads.Update(int64(threadCreateProfile.Count()))
Expand Down