Skip to content

Commit

Permalink
Adding /proc/<pid>/schedstat (google#1872)
Browse files Browse the repository at this point in the history
Add /proc/<pid>/schedstat metrics for scheduler metrics
  • Loading branch information
nielsole authored and dashpole committed Mar 8, 2018
1 parent b817801 commit 08f0c23
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 20 deletions.
2 changes: 2 additions & 0 deletions cadvisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ var (
ignoreMetrics metricSetValue = metricSetValue{container.MetricSet{
container.NetworkTcpUsageMetrics: struct{}{},
container.NetworkUdpUsageMetrics: struct{}{},
container.ProcessSchedulerMetrics: struct{}{},
}}

// List of metrics that can be ignored.
Expand All @@ -73,6 +74,7 @@ var (
container.NetworkTcpUsageMetrics: struct{}{},
container.NetworkUdpUsageMetrics: struct{}{},
container.PerCpuUsageMetrics: struct{}{},
container.ProcessSchedulerMetrics: struct{}{},
}
)

Expand Down
21 changes: 11 additions & 10 deletions container/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,17 @@ type ContainerHandlerFactory interface {
type MetricKind string

const (
CpuUsageMetrics MetricKind = "cpu"
PerCpuUsageMetrics MetricKind = "percpu"
MemoryUsageMetrics MetricKind = "memory"
CpuLoadMetrics MetricKind = "cpuLoad"
DiskIOMetrics MetricKind = "diskIO"
DiskUsageMetrics MetricKind = "disk"
NetworkUsageMetrics MetricKind = "network"
NetworkTcpUsageMetrics MetricKind = "tcp"
NetworkUdpUsageMetrics MetricKind = "udp"
AppMetrics MetricKind = "app"
CpuUsageMetrics MetricKind = "cpu"
ProcessSchedulerMetrics MetricKind = "sched"
PerCpuUsageMetrics MetricKind = "percpu"
MemoryUsageMetrics MetricKind = "memory"
CpuLoadMetrics MetricKind = "cpuLoad"
DiskIOMetrics MetricKind = "diskIO"
DiskUsageMetrics MetricKind = "disk"
NetworkUsageMetrics MetricKind = "network"
NetworkTcpUsageMetrics MetricKind = "tcp"
NetworkUdpUsageMetrics MetricKind = "udp"
AppMetrics MetricKind = "app"
)

func (mk MetricKind) String() string {
Expand Down
75 changes: 67 additions & 8 deletions container/libcontainer/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/google/cadvisor/container"
info "github.com/google/cadvisor/info/v1"

"bytes"
"github.com/golang/glog"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/cgroups"
Expand All @@ -39,18 +40,20 @@ import (
import "C"

type Handler struct {
cgroupManager cgroups.Manager
rootFs string
pid int
ignoreMetrics container.MetricSet
cgroupManager cgroups.Manager
rootFs string
pid int
ignoreMetrics container.MetricSet
pidMetricsCache map[int]*info.CpuSchedstat
}

func NewHandler(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetrics container.MetricSet) *Handler {
return &Handler{
cgroupManager: cgroupManager,
rootFs: rootFs,
pid: pid,
ignoreMetrics: ignoreMetrics,
cgroupManager: cgroupManager,
rootFs: rootFs,
pid: pid,
ignoreMetrics: ignoreMetrics,
pidMetricsCache: make(map[int]*info.CpuSchedstat),
}
}

Expand All @@ -66,6 +69,18 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
withPerCPU := !h.ignoreMetrics.Has(container.PerCpuUsageMetrics)
stats := newContainerStats(libcontainerStats, withPerCPU)

if !h.ignoreMetrics.Has(container.ProcessSchedulerMetrics) {
pids, err := h.cgroupManager.GetAllPids()
if err != nil {
glog.V(4).Infof("Could not get PIDs for container %d: %v", h.pid, err)
} else {
stats.Cpu.Schedstat, err = schedulerStatsFromProcs(h.rootFs, pids, h.pidMetricsCache)
if err != nil {
glog.V(4).Infof("Unable to get Process Scheduler Stats: %v", err)
}
}
}

// If we know the pid then get network stats from /proc/<pid>/net/dev
if h.pid == 0 {
return stats, nil
Expand Down Expand Up @@ -117,6 +132,50 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
return stats, nil
}

func schedulerStatsFromProcs(rootFs string, pids []int, pidMetricsCache map[int]*info.CpuSchedstat) (info.CpuSchedstat, error) {
for _, pid := range pids {
f, err := os.Open(path.Join(rootFs, "proc", strconv.Itoa(pid), "schedstat"))
if err != nil {
return info.CpuSchedstat{}, fmt.Errorf("couldn't open scheduler statistics for process %d: %v", pid, err)
}
defer f.Close()
contents, err := ioutil.ReadAll(f)
if err != nil {
return info.CpuSchedstat{}, fmt.Errorf("couldn't read scheduler statistics for process %d: %v", pid, err)
}
rawMetrics := bytes.Split(bytes.TrimRight(contents, "\n"), []byte(" "))
if len(rawMetrics) != 3 {
return info.CpuSchedstat{}, fmt.Errorf("unexpected number of metrics in schedstat file for process %d", pid)
}
cacheEntry, ok := pidMetricsCache[pid]
if !ok {
cacheEntry = &info.CpuSchedstat{}
pidMetricsCache[pid] = cacheEntry
}
for i, rawMetric := range rawMetrics {
metric, err := strconv.ParseUint(string(rawMetric), 10, 64)
if err != nil {
return info.CpuSchedstat{}, fmt.Errorf("parsing error while reading scheduler statistics for process: %d: %v", pid, err)
}
switch i {
case 0:
cacheEntry.RunTime = metric
case 1:
cacheEntry.RunqueueTime = metric
case 2:
cacheEntry.RunPeriods = metric
}
}
}
schedstats := info.CpuSchedstat{}
for _, v := range pidMetricsCache {
schedstats.RunPeriods += v.RunPeriods
schedstats.RunqueueTime += v.RunqueueTime
schedstats.RunTime += v.RunTime
}
return schedstats, nil
}

func networkStatsFromProc(rootFs string, pid int) ([]info.InterfaceStats, error) {
netStatsFile := path.Join(rootFs, "proc", strconv.Itoa(pid), "/net/dev")

Expand Down
17 changes: 15 additions & 2 deletions info/v1/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,10 +293,23 @@ type CpuCFS struct {
ThrottledTime uint64 `json:"throttled_time"`
}

// Cpu Aggregated scheduler statistics
type CpuSchedstat struct {
// https://www.kernel.org/doc/Documentation/scheduler/sched-stats.txt

// time spent on the cpu
RunTime uint64 `json:"run_time"`
// time spent waiting on a runqueue
RunqueueTime uint64 `json:"runqueue_time"`
// # of timeslices run on this cpu
RunPeriods uint64 `json:"run_periods"`
}

// All CPU usage metrics are cumulative from the creation of the container
type CpuStats struct {
Usage CpuUsage `json:"usage"`
CFS CpuCFS `json:"cfs"`
Usage CpuUsage `json:"usage"`
CFS CpuCFS `json:"cfs"`
Schedstat CpuSchedstat `json:"schedstat"`
// Smoothed average of number of runnable threads x 1000.
// We multiply by thousand to avoid using floats, but preserving precision.
// Load is smoothed over the last 10 seconds. Instantaneous value can be read
Expand Down
21 changes: 21 additions & 0 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,27 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.CFS.ThrottledTime) / float64(time.Second)}}
},
}, {
name: "container_cpu_schedstat_run_seconds_total",
help: "Time duration the processes of the container have run on the CPU.",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.Schedstat.RunTime) / float64(time.Second)}}
},
}, {
name: "container_cpu_schedstat_runqueue_seconds_total",
help: "Time duration processes of the container have been waiting on a runqueue.",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.Schedstat.RunqueueTime) / float64(time.Second)}}
},
}, {
name: "container_cpu_schedstat_run_periods_total",
help: "Number of times processes of the cgroup have run on the cpu",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Cpu.Schedstat.RunPeriods)}}
},
}, {
name: "container_cpu_load_average_10s",
help: "Value of container cpu load average over the last 10 seconds.",
Expand Down
5 changes: 5 additions & 0 deletions metrics/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.Container
ThrottledPeriods: 18,
ThrottledTime: 1724314000,
},
Schedstat: info.CpuSchedstat{
RunTime: 53643567,
RunqueueTime: 479424566378,
RunPeriods: 984285,
},
LoadAverage: 2,
},
Memory: info.MemoryStats{
Expand Down
9 changes: 9 additions & 0 deletions metrics/testdata/prometheus_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,15 @@ container_cpu_cfs_throttled_seconds_total{container_env_foo_env="prod",container
# HELP container_cpu_load_average_10s Value of container cpu load average over the last 10 seconds.
# TYPE container_cpu_load_average_10s gauge
container_cpu_load_average_10s{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 2
# HELP container_cpu_schedstat_run_periods_total Number of times processes of the cgroup have run on the cpu
# TYPE container_cpu_schedstat_run_periods_total counter
container_cpu_schedstat_run_periods_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 984285
# HELP container_cpu_schedstat_run_seconds_total Time duration the processes of the container have run on the CPU.
# TYPE container_cpu_schedstat_run_seconds_total counter
container_cpu_schedstat_run_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.053643567
# HELP container_cpu_schedstat_runqueue_seconds_total Time duration processes of the container have been waiting on a runqueue.
# TYPE container_cpu_schedstat_runqueue_seconds_total counter
container_cpu_schedstat_runqueue_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 479.424566378
# HELP container_cpu_system_seconds_total Cumulative system cpu time consumed in seconds.
# TYPE container_cpu_system_seconds_total counter
container_cpu_system_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7e-09
Expand Down

0 comments on commit 08f0c23

Please sign in to comment.