diff --git a/cadvisor.go b/cadvisor.go index 5336cb4b8d..6e47e0fa8d 100644 --- a/cadvisor.go +++ b/cadvisor.go @@ -68,6 +68,7 @@ var ( container.NetworkTcpUsageMetrics: struct{}{}, container.NetworkUdpUsageMetrics: struct{}{}, container.ProcessSchedulerMetrics: struct{}{}, + container.ProcessMetrics: struct{}{}, }} // List of metrics that can be ignored. @@ -78,6 +79,7 @@ var ( container.NetworkUdpUsageMetrics: struct{}{}, container.PerCpuUsageMetrics: struct{}{}, container.ProcessSchedulerMetrics: struct{}{}, + container.ProcessMetrics: struct{}{}, } ) @@ -109,7 +111,7 @@ func (ml *metricSetValue) Set(value string) error { } func init() { - flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'disk', 'network', 'tcp', 'udp', 'percpu'. Note: tcp and udp are disabled by default due to high CPU usage.") + flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'disk', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process'. Note: tcp and udp are disabled by default due to high CPU usage.") // Default logging verbosity to V(2) flag.Set("v", "2") @@ -251,6 +253,7 @@ func toIncludedMetrics(ignoreMetrics container.MetricSet) container.MetricSet { container.NetworkUdpUsageMetrics, container.AcceleratorUsageMetrics, container.AppMetrics, + container.ProcessMetrics, } for _, metric := range allMetrics { if !ignoreMetrics.Has(metric) { diff --git a/container/factory.go b/container/factory.go index 47847057e0..8e33ade6d5 100644 --- a/container/factory.go +++ b/container/factory.go @@ -53,6 +53,7 @@ const ( NetworkUdpUsageMetrics MetricKind = "udp" AcceleratorUsageMetrics MetricKind = "accelerator" AppMetrics MetricKind = "app" + ProcessMetrics MetricKind = "process" ) func (mk MetricKind) String() string { diff --git a/docs/storage/prometheus.md b/docs/storage/prometheus.md index adc0e34234..c6bae7d20d 100644 --- a/docs/storage/prometheus.md +++ b/docs/storage/prometheus.md @@ -29,6 +29,7 @@ Metric name | Type | Description | Unit (where applicable) `container_cpu_system_seconds_total` | Counter | Cumulative system cpu time consumed | seconds `container_cpu_usage_seconds_total` | Counter | Cumulative cpu time consumed | seconds `container_cpu_user_seconds_total` | Counter | Cumulative user cpu time consumed | seconds +`container_file_descriptors` | Gauge | Number of open file descriptors for the container | `container_fs_inodes_free` | Gauge | Number of available Inodes | `container_fs_inodes_total` | Gauge | Total number of Inodes | `container_fs_io_current` | Gauge | Number of I/Os currently in progress | @@ -66,6 +67,7 @@ Metric name | Type | Description | Unit (where applicable) `container_network_transmit_errors_total` | Counter | Cumulative count of errors encountered while transmitting | `container_network_tcp_usage_total` | Gauge | tcp connection usage statistic for container | `container_network_udp_usage_total` | Gauge | udp connection usage statistic for container | +`container_processes` | Gauge | Number of processes running inside the container | `container_spec_cpu_period` | Gauge | CPU period of the container | `container_spec_cpu_quota` | Gauge | CPU quota of the container | `container_spec_cpu_shares` | Gauge | CPU share of the container | diff --git a/info/v2/container.go b/info/v2/container.go index 0e2fc7ea16..4288d003db 100644 --- a/info/v2/container.go +++ b/info/v2/container.go @@ -254,6 +254,7 @@ type ProcessInfo struct { RunningTime string `json:"running_time"` CgroupPath string `json:"cgroup_path"` Cmd string `json:"cmd"` + FdCount int `json:"fd_count"` } type TcpStat struct { diff --git a/manager/container.go b/manager/container.go index 295479f092..55d07501b5 100644 --- a/manager/container.go +++ b/manager/container.go @@ -47,7 +47,9 @@ import ( var enableLoadReader = flag.Bool("enable_load_reader", false, "Whether to enable cpu load reader") var HousekeepingInterval = flag.Duration("housekeeping_interval", 1*time.Second, "Interval between container housekeepings") -var cgroupPathRegExp = regexp.MustCompile(`devices[^:]*:(.*?)[,;$]`) +// cgroup type chosen to fetch the cgroup path of a process. +// Memory has been chosen, as it is one of the default cgroups that is enabled for most containers. +var cgroupPathRegExp = regexp.MustCompile(`memory[^:]*:(.*?)[,;$]`) type containerInfo struct { info.ContainerReference @@ -185,8 +187,8 @@ func (c *containerData) getCgroupPath(cgroups string) (string, error) { } matches := cgroupPathRegExp.FindSubmatch([]byte(cgroups)) if len(matches) != 2 { - glog.V(3).Infof("failed to get devices cgroup path from %q", cgroups) - // return root in case of failures - devices hierarchy might not be enabled. + glog.V(3).Infof("failed to get memory cgroup path from %q", cgroups) + // return root in case of failures - memory hierarchy might not be enabled. return "/", nil } return string(matches[1]), nil @@ -266,6 +268,10 @@ func (c *containerData) getContainerPids(inHostNamespace bool) ([]string, error) func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace bool) ([]v2.ProcessInfo, error) { // report all processes for root. isRoot := c.info.Name == "/" + rootfs := "/" + if !inHostNamespace { + rootfs = "/rootfs" + } format := "user,pid,ppid,stime,pcpu,pmem,rss,vsz,stat,time,comm,cgroup" out, err := c.getPsOutput(inHostNamespace, format) if err != nil { @@ -324,6 +330,15 @@ func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace cgroupPath = cgroup } + var fdCount int + dirPath := path.Join(rootfs, "/proc", strconv.Itoa(pid), "fd") + fds, err := ioutil.ReadDir(dirPath) + if err != nil { + glog.V(4).Infof("error while listing directory %q to measure fd count: %v", dirPath, err) + continue + } + fdCount = len(fds) + if isRoot || c.info.Name == cgroup { processes = append(processes, v2.ProcessInfo{ User: fields[0], @@ -338,6 +353,7 @@ func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace RunningTime: fields[9], Cmd: fields[10], CgroupPath: cgroupPath, + FdCount: fdCount, }) } } diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 2fff79a2c6..6f01d60605 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -21,6 +21,7 @@ import ( "github.com/google/cadvisor/container" info "github.com/google/cadvisor/info/v1" + "github.com/google/cadvisor/info/v2" "github.com/golang/glog" "github.com/prometheus/client_golang/prometheus" @@ -35,6 +36,8 @@ type infoProvider interface { GetVersionInfo() (*info.VersionInfo, error) // GetMachineInfo provides information about the machine. GetMachineInfo() (*info.MachineInfo, error) + // GetProcessList provides information about each container's processes + GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error) } // metricValue describes a single metric value for a given set of label values @@ -109,6 +112,7 @@ type PrometheusCollector struct { errors prometheus.Gauge containerMetrics []containerMetric containerLabelsFunc ContainerLabelsFunc + includedMetrics container.MetricSet } // NewPrometheusCollector returns a new PrometheusCollector. The passed @@ -137,6 +141,7 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }, }, }, + includedMetrics: includedMetrics, } if includedMetrics.Has(container.CpuUsageMetrics) { c.containerMetrics = append(c.containerMetrics, []containerMetric{ @@ -926,10 +931,15 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric) rawLabels[l] = struct{}{} } } - for _, container := range containers { + + psReqOpt := v2.RequestOptions{ + IdType: v2.TypeName, + } + + for _, cont := range containers { values := make([]string, 0, len(rawLabels)) labels := make([]string, 0, len(rawLabels)) - containerLabels := c.containerLabelsFunc(container) + containerLabels := c.containerLabelsFunc(cont) for l := range rawLabels { labels = append(labels, sanitizeLabelName(l)) values = append(values, containerLabels[l]) @@ -937,35 +947,50 @@ func (c *PrometheusCollector) collectContainersInfo(ch chan<- prometheus.Metric) // Container spec desc := prometheus.NewDesc("container_start_time_seconds", "Start time of the container since unix epoch in seconds.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.CreationTime.Unix()), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.CreationTime.Unix()), values...) - if container.Spec.HasCpu { + if cont.Spec.HasCpu { desc = prometheus.NewDesc("container_spec_cpu_period", "CPU period of the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.Cpu.Period), values...) - if container.Spec.Cpu.Quota != 0 { + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.Cpu.Period), values...) + if cont.Spec.Cpu.Quota != 0 { desc = prometheus.NewDesc("container_spec_cpu_quota", "CPU quota of the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.Cpu.Quota), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.Cpu.Quota), values...) } desc := prometheus.NewDesc("container_spec_cpu_shares", "CPU share of the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(container.Spec.Cpu.Limit), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(cont.Spec.Cpu.Limit), values...) } - if container.Spec.HasMemory { + if cont.Spec.HasMemory { desc := prometheus.NewDesc("container_spec_memory_limit_bytes", "Memory limit for the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.Limit), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(cont.Spec.Memory.Limit), values...) desc = prometheus.NewDesc("container_spec_memory_swap_limit_bytes", "Memory swap limit for the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.SwapLimit), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(cont.Spec.Memory.SwapLimit), values...) desc = prometheus.NewDesc("container_spec_memory_reservation_limit_bytes", "Memory reservation limit for the container.", labels, nil) - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(container.Spec.Memory.Reservation), values...) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, specMemoryValue(cont.Spec.Memory.Reservation), values...) + } + + if c.includedMetrics.Has(container.ProcessMetrics) { + psList, err := c.infoProvider.GetProcessList(cont.Name, psReqOpt) + if err == nil { + desc = prometheus.NewDesc("container_processes", "Number of processes running inside the container.", labels, nil) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(len(psList)), values...) + + var fd int + for _, ps := range psList { + fd += ps.FdCount + } + desc = prometheus.NewDesc("container_file_descriptors", "Number of open file descriptors for the container.", labels, nil) + ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(fd), values...) + } } // Now for the actual metrics - if len(container.Stats) == 0 { + if len(cont.Stats) == 0 { continue } - stats := container.Stats[0] + stats := cont.Stats[0] for _, cm := range c.containerMetrics { - if cm.condition != nil && !cm.condition(container.Spec) { + if cm.condition != nil && !cm.condition(cont.Spec) { continue } desc := cm.desc(labels) diff --git a/metrics/prometheus_test.go b/metrics/prometheus_test.go index c6af3a1949..ddaab830ea 100644 --- a/metrics/prometheus_test.go +++ b/metrics/prometheus_test.go @@ -26,6 +26,7 @@ import ( "github.com/google/cadvisor/container" info "github.com/google/cadvisor/info/v1" + "github.com/google/cadvisor/info/v2" "github.com/prometheus/client_golang/prometheus" ) @@ -49,6 +50,26 @@ func (p testSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, erro }, nil } +func (p testSubcontainersInfoProvider) GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error) { + return []v2.ProcessInfo{ + { + User: "user1", + Pid: 1, + Ppid: 2, + StartTime: "OCT2018", + PercentCpu: 0.0, + PercentMemory: 0.0, + RSS: 3, + VirtualSize: 4, + Status: "S", + RunningTime: "00:00:00", + Cmd: "cmd1", + CgroupPath: "path", + FdCount: 5, + }, + }, nil +} + var allMetrics = container.MetricSet{ container.CpuUsageMetrics: struct{}{}, container.ProcessSchedulerMetrics: struct{}{}, @@ -61,6 +82,7 @@ var allMetrics = container.MetricSet{ container.NetworkUsageMetrics: struct{}{}, container.NetworkTcpUsageMetrics: struct{}{}, container.NetworkUdpUsageMetrics: struct{}{}, + container.ProcessMetrics: struct{}{}, } func (p testSubcontainersInfoProvider) SubcontainersInfo(string, *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) { @@ -305,6 +327,13 @@ func (p *erroringSubcontainersInfoProvider) GetMachineInfo() (*info.MachineInfo, return p.successfulProvider.GetMachineInfo() } +func (p *erroringSubcontainersInfoProvider) GetProcessList(containerName string, options v2.RequestOptions) ([]v2.ProcessInfo, error) { + if p.shouldFail { + return nil, errors.New("Oops 2") + } + return p.successfulProvider.GetProcessList(containerName, options) +} + func (p *erroringSubcontainersInfoProvider) SubcontainersInfo( a string, r *info.ContainerInfoRequest) ([]*info.ContainerInfo, error) { if p.shouldFail { diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index d323fb7f06..eb42305957 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -46,6 +46,9 @@ container_cpu_usage_seconds_total{container_env_foo_env="prod",container_label_f # HELP container_cpu_user_seconds_total Cumulative user cpu time consumed in seconds. # TYPE container_cpu_user_seconds_total counter container_cpu_user_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 6e-09 +# HELP container_file_descriptors Number of open file descriptors for the container. +# TYPE container_file_descriptors gauge +container_file_descriptors{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 5 # HELP container_fs_inodes_free Number of available Inodes # TYPE container_fs_inodes_free gauge container_fs_inodes_free{container_env_foo_env="prod",container_label_foo_label="bar",device="sda1",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 524288 @@ -182,6 +185,9 @@ container_network_udp_usage_total{container_env_foo_env="prod",container_label_f container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="listen",zone_name="hello"} 0 container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="rxqueued",zone_name="hello"} 0 container_network_udp_usage_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",udp_state="txqueued",zone_name="hello"} 0 +# HELP container_processes Number of processes running inside the container. +# TYPE container_processes gauge +container_processes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 # HELP container_scrape_error 1 if there was an error while getting container metrics, 0 otherwise # TYPE container_scrape_error gauge container_scrape_error 0