diff --git a/METRICS.md b/METRICS.md index 10d8acb..d331c6f 100644 --- a/METRICS.md +++ b/METRICS.md @@ -40,3 +40,10 @@ - io_stalled_avg60 (float) - memory_waiting_avg60 (float) percentage over 60s - memory_stalled_avg60 (float) percentage over 60s +- nodestat_userprocs + - tags: + - group + - user + - fields: + - processes (int) + - threads (int) diff --git a/README.md b/README.md index 182ae26..69bf823 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ Current collectors are:\ fc_host - fibrechannel metrics from /sys/class/fc_host/\ net - network interface metrics from /sys/class/net/\ pressure - metrics from /proc/pressure/\ + userprocs - metrics for the number of processes and threads per user\ Metric timestamp precision will be 1s. * Restart or reload Telegraf. @@ -54,6 +55,8 @@ nodestat_fc_host,nodename=20000025ff1bab79,type=NPort\ (fabric\ via\ point-to-po nodestat_net,interface=eno1,protocol=ethernet carrier=1i,flag_running=true,flag_up=true,operstate_code=0i,flag_lower_up=true,dormant=0i,duplex="full",link_mode=0i,operstate="up" 1662965695000000000 nodestat_net,interface=enp3s0f0,protocol=ethernet flag_running=true,flag_up=true,carrier=0i,duplex="unknown",link_mode=0i,operstate="down",operstate_code=5i,flag_lower_up=true,dormant=0i 1662965695000000000 nodestat_pressure cpu_waiting_avg60=0,io_waiting_avg60=21.1,io_stalled_avg60=20.97 1662965695000000000 +nodestat_userprocs,group=root,user=root threads=173i,processes=124i 1662965695000000000 +nodestat_userprocs,group=postfix,user=postfix processes=3i,threads=3i 1662965695000000000 ``` # Metrics diff --git a/internal/collectors/collectors.go b/internal/collectors/collectors.go index 89602bb..e0c6b75 100644 --- a/internal/collectors/collectors.go +++ b/internal/collectors/collectors.go @@ -25,7 +25,7 @@ type CollectorInfo struct { var collectInfos []CollectorInfo func init() { - var c []CollectorInfo + var c = make([]CollectorInfo, 0, 4) var ci CollectorInfo ci = CollectorInfo{"fc_host", true, "fibrechannels", fssys.GatherSysFcHostInfo} @@ -34,6 +34,8 @@ func init() { c = append(c, ci) ci = CollectorInfo{"pressure", true, "pressure", fsproc.GatherProcPressureInfo} c = append(c, ci) + ci = CollectorInfo{"userprocs", true, "processes per user", fsproc.GatherProcUserProcsInfo} + c = append(c, ci) collectInfos = c } diff --git a/internal/collectors/fsproc/userprocs.go b/internal/collectors/fsproc/userprocs.go new file mode 100644 index 0000000..064ae7b --- /dev/null +++ b/internal/collectors/fsproc/userprocs.go @@ -0,0 +1,91 @@ +// fsproc functions show metrics from linux /proc filesystem using influx line protocol +// +// Author: Tesifonte Belda +// License: The MIT License (MIT) +// +// References: +// https://github.com/prometheus/node_exporter/tree/master/collector/pressure_linux.go + +package fsproc + +import ( + "fmt" + "os" + "os/user" + "time" + + "github.com/prometheus/procfs" + "github.com/tesibelda/lightmetric/metric" +) + +type userInfo struct { + processes int + threads int +} + +// GatherProcUserProcsInfo prints number of process per user metrics from /proc//status> +func GatherProcUserProcsInfo() error { + fs, err := procfs.NewDefaultFS() + if err != nil { + return fmt.Errorf("failed to open procfs: %w", err) + } + + p, err := fs.AllProcs() + if err != nil { + return fmt.Errorf("unable to list all processes: %w", err) + } + + uidprocs := make(map[string]userInfo, 10) + totalProcs, th, ok := 0, 0, false + info := userInfo{} + for _, pid := range p { + status, err := pid.NewStatus() + if err != nil { + // PIDs can vanish between getting the list and getting stats. + continue + } + + th = 0 + stat, err := pid.Stat() + if err == nil { + th = stat.NumThreads + } + + if info, ok = uidprocs[status.UIDs[0]]; !ok { + info = userInfo{} + } + info.processes++ + info.threads += th + uidprocs[status.UIDs[0]] = info + totalProcs++ + } + if totalProcs == 0 { + return fmt.Errorf("unable to list any processes") + } + + fields := make(map[string]interface{}, 2) + tags := make(map[string]string, 2) + var t time.Time + var m metric.Metric + for k, v := range uidprocs { + usr, err := user.LookupId(k) + if err != nil { + continue + } + if len(usr.Username) > 0 { + grp, err := user.LookupGroupId(usr.Gid) + if err != nil { + grp = &user.Group{} + } + + fields["processes"] = v.processes + fields["threads"] = v.threads + tags["user"] = usr.Username + tags["group"] = grp.Name + t = metric.TimeWithPrecision(time.Now(), time.Second) + m = metric.New("nodestat_userprocs", tags, fields, t) + fmt.Fprint(os.Stdout, m.String(metric.InfluxLp)) + } + } + return nil +}