Skip to content

Commit

Permalink
Add new collector and metrics for watchdog (prometheus#2309) (prometh…
Browse files Browse the repository at this point in the history
…eus#2880)

Signed-off-by: Gavin Lam <gavin.oss@tutamail.com>
  • Loading branch information
gavinkflam authored and gitperr committed Apr 30, 2024
1 parent 7b13da1 commit 8d38833
Show file tree
Hide file tree
Showing 7 changed files with 348 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ softirqs | Exposes detailed softirq statistics from `/proc/softirqs`. | Linux
sysctl | Expose sysctl values from `/proc/sys`. Use `--collector.sysctl.include(-info)` to configure. | Linux
systemd | Exposes service and system status from [systemd](http://www.freedesktop.org/wiki/Software/systemd/). | Linux
tcpstat | Exposes TCP connection status information from `/proc/net/tcp` and `/proc/net/tcp6`. (Warning: the current version has potential performance issues in high load situations.) | Linux
watchdog | Exposes statistics from `/sys/class/watchdog` | Linux
wifi | Exposes WiFi device and station statistics. | Linux
xfrm | Exposes statistics from `/proc/net/xfrm_stat` | Linux
zoneinfo | Exposes NUMA memory zone metrics. | Linux
Expand Down
26 changes: 26 additions & 0 deletions collector/fixtures/e2e-64k-page-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2945,6 +2945,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1
node_scrape_collector_success{collector="time"} 1
node_scrape_collector_success{collector="udp_queues"} 1
node_scrape_collector_success{collector="vmstat"} 1
node_scrape_collector_success{collector="watchdog"} 1
node_scrape_collector_success{collector="wifi"} 1
node_scrape_collector_success{collector="xfrm"} 1
node_scrape_collector_success{collector="xfs"} 1
Expand Down Expand Up @@ -3218,6 +3219,31 @@ node_vmstat_pswpin 1476
# HELP node_vmstat_pswpout /proc/vmstat information field pswpout.
# TYPE node_vmstat_pswpout untyped
node_vmstat_pswpout 35045
# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
# TYPE node_watchdog_access_cs0 gauge
node_watchdog_access_cs0{name="watchdog0"} 0
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
# TYPE node_watchdog_bootstatus gauge
node_watchdog_bootstatus{name="watchdog0"} 1
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
# TYPE node_watchdog_fw_version gauge
node_watchdog_fw_version{name="watchdog0"} 2
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
# TYPE node_watchdog_info gauge
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
# TYPE node_watchdog_nowayout gauge
node_watchdog_nowayout{name="watchdog0"} 0
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
# TYPE node_watchdog_pretimeout_seconds gauge
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
# TYPE node_watchdog_timeleft_seconds gauge
node_watchdog_timeleft_seconds{name="watchdog0"} 300
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
# TYPE node_watchdog_timeout_seconds gauge
node_watchdog_timeout_seconds{name="watchdog0"} 60
# HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz.
# TYPE node_wifi_interface_frequency_hertz gauge
node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09
Expand Down
26 changes: 26 additions & 0 deletions collector/fixtures/e2e-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2967,6 +2967,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1
node_scrape_collector_success{collector="time"} 1
node_scrape_collector_success{collector="udp_queues"} 1
node_scrape_collector_success{collector="vmstat"} 1
node_scrape_collector_success{collector="watchdog"} 1
node_scrape_collector_success{collector="wifi"} 1
node_scrape_collector_success{collector="xfrm"} 1
node_scrape_collector_success{collector="xfs"} 1
Expand Down Expand Up @@ -3240,6 +3241,31 @@ node_vmstat_pswpin 1476
# HELP node_vmstat_pswpout /proc/vmstat information field pswpout.
# TYPE node_vmstat_pswpout untyped
node_vmstat_pswpout 35045
# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
# TYPE node_watchdog_access_cs0 gauge
node_watchdog_access_cs0{name="watchdog0"} 0
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
# TYPE node_watchdog_bootstatus gauge
node_watchdog_bootstatus{name="watchdog0"} 1
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
# TYPE node_watchdog_fw_version gauge
node_watchdog_fw_version{name="watchdog0"} 2
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
# TYPE node_watchdog_info gauge
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
# TYPE node_watchdog_nowayout gauge
node_watchdog_nowayout{name="watchdog0"} 0
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
# TYPE node_watchdog_pretimeout_seconds gauge
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
# TYPE node_watchdog_timeleft_seconds gauge
node_watchdog_timeleft_seconds{name="watchdog0"} 300
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
# TYPE node_watchdog_timeout_seconds gauge
node_watchdog_timeout_seconds{name="watchdog0"} 60
# HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz.
# TYPE node_wifi_interface_frequency_hertz gauge
node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09
Expand Down
69 changes: 69 additions & 0 deletions collector/fixtures/sys.ttar
Original file line number Diff line number Diff line change
Expand Up @@ -1717,6 +1717,75 @@ SymlinkTo: ../../devices/virtual/thermal/cooling_device0
Path: sys/class/thermal/thermal_zone0
SymlinkTo: ../../devices/virtual/thermal/thermal_zone0
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/watchdog
Mode: 775
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/watchdog/watchdog0
Mode: 775
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/access_cs0
Lines: 1
0EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/bootstatus
Lines: 1
1EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/fw_version
Lines: 1
2EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/identity
Lines: 1
Software WatchdogEOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/nowayout
Lines: 1
0EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/options
Lines: 1
0x8380EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/pretimeout
Lines: 1
120EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/pretimeout_governor
Lines: 1
noopEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/state
Lines: 1
activeEOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/status
Lines: 1
0x8000EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/timeleft
Lines: 1
300EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/timeout
Lines: 1
60EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/watchdog/watchdog1
Mode: 775
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/devices
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down
133 changes: 133 additions & 0 deletions collector/watchdog.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build linux && !nowatchdog
// +build linux,!nowatchdog

package collector

import (
"fmt"

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/sysfs"
)

type watchdogCollector struct {
fs sysfs.FS
logger log.Logger
}

func init() {
registerCollector("watchdog", defaultDisabled, NewWatchdogCollector)
}

// NewWatchdogCollector returns a new Collector exposing watchdog stats.
func NewWatchdogCollector(logger log.Logger) (Collector, error) {
fs, err := sysfs.NewFS(*sysPath)
if err != nil {
return nil, fmt.Errorf("failed to open procfs: %w", err)
}

return &watchdogCollector{
fs: fs,
logger: logger,
}, nil
}

var (
watchdogBootstatusDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "bootstatus"),
"Value of /sys/class/watchdog/<watchdog>/bootstatus",
[]string{"name"}, nil,
)
watchdogFwVersionDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "fw_version"),
"Value of /sys/class/watchdog/<watchdog>/fw_version",
[]string{"name"}, nil,
)
watchdogNowayoutDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "nowayout"),
"Value of /sys/class/watchdog/<watchdog>/nowayout",
[]string{"name"}, nil,
)
watchdogTimeleftDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "timeleft_seconds"),
"Value of /sys/class/watchdog/<watchdog>/timeleft",
[]string{"name"}, nil,
)
watchdogTimeoutDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "timeout_seconds"),
"Value of /sys/class/watchdog/<watchdog>/timeout",
[]string{"name"}, nil,
)
watchdogPretimeoutDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "pretimeout_seconds"),
"Value of /sys/class/watchdog/<watchdog>/pretimeout",
[]string{"name"}, nil,
)
watchdogAccessCs0Desc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "access_cs0"),
"Value of /sys/class/watchdog/<watchdog>/access_cs0",
[]string{"name"}, nil,
)
watchdogInfoDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "info"),
"Info of /sys/class/watchdog/<watchdog>",
[]string{"name", "options", "identity", "state", "status", "pretimeout_governor"}, nil,
)
)

func toLabelValue(ptr *string) string {
if ptr == nil {
return ""
}
return *ptr
}

func (c *watchdogCollector) Update(ch chan<- prometheus.Metric) error {
watchdogClass, err := c.fs.WatchdogClass()
if err != nil {
return err
}

for _, wd := range watchdogClass {
if wd.Bootstatus != nil {
ch <- prometheus.MustNewConstMetric(watchdogBootstatusDesc, prometheus.GaugeValue, float64(*wd.Bootstatus), wd.Name)
}
if wd.FwVersion != nil {
ch <- prometheus.MustNewConstMetric(watchdogFwVersionDesc, prometheus.GaugeValue, float64(*wd.FwVersion), wd.Name)
}
if wd.Nowayout != nil {
ch <- prometheus.MustNewConstMetric(watchdogNowayoutDesc, prometheus.GaugeValue, float64(*wd.Nowayout), wd.Name)
}
if wd.Timeleft != nil {
ch <- prometheus.MustNewConstMetric(watchdogTimeleftDesc, prometheus.GaugeValue, float64(*wd.Timeleft), wd.Name)
}
if wd.Timeout != nil {
ch <- prometheus.MustNewConstMetric(watchdogTimeoutDesc, prometheus.GaugeValue, float64(*wd.Timeout), wd.Name)
}
if wd.Pretimeout != nil {
ch <- prometheus.MustNewConstMetric(watchdogPretimeoutDesc, prometheus.GaugeValue, float64(*wd.Pretimeout), wd.Name)
}
if wd.AccessCs0 != nil {
ch <- prometheus.MustNewConstMetric(watchdogAccessCs0Desc, prometheus.GaugeValue, float64(*wd.AccessCs0), wd.Name)
}

ch <- prometheus.MustNewConstMetric(watchdogInfoDesc, prometheus.GaugeValue, 1.0,
wd.Name, toLabelValue(wd.Options), toLabelValue(wd.Identity), toLabelValue(wd.State), toLabelValue(wd.Status), toLabelValue(wd.PretimeoutGovernor))
}

return nil
}
92 changes: 92 additions & 0 deletions collector/watchdog_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file ewcept in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !nowatchdog
// +build !nowatchdog

package collector

import (
"fmt"
"os"
"strings"
"testing"

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
)

type testWatchdogCollector struct {
wc Collector
}

func (c testWatchdogCollector) Collect(ch chan<- prometheus.Metric) {
c.wc.Update(ch)
}

func (c testWatchdogCollector) Describe(ch chan<- *prometheus.Desc) {
prometheus.DescribeByCollect(c, ch)
}

func TestWatchdogStats(t *testing.T) {
testcase := `# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
# TYPE node_watchdog_access_cs0 gauge
node_watchdog_access_cs0{name="watchdog0"} 0
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
# TYPE node_watchdog_bootstatus gauge
node_watchdog_bootstatus{name="watchdog0"} 1
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
# TYPE node_watchdog_fw_version gauge
node_watchdog_fw_version{name="watchdog0"} 2
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
# TYPE node_watchdog_info gauge
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
# TYPE node_watchdog_nowayout gauge
node_watchdog_nowayout{name="watchdog0"} 0
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
# TYPE node_watchdog_pretimeout_seconds gauge
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
# TYPE node_watchdog_timeleft_seconds gauge
node_watchdog_timeleft_seconds{name="watchdog0"} 300
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
# TYPE node_watchdog_timeout_seconds gauge
node_watchdog_timeout_seconds{name="watchdog0"} 60
`
*sysPath = "fixtures/sys"

logger := log.NewLogfmtLogger(os.Stderr)
c, err := NewWatchdogCollector(logger)
if err != nil {
t.Fatal(err)
}
reg := prometheus.NewRegistry()
reg.MustRegister(&testWatchdogCollector{wc: c})

sink := make(chan prometheus.Metric)
go func() {
err = c.Update(sink)
if err != nil {
panic(fmt.Errorf("failed to update collector: %s", err))
}
close(sink)
}()

err = testutil.GatherAndCompare(reg, strings.NewReader(testcase))
if err != nil {
t.Fatal(err)
}
}
1 change: 1 addition & 0 deletions end-to-end-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ enabled_collectors=$(cat << COLLECTORS
thermal_zone
udp_queues
vmstat
watchdog
wifi
xfrm
xfs
Expand Down

0 comments on commit 8d38833

Please sign in to comment.