Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new collector and metrics for watchdog #2880

Merged
merged 1 commit into from
Mar 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ softirqs | Exposes detailed softirq statistics from `/proc/softirqs`. | Linux
sysctl | Expose sysctl values from `/proc/sys`. Use `--collector.sysctl.include(-info)` to configure. | Linux
systemd | Exposes service and system status from [systemd](http://www.freedesktop.org/wiki/Software/systemd/). | Linux
tcpstat | Exposes TCP connection status information from `/proc/net/tcp` and `/proc/net/tcp6`. (Warning: the current version has potential performance issues in high load situations.) | Linux
watchdog | Exposes statistics from `/sys/class/watchdog` | Linux
wifi | Exposes WiFi device and station statistics. | Linux
xfrm | Exposes statistics from `/proc/net/xfrm_stat` | Linux
zoneinfo | Exposes NUMA memory zone metrics. | Linux
Expand Down
26 changes: 26 additions & 0 deletions collector/fixtures/e2e-64k-page-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2945,6 +2945,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1
node_scrape_collector_success{collector="time"} 1
node_scrape_collector_success{collector="udp_queues"} 1
node_scrape_collector_success{collector="vmstat"} 1
node_scrape_collector_success{collector="watchdog"} 1
node_scrape_collector_success{collector="wifi"} 1
node_scrape_collector_success{collector="xfrm"} 1
node_scrape_collector_success{collector="xfs"} 1
Expand Down Expand Up @@ -3218,6 +3219,31 @@ node_vmstat_pswpin 1476
# HELP node_vmstat_pswpout /proc/vmstat information field pswpout.
# TYPE node_vmstat_pswpout untyped
node_vmstat_pswpout 35045
# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
# TYPE node_watchdog_access_cs0 gauge
node_watchdog_access_cs0{name="watchdog0"} 0
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
# TYPE node_watchdog_bootstatus gauge
node_watchdog_bootstatus{name="watchdog0"} 1
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
# TYPE node_watchdog_fw_version gauge
node_watchdog_fw_version{name="watchdog0"} 2
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
# TYPE node_watchdog_info gauge
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
# TYPE node_watchdog_nowayout gauge
node_watchdog_nowayout{name="watchdog0"} 0
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
# TYPE node_watchdog_pretimeout_seconds gauge
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
# TYPE node_watchdog_timeleft_seconds gauge
node_watchdog_timeleft_seconds{name="watchdog0"} 300
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
# TYPE node_watchdog_timeout_seconds gauge
node_watchdog_timeout_seconds{name="watchdog0"} 60
# HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz.
# TYPE node_wifi_interface_frequency_hertz gauge
node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09
Expand Down
26 changes: 26 additions & 0 deletions collector/fixtures/e2e-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2967,6 +2967,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1
node_scrape_collector_success{collector="time"} 1
node_scrape_collector_success{collector="udp_queues"} 1
node_scrape_collector_success{collector="vmstat"} 1
node_scrape_collector_success{collector="watchdog"} 1
node_scrape_collector_success{collector="wifi"} 1
node_scrape_collector_success{collector="xfrm"} 1
node_scrape_collector_success{collector="xfs"} 1
Expand Down Expand Up @@ -3240,6 +3241,31 @@ node_vmstat_pswpin 1476
# HELP node_vmstat_pswpout /proc/vmstat information field pswpout.
# TYPE node_vmstat_pswpout untyped
node_vmstat_pswpout 35045
# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
# TYPE node_watchdog_access_cs0 gauge
node_watchdog_access_cs0{name="watchdog0"} 0
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
# TYPE node_watchdog_bootstatus gauge
node_watchdog_bootstatus{name="watchdog0"} 1
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
# TYPE node_watchdog_fw_version gauge
node_watchdog_fw_version{name="watchdog0"} 2
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
# TYPE node_watchdog_info gauge
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
# TYPE node_watchdog_nowayout gauge
node_watchdog_nowayout{name="watchdog0"} 0
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
# TYPE node_watchdog_pretimeout_seconds gauge
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
# TYPE node_watchdog_timeleft_seconds gauge
node_watchdog_timeleft_seconds{name="watchdog0"} 300
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
# TYPE node_watchdog_timeout_seconds gauge
node_watchdog_timeout_seconds{name="watchdog0"} 60
# HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz.
# TYPE node_wifi_interface_frequency_hertz gauge
node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09
Expand Down
69 changes: 69 additions & 0 deletions collector/fixtures/sys.ttar
Original file line number Diff line number Diff line change
Expand Up @@ -1717,6 +1717,75 @@ SymlinkTo: ../../devices/virtual/thermal/cooling_device0
Path: sys/class/thermal/thermal_zone0
SymlinkTo: ../../devices/virtual/thermal/thermal_zone0
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/watchdog
Mode: 775
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/watchdog/watchdog0
Mode: 775
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/access_cs0
Lines: 1
0EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/bootstatus
Lines: 1
1EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/fw_version
Lines: 1
2EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/identity
Lines: 1
Software WatchdogEOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/nowayout
Lines: 1
0EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/options
Lines: 1
0x8380EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/pretimeout
Lines: 1
120EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/pretimeout_governor
Lines: 1
noopEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/state
Lines: 1
activeEOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/status
Lines: 1
0x8000EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/timeleft
Lines: 1
300EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/timeout
Lines: 1
60EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/watchdog/watchdog1
Mode: 775
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/devices
Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down
133 changes: 133 additions & 0 deletions collector/watchdog.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build linux && !nowatchdog
// +build linux,!nowatchdog

package collector

import (
"fmt"

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/sysfs"
)

type watchdogCollector struct {
fs sysfs.FS
logger log.Logger
}

func init() {
registerCollector("watchdog", defaultDisabled, NewWatchdogCollector)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems low cardinality enough, and useful enough, we could consider enabling this by default.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I have submitted a PR for this. #2953

}

// NewWatchdogCollector returns a new Collector exposing watchdog stats.
func NewWatchdogCollector(logger log.Logger) (Collector, error) {
fs, err := sysfs.NewFS(*sysPath)
if err != nil {
return nil, fmt.Errorf("failed to open procfs: %w", err)
}

return &watchdogCollector{
fs: fs,
logger: logger,
}, nil
}

var (
watchdogBootstatusDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "bootstatus"),
"Value of /sys/class/watchdog/<watchdog>/bootstatus",
[]string{"name"}, nil,
)
watchdogFwVersionDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "fw_version"),
"Value of /sys/class/watchdog/<watchdog>/fw_version",
[]string{"name"}, nil,
)
watchdogNowayoutDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "nowayout"),
"Value of /sys/class/watchdog/<watchdog>/nowayout",
[]string{"name"}, nil,
)
watchdogTimeleftDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "timeleft_seconds"),
"Value of /sys/class/watchdog/<watchdog>/timeleft",
[]string{"name"}, nil,
)
watchdogTimeoutDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "timeout_seconds"),
"Value of /sys/class/watchdog/<watchdog>/timeout",
[]string{"name"}, nil,
)
watchdogPretimeoutDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "pretimeout_seconds"),
"Value of /sys/class/watchdog/<watchdog>/pretimeout",
[]string{"name"}, nil,
)
watchdogAccessCs0Desc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "access_cs0"),
"Value of /sys/class/watchdog/<watchdog>/access_cs0",
[]string{"name"}, nil,
)
watchdogInfoDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "info"),
"Info of /sys/class/watchdog/<watchdog>",
[]string{"name", "options", "identity", "state", "status", "pretimeout_governor"}, nil,
)
)

func toLabelValue(ptr *string) string {
if ptr == nil {
return ""
}
return *ptr
}

func (c *watchdogCollector) Update(ch chan<- prometheus.Metric) error {
watchdogClass, err := c.fs.WatchdogClass()
if err != nil {
return err
}

for _, wd := range watchdogClass {
if wd.Bootstatus != nil {
ch <- prometheus.MustNewConstMetric(watchdogBootstatusDesc, prometheus.GaugeValue, float64(*wd.Bootstatus), wd.Name)
}
if wd.FwVersion != nil {
ch <- prometheus.MustNewConstMetric(watchdogFwVersionDesc, prometheus.GaugeValue, float64(*wd.FwVersion), wd.Name)
}
if wd.Nowayout != nil {
ch <- prometheus.MustNewConstMetric(watchdogNowayoutDesc, prometheus.GaugeValue, float64(*wd.Nowayout), wd.Name)
}
if wd.Timeleft != nil {
ch <- prometheus.MustNewConstMetric(watchdogTimeleftDesc, prometheus.GaugeValue, float64(*wd.Timeleft), wd.Name)
}
if wd.Timeout != nil {
ch <- prometheus.MustNewConstMetric(watchdogTimeoutDesc, prometheus.GaugeValue, float64(*wd.Timeout), wd.Name)
}
if wd.Pretimeout != nil {
ch <- prometheus.MustNewConstMetric(watchdogPretimeoutDesc, prometheus.GaugeValue, float64(*wd.Pretimeout), wd.Name)
}
if wd.AccessCs0 != nil {
ch <- prometheus.MustNewConstMetric(watchdogAccessCs0Desc, prometheus.GaugeValue, float64(*wd.AccessCs0), wd.Name)
}

ch <- prometheus.MustNewConstMetric(watchdogInfoDesc, prometheus.GaugeValue, 1.0,
wd.Name, toLabelValue(wd.Options), toLabelValue(wd.Identity), toLabelValue(wd.State), toLabelValue(wd.Status), toLabelValue(wd.PretimeoutGovernor))
}

return nil
}
92 changes: 92 additions & 0 deletions collector/watchdog_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file ewcept in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !nowatchdog
// +build !nowatchdog

package collector

import (
"fmt"
"os"
"strings"
"testing"

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
)

type testWatchdogCollector struct {
wc Collector
}

func (c testWatchdogCollector) Collect(ch chan<- prometheus.Metric) {
c.wc.Update(ch)
}

func (c testWatchdogCollector) Describe(ch chan<- *prometheus.Desc) {
prometheus.DescribeByCollect(c, ch)
}

func TestWatchdogStats(t *testing.T) {
testcase := `# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
# TYPE node_watchdog_access_cs0 gauge
node_watchdog_access_cs0{name="watchdog0"} 0
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
# TYPE node_watchdog_bootstatus gauge
node_watchdog_bootstatus{name="watchdog0"} 1
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
# TYPE node_watchdog_fw_version gauge
node_watchdog_fw_version{name="watchdog0"} 2
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
# TYPE node_watchdog_info gauge
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
# TYPE node_watchdog_nowayout gauge
node_watchdog_nowayout{name="watchdog0"} 0
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
# TYPE node_watchdog_pretimeout_seconds gauge
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
# TYPE node_watchdog_timeleft_seconds gauge
node_watchdog_timeleft_seconds{name="watchdog0"} 300
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
# TYPE node_watchdog_timeout_seconds gauge
node_watchdog_timeout_seconds{name="watchdog0"} 60
`
*sysPath = "fixtures/sys"

logger := log.NewLogfmtLogger(os.Stderr)
c, err := NewWatchdogCollector(logger)
if err != nil {
t.Fatal(err)
}
reg := prometheus.NewRegistry()
reg.MustRegister(&testWatchdogCollector{wc: c})

sink := make(chan prometheus.Metric)
go func() {
err = c.Update(sink)
if err != nil {
panic(fmt.Errorf("failed to update collector: %s", err))
}
close(sink)
}()

err = testutil.GatherAndCompare(reg, strings.NewReader(testcase))
if err != nil {
t.Fatal(err)
}
}
1 change: 1 addition & 0 deletions end-to-end-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ enabled_collectors=$(cat << COLLECTORS
thermal_zone
udp_queues
vmstat
watchdog
wifi
xfrm
xfs
Expand Down
Loading