From 6a2b220748071ec369a0ba5e3de802b8b407dd57 Mon Sep 17 00:00:00 2001 From: Sakerdotes Date: Tue, 17 May 2022 22:14:26 +0200 Subject: [PATCH] =?UTF-8?q?fix(parsers/nagios):=20metrics=20will=20always?= =?UTF-8?q?=20return=20a=20supported=20status=20co=E2=80=A6=20(#11062)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Morten Urban --- plugins/inputs/exec/exec.go | 9 ++--- plugins/parsers/nagios/parser.go | 45 ++++++++++++++------- plugins/parsers/nagios/parser_test.go | 58 +++++++++++++++++---------- 3 files changed, 70 insertions(+), 42 deletions(-) diff --git a/plugins/inputs/exec/exec.go b/plugins/inputs/exec/exec.go index 1e42bd3053332..3369979eafb55 100644 --- a/plugins/inputs/exec/exec.go +++ b/plugins/inputs/exec/exec.go @@ -127,9 +127,9 @@ func (e *Exec) ProcessCommand(command string, acc telegraf.Accumulator, wg *sync defer wg.Done() _, isNagios := e.parser.(*nagios.NagiosParser) - out, errbuf, runErr := e.runner.Run(command, e.Environment, time.Duration(e.Timeout)) + out, errBuf, runErr := e.runner.Run(command, e.Environment, time.Duration(e.Timeout)) if !isNagios && runErr != nil { - err := fmt.Errorf("exec: %s for command '%s': %s", runErr, command, string(errbuf)) + err := fmt.Errorf("exec: %s for command '%s': %s", runErr, command, string(errBuf)) acc.AddError(err) return } @@ -141,10 +141,7 @@ func (e *Exec) ProcessCommand(command string, acc telegraf.Accumulator, wg *sync } if isNagios { - metrics, err = nagios.TryAddState(runErr, metrics) - if err != nil { - e.Log.Errorf("Failed to add nagios state: %s", err) - } + metrics = nagios.AddState(runErr, errBuf, metrics) } for _, m := range metrics { diff --git a/plugins/parsers/nagios/parser.go b/plugins/parsers/nagios/parser.go index f45e07a82eb1d..22aa5c0dff7a1 100644 --- a/plugins/parsers/nagios/parser.go +++ b/plugins/parsers/nagios/parser.go @@ -4,7 +4,6 @@ import ( "bufio" "bytes" "errors" - "fmt" "os/exec" "regexp" "strconv" @@ -16,6 +15,10 @@ import ( "github.com/influxdata/telegraf/metric" ) +// unknownExitCode is the nagios unknown status code +// the exit code should be used if an error occurs or something unexpected happens +const unknownExitCode = 3 + // getExitCode get the exit code from an error value which is the result // of running a command through exec package api. func getExitCode(err error) (int, error) { @@ -25,10 +28,7 @@ func getExitCode(err error) (int, error) { ee, ok := err.(*exec.ExitError) if !ok { - // If it is not an *exec.ExitError, then it must be - // an io error, but docs do not say anything about the - // exit code in this case. - return 0, err + return unknownExitCode, err } ws, ok := ee.Sys().(syscall.WaitStatus) @@ -39,19 +39,35 @@ func getExitCode(err error) (int, error) { return ws.ExitStatus(), nil } -// TryAddState attempts to add a state derived from the runErr. -// If any error occurs, it is guaranteed to be returned along with -// the initial metric slice. -func TryAddState(runErr error, metrics []telegraf.Metric) ([]telegraf.Metric, error) { - state, err := getExitCode(runErr) - if err != nil { - return metrics, fmt.Errorf("exec: get exit code: %s", err) +// AddState adds a state derived from the runErr. Unknown state will be set as fallback. +// If any error occurs, it is guaranteed to be added to the service output. +// An updated slice of metrics will be returned. +func AddState(runErr error, errMessage []byte, metrics []telegraf.Metric) []telegraf.Metric { + state, exitErr := getExitCode(runErr) + // This will ensure that in every error case the valid nagios state 'unknown' will be returned. + // No error needs to be thrown because the output will contain the error information. + // Description found at 'Plugin Return Codes' https://nagios-plugins.org/doc/guidelines.html + if exitErr != nil || state < 0 || state > unknownExitCode { + state = unknownExitCode } for _, m := range metrics { if m.Name() == "nagios_state" { m.AddField("state", state) - return metrics, nil + + if state == unknownExitCode { + errorMessage := string(errMessage) + if exitErr != nil && exitErr.Error() != "" { + errorMessage = exitErr.Error() + } + value, ok := m.GetField("service_output") + if !ok || value == "" { + // By adding the error message as output, the metric contains all needed information to understand + // the problem and fix it + m.AddField("service_output", errorMessage) + } + } + return metrics } } @@ -66,8 +82,7 @@ func TryAddState(runErr error, metrics []telegraf.Metric) ([]telegraf.Metric, er } m := metric.New("nagios_state", nil, f, ts) - metrics = append(metrics, m) - return metrics, nil + return append(metrics, m) } type NagiosParser struct { diff --git a/plugins/parsers/nagios/parser_test.go b/plugins/parsers/nagios/parser_test.go index 48cfce241d173..79a54ffa7a76f 100644 --- a/plugins/parsers/nagios/parser_test.go +++ b/plugins/parsers/nagios/parser_test.go @@ -32,7 +32,7 @@ func TestGetExitCode(t *testing.T) { errF: func() error { return errors.New("I am not *exec.ExitError") }, - expCode: 0, + expCode: 3, expErr: errors.New("I am not *exec.ExitError"), }, } @@ -89,10 +89,11 @@ func assertEqual(t *testing.T, exp, actual []telegraf.Metric) { func TestTryAddState(t *testing.T) { tests := []struct { - name string - runErrF func() error - metrics []telegraf.Metric - assertF func(*testing.T, []telegraf.Metric, error) + name string + runErrF func() error + runErrMessage []byte + metrics []telegraf.Metric + assertF func(*testing.T, []telegraf.Metric) }{ { name: "should append state=0 field to existing metric", @@ -107,7 +108,7 @@ func TestTryAddState(t *testing.T) { n("nagios_state"). f("service_output", "OK: system working").b(), }, - assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { + assertF: func(t *testing.T, metrics []telegraf.Metric) { exp := []telegraf.Metric{ mb(). n("nagios"). @@ -118,7 +119,6 @@ func TestTryAddState(t *testing.T) { f("state", 0).b(), } assertEqual(t, exp, metrics) - require.NoError(t, err) }, }, { @@ -131,7 +131,7 @@ func TestTryAddState(t *testing.T) { n("nagios"). f("perfdata", 0).b(), }, - assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { + assertF: func(t *testing.T, metrics []telegraf.Metric) { exp := []telegraf.Metric{ mb(). n("nagios"). @@ -141,7 +141,6 @@ func TestTryAddState(t *testing.T) { f("state", 0).b(), } assertEqual(t, exp, metrics) - require.NoError(t, err) }, }, { @@ -150,7 +149,7 @@ func TestTryAddState(t *testing.T) { return nil }, metrics: []telegraf.Metric{}, - assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { + assertF: func(t *testing.T, metrics []telegraf.Metric) { require.Len(t, metrics, 1) m := metrics[0] require.Equal(t, "nagios_state", m.Name()) @@ -158,37 +157,54 @@ func TestTryAddState(t *testing.T) { require.True(t, ok) require.Equal(t, int64(0), s) require.WithinDuration(t, time.Now().UTC(), m.Time(), 10*time.Second) - require.NoError(t, err) }, }, { - name: "should return original metrics and an error", + name: "should return metrics with state unknown and thrown error is service_output", runErrF: func() error { return errors.New("non parsable error") }, metrics: []telegraf.Metric{ mb(). - n("nagios"). - f("perfdata", 0).b(), + n("nagios_state").b(), }, - assertF: func(t *testing.T, metrics []telegraf.Metric, err error) { + assertF: func(t *testing.T, metrics []telegraf.Metric) { exp := []telegraf.Metric{ mb(). - n("nagios"). - f("perfdata", 0).b(), + n("nagios_state"). + f("state", 3). + f("service_output", "non parsable error").b(), } - expErr := "exec: get exit code: non parsable error" assertEqual(t, exp, metrics) - require.Equal(t, expErr, err.Error()) + }, + }, + { + name: "should return metrics with state unknown and service_output error from error message parameter", + runErrF: func() error { + return errors.New("") + }, + runErrMessage: []byte("some error message"), + metrics: []telegraf.Metric{ + mb(). + n("nagios_state").b(), + }, + assertF: func(t *testing.T, metrics []telegraf.Metric) { + exp := []telegraf.Metric{ + mb(). + n("nagios_state"). + f("state", 3). + f("service_output", "some error message").b(), + } + assertEqual(t, exp, metrics) }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - metrics, err := TryAddState(tt.runErrF(), tt.metrics) - tt.assertF(t, metrics, err) + metrics := AddState(tt.runErrF(), tt.runErrMessage, tt.metrics) + tt.assertF(t, metrics) }) } }