From 0b2a2f1df61cd745a1b3adfed3ee3ba647181b77 Mon Sep 17 00:00:00 2001 From: Kuba Trojan Date: Tue, 23 Nov 2021 08:08:31 -0800 Subject: [PATCH] fix: Add SMART plugin concurrency configuration option, nvme-cli v1.14+ support and lint fixes. --- plugins/inputs/smart/README.md | 22 ++- plugins/inputs/smart/smart.go | 201 ++++++++++++++++------ plugins/inputs/smart/smart_test.go | 268 +++++++++++++++++++++++++++-- 3 files changed, 422 insertions(+), 69 deletions(-) diff --git a/plugins/inputs/smart/README.md b/plugins/inputs/smart/README.md index 31ab92d3ad9d6..5c9a0a1539102 100644 --- a/plugins/inputs/smart/README.md +++ b/plugins/inputs/smart/README.md @@ -112,6 +112,13 @@ smartctl --scan -d nvme ## Timeout for the cli command to complete. # timeout = "30s" + + ## Optionally call smartctl and nvme-cli sequentially to gather disk attributes. + ## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes. + ## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of + ## SMART data - one individual array drive at the time. In such case please set this configuration option to false + ## to get readings for all drives. + # parallel_reads = true ``` ## Permissions @@ -235,13 +242,26 @@ the DEVICE (name of the device could be taken from the previous command): smartctl --info --health --attributes --tolerance=verypermissive --nocheck NOCHECK --format=brief -d DEVICE ``` -If you try to gather vendor specific metrics, please provide this commad +If you try to gather vendor specific metrics, please provide this command and replace vendor and device to match your case: ```sh nvme VENDOR smart-log-add DEVICE ``` +If you have specified devices array in configuration file, and Telegraf only shows data from one device, you should +change the plugin configuration to sequentially gather disk attributes instead of collecting it in separate threads +(goroutines). To do this find in plugin configuration parallel_reads and change it to false: + +```toml + ## Optionally call smartctl and nvme-cli sequentially to gather disk attributes. + ## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes. + ## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of + ## SMART data - one individual array drive at the time. In such case please set this configuration option to false + ## to get readings for all drives. + parallel_reads = false +``` + ## Example SMART Plugin Outputs ```shell diff --git a/plugins/inputs/smart/smart.go b/plugins/inputs/smart/smart.go index b0f189d69fbf9..457b28ba8e4bc 100644 --- a/plugins/inputs/smart/smart.go +++ b/plugins/inputs/smart/smart.go @@ -53,14 +53,26 @@ var ( attribute = regexp.MustCompile(`^\s*([0-9]+)\s(\S+)\s+([-P][-O][-S][-R][-C][-K])\s+([0-9]+)\s+([0-9]+)\s+([0-9-]+)\s+([-\w]+)\s+([\w\+\.]+).*$`) // Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff + // nvme version 1.14+ metrics: + // ID KEY Normalized Raw + // 0xab program_fail_count 100 0 + + // nvme deprecated metric format: // key normalized raw // program_fail_count : 100% 0 - intelExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\w\s]+)%(.+)`) + + // REGEX patter supports deprecated metrics (nvme-cli version below 1.14) and metrics from nvme-cli 1.14 (and above). + intelExpressionPattern = regexp.MustCompile(`^([A-Za-z0-9_\s]+)[:|\s]+(\d+)[%|\s]+(.+)`) // vid : 0x8086 // sn : CFGT53260XSP8011P nvmeIDCtrlExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\s\w]+)`) + // Format from nvme-cli 1.14 (and above) gives ID and KEY, this regex is for separating id from key. + // ID KEY + // 0xab program_fail_count + nvmeIDSeparatePattern = regexp.MustCompile(`^([A-Za-z0-9_]+)(.+)`) + deviceFieldIds = map[string]string{ "1": "read_error_rate", "7": "seek_error_rate", @@ -213,12 +225,51 @@ var ( Parse: parseTemperatureSensor, }, } - - // to obtain Intel specific metrics from nvme-cli + // To obtain Intel specific metrics from nvme-cli version 1.14 and above. intelAttributes = map[string]struct { ID string Name string Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error + }{ + "program_fail_count": { + Name: "Program_Fail_Count", + }, + "erase_fail_count": { + Name: "Erase_Fail_Count", + }, + "wear_leveling_count": { // previously: "wear_leveling" + Name: "Wear_Leveling_Count", + }, + "e2e_error_detect_count": { // previously: "end_to_end_error_detection_count" + Name: "End_To_End_Error_Detection_Count", + }, + "crc_error_count": { + Name: "Crc_Error_Count", + }, + "media_wear_percentage": { // previously: "timed_workload_media_wear" + Name: "Media_Wear_Percentage", + }, + "host_reads": { + Name: "Host_Reads", + }, + "timed_work_load": { // previously: "timed_workload_timer" + Name: "Timed_Workload_Timer", + }, + "thermal_throttle_status": { + Name: "Thermal_Throttle_Status", + }, + "retry_buff_overflow_count": { // previously: "retry_buffer_overflow_count" + Name: "Retry_Buffer_Overflow_Count", + }, + "pll_lock_loss_counter": { // previously: "pll_lock_loss_count" + Name: "Pll_Lock_Loss_Count", + }, + } + // to obtain Intel specific metrics from nvme-cli + intelAttributesDeprecatedFormat = map[string]struct { + ID string + Name string + Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error }{ "program_fail_count": { Name: "Program_Fail_Count", @@ -284,6 +335,7 @@ type Smart struct { UseSudo bool `toml:"use_sudo"` Timeout config.Duration `toml:"timeout"` Log telegraf.Logger `toml:"-"` + ParallelReads bool `toml:"parallel_reads"` } type nvmeDevice struct { @@ -333,6 +385,13 @@ var sampleConfig = ` ## Timeout for the cli command to complete. # timeout = "30s" + + ## Optionally call smartctl and nvme-cli sequentially to gather disk attributes. + ## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes. + ## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of + ## SMART data - one individual array drive at the time. In such case please set this configuration option to false + ## to get readings for all drives. + # parallel_reads = true ` func newSmart() *Smart { @@ -404,9 +463,9 @@ func (m *Smart) Gather(acc telegraf.Accumulator) error { if err != nil { return err } - NVMeDevices := distinguishNVMeDevices(devicesFromConfig, scannedNVMeDevices) + nvmeDevices := distinguishNVMeDevices(devicesFromConfig, scannedNVMeDevices) - m.getVendorNVMeAttributes(acc, NVMeDevices) + m.getVendorNVMeAttributes(acc, nvmeDevices) } return nil } @@ -434,28 +493,28 @@ func (m *Smart) scanAllDevices(ignoreExcludes bool) ([]string, []string, error) } // this will return only NVMe devices - NVMeDevices, err := m.scanDevices(ignoreExcludes, "--scan", "--device=nvme") + nvmeDevices, err := m.scanDevices(ignoreExcludes, "--scan", "--device=nvme") if err != nil { return nil, nil, err } // to handle all versions of smartctl this will return only non NVMe devices - nonNVMeDevices := difference(devices, NVMeDevices) - return NVMeDevices, nonNVMeDevices, nil + nonNVMeDevices := difference(devices, nvmeDevices) + return nvmeDevices, nonNVMeDevices, nil } func distinguishNVMeDevices(userDevices []string, availableNVMeDevices []string) []string { - var NVMeDevices []string + var nvmeDevices []string for _, userDevice := range userDevices { for _, NVMeDevice := range availableNVMeDevices { // double check. E.g. in case when nvme0 is equal nvme0n1, will check if "nvme0" part is present. if strings.Contains(NVMeDevice, userDevice) || strings.Contains(userDevice, NVMeDevice) { - NVMeDevices = append(NVMeDevices, userDevice) + nvmeDevices = append(nvmeDevices, userDevice) } } } - return NVMeDevices + return nvmeDevices } // Scan for S.M.A.R.T. devices from smartctl @@ -506,69 +565,77 @@ func excludedDev(excludes []string, deviceLine string) bool { func (m *Smart) getAttributes(acc telegraf.Accumulator, devices []string) { var wg sync.WaitGroup wg.Add(len(devices)) - for _, device := range devices { - go gatherDisk(acc, m.Timeout, m.UseSudo, m.Attributes, m.PathSmartctl, m.Nocheck, device, &wg) + if m.ParallelReads { + go m.gatherDisk(acc, device, &wg) + } else { + m.gatherDisk(acc, device, &wg) + } } wg.Wait() } func (m *Smart) getVendorNVMeAttributes(acc telegraf.Accumulator, devices []string) { - NVMeDevices := getDeviceInfoForNVMeDisks(acc, devices, m.PathNVMe, m.Timeout, m.UseSudo) + nvmeDevices := getDeviceInfoForNVMeDisks(acc, devices, m.PathNVMe, m.Timeout, m.UseSudo) var wg sync.WaitGroup - for _, device := range NVMeDevices { + for _, device := range nvmeDevices { if contains(m.EnableExtensions, "auto-on") { + // nolint:revive // one case switch on purpose to demonstrate potential extensions switch device.vendorID { case intelVID: wg.Add(1) - go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + if m.ParallelReads { + go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + } else { + gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + } } } else if contains(m.EnableExtensions, "Intel") && device.vendorID == intelVID { wg.Add(1) - go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + if m.ParallelReads { + go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + } else { + gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + } } } wg.Wait() } func getDeviceInfoForNVMeDisks(acc telegraf.Accumulator, devices []string, nvme string, timeout config.Duration, useSudo bool) []nvmeDevice { - var NVMeDevices []nvmeDevice + var nvmeDevices []nvmeDevice for _, device := range devices { - vid, sn, mn, err := gatherNVMeDeviceInfo(nvme, device, timeout, useSudo) + newDevice, err := gatherNVMeDeviceInfo(nvme, device, timeout, useSudo) if err != nil { acc.AddError(fmt.Errorf("cannot find device info for %s device", device)) continue } - newDevice := nvmeDevice{ - name: device, - vendorID: vid, - model: mn, - serialNumber: sn, - } - NVMeDevices = append(NVMeDevices, newDevice) + nvmeDevices = append(nvmeDevices, newDevice) } - return NVMeDevices + return nvmeDevices } -func gatherNVMeDeviceInfo(nvme, device string, timeout config.Duration, useSudo bool) (string, string, string, error) { +func gatherNVMeDeviceInfo(nvme, deviceName string, timeout config.Duration, useSudo bool) (device nvmeDevice, err error) { args := []string{"id-ctrl"} - args = append(args, strings.Split(device, " ")...) + args = append(args, strings.Split(deviceName, " ")...) out, err := runCmd(timeout, useSudo, nvme, args...) if err != nil { - return "", "", "", err + return device, err } outStr := string(out) - - vid, sn, mn, err := findNVMeDeviceInfo(outStr) - - return vid, sn, mn, err + device, err = findNVMeDeviceInfo(outStr) + if err != nil { + return device, err + } + device.name = deviceName + return device, nil } -func findNVMeDeviceInfo(output string) (string, string, string, error) { +func findNVMeDeviceInfo(output string) (nvmeDevice, error) { scanner := bufio.NewScanner(strings.NewReader(output)) var vid, sn, mn string @@ -580,7 +647,7 @@ func findNVMeDeviceInfo(output string) (string, string, string, error) { matches[2] = strings.TrimSpace(matches[2]) if matches[1] == "vid" { if _, err := fmt.Sscanf(matches[2], "%s", &vid); err != nil { - return "", "", "", err + return nvmeDevice{}, err } } if matches[1] == "sn" { @@ -591,7 +658,13 @@ func findNVMeDeviceInfo(output string) (string, string, string, error) { } } } - return vid, sn, mn, nil + + newDevice := nvmeDevice{ + vendorID: vid, + model: mn, + serialNumber: sn, + } + return newDevice, nil } func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo bool, nvme string, device nvmeDevice, wg *sync.WaitGroup) { @@ -619,10 +692,31 @@ func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout config.Duration, uses tags["model"] = device.model tags["serial_no"] = device.serialNumber - if matches := intelExpressionPattern.FindStringSubmatch(line); len(matches) > 3 { - matches[1] = strings.TrimSpace(matches[1]) + // Create struct to initialize later with intel attributes. + var ( + attr = struct { + ID string + Name string + Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error + }{} + attrExists bool + ) + + if matches := intelExpressionPattern.FindStringSubmatch(line); len(matches) > 3 && len(matches[1]) > 1 { + // Check if nvme shows metrics in deprecated format or in format with ID. + // Based on that, an attribute map with metrics is chosen. + // If string has more than one character it means it has KEY there, otherwise it's empty string (""). + if separatedIDAndKey := nvmeIDSeparatePattern.FindStringSubmatch(matches[1]); len(strings.TrimSpace(separatedIDAndKey[2])) > 1 { + matches[1] = strings.TrimSpace(separatedIDAndKey[2]) + attr, attrExists = intelAttributes[matches[1]] + } else { + matches[1] = strings.TrimSpace(matches[1]) + attr, attrExists = intelAttributesDeprecatedFormat[matches[1]] + } + matches[3] = strings.TrimSpace(matches[3]) - if attr, ok := intelAttributes[matches[1]]; ok { + + if attrExists { tags["name"] = attr.Name if attr.ID != "" { tags["id"] = attr.ID @@ -641,18 +735,18 @@ func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout config.Duration, uses } } -func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, collectAttributes bool, smartctl, nocheck, device string, wg *sync.WaitGroup) { +func (m *Smart) gatherDisk(acc telegraf.Accumulator, device string, wg *sync.WaitGroup) { defer wg.Done() // smartctl 5.41 & 5.42 have are broken regarding handling of --nocheck/-n - args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", nocheck, "--format=brief"} + args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", m.Nocheck, "--format=brief"} args = append(args, strings.Split(device, " ")...) - out, e := runCmd(timeout, usesudo, smartctl, args...) + out, e := runCmd(m.Timeout, m.UseSudo, m.PathSmartctl, args...) outStr := string(out) // Ignore all exit statuses except if it is a command line parse error exitStatus, er := exitStatus(e) if er != nil { - acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", smartctl, strings.Join(args, " "), e, outStr)) + acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", m.PathSmartctl, strings.Join(args, " "), e, outStr)) return } @@ -712,7 +806,7 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll tags := map[string]string{} fields := make(map[string]interface{}) - if collectAttributes { + if m.Attributes { //add power mode keys := [...]string{"device", "model", "serial_no", "wwn", "capacity", "enabled", "power"} for _, key := range keys { @@ -724,8 +818,8 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll attr := attribute.FindStringSubmatch(line) if len(attr) > 1 { - // attribute has been found, add it only if collectAttributes is true - if collectAttributes { + // attribute has been found, add it only if m.Attributes is true + if m.Attributes { tags["id"] = attr[1] tags["name"] = attr[2] tags["flags"] = attr[3] @@ -774,8 +868,8 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll continue } // if the field is classified as an attribute, only add it - // if collectAttributes is true - if collectAttributes { + // if m.Attributes is true + if m.Attributes { acc.AddFields("smart_attribute", fields, tags) } } @@ -972,13 +1066,13 @@ func parseTemperatureSensor(fields, _ map[string]interface{}, str string) error return nil } -func validatePath(path string) error { - pathInfo, err := os.Stat(path) +func validatePath(filePath string) error { + pathInfo, err := os.Stat(filePath) if os.IsNotExist(err) { - return fmt.Errorf("provided path does not exist: [%s]", path) + return fmt.Errorf("provided path does not exist: [%s]", filePath) } if mode := pathInfo.Mode(); !mode.IsRegular() { - return fmt.Errorf("provided path does not point to a regular file: [%s]", path) + return fmt.Errorf("provided path does not point to a regular file: [%s]", filePath) } return nil } @@ -989,6 +1083,7 @@ func init() { inputs.Add("smart", func() telegraf.Input { m := newSmart() + m.ParallelReads = true m.Nocheck = "standby" return m }) diff --git a/plugins/inputs/smart/smart_test.go b/plugins/inputs/smart/smart_test.go index 5a1799381cebe..5fe3aafe7dd08 100644 --- a/plugins/inputs/smart/smart_test.go +++ b/plugins/inputs/smart/smart_test.go @@ -77,6 +77,60 @@ func TestGatherAttributes(t *testing.T) { }) } +func TestGatherInParallelMode(t *testing.T) { + s := newSmart() + s.Attributes = true + s.PathSmartctl = "smartctl" + s.PathNVMe = "nvmeIdentifyController" + s.EnableExtensions = append(s.EnableExtensions, "auto-on") + s.Devices = []string{"/dev/nvme0"} + + runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) { + if len(args) > 0 { + if args[0] == "--info" && args[7] == "/dev/ada0" { + return []byte(mockInfoAttributeData), nil + } else if args[0] == "--info" && args[7] == "/dev/nvmeIdentifyController" { + return []byte(smartctlNvmeInfoData), nil + } else if args[0] == "--scan" && len(args) == 1 { + return []byte(mockScanData), nil + } else if args[0] == "--scan" && len(args) >= 2 && args[1] == "--device=nvme" { + return []byte(mockScanNvmeData), nil + } else if args[0] == "intel" && args[1] == "smart-log-add" { + return []byte(nvmeIntelInfoDataCurrentMetricsFormat), nil + } else if args[0] == "id-ctrl" { + return []byte(nvmeIdentifyController), nil + } + } + return nil, errors.New("command not found") + } + + t.Run("Gather nvme device info in goroutine", func(t *testing.T) { + acc := &testutil.Accumulator{} + // Set to read metrics in goroutine. + s.ParallelReads = true + + err := s.Gather(acc) + require.NoError(t, err) + + result := acc.GetTelegrafMetrics() + testutil.RequireMetricsEqual(t, testIntelInvmeNewFormatAttributes, result, + testutil.SortMetrics(), testutil.IgnoreTime()) + }) + + t.Run("Gather nvme device info sequentially", func(t *testing.T) { + acc := &testutil.Accumulator{} + // Set to read metrics sequentially. + s.ParallelReads = false + + err := s.Gather(acc) + require.NoError(t, err) + + result := acc.GetTelegrafMetrics() + testutil.RequireMetricsEqual(t, testIntelInvmeNewFormatAttributes, result, + testutil.SortMetrics(), testutil.IgnoreTime()) + }) +} + func TestGatherNoAttributes(t *testing.T) { s := newSmart() s.Attributes = false @@ -123,6 +177,16 @@ func TestExcludedDev(t *testing.T) { assert.Equal(t, false, excludedDev([]string{"/dev/pass6"}, "/dev/pass1 -d atacam"), "Shouldn't be excluded.") } +var ( + sampleSmart = Smart{ + PathSmartctl: "", + Nocheck: "", + Attributes: true, + UseSudo: true, + Timeout: config.Duration(time.Second * 30), + } +) + func TestGatherSATAInfo(t *testing.T) { runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) { return []byte(hgstSATAInfoData), nil @@ -134,7 +198,8 @@ func TestGatherSATAInfo(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + + sampleSmart.gatherDisk(acc, "", wg) assert.Equal(t, 101, acc.NFields(), "Wrong number of fields gathered") assert.Equal(t, uint64(20), acc.NMetrics(), "Wrong number of metrics gathered") } @@ -150,7 +215,7 @@ func TestGatherSATAInfo65(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + sampleSmart.gatherDisk(acc, "", wg) assert.Equal(t, 91, acc.NFields(), "Wrong number of fields gathered") assert.Equal(t, uint64(18), acc.NMetrics(), "Wrong number of metrics gathered") } @@ -166,7 +231,7 @@ func TestGatherHgstSAS(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + sampleSmart.gatherDisk(acc, "", wg) assert.Equal(t, 6, acc.NFields(), "Wrong number of fields gathered") assert.Equal(t, uint64(4), acc.NMetrics(), "Wrong number of metrics gathered") } @@ -182,7 +247,7 @@ func TestGatherHtSAS(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + sampleSmart.gatherDisk(acc, "", wg) testutil.RequireMetricsEqual(t, testHtsasAtributtes, acc.GetTelegrafMetrics(), testutil.SortMetrics(), testutil.IgnoreTime()) } @@ -198,7 +263,7 @@ func TestGatherSSD(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + sampleSmart.gatherDisk(acc, "", wg) assert.Equal(t, 105, acc.NFields(), "Wrong number of fields gathered") assert.Equal(t, uint64(26), acc.NMetrics(), "Wrong number of metrics gathered") } @@ -214,7 +279,7 @@ func TestGatherSSDRaid(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + sampleSmart.gatherDisk(acc, "", wg) assert.Equal(t, 74, acc.NFields(), "Wrong number of fields gathered") assert.Equal(t, uint64(15), acc.NMetrics(), "Wrong number of metrics gathered") } @@ -230,15 +295,38 @@ func TestGatherNvme(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "nvme0", wg) + sampleSmart.gatherDisk(acc, "nvme0", wg) testutil.RequireMetricsEqual(t, testSmartctlNvmeAttributes, acc.GetTelegrafMetrics(), testutil.SortMetrics(), testutil.IgnoreTime()) } -func TestGatherIntelNvme(t *testing.T) { +func TestGatherIntelNvmeCurrentMetrics(t *testing.T) { runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) { - return []byte(nvmeIntelInfoData), nil + return []byte(nvmeIntelInfoDataCurrentMetricsFormat), nil + } + + var ( + acc = &testutil.Accumulator{} + wg = &sync.WaitGroup{} + device = nvmeDevice{ + name: "nvme0", + model: mockModel, + serialNumber: mockSerial, + } + ) + + wg.Add(1) + gatherIntelNVMeDisk(acc, config.Duration(time.Second*30), true, "", device, wg) + + result := acc.GetTelegrafMetrics() + testutil.RequireMetricsEqual(t, testIntelInvmeNewFormatAttributes, result, + testutil.SortMetrics(), testutil.IgnoreTime()) +} + +func TestGatherIntelNvmeDeprecatedFormatMetrics(t *testing.T) { + runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) { + return []byte(nvmeIntelInfoDataDeprecatedMetricsFormat), nil } var ( @@ -260,12 +348,12 @@ func TestGatherIntelNvme(t *testing.T) { } func Test_findVIDFromNVMeOutput(t *testing.T) { - vid, sn, mn, err := findNVMeDeviceInfo(nvmeIdentifyController) + device, err := findNVMeDeviceInfo(nvmeIdentifyController) assert.Nil(t, err) - assert.Equal(t, "0x8086", vid) - assert.Equal(t, "CVFT5123456789ABCD", sn) - assert.Equal(t, "INTEL SSDPEDABCDEFG", mn) + assert.Equal(t, "0x8086", device.vendorID) + assert.Equal(t, "CVFT5123456789ABCD", device.serialNumber) + assert.Equal(t, "INTEL SSDPEDABCDEFG", device.model) } func Test_checkForNVMeDevices(t *testing.T) { @@ -303,7 +391,8 @@ func Test_integerOverflow(t *testing.T) { t.Run("If data raw_value is out of int64 range, there should be no metrics for that attribute", func(t *testing.T) { wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "nvme0", wg) + + sampleSmart.gatherDisk(acc, "nvme0", wg) result := acc.GetTelegrafMetrics() testutil.RequireMetricsEqual(t, testOverflowAttributes, result, @@ -1257,6 +1346,141 @@ var ( time.Now(), ), } + + testIntelInvmeNewFormatAttributes = []telegraf.Metric{ + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Program_Fail_Count", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Erase_Fail_Count", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Wear_Leveling_Count", + }, + map[string]interface{}{ + "raw_value": int64(700090417315), + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "End_To_End_Error_Detection_Count", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Crc_Error_Count", + }, + map[string]interface{}{ + "raw_value": 13, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Media_Wear_Percentage", + }, + map[string]interface{}{ + "raw_value": 552, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Host_Reads", + }, + map[string]interface{}{ + "raw_value": 73, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Timed_Workload_Timer", + }, + map[string]interface{}{ + "raw_value": int64(2343038), + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Thermal_Throttle_Status", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Retry_Buffer_Overflow_Count", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Pll_Lock_Loss_Count", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + } // smartctl --scan mockScanData = `/dev/ada0 -d atacam # /dev/ada0, ATA device` @@ -1727,7 +1951,7 @@ Temperature Sensor 3: 9223372036854775807 C Temperature Sensor 4: -9223372036854775808 C ` - nvmeIntelInfoData = `Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff + nvmeIntelInfoDataDeprecatedMetricsFormat = `Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff key normalized raw program_fail_count : 100% 0 erase_fail_count : 100% 0 @@ -1742,6 +1966,20 @@ retry_buffer_overflow_count : 100% 0 pll_lock_loss_count : 100% 0 nand_bytes_written : 0% sectors: 0 host_bytes_written : 0% sectors: 0 +` + nvmeIntelInfoDataCurrentMetricsFormat = `Additional Smart Log for NVME device:nvme0n1 namespace-id:ffffffff +ID KEY Normalized Raw +0xab program_fail_count 100 0 +0xac erase_fail_count 100 0 +0xad wear_leveling_count 100 700090417315 +0xb8 e2e_error_detect_count 100 0 +0xc7 crc_error_count 100 13 +0xe2 media_wear_percentage 100 552 +0xe3 host_reads 100 73 +0xe4 timed_work_load 100 2343038 +0xea thermal_throttle_status 100 0 +0xf0 retry_buff_overflow_count 100 0 +0xf3 pll_lock_loss_counter 100 0 ` nvmeIdentifyController = `NVME Identify Controller: