From 4b2c597757cc85a731939fd2d566a941610b3793 Mon Sep 17 00:00:00 2001 From: Kuba Trojan Date: Tue, 7 Dec 2021 08:10:36 -0800 Subject: [PATCH] feat: Add SMART plugin concurrency configuration option, nvme-cli v1.14+ support and lint fixes. (#10150) --- plugins/inputs/smart/README.md | 24 ++- plugins/inputs/smart/smart.go | 233 +++++++++++++++------ plugins/inputs/smart/smart_test.go | 318 ++++++++++++++++++++++++++--- 3 files changed, 479 insertions(+), 96 deletions(-) diff --git a/plugins/inputs/smart/README.md b/plugins/inputs/smart/README.md index 31ab92d3ad9d6..3fb37c396f9b5 100644 --- a/plugins/inputs/smart/README.md +++ b/plugins/inputs/smart/README.md @@ -112,6 +112,14 @@ smartctl --scan -d nvme ## Timeout for the cli command to complete. # timeout = "30s" + + ## Optionally call smartctl and nvme-cli with a specific concurrency policy. + ## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes. + ## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of + ## SMART data - one individual array drive at the time. In such case please set this configuration option + ## to "sequential" to get readings for all drives. + ## valid options: concurrent, sequential + # read_method = "concurrent" ``` ## Permissions @@ -235,13 +243,27 @@ the DEVICE (name of the device could be taken from the previous command): smartctl --info --health --attributes --tolerance=verypermissive --nocheck NOCHECK --format=brief -d DEVICE ``` -If you try to gather vendor specific metrics, please provide this commad +If you try to gather vendor specific metrics, please provide this command and replace vendor and device to match your case: ```sh nvme VENDOR smart-log-add DEVICE ``` +If you have specified devices array in configuration file, and Telegraf only shows data from one device, you should +change the plugin configuration to sequentially gather disk attributes instead of collecting it in separate threads +(goroutines). To do this find in plugin configuration read_method and change it to sequential: + +```toml + ## Optionally call smartctl and nvme-cli with a specific concurrency policy. + ## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes. + ## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of + ## SMART data - one individual array drive at the time. In such case please set this configuration option + ## to "sequential" to get readings for all drives. + ## valid options: concurrent, sequential + read_method = "sequential" +``` + ## Example SMART Plugin Outputs ```shell diff --git a/plugins/inputs/smart/smart.go b/plugins/inputs/smart/smart.go index b0f189d69fbf9..cc6b40e94fcec 100644 --- a/plugins/inputs/smart/smart.go +++ b/plugins/inputs/smart/smart.go @@ -43,8 +43,8 @@ var ( // PASSED, FAILED, UNKNOWN smartOverallHealth = regexp.MustCompile(`^(SMART overall-health self-assessment test result|SMART Health Status):\s+(\w+).*$`) - // sasNvmeAttr is a SAS or NVME SMART attribute - sasNvmeAttr = regexp.MustCompile(`^([^:]+):\s+(.+)$`) + // sasNVMeAttr is a SAS or NVMe SMART attribute + sasNVMeAttr = regexp.MustCompile(`^([^:]+):\s+(.+)$`) // ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE // 1 Raw_Read_Error_Rate -O-RC- 200 200 000 - 0 @@ -53,14 +53,26 @@ var ( attribute = regexp.MustCompile(`^\s*([0-9]+)\s(\S+)\s+([-P][-O][-S][-R][-C][-K])\s+([0-9]+)\s+([0-9]+)\s+([0-9-]+)\s+([-\w]+)\s+([\w\+\.]+).*$`) // Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff + // nvme version 1.14+ metrics: + // ID KEY Normalized Raw + // 0xab program_fail_count 100 0 + + // nvme deprecated metric format: // key normalized raw // program_fail_count : 100% 0 - intelExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\w\s]+)%(.+)`) + + // REGEX patter supports deprecated metrics (nvme-cli version below 1.14) and metrics from nvme-cli 1.14 (and above). + intelExpressionPattern = regexp.MustCompile(`^([A-Za-z0-9_\s]+)[:|\s]+(\d+)[%|\s]+(.+)`) // vid : 0x8086 // sn : CFGT53260XSP8011P nvmeIDCtrlExpressionPattern = regexp.MustCompile(`^([\w\s]+):([\s\w]+)`) + // Format from nvme-cli 1.14 (and above) gives ID and KEY, this regex is for separating id from key. + // ID KEY + // 0xab program_fail_count + nvmeIDSeparatePattern = regexp.MustCompile(`^([A-Za-z0-9_]+)(.+)`) + deviceFieldIds = map[string]string{ "1": "read_error_rate", "7": "seek_error_rate", @@ -70,7 +82,7 @@ var ( } // to obtain metrics from smartctl - sasNvmeAttributes = map[string]struct { + sasNVMeAttributes = map[string]struct { ID string Name string Parse func(fields, deviceFields map[string]interface{}, str string) error @@ -213,12 +225,51 @@ var ( Parse: parseTemperatureSensor, }, } - - // to obtain Intel specific metrics from nvme-cli + // To obtain Intel specific metrics from nvme-cli version 1.14 and above. intelAttributes = map[string]struct { ID string Name string Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error + }{ + "program_fail_count": { + Name: "Program_Fail_Count", + }, + "erase_fail_count": { + Name: "Erase_Fail_Count", + }, + "wear_leveling_count": { // previously: "wear_leveling" + Name: "Wear_Leveling_Count", + }, + "e2e_error_detect_count": { // previously: "end_to_end_error_detection_count" + Name: "End_To_End_Error_Detection_Count", + }, + "crc_error_count": { + Name: "Crc_Error_Count", + }, + "media_wear_percentage": { // previously: "timed_workload_media_wear" + Name: "Media_Wear_Percentage", + }, + "host_reads": { + Name: "Host_Reads", + }, + "timed_work_load": { // previously: "timed_workload_timer" + Name: "Timed_Workload_Timer", + }, + "thermal_throttle_status": { + Name: "Thermal_Throttle_Status", + }, + "retry_buff_overflow_count": { // previously: "retry_buffer_overflow_count" + Name: "Retry_Buffer_Overflow_Count", + }, + "pll_lock_loss_counter": { // previously: "pll_lock_loss_count" + Name: "Pll_Lock_Loss_Count", + }, + } + // to obtain Intel specific metrics from nvme-cli + intelAttributesDeprecatedFormat = map[string]struct { + ID string + Name string + Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error }{ "program_fail_count": { Name: "Program_Fail_Count", @@ -269,6 +320,8 @@ var ( Parse: parseBytesWritten, }, } + + knownReadMethods = []string{"concurrent", "sequential"} ) // Smart plugin reads metrics from storage devices supporting S.M.A.R.T. @@ -283,6 +336,7 @@ type Smart struct { Devices []string `toml:"devices"` UseSudo bool `toml:"use_sudo"` Timeout config.Duration `toml:"timeout"` + ReadMethod string `toml:"read_method"` Log telegraf.Logger `toml:"-"` } @@ -333,11 +387,20 @@ var sampleConfig = ` ## Timeout for the cli command to complete. # timeout = "30s" + + ## Optionally call smartctl and nvme-cli with a specific concurrency policy. + ## By default, smartctl and nvme-cli are called in separate threads (goroutines) to gather disk attributes. + ## Some devices (e.g. disks in RAID arrays) may have access limitations that require sequential reading of + ## SMART data - one individual array drive at the time. In such case please set this configuration option + ## to "sequential" to get readings for all drives. + ## valid options: concurrent, sequential + # read_method = "concurrent" ` func newSmart() *Smart { return &Smart{ - Timeout: config.Duration(time.Second * 30), + Timeout: config.Duration(time.Second * 30), + ReadMethod: "concurrent", } } @@ -368,6 +431,10 @@ func (m *Smart) Init() error { m.PathNVMe, _ = exec.LookPath("nvme") } + if !contains(knownReadMethods, m.ReadMethod) { + return fmt.Errorf("provided read method `%s` is not valid", m.ReadMethod) + } + err := validatePath(m.PathSmartctl) if err != nil { m.PathSmartctl = "" @@ -404,9 +471,9 @@ func (m *Smart) Gather(acc telegraf.Accumulator) error { if err != nil { return err } - NVMeDevices := distinguishNVMeDevices(devicesFromConfig, scannedNVMeDevices) + nvmeDevices := distinguishNVMeDevices(devicesFromConfig, scannedNVMeDevices) - m.getVendorNVMeAttributes(acc, NVMeDevices) + m.getVendorNVMeAttributes(acc, nvmeDevices) } return nil } @@ -434,28 +501,28 @@ func (m *Smart) scanAllDevices(ignoreExcludes bool) ([]string, []string, error) } // this will return only NVMe devices - NVMeDevices, err := m.scanDevices(ignoreExcludes, "--scan", "--device=nvme") + nvmeDevices, err := m.scanDevices(ignoreExcludes, "--scan", "--device=nvme") if err != nil { return nil, nil, err } // to handle all versions of smartctl this will return only non NVMe devices - nonNVMeDevices := difference(devices, NVMeDevices) - return NVMeDevices, nonNVMeDevices, nil + nonNVMeDevices := difference(devices, nvmeDevices) + return nvmeDevices, nonNVMeDevices, nil } func distinguishNVMeDevices(userDevices []string, availableNVMeDevices []string) []string { - var NVMeDevices []string + var nvmeDevices []string for _, userDevice := range userDevices { - for _, NVMeDevice := range availableNVMeDevices { + for _, availableNVMeDevice := range availableNVMeDevices { // double check. E.g. in case when nvme0 is equal nvme0n1, will check if "nvme0" part is present. - if strings.Contains(NVMeDevice, userDevice) || strings.Contains(userDevice, NVMeDevice) { - NVMeDevices = append(NVMeDevices, userDevice) + if strings.Contains(availableNVMeDevice, userDevice) || strings.Contains(userDevice, availableNVMeDevice) { + nvmeDevices = append(nvmeDevices, userDevice) } } } - return NVMeDevices + return nvmeDevices } // Scan for S.M.A.R.T. devices from smartctl @@ -506,69 +573,86 @@ func excludedDev(excludes []string, deviceLine string) bool { func (m *Smart) getAttributes(acc telegraf.Accumulator, devices []string) { var wg sync.WaitGroup wg.Add(len(devices)) - for _, device := range devices { - go gatherDisk(acc, m.Timeout, m.UseSudo, m.Attributes, m.PathSmartctl, m.Nocheck, device, &wg) + switch m.ReadMethod { + case "concurrent": + go m.gatherDisk(acc, device, &wg) + case "sequential": + m.gatherDisk(acc, device, &wg) + default: + wg.Done() + } } wg.Wait() } func (m *Smart) getVendorNVMeAttributes(acc telegraf.Accumulator, devices []string) { - NVMeDevices := getDeviceInfoForNVMeDisks(acc, devices, m.PathNVMe, m.Timeout, m.UseSudo) + nvmeDevices := getDeviceInfoForNVMeDisks(acc, devices, m.PathNVMe, m.Timeout, m.UseSudo) var wg sync.WaitGroup - for _, device := range NVMeDevices { + for _, device := range nvmeDevices { if contains(m.EnableExtensions, "auto-on") { + // nolint:revive // one case switch on purpose to demonstrate potential extensions switch device.vendorID { case intelVID: wg.Add(1) - go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + switch m.ReadMethod { + case "concurrent": + go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + case "sequential": + gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + default: + wg.Done() + } } } else if contains(m.EnableExtensions, "Intel") && device.vendorID == intelVID { wg.Add(1) - go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + switch m.ReadMethod { + case "concurrent": + go gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + case "sequential": + gatherIntelNVMeDisk(acc, m.Timeout, m.UseSudo, m.PathNVMe, device, &wg) + default: + wg.Done() + } } } wg.Wait() } func getDeviceInfoForNVMeDisks(acc telegraf.Accumulator, devices []string, nvme string, timeout config.Duration, useSudo bool) []nvmeDevice { - var NVMeDevices []nvmeDevice + var nvmeDevices []nvmeDevice for _, device := range devices { - vid, sn, mn, err := gatherNVMeDeviceInfo(nvme, device, timeout, useSudo) + newDevice, err := gatherNVMeDeviceInfo(nvme, device, timeout, useSudo) if err != nil { acc.AddError(fmt.Errorf("cannot find device info for %s device", device)) continue } - newDevice := nvmeDevice{ - name: device, - vendorID: vid, - model: mn, - serialNumber: sn, - } - NVMeDevices = append(NVMeDevices, newDevice) + nvmeDevices = append(nvmeDevices, newDevice) } - return NVMeDevices + return nvmeDevices } -func gatherNVMeDeviceInfo(nvme, device string, timeout config.Duration, useSudo bool) (string, string, string, error) { +func gatherNVMeDeviceInfo(nvme, deviceName string, timeout config.Duration, useSudo bool) (device nvmeDevice, err error) { args := []string{"id-ctrl"} - args = append(args, strings.Split(device, " ")...) + args = append(args, strings.Split(deviceName, " ")...) out, err := runCmd(timeout, useSudo, nvme, args...) if err != nil { - return "", "", "", err + return device, err } outStr := string(out) - - vid, sn, mn, err := findNVMeDeviceInfo(outStr) - - return vid, sn, mn, err + device, err = findNVMeDeviceInfo(outStr) + if err != nil { + return device, err + } + device.name = deviceName + return device, nil } -func findNVMeDeviceInfo(output string) (string, string, string, error) { +func findNVMeDeviceInfo(output string) (nvmeDevice, error) { scanner := bufio.NewScanner(strings.NewReader(output)) var vid, sn, mn string @@ -580,7 +664,7 @@ func findNVMeDeviceInfo(output string) (string, string, string, error) { matches[2] = strings.TrimSpace(matches[2]) if matches[1] == "vid" { if _, err := fmt.Sscanf(matches[2], "%s", &vid); err != nil { - return "", "", "", err + return nvmeDevice{}, err } } if matches[1] == "sn" { @@ -591,7 +675,13 @@ func findNVMeDeviceInfo(output string) (string, string, string, error) { } } } - return vid, sn, mn, nil + + newDevice := nvmeDevice{ + vendorID: vid, + model: mn, + serialNumber: sn, + } + return newDevice, nil } func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo bool, nvme string, device nvmeDevice, wg *sync.WaitGroup) { @@ -619,10 +709,31 @@ func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout config.Duration, uses tags["model"] = device.model tags["serial_no"] = device.serialNumber - if matches := intelExpressionPattern.FindStringSubmatch(line); len(matches) > 3 { - matches[1] = strings.TrimSpace(matches[1]) + // Create struct to initialize later with intel attributes. + var ( + attr = struct { + ID string + Name string + Parse func(acc telegraf.Accumulator, fields map[string]interface{}, tags map[string]string, str string) error + }{} + attrExists bool + ) + + if matches := intelExpressionPattern.FindStringSubmatch(line); len(matches) > 3 && len(matches[1]) > 1 { + // Check if nvme shows metrics in deprecated format or in format with ID. + // Based on that, an attribute map with metrics is chosen. + // If string has more than one character it means it has KEY there, otherwise it's empty string (""). + if separatedIDAndKey := nvmeIDSeparatePattern.FindStringSubmatch(matches[1]); len(strings.TrimSpace(separatedIDAndKey[2])) > 1 { + matches[1] = strings.TrimSpace(separatedIDAndKey[2]) + attr, attrExists = intelAttributes[matches[1]] + } else { + matches[1] = strings.TrimSpace(matches[1]) + attr, attrExists = intelAttributesDeprecatedFormat[matches[1]] + } + matches[3] = strings.TrimSpace(matches[3]) - if attr, ok := intelAttributes[matches[1]]; ok { + + if attrExists { tags["name"] = attr.Name if attr.ID != "" { tags["id"] = attr.ID @@ -641,18 +752,18 @@ func gatherIntelNVMeDisk(acc telegraf.Accumulator, timeout config.Duration, uses } } -func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, collectAttributes bool, smartctl, nocheck, device string, wg *sync.WaitGroup) { +func (m *Smart) gatherDisk(acc telegraf.Accumulator, device string, wg *sync.WaitGroup) { defer wg.Done() // smartctl 5.41 & 5.42 have are broken regarding handling of --nocheck/-n - args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", nocheck, "--format=brief"} + args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", m.Nocheck, "--format=brief"} args = append(args, strings.Split(device, " ")...) - out, e := runCmd(timeout, usesudo, smartctl, args...) + out, e := runCmd(m.Timeout, m.UseSudo, m.PathSmartctl, args...) outStr := string(out) // Ignore all exit statuses except if it is a command line parse error exitStatus, er := exitStatus(e) if er != nil { - acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", smartctl, strings.Join(args, " "), e, outStr)) + acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", m.PathSmartctl, strings.Join(args, " "), e, outStr)) return } @@ -712,7 +823,7 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll tags := map[string]string{} fields := make(map[string]interface{}) - if collectAttributes { + if m.Attributes { //add power mode keys := [...]string{"device", "model", "serial_no", "wwn", "capacity", "enabled", "power"} for _, key := range keys { @@ -724,8 +835,8 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll attr := attribute.FindStringSubmatch(line) if len(attr) > 1 { - // attribute has been found, add it only if collectAttributes is true - if collectAttributes { + // attribute has been found, add it only if m.Attributes is true + if m.Attributes { tags["id"] = attr[1] tags["name"] = attr[2] tags["flags"] = attr[3] @@ -758,8 +869,8 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll } } else { // what was found is not a vendor attribute - if matches := sasNvmeAttr.FindStringSubmatch(line); len(matches) > 2 { - if attr, ok := sasNvmeAttributes[matches[1]]; ok { + if matches := sasNVMeAttr.FindStringSubmatch(line); len(matches) > 2 { + if attr, ok := sasNVMeAttributes[matches[1]]; ok { tags["name"] = attr.Name if attr.ID != "" { tags["id"] = attr.ID @@ -774,8 +885,8 @@ func gatherDisk(acc telegraf.Accumulator, timeout config.Duration, usesudo, coll continue } // if the field is classified as an attribute, only add it - // if collectAttributes is true - if collectAttributes { + // if m.Attributes is true + if m.Attributes { acc.AddFields("smart_attribute", fields, tags) } } @@ -972,13 +1083,13 @@ func parseTemperatureSensor(fields, _ map[string]interface{}, str string) error return nil } -func validatePath(path string) error { - pathInfo, err := os.Stat(path) +func validatePath(filePath string) error { + pathInfo, err := os.Stat(filePath) if os.IsNotExist(err) { - return fmt.Errorf("provided path does not exist: [%s]", path) + return fmt.Errorf("provided path does not exist: [%s]", filePath) } if mode := pathInfo.Mode(); !mode.IsRegular() { - return fmt.Errorf("provided path does not point to a regular file: [%s]", path) + return fmt.Errorf("provided path does not point to a regular file: [%s]", filePath) } return nil } diff --git a/plugins/inputs/smart/smart_test.go b/plugins/inputs/smart/smart_test.go index 5a1799381cebe..6801ca764afa5 100644 --- a/plugins/inputs/smart/smart_test.go +++ b/plugins/inputs/smart/smart_test.go @@ -24,11 +24,11 @@ func TestGatherAttributes(t *testing.T) { if args[0] == "--info" && args[7] == "/dev/ada0" { return []byte(mockInfoAttributeData), nil } else if args[0] == "--info" && args[7] == "/dev/nvme0" { - return []byte(smartctlNvmeInfoData), nil + return []byte(smartctlNVMeInfoData), nil } else if args[0] == "--scan" && len(args) == 1 { return []byte(mockScanData), nil } else if args[0] == "--scan" && len(args) >= 2 && args[1] == "--device=nvme" { - return []byte(mockScanNvmeData), nil + return []byte(mockScanNVMeData), nil } } return nil, errors.New("command not found") @@ -45,7 +45,7 @@ func TestGatherAttributes(t *testing.T) { s.PathSmartctl = "smartctl" s.PathNVMe = "" - t.Run("Only non nvme device", func(t *testing.T) { + t.Run("Only non NVMe device", func(t *testing.T) { s.Devices = []string{"/dev/ada0"} var acc testutil.Accumulator @@ -62,7 +62,7 @@ func TestGatherAttributes(t *testing.T) { acc.AssertContainsTaggedFields(t, "smart_device", test.fields, test.tags) } }) - t.Run("Only nvme device", func(t *testing.T) { + t.Run("Only NVMe device", func(t *testing.T) { s.Devices = []string{"/dev/nvme0"} var acc testutil.Accumulator @@ -71,12 +71,78 @@ func TestGatherAttributes(t *testing.T) { require.NoError(t, err) assert.Equal(t, 32, acc.NFields(), "Wrong number of fields gathered") - testutil.RequireMetricsEqual(t, testSmartctlNvmeAttributes, acc.GetTelegrafMetrics(), + testutil.RequireMetricsEqual(t, testSmartctlNVMeAttributes, acc.GetTelegrafMetrics(), testutil.SortMetrics(), testutil.IgnoreTime()) }) }) } +func TestGatherInParallelMode(t *testing.T) { + s := newSmart() + s.Attributes = true + s.PathSmartctl = "smartctl" + s.PathNVMe = "nvmeIdentifyController" + s.EnableExtensions = append(s.EnableExtensions, "auto-on") + s.Devices = []string{"/dev/nvme0"} + + runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) { + if len(args) > 0 { + if args[0] == "--info" && args[7] == "/dev/ada0" { + return []byte(mockInfoAttributeData), nil + } else if args[0] == "--info" && args[7] == "/dev/nvmeIdentifyController" { + return []byte(smartctlNVMeInfoData), nil + } else if args[0] == "--scan" && len(args) == 1 { + return []byte(mockScanData), nil + } else if args[0] == "--scan" && len(args) >= 2 && args[1] == "--device=nvme" { + return []byte(mockScanNVMeData), nil + } else if args[0] == "intel" && args[1] == "smart-log-add" { + return []byte(nvmeIntelInfoDataMetricsFormat), nil + } else if args[0] == "id-ctrl" { + return []byte(nvmeIdentifyController), nil + } + } + return nil, errors.New("command not found") + } + + t.Run("Gather NVMe device info in goroutine", func(t *testing.T) { + acc := &testutil.Accumulator{} + s.ReadMethod = "concurrent" + + err := s.Gather(acc) + require.NoError(t, err) + + result := acc.GetTelegrafMetrics() + testutil.RequireMetricsEqual(t, testIntelNVMeNewFormatAttributes, result, + testutil.SortMetrics(), testutil.IgnoreTime()) + }) + + t.Run("Gather NVMe device info sequentially", func(t *testing.T) { + acc := &testutil.Accumulator{} + s.ReadMethod = "sequential" + + err := s.Gather(acc) + require.NoError(t, err) + + result := acc.GetTelegrafMetrics() + testutil.RequireMetricsEqual(t, testIntelNVMeNewFormatAttributes, result, + testutil.SortMetrics(), testutil.IgnoreTime()) + }) + + t.Run("Gather NVMe device info - not known read method", func(t *testing.T) { + acc := &testutil.Accumulator{} + s.ReadMethod = "horizontally" + + err := s.Init() + require.Error(t, err) + + err = s.Gather(acc) + require.NoError(t, err) + + result := acc.GetTelegrafMetrics() + testutil.RequireMetricsEqual(t, []telegraf.Metric{}, result) + }) +} + func TestGatherNoAttributes(t *testing.T) { s := newSmart() s.Attributes = false @@ -90,9 +156,9 @@ func TestGatherNoAttributes(t *testing.T) { } else if args[0] == "--info" && args[7] == "/dev/ada0" { return []byte(mockInfoAttributeData), nil } else if args[0] == "--info" && args[7] == "/dev/nvme0" { - return []byte(smartctlNvmeInfoData), nil + return []byte(smartctlNVMeInfoData), nil } else if args[0] == "--scan" && args[1] == "--device=nvme" { - return []byte(mockScanNvmeData), nil + return []byte(mockScanNVMeData), nil } } return nil, errors.New("command not found") @@ -111,7 +177,7 @@ func TestGatherNoAttributes(t *testing.T) { for _, test := range testsAda0Device { acc.AssertContainsTaggedFields(t, "smart_device", test.fields, test.tags) } - for _, test := range testNvmeDevice { + for _, test := range testNVMeDevice { acc.AssertContainsTaggedFields(t, "smart_device", test.fields, test.tags) } }) @@ -123,6 +189,16 @@ func TestExcludedDev(t *testing.T) { assert.Equal(t, false, excludedDev([]string{"/dev/pass6"}, "/dev/pass1 -d atacam"), "Shouldn't be excluded.") } +var ( + sampleSmart = Smart{ + PathSmartctl: "", + Nocheck: "", + Attributes: true, + UseSudo: true, + Timeout: config.Duration(time.Second * 30), + } +) + func TestGatherSATAInfo(t *testing.T) { runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) { return []byte(hgstSATAInfoData), nil @@ -134,7 +210,8 @@ func TestGatherSATAInfo(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + + sampleSmart.gatherDisk(acc, "", wg) assert.Equal(t, 101, acc.NFields(), "Wrong number of fields gathered") assert.Equal(t, uint64(20), acc.NMetrics(), "Wrong number of metrics gathered") } @@ -150,7 +227,7 @@ func TestGatherSATAInfo65(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + sampleSmart.gatherDisk(acc, "", wg) assert.Equal(t, 91, acc.NFields(), "Wrong number of fields gathered") assert.Equal(t, uint64(18), acc.NMetrics(), "Wrong number of metrics gathered") } @@ -166,7 +243,7 @@ func TestGatherHgstSAS(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + sampleSmart.gatherDisk(acc, "", wg) assert.Equal(t, 6, acc.NFields(), "Wrong number of fields gathered") assert.Equal(t, uint64(4), acc.NMetrics(), "Wrong number of metrics gathered") } @@ -182,7 +259,7 @@ func TestGatherHtSAS(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + sampleSmart.gatherDisk(acc, "", wg) testutil.RequireMetricsEqual(t, testHtsasAtributtes, acc.GetTelegrafMetrics(), testutil.SortMetrics(), testutil.IgnoreTime()) } @@ -198,7 +275,7 @@ func TestGatherSSD(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + sampleSmart.gatherDisk(acc, "", wg) assert.Equal(t, 105, acc.NFields(), "Wrong number of fields gathered") assert.Equal(t, uint64(26), acc.NMetrics(), "Wrong number of metrics gathered") } @@ -214,14 +291,14 @@ func TestGatherSSDRaid(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "", wg) + sampleSmart.gatherDisk(acc, "", wg) assert.Equal(t, 74, acc.NFields(), "Wrong number of fields gathered") assert.Equal(t, uint64(15), acc.NMetrics(), "Wrong number of metrics gathered") } -func TestGatherNvme(t *testing.T) { +func TestGatherNVMe(t *testing.T) { runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) { - return []byte(smartctlNvmeInfoData), nil + return []byte(smartctlNVMeInfoData), nil } var ( @@ -230,15 +307,38 @@ func TestGatherNvme(t *testing.T) { ) wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "nvme0", wg) + sampleSmart.gatherDisk(acc, "nvme0", wg) + + testutil.RequireMetricsEqual(t, testSmartctlNVMeAttributes, acc.GetTelegrafMetrics(), + testutil.SortMetrics(), testutil.IgnoreTime()) +} + +func TestGatherIntelNVMeMetrics(t *testing.T) { + runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) { + return []byte(nvmeIntelInfoDataMetricsFormat), nil + } + + var ( + acc = &testutil.Accumulator{} + wg = &sync.WaitGroup{} + device = nvmeDevice{ + name: "nvme0", + model: mockModel, + serialNumber: mockSerial, + } + ) + + wg.Add(1) + gatherIntelNVMeDisk(acc, config.Duration(time.Second*30), true, "", device, wg) - testutil.RequireMetricsEqual(t, testSmartctlNvmeAttributes, acc.GetTelegrafMetrics(), + result := acc.GetTelegrafMetrics() + testutil.RequireMetricsEqual(t, testIntelNVMeNewFormatAttributes, result, testutil.SortMetrics(), testutil.IgnoreTime()) } -func TestGatherIntelNvme(t *testing.T) { +func TestGatherIntelNVMeDeprecatedFormatMetrics(t *testing.T) { runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) { - return []byte(nvmeIntelInfoData), nil + return []byte(nvmeIntelInfoDataDeprecatedMetricsFormat), nil } var ( @@ -255,17 +355,17 @@ func TestGatherIntelNvme(t *testing.T) { gatherIntelNVMeDisk(acc, config.Duration(time.Second*30), true, "", device, wg) result := acc.GetTelegrafMetrics() - testutil.RequireMetricsEqual(t, testIntelInvmeAttributes, result, + testutil.RequireMetricsEqual(t, testIntelNVMeAttributes, result, testutil.SortMetrics(), testutil.IgnoreTime()) } func Test_findVIDFromNVMeOutput(t *testing.T) { - vid, sn, mn, err := findNVMeDeviceInfo(nvmeIdentifyController) + device, err := findNVMeDeviceInfo(nvmeIdentifyController) assert.Nil(t, err) - assert.Equal(t, "0x8086", vid) - assert.Equal(t, "CVFT5123456789ABCD", sn) - assert.Equal(t, "INTEL SSDPEDABCDEFG", mn) + assert.Equal(t, "0x8086", device.vendorID) + assert.Equal(t, "CVFT5123456789ABCD", device.serialNumber) + assert.Equal(t, "INTEL SSDPEDABCDEFG", device.model) } func Test_checkForNVMeDevices(t *testing.T) { @@ -293,7 +393,7 @@ func Test_difference(t *testing.T) { func Test_integerOverflow(t *testing.T) { runCmd = func(timeout config.Duration, sudo bool, command string, args ...string) ([]byte, error) { - return []byte(smartctlNvmeInfoDataWithOverflow), nil + return []byte(smartctlNVMeInfoDataWithOverflow), nil } var ( @@ -303,7 +403,8 @@ func Test_integerOverflow(t *testing.T) { t.Run("If data raw_value is out of int64 range, there should be no metrics for that attribute", func(t *testing.T) { wg.Add(1) - gatherDisk(acc, config.Duration(time.Second*30), true, true, "", "", "nvme0", wg) + + sampleSmart.gatherDisk(acc, "nvme0", wg) result := acc.GetTelegrafMetrics() testutil.RequireMetricsEqual(t, testOverflowAttributes, result, @@ -656,7 +757,7 @@ var ( mockModel = "INTEL SSDPEDABCDEFG" mockSerial = "CVFT5123456789ABCD" - testSmartctlNvmeAttributes = []telegraf.Metric{ + testSmartctlNVMeAttributes = []telegraf.Metric{ testutil.MustMetric("smart_device", map[string]string{ "device": "nvme0", @@ -1045,7 +1146,7 @@ var ( }, } - testNvmeDevice = []struct { + testNVMeDevice = []struct { fields map[string]interface{} tags map[string]string }{ @@ -1063,7 +1164,7 @@ var ( }, } - testIntelInvmeAttributes = []telegraf.Metric{ + testIntelNVMeAttributes = []telegraf.Metric{ testutil.MustMetric("smart_attribute", map[string]string{ "device": "nvme0", @@ -1257,11 +1358,146 @@ var ( time.Now(), ), } + + testIntelNVMeNewFormatAttributes = []telegraf.Metric{ + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Program_Fail_Count", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Erase_Fail_Count", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Wear_Leveling_Count", + }, + map[string]interface{}{ + "raw_value": int64(700090417315), + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "End_To_End_Error_Detection_Count", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Crc_Error_Count", + }, + map[string]interface{}{ + "raw_value": 13, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Media_Wear_Percentage", + }, + map[string]interface{}{ + "raw_value": 552, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Host_Reads", + }, + map[string]interface{}{ + "raw_value": 73, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Timed_Workload_Timer", + }, + map[string]interface{}{ + "raw_value": int64(2343038), + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Thermal_Throttle_Status", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Retry_Buffer_Overflow_Count", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + testutil.MustMetric("smart_attribute", + map[string]string{ + "device": "nvme0", + "serial_no": mockSerial, + "model": mockModel, + "name": "Pll_Lock_Loss_Count", + }, + map[string]interface{}{ + "raw_value": 0, + }, + time.Now(), + ), + } // smartctl --scan mockScanData = `/dev/ada0 -d atacam # /dev/ada0, ATA device` // smartctl --scan -d nvme - mockScanNvmeData = `/dev/nvme0 -d nvme # /dev/nvme0, NVMe device` + mockScanNVMeData = `/dev/nvme0 -d nvme # /dev/nvme0, NVMe device` // smartctl --info --health --attributes --tolerance=verypermissive -n standby --format=brief [DEVICE] mockInfoAttributeData = `smartctl 6.5 2016-05-07 r4318 [Darwin 16.4.0 x86_64] (local build) @@ -1670,7 +1906,7 @@ Selective self-test flags (0x0): After scanning selected spans, do NOT read-scan remainder of disk. If Selective self-test is pending on power-up, resume after 0 minute delay. ` - smartctlNvmeInfoData = `smartctl 6.5 2016-05-07 r4318 [x86_64-linux-4.1.27-gvt-yocto-standard] (local build) + smartctlNVMeInfoData = `smartctl 6.5 2016-05-07 r4318 [x86_64-linux-4.1.27-gvt-yocto-standard] (local build) Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org === START OF INFORMATION SECTION === @@ -1720,14 +1956,14 @@ Temperature Sensor 7: 44 C Temperature Sensor 8: 43 C ` - smartctlNvmeInfoDataWithOverflow = ` + smartctlNVMeInfoDataWithOverflow = ` Temperature Sensor 1: 9223372036854775808 C Temperature Sensor 2: -9223372036854775809 C Temperature Sensor 3: 9223372036854775807 C Temperature Sensor 4: -9223372036854775808 C ` - nvmeIntelInfoData = `Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff + nvmeIntelInfoDataDeprecatedMetricsFormat = `Additional Smart Log for NVME device:nvme0 namespace-id:ffffffff key normalized raw program_fail_count : 100% 0 erase_fail_count : 100% 0 @@ -1742,6 +1978,20 @@ retry_buffer_overflow_count : 100% 0 pll_lock_loss_count : 100% 0 nand_bytes_written : 0% sectors: 0 host_bytes_written : 0% sectors: 0 +` + nvmeIntelInfoDataMetricsFormat = `Additional Smart Log for NVME device:nvme0n1 namespace-id:ffffffff +ID KEY Normalized Raw +0xab program_fail_count 100 0 +0xac erase_fail_count 100 0 +0xad wear_leveling_count 100 700090417315 +0xb8 e2e_error_detect_count 100 0 +0xc7 crc_error_count 100 13 +0xe2 media_wear_percentage 100 552 +0xe3 host_reads 100 73 +0xe4 timed_work_load 100 2343038 +0xea thermal_throttle_status 100 0 +0xf0 retry_buff_overflow_count 100 0 +0xf3 pll_lock_loss_counter 100 0 ` nvmeIdentifyController = `NVME Identify Controller: