Skip to content

Commit

Permalink
feat: Better SCSI/SAS support
Browse files Browse the repository at this point in the history
Fix the following metrics that were exported as zero because the
exporter did not know how to read them for SCSI devices:
- smartctl_device_bytes_read
- smartctl_device_bytes_written
- smartctl_device_power_cycle_count

New metrics:
- smartctl_read_errors_corrected_by_eccdelayed
- smartctl_read_errors_corrected_by_eccfast
- smartctl_write_errors_corrected_by_eccdelayed
- smartctl_write_errors_corrected_by_eccfast

Fix labels:
- smartctl_device{model_name} is now populated for SCSI/SAS, using
  scsi_model_name.

New labels:
- smartctl_device{} gains:
  scsi_product,scsi_revision,scsi_vendor,scsi_version

Signed-off-by: Robin H. Johnson <[email protected]>
  • Loading branch information
robbat2 committed Oct 16, 2023
1 parent d90594a commit 9113c6c
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 0 deletions.
37 changes: 37 additions & 0 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ var (
"ata_version",
"sata_version",
"form_factor",
// scsi_model_name is mapped into model_name
"scsi_vendor",
"scsi_product",
"scsi_revision",
"scsi_version",
},
nil,
)
Expand Down Expand Up @@ -293,6 +298,22 @@ var (
},
nil,
)
metricReadErrorsCorrectedByEccFast = prometheus.NewDesc(
"smartctl_read_errors_corrected_by_eccfast",
"Read Errors Corrected by ECC Fast",
[]string{
"device",
},
nil,
)
metricReadErrorsCorrectedByEccDelayed = prometheus.NewDesc(
"smartctl_read_errors_corrected_by_eccdelayed",
"Read Errors Corrected by ECC Delayed",
[]string{
"device",
},
nil,
)
metricReadTotalUncorrectedErrors = prometheus.NewDesc(
"smartctl_read_total_uncorrected_errors",
"Read Total Uncorrected Errors",
Expand All @@ -309,6 +330,22 @@ var (
},
nil,
)
metricWriteErrorsCorrectedByEccFast = prometheus.NewDesc(
"smartctl_write_errors_corrected_by_eccfast",
"Write Errors Corrected by ECC Fast",
[]string{
"device",
},
nil,
)
metricWriteErrorsCorrectedByEccDelayed = prometheus.NewDesc(
"smartctl_write_errors_corrected_by_eccdelayed",
"Write Errors Corrected by ECC Delayed",
[]string{
"device",
},
nil,
)
metricWriteTotalUncorrectedErrors = prometheus.NewDesc(
"smartctl_write_total_uncorrected_errors",
"Write Total Uncorrected Errors",
Expand Down
78 changes: 78 additions & 0 deletions smartctl.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ func NewSMARTctl(logger log.Logger, json gjson.Result, ch chan<- prometheus.Metr
var model_name string
if obj := json.Get("model_name"); obj.Exists() {
model_name = obj.String()
} else if obj := json.Get("scsi_model_name"); obj.Exists() {
model_name = obj.String()
}
// If the drive returns an empty model name, replace that with unknown.
if model_name == "" {
Expand Down Expand Up @@ -102,6 +104,8 @@ func (smart *SMARTctl) Collect() {
if smart.device.interface_ == "scsi" {
smart.mineSCSIGrownDefectList()
smart.mineSCSIErrorCounterLog()
smart.mineSCSIBytesRead()
smart.mineSCSIBytesWritten()
}
}

Expand Down Expand Up @@ -130,6 +134,11 @@ func (smart *SMARTctl) mineDevice() {
smart.json.Get("ata_version.string").String(),
smart.json.Get("sata_version.string").String(),
smart.json.Get("form_factor.name").String(),
// scsi_model_name is mapped into model_name
smart.json.Get("scsi_vendor").String(),
smart.json.Get("scsi_product").String(),
smart.json.Get("scsi_revision").String(),
smart.json.Get("scsi_version").String(),
)
}

Expand Down Expand Up @@ -173,6 +182,7 @@ func (smart *SMARTctl) mineBlockSize() {
}

func (smart *SMARTctl) mineInterfaceSpeed() {
// TODO: Support scsi_sas_port_[01].phy_N.negotiated_logical_link_rate
iSpeed := smart.json.Get("interface_speed")
if iSpeed.Exists() {
for _, speedType := range []string{"max", "current"} {
Expand Down Expand Up @@ -253,6 +263,7 @@ func (smart *SMARTctl) mineRotationRate() {

func (smart *SMARTctl) mineTemperatures() {
temperatures := smart.json.Get("temperature")
// TODO: Implement scsi_environmental_reports
if temperatures.Exists() {
temperatures.ForEach(func(key, value gjson.Result) bool {
smart.ch <- prometheus.MustNewConstMetric(
Expand All @@ -279,6 +290,18 @@ func (smart *SMARTctl) minePowerCycleCount() {
)
return
}

// SCSI
powerCycleCount = smart.json.Get("scsi_start_stop_cycle_counter.accumulated_start_stop_cycles")
if powerCycleCount.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDevicePowerCycleCount,
prometheus.CounterValue,
powerCycleCount.Float(),
smart.device.device,
)
return
}
}

func (smart *SMARTctl) mineDeviceSCTStatus() {
Expand Down Expand Up @@ -379,6 +402,36 @@ func (smart *SMARTctl) mineNvmeBytesWritten() {
)
}

func (smart *SMARTctl) mineSCSIBytesRead() {
SCSIHealth := smart.json.Get("scsi_error_counter_log")
if SCSIHealth.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesRead,
prometheus.CounterValue,
// This value is reported by SMARTctl in GB [10^9].
// It is possible that some drives mis-report the value, but
// that is not the responsibility of the exporter or smartctl
SCSIHealth.Get("read.gigabytes_processed").Float()*1e9,
smart.device.device,
)
}
}

func (smart *SMARTctl) mineSCSIBytesWritten() {
SCSIHealth := smart.json.Get("scsi_error_counter_log")
if SCSIHealth.Exists() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceBytesWritten,
prometheus.CounterValue,
// This value is reported by SMARTctl in GB [10^9].
// It is possible that some drives mis-report the value, but
// that is not the responsibility of the exporter or smartctl
SCSIHealth.Get("write.gigabytes_processed").Float()*1e9,
smart.device.device,
)
}
}

func (smart *SMARTctl) mineSmartStatus() {
smart.ch <- prometheus.MustNewConstMetric(
metricDeviceSmartStatus,
Expand Down Expand Up @@ -504,6 +557,18 @@ func (smart *SMARTctl) mineSCSIErrorCounterLog() {
SCSIHealth.Get("read.errors_corrected_by_rereads_rewrites").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricReadErrorsCorrectedByEccFast,
prometheus.GaugeValue,
SCSIHealth.Get("read.errors_corrected_by_eccfast").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricReadErrorsCorrectedByEccDelayed,
prometheus.GaugeValue,
SCSIHealth.Get("read.errors_corrected_by_eccdelayed").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricReadTotalUncorrectedErrors,
prometheus.GaugeValue,
Expand All @@ -516,11 +581,24 @@ func (smart *SMARTctl) mineSCSIErrorCounterLog() {
SCSIHealth.Get("write.errors_corrected_by_rereads_rewrites").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricWriteErrorsCorrectedByEccFast,
prometheus.GaugeValue,
SCSIHealth.Get("write.errors_corrected_by_eccfast").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricWriteErrorsCorrectedByEccDelayed,
prometheus.GaugeValue,
SCSIHealth.Get("write.errors_corrected_by_eccdelayed").Float(),
smart.device.device,
)
smart.ch <- prometheus.MustNewConstMetric(
metricWriteTotalUncorrectedErrors,
prometheus.GaugeValue,
SCSIHealth.Get("write.total_uncorrected_errors").Float(),
smart.device.device,
)
// TODO: Should we also export the verify category?
}
}

0 comments on commit 9113c6c

Please sign in to comment.