mirror of
https://github.com/henrygd/beszel.git
synced 2025-10-30 01:57:04 +00:00
894 lines
28 KiB
Go
894 lines
28 KiB
Go
package agent
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"os/exec"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/henrygd/beszel/internal/entities/smart"
|
|
|
|
"golang.org/x/exp/slog"
|
|
)
|
|
|
|
// SmartManager manages data collection for SMART devices
|
|
type SmartManager struct {
|
|
sync.Mutex
|
|
SmartDataMap map[string]*smart.SmartData
|
|
SmartDevices []*DeviceInfo
|
|
refreshMutex sync.Mutex
|
|
lastScanTime time.Time
|
|
}
|
|
|
|
type scanOutput struct {
|
|
Devices []struct {
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
InfoName string `json:"info_name"`
|
|
Protocol string `json:"protocol"`
|
|
} `json:"devices"`
|
|
}
|
|
|
|
type DeviceInfo struct {
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
InfoName string `json:"info_name"`
|
|
Protocol string `json:"protocol"`
|
|
// typeVerified reports whether we have already parsed SMART data for this device
|
|
// with the stored parserType. When true we can skip re-running the detection logic.
|
|
typeVerified bool
|
|
// parserType holds the parser type (nvme, sat, scsi) that last succeeded.
|
|
parserType string
|
|
}
|
|
|
|
var errNoValidSmartData = fmt.Errorf("no valid SMART data found") // Error for missing data
|
|
|
|
// Refresh updates SMART data for all known devices
|
|
func (sm *SmartManager) Refresh(forceScan bool) error {
|
|
sm.refreshMutex.Lock()
|
|
defer sm.refreshMutex.Unlock()
|
|
|
|
scanErr := sm.ScanDevices(false)
|
|
if scanErr != nil {
|
|
slog.Debug("smartctl scan failed", "err", scanErr)
|
|
}
|
|
|
|
devices := sm.devicesSnapshot()
|
|
var collectErr error
|
|
for _, deviceInfo := range devices {
|
|
if deviceInfo == nil {
|
|
continue
|
|
}
|
|
if err := sm.CollectSmart(deviceInfo); err != nil {
|
|
slog.Debug("smartctl collect failed", "device", deviceInfo.Name, "err", err)
|
|
collectErr = err
|
|
}
|
|
}
|
|
|
|
return sm.resolveRefreshError(scanErr, collectErr)
|
|
}
|
|
|
|
// devicesSnapshot returns a copy of the current device slice to avoid iterating
|
|
// while holding the primary mutex for longer than necessary.
|
|
func (sm *SmartManager) devicesSnapshot() []*DeviceInfo {
|
|
sm.Lock()
|
|
defer sm.Unlock()
|
|
|
|
devices := make([]*DeviceInfo, len(sm.SmartDevices))
|
|
copy(devices, sm.SmartDevices)
|
|
return devices
|
|
}
|
|
|
|
// hasSmartData reports whether any SMART data has been collected.
|
|
// func (sm *SmartManager) hasSmartData() bool {
|
|
// sm.Lock()
|
|
// defer sm.Unlock()
|
|
|
|
// return len(sm.SmartDataMap) > 0
|
|
// }
|
|
|
|
// resolveRefreshError determines the proper error to return after a refresh.
|
|
func (sm *SmartManager) resolveRefreshError(scanErr, collectErr error) error {
|
|
sm.Lock()
|
|
noDevices := len(sm.SmartDevices) == 0
|
|
noData := len(sm.SmartDataMap) == 0
|
|
sm.Unlock()
|
|
|
|
if noDevices {
|
|
if scanErr != nil {
|
|
return scanErr
|
|
}
|
|
}
|
|
|
|
if !noData {
|
|
return nil
|
|
}
|
|
|
|
if collectErr != nil {
|
|
return collectErr
|
|
}
|
|
if scanErr != nil {
|
|
return scanErr
|
|
}
|
|
return errNoValidSmartData
|
|
}
|
|
|
|
// GetCurrentData returns the current SMART data
|
|
func (sm *SmartManager) GetCurrentData() map[string]smart.SmartData {
|
|
sm.Lock()
|
|
defer sm.Unlock()
|
|
result := make(map[string]smart.SmartData, len(sm.SmartDataMap))
|
|
for key, value := range sm.SmartDataMap {
|
|
if value != nil {
|
|
result[key] = *value
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// ScanDevices scans for SMART devices
|
|
// Scan devices using `smartctl --scan -j`
|
|
// If scan fails, return error
|
|
// If scan succeeds, parse the output and update the SmartDevices slice
|
|
func (sm *SmartManager) ScanDevices(force bool) error {
|
|
if !force && time.Since(sm.lastScanTime) < 30*time.Minute {
|
|
return nil
|
|
}
|
|
sm.lastScanTime = time.Now()
|
|
currentDevices := sm.devicesSnapshot()
|
|
|
|
var configuredDevices []*DeviceInfo
|
|
if configuredRaw, ok := GetEnv("SMART_DEVICES"); ok {
|
|
slog.Info("SMART_DEVICES", "value", configuredRaw)
|
|
config := strings.TrimSpace(configuredRaw)
|
|
if config == "" {
|
|
return errNoValidSmartData
|
|
}
|
|
|
|
parsedDevices, err := sm.parseConfiguredDevices(config)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
configuredDevices = parsedDevices
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(ctx, "smartctl", "--scan", "-j")
|
|
output, err := cmd.Output()
|
|
|
|
var (
|
|
scanErr error
|
|
scannedDevices []*DeviceInfo
|
|
hasValidScan bool
|
|
)
|
|
|
|
if err != nil {
|
|
scanErr = err
|
|
} else {
|
|
scannedDevices, hasValidScan = sm.parseScan(output)
|
|
if !hasValidScan {
|
|
scanErr = errNoValidSmartData
|
|
}
|
|
}
|
|
|
|
finalDevices := mergeDeviceLists(currentDevices, scannedDevices, configuredDevices)
|
|
sm.updateSmartDevices(finalDevices)
|
|
|
|
if len(finalDevices) == 0 {
|
|
if scanErr != nil {
|
|
slog.Debug("smartctl scan failed", "err", scanErr)
|
|
return scanErr
|
|
}
|
|
return errNoValidSmartData
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (sm *SmartManager) parseConfiguredDevices(config string) ([]*DeviceInfo, error) {
|
|
entries := strings.Split(config, ",")
|
|
devices := make([]*DeviceInfo, 0, len(entries))
|
|
for _, entry := range entries {
|
|
entry = strings.TrimSpace(entry)
|
|
if entry == "" {
|
|
continue
|
|
}
|
|
|
|
parts := strings.SplitN(entry, ":", 2)
|
|
|
|
name := strings.TrimSpace(parts[0])
|
|
if name == "" {
|
|
return nil, fmt.Errorf("invalid SMART_DEVICES entry %q", entry)
|
|
}
|
|
|
|
devType := ""
|
|
if len(parts) == 2 {
|
|
devType = strings.ToLower(strings.TrimSpace(parts[1]))
|
|
}
|
|
|
|
devices = append(devices, &DeviceInfo{
|
|
Name: name,
|
|
Type: devType,
|
|
})
|
|
}
|
|
|
|
if len(devices) == 0 {
|
|
return nil, errNoValidSmartData
|
|
}
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
// detectSmartOutputType inspects sections that are unique to each smartctl
|
|
// JSON schema (NVMe, ATA/SATA, SCSI) to determine which parser should be used
|
|
// when the reported device type is ambiguous or missing.
|
|
func detectSmartOutputType(output []byte) string {
|
|
var hints struct {
|
|
AtaSmartAttributes json.RawMessage `json:"ata_smart_attributes"`
|
|
NVMeSmartHealthInformationLog json.RawMessage `json:"nvme_smart_health_information_log"`
|
|
ScsiErrorCounterLog json.RawMessage `json:"scsi_error_counter_log"`
|
|
}
|
|
|
|
if err := json.Unmarshal(output, &hints); err != nil {
|
|
return ""
|
|
}
|
|
|
|
switch {
|
|
case hasJSONValue(hints.NVMeSmartHealthInformationLog):
|
|
return "nvme"
|
|
case hasJSONValue(hints.AtaSmartAttributes):
|
|
return "sat"
|
|
case hasJSONValue(hints.ScsiErrorCounterLog):
|
|
return "scsi"
|
|
default:
|
|
return "sat"
|
|
}
|
|
}
|
|
|
|
// hasJSONValue reports whether a JSON payload contains a concrete value. The
|
|
// smartctl output often emits "null" for sections that do not apply, so we
|
|
// only treat non-null content as a hint.
|
|
func hasJSONValue(raw json.RawMessage) bool {
|
|
if len(raw) == 0 {
|
|
return false
|
|
}
|
|
trimmed := strings.TrimSpace(string(raw))
|
|
return trimmed != "" && trimmed != "null"
|
|
}
|
|
|
|
func normalizeParserType(value string) string {
|
|
switch strings.ToLower(strings.TrimSpace(value)) {
|
|
case "nvme", "sntasmedia", "sntrealtek":
|
|
return "nvme"
|
|
case "sat", "ata":
|
|
return "sat"
|
|
case "scsi":
|
|
return "scsi"
|
|
default:
|
|
return strings.ToLower(strings.TrimSpace(value))
|
|
}
|
|
}
|
|
|
|
// parseSmartOutput attempts each SMART parser, optionally detecting the type when
|
|
// it is not provided, and updates the device info when a parser succeeds.
|
|
func (sm *SmartManager) parseSmartOutput(deviceInfo *DeviceInfo, output []byte) bool {
|
|
parsers := []struct {
|
|
Type string
|
|
Parse func([]byte) (bool, int)
|
|
}{
|
|
{Type: "nvme", Parse: sm.parseSmartForNvme},
|
|
{Type: "sat", Parse: sm.parseSmartForSata},
|
|
{Type: "scsi", Parse: sm.parseSmartForScsi},
|
|
}
|
|
|
|
deviceType := normalizeParserType(deviceInfo.parserType)
|
|
if deviceType == "" {
|
|
deviceType = normalizeParserType(deviceInfo.Type)
|
|
}
|
|
if deviceInfo.parserType == "" {
|
|
switch deviceType {
|
|
case "nvme", "sat", "scsi":
|
|
deviceInfo.parserType = deviceType
|
|
}
|
|
}
|
|
|
|
// Only run the type detection when we do not yet know which parser works
|
|
// or the previous attempt failed.
|
|
needsDetection := deviceType == "" || !deviceInfo.typeVerified
|
|
if needsDetection {
|
|
structureType := detectSmartOutputType(output)
|
|
if deviceType != structureType {
|
|
deviceType = structureType
|
|
deviceInfo.parserType = structureType
|
|
deviceInfo.typeVerified = false
|
|
}
|
|
if deviceInfo.Type == "" || strings.EqualFold(deviceInfo.Type, structureType) {
|
|
deviceInfo.Type = structureType
|
|
}
|
|
}
|
|
|
|
// Try the most likely parser first, but keep the remaining parsers in reserve
|
|
// so an incorrect hint never leaves the device unparsed.
|
|
selectedParsers := make([]struct {
|
|
Type string
|
|
Parse func([]byte) (bool, int)
|
|
}, 0, len(parsers))
|
|
if deviceType != "" {
|
|
for _, parser := range parsers {
|
|
if parser.Type == deviceType {
|
|
selectedParsers = append(selectedParsers, parser)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
for _, parser := range parsers {
|
|
alreadySelected := false
|
|
for _, selected := range selectedParsers {
|
|
if selected.Type == parser.Type {
|
|
alreadySelected = true
|
|
break
|
|
}
|
|
}
|
|
if alreadySelected {
|
|
continue
|
|
}
|
|
selectedParsers = append(selectedParsers, parser)
|
|
}
|
|
|
|
// Try the selected parsers in order until we find one that succeeds.
|
|
for _, parser := range selectedParsers {
|
|
hasData, _ := parser.Parse(output)
|
|
if hasData {
|
|
deviceInfo.parserType = parser.Type
|
|
if deviceInfo.Type == "" || strings.EqualFold(deviceInfo.Type, parser.Type) {
|
|
deviceInfo.Type = parser.Type
|
|
}
|
|
// Remember that this parser is valid so future refreshes can bypass
|
|
// detection entirely.
|
|
deviceInfo.typeVerified = true
|
|
return true
|
|
}
|
|
slog.Debug("parser failed", "device", deviceInfo.Name, "parser", parser.Type)
|
|
}
|
|
|
|
// Leave verification false so the next pass will attempt detection again.
|
|
deviceInfo.typeVerified = false
|
|
slog.Debug("parsing failed", "device", deviceInfo.Name)
|
|
return false
|
|
}
|
|
|
|
// CollectSmart collects SMART data for a device
|
|
// Collect data using `smartctl -d <type> -aj /dev/<device>` when device type is known
|
|
// Always attempts to parse output even if command fails, as some data may still be available
|
|
// If collect fails, return error
|
|
// If collect succeeds, parse the output and update the SmartDataMap
|
|
// Uses -n standby to avoid waking up sleeping disks, but bypasses standby mode
|
|
// for initial data collection when no cached data exists
|
|
func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
|
|
// slog.Info("collecting SMART data", "device", deviceInfo.Name, "type", deviceInfo.Type, "has_existing_data", sm.hasDataForDevice(deviceInfo.Name))
|
|
|
|
// Check if we have any existing data for this device
|
|
hasExistingData := sm.hasDataForDevice(deviceInfo.Name)
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
|
defer cancel()
|
|
|
|
// Try with -n standby first if we have existing data
|
|
args := sm.smartctlArgs(deviceInfo, true)
|
|
cmd := exec.CommandContext(ctx, "smartctl", args...)
|
|
output, err := cmd.CombinedOutput()
|
|
|
|
// Check if device is in standby (exit status 2)
|
|
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() == 2 {
|
|
if hasExistingData {
|
|
// Device is in standby and we have cached data, keep using cache
|
|
return nil
|
|
}
|
|
// No cached data, need to collect initial data by bypassing standby
|
|
ctx2, cancel2 := context.WithTimeout(context.Background(), 2*time.Second)
|
|
defer cancel2()
|
|
args = sm.smartctlArgs(deviceInfo, false)
|
|
cmd = exec.CommandContext(ctx2, "smartctl", args...)
|
|
output, err = cmd.CombinedOutput()
|
|
}
|
|
|
|
hasValidData := sm.parseSmartOutput(deviceInfo, output)
|
|
|
|
if !hasValidData {
|
|
if err != nil {
|
|
slog.Debug("smartctl failed", "device", deviceInfo.Name, "err", err)
|
|
return err
|
|
}
|
|
slog.Debug("no valid SMART data found", "device", deviceInfo.Name)
|
|
return errNoValidSmartData
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// smartctlArgs returns the arguments for the smartctl command
|
|
// based on the device type and whether to include standby mode
|
|
func (sm *SmartManager) smartctlArgs(deviceInfo *DeviceInfo, includeStandby bool) []string {
|
|
args := make([]string, 0, 7)
|
|
|
|
if deviceInfo != nil {
|
|
deviceType := strings.ToLower(deviceInfo.Type)
|
|
// types sometimes misidentified in scan; see github.com/henrygd/beszel/issues/1345
|
|
if deviceType != "" && deviceType != "scsi" && deviceType != "ata" {
|
|
args = append(args, "-d", deviceInfo.Type)
|
|
}
|
|
}
|
|
|
|
args = append(args, "-aj")
|
|
|
|
if includeStandby {
|
|
args = append(args, "-n", "standby")
|
|
}
|
|
|
|
if deviceInfo != nil {
|
|
args = append(args, deviceInfo.Name)
|
|
}
|
|
|
|
return args
|
|
}
|
|
|
|
// hasDataForDevice checks if we have cached SMART data for a specific device
|
|
func (sm *SmartManager) hasDataForDevice(deviceName string) bool {
|
|
sm.Lock()
|
|
defer sm.Unlock()
|
|
|
|
// Check if any cached data has this device name
|
|
for _, data := range sm.SmartDataMap {
|
|
if data != nil && data.DiskName == deviceName {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// parseScan parses the output of smartctl --scan -j and returns the discovered devices.
|
|
func (sm *SmartManager) parseScan(output []byte) ([]*DeviceInfo, bool) {
|
|
scan := &scanOutput{}
|
|
|
|
if err := json.Unmarshal(output, scan); err != nil {
|
|
return nil, false
|
|
}
|
|
|
|
if len(scan.Devices) == 0 {
|
|
slog.Debug("no devices found in smartctl scan")
|
|
return nil, false
|
|
}
|
|
|
|
devices := make([]*DeviceInfo, 0, len(scan.Devices))
|
|
for _, device := range scan.Devices {
|
|
slog.Debug("smartctl scan", "name", device.Name, "type", device.Type, "protocol", device.Protocol)
|
|
devices = append(devices, &DeviceInfo{
|
|
Name: device.Name,
|
|
Type: device.Type,
|
|
InfoName: device.InfoName,
|
|
Protocol: device.Protocol,
|
|
})
|
|
}
|
|
|
|
return devices, true
|
|
}
|
|
|
|
// mergeDeviceLists combines scanned and configured SMART devices, preferring
|
|
// configured SMART_DEVICES when both sources reference the same device.
|
|
func mergeDeviceLists(existing, scanned, configured []*DeviceInfo) []*DeviceInfo {
|
|
if len(scanned) == 0 && len(configured) == 0 {
|
|
return existing
|
|
}
|
|
|
|
// preserveVerifiedType copies the verified type/parser metadata from an existing
|
|
// device record so that subsequent scans/config updates never downgrade a
|
|
// previously verified device.
|
|
preserveVerifiedType := func(target, prev *DeviceInfo) {
|
|
if prev == nil || !prev.typeVerified {
|
|
return
|
|
}
|
|
target.Type = prev.Type
|
|
target.typeVerified = true
|
|
target.parserType = prev.parserType
|
|
}
|
|
|
|
existingIndex := make(map[string]*DeviceInfo, len(existing))
|
|
for _, dev := range existing {
|
|
if dev == nil || dev.Name == "" {
|
|
continue
|
|
}
|
|
existingIndex[dev.Name] = dev
|
|
}
|
|
|
|
finalDevices := make([]*DeviceInfo, 0, len(scanned)+len(configured))
|
|
deviceIndex := make(map[string]*DeviceInfo, len(scanned)+len(configured))
|
|
|
|
// Start with the newly scanned devices so we always surface fresh metadata,
|
|
// but ensure we retain any previously verified parser assignment.
|
|
for _, dev := range scanned {
|
|
if dev == nil || dev.Name == "" {
|
|
continue
|
|
}
|
|
|
|
// Work on a copy so we can safely adjust metadata without mutating the
|
|
// input slices that may be reused elsewhere.
|
|
copyDev := *dev
|
|
if prev := existingIndex[copyDev.Name]; prev != nil {
|
|
preserveVerifiedType(©Dev, prev)
|
|
}
|
|
|
|
finalDevices = append(finalDevices, ©Dev)
|
|
deviceIndex[copyDev.Name] = finalDevices[len(finalDevices)-1]
|
|
}
|
|
|
|
// Merge configured devices on top so users can override scan results (except
|
|
// for verified type information).
|
|
for _, dev := range configured {
|
|
if dev == nil || dev.Name == "" {
|
|
continue
|
|
}
|
|
|
|
if existingDev, ok := deviceIndex[dev.Name]; ok {
|
|
// Only update the type if it has not been verified yet; otherwise we
|
|
// keep the existing verified metadata intact.
|
|
if dev.Type != "" && !existingDev.typeVerified {
|
|
newType := strings.TrimSpace(dev.Type)
|
|
existingDev.Type = newType
|
|
existingDev.typeVerified = false
|
|
existingDev.parserType = normalizeParserType(newType)
|
|
}
|
|
if dev.InfoName != "" {
|
|
existingDev.InfoName = dev.InfoName
|
|
}
|
|
if dev.Protocol != "" {
|
|
existingDev.Protocol = dev.Protocol
|
|
}
|
|
continue
|
|
}
|
|
|
|
copyDev := *dev
|
|
if prev := existingIndex[copyDev.Name]; prev != nil {
|
|
preserveVerifiedType(©Dev, prev)
|
|
} else if copyDev.Type != "" {
|
|
copyDev.parserType = normalizeParserType(copyDev.Type)
|
|
}
|
|
|
|
finalDevices = append(finalDevices, ©Dev)
|
|
deviceIndex[copyDev.Name] = finalDevices[len(finalDevices)-1]
|
|
}
|
|
|
|
return finalDevices
|
|
}
|
|
|
|
// updateSmartDevices replaces the cached device list and prunes SMART data
|
|
// entries whose backing device no longer exists.
|
|
func (sm *SmartManager) updateSmartDevices(devices []*DeviceInfo) {
|
|
sm.Lock()
|
|
defer sm.Unlock()
|
|
|
|
sm.SmartDevices = devices
|
|
|
|
if len(sm.SmartDataMap) == 0 {
|
|
return
|
|
}
|
|
|
|
validNames := make(map[string]struct{}, len(devices))
|
|
for _, device := range devices {
|
|
if device == nil || device.Name == "" {
|
|
continue
|
|
}
|
|
validNames[device.Name] = struct{}{}
|
|
}
|
|
|
|
for key, data := range sm.SmartDataMap {
|
|
if data == nil {
|
|
delete(sm.SmartDataMap, key)
|
|
continue
|
|
}
|
|
|
|
if _, ok := validNames[data.DiskName]; ok {
|
|
continue
|
|
}
|
|
|
|
delete(sm.SmartDataMap, key)
|
|
}
|
|
}
|
|
|
|
// isVirtualDevice checks if a device is a virtual disk that should be filtered out
|
|
func (sm *SmartManager) isVirtualDevice(data *smart.SmartInfoForSata) bool {
|
|
vendorUpper := strings.ToUpper(data.ScsiVendor)
|
|
productUpper := strings.ToUpper(data.ScsiProduct)
|
|
modelUpper := strings.ToUpper(data.ModelName)
|
|
|
|
return sm.isVirtualDeviceFromStrings(vendorUpper, productUpper, modelUpper)
|
|
}
|
|
|
|
// isVirtualDeviceNvme checks if an NVMe device is a virtual disk that should be filtered out
|
|
func (sm *SmartManager) isVirtualDeviceNvme(data *smart.SmartInfoForNvme) bool {
|
|
modelUpper := strings.ToUpper(data.ModelName)
|
|
|
|
return sm.isVirtualDeviceFromStrings(modelUpper)
|
|
}
|
|
|
|
// isVirtualDeviceScsi checks if a SCSI device is a virtual disk that should be filtered out
|
|
func (sm *SmartManager) isVirtualDeviceScsi(data *smart.SmartInfoForScsi) bool {
|
|
vendorUpper := strings.ToUpper(data.ScsiVendor)
|
|
productUpper := strings.ToUpper(data.ScsiProduct)
|
|
modelUpper := strings.ToUpper(data.ScsiModelName)
|
|
|
|
return sm.isVirtualDeviceFromStrings(vendorUpper, productUpper, modelUpper)
|
|
}
|
|
|
|
// isVirtualDeviceFromStrings checks if any of the provided strings indicate a virtual device
|
|
func (sm *SmartManager) isVirtualDeviceFromStrings(fields ...string) bool {
|
|
for _, field := range fields {
|
|
fieldUpper := strings.ToUpper(field)
|
|
switch {
|
|
case strings.Contains(fieldUpper, "IET"), // iSCSI Enterprise Target
|
|
strings.Contains(fieldUpper, "VIRTUAL"),
|
|
strings.Contains(fieldUpper, "QEMU"),
|
|
strings.Contains(fieldUpper, "VBOX"),
|
|
strings.Contains(fieldUpper, "VMWARE"),
|
|
strings.Contains(fieldUpper, "MSFT"): // Microsoft Hyper-V
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// parseSmartForSata parses the output of smartctl --all -j for SATA/ATA devices and updates the SmartDataMap
|
|
// Returns hasValidData and exitStatus
|
|
func (sm *SmartManager) parseSmartForSata(output []byte) (bool, int) {
|
|
var data smart.SmartInfoForSata
|
|
|
|
if err := json.Unmarshal(output, &data); err != nil {
|
|
return false, 0
|
|
}
|
|
|
|
if data.SerialNumber == "" {
|
|
slog.Debug("no serial number", "device", data.Device.Name)
|
|
return false, data.Smartctl.ExitStatus
|
|
}
|
|
|
|
// Skip virtual devices (e.g., Kubernetes PVCs, QEMU, VirtualBox, etc.)
|
|
if sm.isVirtualDevice(&data) {
|
|
slog.Debug("skipping smart", "device", data.Device.Name, "model", data.ModelName)
|
|
return false, data.Smartctl.ExitStatus
|
|
}
|
|
|
|
sm.Lock()
|
|
defer sm.Unlock()
|
|
|
|
keyName := data.SerialNumber
|
|
|
|
// if device does not exist in SmartDataMap, initialize it
|
|
if _, ok := sm.SmartDataMap[keyName]; !ok {
|
|
sm.SmartDataMap[keyName] = &smart.SmartData{}
|
|
}
|
|
|
|
// update SmartData
|
|
smartData := sm.SmartDataMap[keyName]
|
|
// smartData.ModelFamily = data.ModelFamily
|
|
smartData.ModelName = data.ModelName
|
|
smartData.SerialNumber = data.SerialNumber
|
|
smartData.FirmwareVersion = data.FirmwareVersion
|
|
smartData.Capacity = data.UserCapacity.Bytes
|
|
smartData.Temperature = data.Temperature.Current
|
|
smartData.SmartStatus = getSmartStatus(smartData.Temperature, data.SmartStatus.Passed)
|
|
smartData.DiskName = data.Device.Name
|
|
smartData.DiskType = data.Device.Type
|
|
|
|
// update SmartAttributes
|
|
smartData.Attributes = make([]*smart.SmartAttribute, 0, len(data.AtaSmartAttributes.Table))
|
|
for _, attr := range data.AtaSmartAttributes.Table {
|
|
smartAttr := &smart.SmartAttribute{
|
|
ID: attr.ID,
|
|
Name: attr.Name,
|
|
Value: attr.Value,
|
|
Worst: attr.Worst,
|
|
Threshold: attr.Thresh,
|
|
RawValue: uint64(attr.Raw.Value),
|
|
RawString: attr.Raw.String,
|
|
WhenFailed: attr.WhenFailed,
|
|
}
|
|
smartData.Attributes = append(smartData.Attributes, smartAttr)
|
|
}
|
|
sm.SmartDataMap[keyName] = smartData
|
|
|
|
return true, data.Smartctl.ExitStatus
|
|
}
|
|
|
|
func getSmartStatus(temperature uint8, passed bool) string {
|
|
if passed {
|
|
return "PASSED"
|
|
} else if temperature > 0 {
|
|
return "FAILED"
|
|
} else {
|
|
return "UNKNOWN"
|
|
}
|
|
}
|
|
|
|
func (sm *SmartManager) parseSmartForScsi(output []byte) (bool, int) {
|
|
var data smart.SmartInfoForScsi
|
|
|
|
if err := json.Unmarshal(output, &data); err != nil {
|
|
return false, 0
|
|
}
|
|
|
|
if data.SerialNumber == "" {
|
|
slog.Debug("no serial number", "device", data.Device.Name)
|
|
return false, data.Smartctl.ExitStatus
|
|
}
|
|
|
|
// Skip virtual devices (e.g., Kubernetes PVCs, QEMU, VirtualBox, etc.)
|
|
if sm.isVirtualDeviceScsi(&data) {
|
|
slog.Debug("skipping smart", "device", data.Device.Name, "model", data.ScsiModelName)
|
|
return false, data.Smartctl.ExitStatus
|
|
}
|
|
|
|
sm.Lock()
|
|
defer sm.Unlock()
|
|
|
|
keyName := data.SerialNumber
|
|
if _, ok := sm.SmartDataMap[keyName]; !ok {
|
|
sm.SmartDataMap[keyName] = &smart.SmartData{}
|
|
}
|
|
|
|
smartData := sm.SmartDataMap[keyName]
|
|
smartData.ModelName = data.ScsiModelName
|
|
smartData.SerialNumber = data.SerialNumber
|
|
smartData.FirmwareVersion = data.ScsiRevision
|
|
smartData.Capacity = data.UserCapacity.Bytes
|
|
smartData.Temperature = data.Temperature.Current
|
|
smartData.SmartStatus = getSmartStatus(smartData.Temperature, data.SmartStatus.Passed)
|
|
smartData.DiskName = data.Device.Name
|
|
smartData.DiskType = data.Device.Type
|
|
|
|
attributes := make([]*smart.SmartAttribute, 0, 10)
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "PowerOnHours", RawValue: data.PowerOnTime.Hours})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "PowerOnMinutes", RawValue: data.PowerOnTime.Minutes})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "GrownDefectList", RawValue: data.ScsiGrownDefectList})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "StartStopCycles", RawValue: data.ScsiStartStopCycleCounter.AccumulatedStartStopCycles})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "LoadUnloadCycles", RawValue: data.ScsiStartStopCycleCounter.AccumulatedLoadUnloadCycles})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "StartStopSpecified", RawValue: data.ScsiStartStopCycleCounter.SpecifiedCycleCountOverDeviceLifetime})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "LoadUnloadSpecified", RawValue: data.ScsiStartStopCycleCounter.SpecifiedLoadUnloadCountOverDeviceLifetime})
|
|
|
|
readStats := data.ScsiErrorCounterLog.Read
|
|
writeStats := data.ScsiErrorCounterLog.Write
|
|
verifyStats := data.ScsiErrorCounterLog.Verify
|
|
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "ReadTotalErrorsCorrected", RawValue: readStats.TotalErrorsCorrected})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "ReadTotalUncorrectedErrors", RawValue: readStats.TotalUncorrectedErrors})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "ReadCorrectionAlgorithmInvocations", RawValue: readStats.CorrectionAlgorithmInvocations})
|
|
if val := parseScsiGigabytesProcessed(readStats.GigabytesProcessed); val >= 0 {
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "ReadGigabytesProcessed", RawValue: uint64(val)})
|
|
}
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "WriteTotalErrorsCorrected", RawValue: writeStats.TotalErrorsCorrected})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "WriteTotalUncorrectedErrors", RawValue: writeStats.TotalUncorrectedErrors})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "WriteCorrectionAlgorithmInvocations", RawValue: writeStats.CorrectionAlgorithmInvocations})
|
|
if val := parseScsiGigabytesProcessed(writeStats.GigabytesProcessed); val >= 0 {
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "WriteGigabytesProcessed", RawValue: uint64(val)})
|
|
}
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "VerifyTotalErrorsCorrected", RawValue: verifyStats.TotalErrorsCorrected})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "VerifyTotalUncorrectedErrors", RawValue: verifyStats.TotalUncorrectedErrors})
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "VerifyCorrectionAlgorithmInvocations", RawValue: verifyStats.CorrectionAlgorithmInvocations})
|
|
if val := parseScsiGigabytesProcessed(verifyStats.GigabytesProcessed); val >= 0 {
|
|
attributes = append(attributes, &smart.SmartAttribute{Name: "VerifyGigabytesProcessed", RawValue: uint64(val)})
|
|
}
|
|
|
|
smartData.Attributes = attributes
|
|
sm.SmartDataMap[keyName] = smartData
|
|
|
|
return true, data.Smartctl.ExitStatus
|
|
}
|
|
|
|
func parseScsiGigabytesProcessed(value string) int64 {
|
|
if value == "" {
|
|
return -1
|
|
}
|
|
normalized := strings.ReplaceAll(value, ",", "")
|
|
parsed, err := strconv.ParseInt(normalized, 10, 64)
|
|
if err != nil {
|
|
return -1
|
|
}
|
|
return parsed
|
|
}
|
|
|
|
// parseSmartForNvme parses the output of smartctl --all -j /dev/nvmeX and updates the SmartDataMap
|
|
// Returns hasValidData and exitStatus
|
|
func (sm *SmartManager) parseSmartForNvme(output []byte) (bool, int) {
|
|
data := &smart.SmartInfoForNvme{}
|
|
|
|
if err := json.Unmarshal(output, &data); err != nil {
|
|
return false, 0
|
|
}
|
|
|
|
if data.SerialNumber == "" {
|
|
slog.Debug("no serial number", "device", data.Device.Name)
|
|
return false, data.Smartctl.ExitStatus
|
|
}
|
|
|
|
// Skip virtual devices (e.g., Kubernetes PVCs, QEMU, VirtualBox, etc.)
|
|
if sm.isVirtualDeviceNvme(data) {
|
|
slog.Debug("skipping smart", "device", data.Device.Name, "model", data.ModelName)
|
|
return false, data.Smartctl.ExitStatus
|
|
}
|
|
|
|
sm.Lock()
|
|
defer sm.Unlock()
|
|
|
|
keyName := data.SerialNumber
|
|
|
|
// if device does not exist in SmartDataMap, initialize it
|
|
if _, ok := sm.SmartDataMap[keyName]; !ok {
|
|
sm.SmartDataMap[keyName] = &smart.SmartData{}
|
|
}
|
|
|
|
// update SmartData
|
|
smartData := sm.SmartDataMap[keyName]
|
|
smartData.ModelName = data.ModelName
|
|
smartData.SerialNumber = data.SerialNumber
|
|
smartData.FirmwareVersion = data.FirmwareVersion
|
|
smartData.Capacity = data.UserCapacity.Bytes
|
|
smartData.Temperature = data.NVMeSmartHealthInformationLog.Temperature
|
|
smartData.SmartStatus = getSmartStatus(smartData.Temperature, data.SmartStatus.Passed)
|
|
smartData.DiskName = data.Device.Name
|
|
smartData.DiskType = data.Device.Type
|
|
|
|
// nvme attributes does not follow the same format as ata attributes,
|
|
// so we manually map each field to SmartAttributes
|
|
log := data.NVMeSmartHealthInformationLog
|
|
smartData.Attributes = []*smart.SmartAttribute{
|
|
{Name: "CriticalWarning", RawValue: uint64(log.CriticalWarning)},
|
|
{Name: "Temperature", RawValue: uint64(log.Temperature)},
|
|
{Name: "AvailableSpare", RawValue: uint64(log.AvailableSpare)},
|
|
{Name: "AvailableSpareThreshold", RawValue: uint64(log.AvailableSpareThreshold)},
|
|
{Name: "PercentageUsed", RawValue: uint64(log.PercentageUsed)},
|
|
{Name: "DataUnitsRead", RawValue: log.DataUnitsRead},
|
|
{Name: "DataUnitsWritten", RawValue: log.DataUnitsWritten},
|
|
{Name: "HostReads", RawValue: uint64(log.HostReads)},
|
|
{Name: "HostWrites", RawValue: uint64(log.HostWrites)},
|
|
{Name: "ControllerBusyTime", RawValue: uint64(log.ControllerBusyTime)},
|
|
{Name: "PowerCycles", RawValue: uint64(log.PowerCycles)},
|
|
{Name: "PowerOnHours", RawValue: uint64(log.PowerOnHours)},
|
|
{Name: "UnsafeShutdowns", RawValue: uint64(log.UnsafeShutdowns)},
|
|
{Name: "MediaErrors", RawValue: uint64(log.MediaErrors)},
|
|
{Name: "NumErrLogEntries", RawValue: uint64(log.NumErrLogEntries)},
|
|
{Name: "WarningTempTime", RawValue: uint64(log.WarningTempTime)},
|
|
{Name: "CriticalCompTime", RawValue: uint64(log.CriticalCompTime)},
|
|
}
|
|
|
|
sm.SmartDataMap[keyName] = smartData
|
|
|
|
return true, data.Smartctl.ExitStatus
|
|
}
|
|
|
|
// detectSmartctl checks if smartctl is installed, returns an error if not
|
|
func (sm *SmartManager) detectSmartctl() error {
|
|
if _, err := exec.LookPath("smartctl"); err == nil {
|
|
slog.Debug("smartctl found")
|
|
return nil
|
|
}
|
|
slog.Debug("smartctl not found")
|
|
return errors.New("smartctl not found")
|
|
}
|
|
|
|
// NewSmartManager creates and initializes a new SmartManager
|
|
func NewSmartManager() (*SmartManager, error) {
|
|
sm := &SmartManager{
|
|
SmartDataMap: make(map[string]*smart.SmartData),
|
|
}
|
|
if err := sm.detectSmartctl(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return sm, nil
|
|
}
|