Compare commits

...

8 Commits

Author SHA1 Message Date
henrygd
283fa9d5c2 include GTT memory in AMD GPU metrics (#1569) 2026-02-13 20:06:37 -05:00
henrygd
7d6c0caafc add amdgpu.ids to docker images (#1569) 2026-02-13 19:55:02 -05:00
henrygd
04d54a3efc update sysfs amd collector to pull pretty name from amdgpu.ids (#1569) 2026-02-13 19:41:40 -05:00
henrygd
14ecb1b069 add nvtop integration and introduce GPU_COLLECTOR env var 2026-02-13 19:41:40 -05:00
henrygd
1f1a448aef ui: small refactoring / auto formatting 2026-02-12 18:40:16 -05:00
VACInc
e816ea143a SMART: add eMMC health via sysfs (#1736)
* SMART: add eMMC health via sysfs

Read eMMC wear/EOL indicators from /sys/class/block/mmcblk*/device and expose in SMART device list. Includes mocked sysfs tests and UI tweaks for unknown temps.

* small optimizations for emmc scan and parsing

* smart: keep smartctl optional only for Linux hosts with eMMC

* update smart alerts to handle warning state

* refactor: rename binPath to smartctlPath and replace hasSmartctl with smartctlPath checks

---------

Co-authored-by: henrygd <hank@henrygd.me>
2026-02-12 15:27:42 -05:00
Sven van Ginkel
2230097dc7 chore: update inactivity-actions (#1742) 2026-02-12 12:29:22 -05:00
henrygd
25c77c5664 make: auto-apply glibc tag for agent on linux/amd64 glibc 2026-02-11 13:49:29 -05:00
29 changed files with 2740 additions and 319 deletions

View File

@@ -6,6 +6,7 @@ on:
workflow_dispatch:
permissions:
actions: write
issues: write
pull-requests: write
@@ -48,6 +49,9 @@ jobs:
# Action can not skip PRs, set it to 100 years to cover it.
days-before-pr-stale: 36524
# Max issues to process before early exit. Next run resumes from cache. GH API limit: 5000.
operations-per-run: 1500
# Labels
stale-issue-label: 'stale'
remove-stale-when-updated: true
@@ -56,4 +60,5 @@ jobs:
# Exemptions
exempt-assignees: true
exempt-milestones: true
exempt-milestones: true

1
.gitignore vendored
View File

@@ -10,6 +10,7 @@ dist
*.exe
internal/cmd/hub/hub
internal/cmd/agent/agent
agent.test
node_modules
build
*timestamp*

View File

@@ -3,6 +3,40 @@ OS ?= $(shell go env GOOS)
ARCH ?= $(shell go env GOARCH)
# Skip building the web UI if true
SKIP_WEB ?= false
# Controls NVML/glibc agent build tag behavior:
# - auto (default): enable on linux/amd64 glibc hosts
# - true: always enable
# - false: always disable
NVML ?= auto
# Detect glibc host for local linux/amd64 builds.
HOST_GLIBC := $(shell \
if [ "$(OS)" = "linux" ] && [ "$(ARCH)" = "amd64" ]; then \
for p in /lib64/ld-linux-x86-64.so.2 /lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2; do \
[ -e "$$p" ] && { echo true; exit 0; }; \
done; \
if command -v ldd >/dev/null 2>&1; then \
if ldd --version 2>&1 | tr '[:upper:]' '[:lower:]' | awk '/gnu libc|glibc/{found=1} END{exit !found}'; then \
echo true; \
else \
echo false; \
fi; \
else \
echo false; \
fi; \
else \
echo false; \
fi)
# Enable glibc build tag for NVML on supported Linux builds.
AGENT_GO_TAGS :=
ifeq ($(NVML),true)
AGENT_GO_TAGS := -tags glibc
else ifeq ($(NVML),auto)
ifeq ($(HOST_GLIBC),true)
AGENT_GO_TAGS := -tags glibc
endif
endif
# Set executable extension based on target OS
EXE_EXT := $(if $(filter windows,$(OS)),.exe,)
@@ -54,7 +88,7 @@ fetch-smartctl-conditional:
# Update build-agent to include conditional .NET build
build-agent: tidy build-dotnet-conditional fetch-smartctl-conditional
GOOS=$(OS) GOARCH=$(ARCH) go build -o ./build/beszel-agent_$(OS)_$(ARCH)$(EXE_EXT) -ldflags "-w -s" ./internal/cmd/agent
GOOS=$(OS) GOARCH=$(ARCH) go build $(AGENT_GO_TAGS) -o ./build/beszel-agent_$(OS)_$(ARCH)$(EXE_EXT) -ldflags "-w -s" ./internal/cmd/agent
build-hub: tidy $(if $(filter false,$(SKIP_WEB)),build-web-ui)
GOOS=$(OS) GOARCH=$(ARCH) go build -o ./build/beszel_$(OS)_$(ARCH)$(EXE_EXT) -ldflags "-w -s" ./internal/cmd/hub
@@ -90,9 +124,9 @@ dev-hub:
dev-agent:
@if command -v entr >/dev/null 2>&1; then \
find ./internal/cmd/agent/*.go ./agent/*.go | entr -r go run github.com/henrygd/beszel/internal/cmd/agent; \
find ./internal/cmd/agent/*.go ./agent/*.go | entr -r go run $(AGENT_GO_TAGS) github.com/henrygd/beszel/internal/cmd/agent; \
else \
go run github.com/henrygd/beszel/internal/cmd/agent; \
go run $(AGENT_GO_TAGS) github.com/henrygd/beszel/internal/cmd/agent; \
fi
build-dotnet:

95
agent/emmc_common.go Normal file
View File

@@ -0,0 +1,95 @@
package agent
import (
"fmt"
"strconv"
"strings"
)
func isEmmcBlockName(name string) bool {
if !strings.HasPrefix(name, "mmcblk") {
return false
}
suffix := strings.TrimPrefix(name, "mmcblk")
if suffix == "" {
return false
}
for _, c := range suffix {
if c < '0' || c > '9' {
return false
}
}
return true
}
func parseHexOrDecByte(s string) (uint8, bool) {
s = strings.TrimSpace(s)
if s == "" {
return 0, false
}
base := 10
if strings.HasPrefix(s, "0x") || strings.HasPrefix(s, "0X") {
base = 16
s = s[2:]
}
parsed, err := strconv.ParseUint(s, base, 8)
if err != nil {
return 0, false
}
return uint8(parsed), true
}
func parseHexBytePair(s string) (uint8, uint8, bool) {
fields := strings.Fields(s)
if len(fields) < 2 {
return 0, 0, false
}
a, okA := parseHexOrDecByte(fields[0])
b, okB := parseHexOrDecByte(fields[1])
if !okA && !okB {
return 0, 0, false
}
return a, b, true
}
func emmcSmartStatus(preEOL uint8) string {
switch preEOL {
case 0x01:
return "PASSED"
case 0x02:
return "WARNING"
case 0x03:
return "FAILED"
default:
return "UNKNOWN"
}
}
func emmcPreEOLString(preEOL uint8) string {
switch preEOL {
case 0x01:
return "0x01 (normal)"
case 0x02:
return "0x02 (warning)"
case 0x03:
return "0x03 (urgent)"
default:
return fmt.Sprintf("0x%02x", preEOL)
}
}
func emmcLifeTimeString(v uint8) string {
// JEDEC eMMC: 0x01..0x0A => 0-100% used in 10% steps, 0x0B => exceeded.
switch {
case v == 0:
return "0x00 (not reported)"
case v >= 0x01 && v <= 0x0A:
low := int(v-1) * 10
high := int(v) * 10
return fmt.Sprintf("0x%02x (%d-%d%% used)", v, low, high)
case v == 0x0B:
return "0x0b (>100% used)"
default:
return fmt.Sprintf("0x%02x", v)
}
}

78
agent/emmc_common_test.go Normal file
View File

@@ -0,0 +1,78 @@
package agent
import "testing"
func TestParseHexOrDecByte(t *testing.T) {
tests := []struct {
in string
want uint8
ok bool
}{
{"0x01", 1, true},
{"0X0b", 11, true},
{"01", 1, true},
{" 3 ", 3, true},
{"", 0, false},
{"0x", 0, false},
{"nope", 0, false},
}
for _, tt := range tests {
got, ok := parseHexOrDecByte(tt.in)
if ok != tt.ok || got != tt.want {
t.Fatalf("parseHexOrDecByte(%q) = (%d,%v), want (%d,%v)", tt.in, got, ok, tt.want, tt.ok)
}
}
}
func TestParseHexBytePair(t *testing.T) {
a, b, ok := parseHexBytePair("0x01 0x02\n")
if !ok || a != 1 || b != 2 {
t.Fatalf("parseHexBytePair hex = (%d,%d,%v), want (1,2,true)", a, b, ok)
}
a, b, ok = parseHexBytePair("01 02")
if !ok || a != 1 || b != 2 {
t.Fatalf("parseHexBytePair dec = (%d,%d,%v), want (1,2,true)", a, b, ok)
}
_, _, ok = parseHexBytePair("0x01")
if ok {
t.Fatalf("parseHexBytePair short input ok=true, want false")
}
}
func TestEmmcSmartStatus(t *testing.T) {
if got := emmcSmartStatus(0x01); got != "PASSED" {
t.Fatalf("emmcSmartStatus(0x01) = %q, want PASSED", got)
}
if got := emmcSmartStatus(0x02); got != "WARNING" {
t.Fatalf("emmcSmartStatus(0x02) = %q, want WARNING", got)
}
if got := emmcSmartStatus(0x03); got != "FAILED" {
t.Fatalf("emmcSmartStatus(0x03) = %q, want FAILED", got)
}
if got := emmcSmartStatus(0x00); got != "UNKNOWN" {
t.Fatalf("emmcSmartStatus(0x00) = %q, want UNKNOWN", got)
}
}
func TestIsEmmcBlockName(t *testing.T) {
cases := []struct {
name string
ok bool
}{
{"mmcblk0", true},
{"mmcblk1", true},
{"mmcblk10", true},
{"mmcblk0p1", false},
{"sda", false},
{"mmcblk", false},
{"mmcblkA", false},
}
for _, c := range cases {
if got := isEmmcBlockName(c.name); got != c.ok {
t.Fatalf("isEmmcBlockName(%q) = %v, want %v", c.name, got, c.ok)
}
}
}

227
agent/emmc_linux.go Normal file
View File

@@ -0,0 +1,227 @@
//go:build linux
package agent
import (
"os"
"path/filepath"
"strconv"
"strings"
"github.com/henrygd/beszel/internal/entities/smart"
)
// emmcSysfsRoot is a test hook; production value is "/sys".
var emmcSysfsRoot = "/sys"
type emmcHealth struct {
model string
serial string
revision string
capacity uint64
preEOL uint8
lifeA uint8
lifeB uint8
}
func scanEmmcDevices() []*DeviceInfo {
blockDir := filepath.Join(emmcSysfsRoot, "class", "block")
entries, err := os.ReadDir(blockDir)
if err != nil {
return nil
}
devices := make([]*DeviceInfo, 0, 2)
for _, ent := range entries {
name := ent.Name()
if !isEmmcBlockName(name) {
continue
}
deviceDir := filepath.Join(blockDir, name, "device")
if !hasEmmcHealthFiles(deviceDir) {
continue
}
devPath := filepath.Join("/dev", name)
devices = append(devices, &DeviceInfo{
Name: devPath,
Type: "emmc",
InfoName: devPath + " [eMMC]",
Protocol: "MMC",
})
}
return devices
}
func (sm *SmartManager) collectEmmcHealth(deviceInfo *DeviceInfo) (bool, error) {
if deviceInfo == nil || deviceInfo.Name == "" {
return false, nil
}
base := filepath.Base(deviceInfo.Name)
if !isEmmcBlockName(base) && !strings.EqualFold(deviceInfo.Type, "emmc") && !strings.EqualFold(deviceInfo.Type, "mmc") {
return false, nil
}
health, ok := readEmmcHealth(base)
if !ok {
return false, nil
}
// Normalize the device type to keep pruning logic stable across refreshes.
deviceInfo.Type = "emmc"
key := health.serial
if key == "" {
key = filepath.Join("/dev", base)
}
status := emmcSmartStatus(health.preEOL)
attrs := []*smart.SmartAttribute{
{
Name: "PreEOLInfo",
RawValue: uint64(health.preEOL),
RawString: emmcPreEOLString(health.preEOL),
},
{
Name: "DeviceLifeTimeEstA",
RawValue: uint64(health.lifeA),
RawString: emmcLifeTimeString(health.lifeA),
},
{
Name: "DeviceLifeTimeEstB",
RawValue: uint64(health.lifeB),
RawString: emmcLifeTimeString(health.lifeB),
},
}
sm.Lock()
defer sm.Unlock()
if _, exists := sm.SmartDataMap[key]; !exists {
sm.SmartDataMap[key] = &smart.SmartData{}
}
data := sm.SmartDataMap[key]
data.ModelName = health.model
data.SerialNumber = health.serial
data.FirmwareVersion = health.revision
data.Capacity = health.capacity
data.Temperature = 0
data.SmartStatus = status
data.DiskName = filepath.Join("/dev", base)
data.DiskType = "emmc"
data.Attributes = attrs
return true, nil
}
func readEmmcHealth(blockName string) (emmcHealth, bool) {
var out emmcHealth
if !isEmmcBlockName(blockName) {
return out, false
}
deviceDir := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "device")
preEOL, okPre := readHexByteFile(filepath.Join(deviceDir, "pre_eol_info"))
// Some kernels expose EXT_CSD lifetime via "life_time" (two bytes), others as
// separate files. Support both.
lifeA, lifeB, okLife := readLifeTime(deviceDir)
if !okPre && !okLife {
return out, false
}
out.preEOL = preEOL
out.lifeA = lifeA
out.lifeB = lifeB
out.model = readStringFile(filepath.Join(deviceDir, "name"))
out.serial = readStringFile(filepath.Join(deviceDir, "serial"))
out.revision = readStringFile(filepath.Join(deviceDir, "prv"))
if capBytes, ok := readBlockCapacityBytes(blockName); ok {
out.capacity = capBytes
}
return out, true
}
func readLifeTime(deviceDir string) (uint8, uint8, bool) {
if content, ok := readStringFileOK(filepath.Join(deviceDir, "life_time")); ok {
a, b, ok := parseHexBytePair(content)
return a, b, ok
}
a, okA := readHexByteFile(filepath.Join(deviceDir, "device_life_time_est_typ_a"))
b, okB := readHexByteFile(filepath.Join(deviceDir, "device_life_time_est_typ_b"))
if okA || okB {
return a, b, true
}
return 0, 0, false
}
func readBlockCapacityBytes(blockName string) (uint64, bool) {
sizePath := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "size")
lbsPath := filepath.Join(emmcSysfsRoot, "class", "block", blockName, "queue", "logical_block_size")
sizeStr, ok := readStringFileOK(sizePath)
if !ok {
return 0, false
}
sectors, err := strconv.ParseUint(sizeStr, 10, 64)
if err != nil || sectors == 0 {
return 0, false
}
lbsStr, ok := readStringFileOK(lbsPath)
logicalBlockSize := uint64(512)
if ok {
if parsed, err := strconv.ParseUint(lbsStr, 10, 64); err == nil && parsed > 0 {
logicalBlockSize = parsed
}
}
return sectors * logicalBlockSize, true
}
func readHexByteFile(path string) (uint8, bool) {
content, ok := readStringFileOK(path)
if !ok {
return 0, false
}
b, ok := parseHexOrDecByte(content)
return b, ok
}
func readStringFile(path string) string {
content, _ := readStringFileOK(path)
return content
}
func readStringFileOK(path string) (string, bool) {
b, err := os.ReadFile(path)
if err != nil {
return "", false
}
return strings.TrimSpace(string(b)), true
}
func hasEmmcHealthFiles(deviceDir string) bool {
entries, err := os.ReadDir(deviceDir)
if err != nil {
return false
}
for _, ent := range entries {
switch ent.Name() {
case "pre_eol_info", "life_time", "device_life_time_est_typ_a", "device_life_time_est_typ_b":
return true
}
}
return false
}

80
agent/emmc_linux_test.go Normal file
View File

@@ -0,0 +1,80 @@
//go:build linux
package agent
import (
"os"
"path/filepath"
"testing"
"github.com/henrygd/beszel/internal/entities/smart"
)
func TestEmmcMockSysfsScanAndCollect(t *testing.T) {
tmp := t.TempDir()
prev := emmcSysfsRoot
emmcSysfsRoot = tmp
t.Cleanup(func() { emmcSysfsRoot = prev })
// Fake: /sys/class/block/mmcblk0
mmcDeviceDir := filepath.Join(tmp, "class", "block", "mmcblk0", "device")
mmcQueueDir := filepath.Join(tmp, "class", "block", "mmcblk0", "queue")
if err := os.MkdirAll(mmcDeviceDir, 0o755); err != nil {
t.Fatal(err)
}
if err := os.MkdirAll(mmcQueueDir, 0o755); err != nil {
t.Fatal(err)
}
write := func(path, content string) {
t.Helper()
if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
t.Fatal(err)
}
}
write(filepath.Join(mmcDeviceDir, "pre_eol_info"), "0x02\n")
write(filepath.Join(mmcDeviceDir, "life_time"), "0x04 0x05\n")
write(filepath.Join(mmcDeviceDir, "name"), "H26M52103FMR\n")
write(filepath.Join(mmcDeviceDir, "serial"), "01234567\n")
write(filepath.Join(mmcDeviceDir, "prv"), "0x08\n")
write(filepath.Join(mmcQueueDir, "logical_block_size"), "512\n")
write(filepath.Join(tmp, "class", "block", "mmcblk0", "size"), "1024\n") // sectors
devs := scanEmmcDevices()
if len(devs) != 1 {
t.Fatalf("scanEmmcDevices() = %d devices, want 1", len(devs))
}
if devs[0].Name != "/dev/mmcblk0" || devs[0].Type != "emmc" {
t.Fatalf("scanEmmcDevices()[0] = %+v, want Name=/dev/mmcblk0 Type=emmc", devs[0])
}
sm := &SmartManager{SmartDataMap: map[string]*smart.SmartData{}}
ok, err := sm.collectEmmcHealth(devs[0])
if err != nil || !ok {
t.Fatalf("collectEmmcHealth() = (ok=%v, err=%v), want (true,nil)", ok, err)
}
if len(sm.SmartDataMap) != 1 {
t.Fatalf("SmartDataMap len=%d, want 1", len(sm.SmartDataMap))
}
var got *smart.SmartData
for _, v := range sm.SmartDataMap {
got = v
break
}
if got == nil {
t.Fatalf("SmartDataMap value nil")
}
if got.DiskType != "emmc" || got.DiskName != "/dev/mmcblk0" {
t.Fatalf("disk fields = (type=%q name=%q), want (emmc,/dev/mmcblk0)", got.DiskType, got.DiskName)
}
if got.SmartStatus != "WARNING" {
t.Fatalf("SmartStatus=%q, want WARNING", got.SmartStatus)
}
if got.SerialNumber != "01234567" || got.ModelName == "" || got.Capacity == 0 {
t.Fatalf("identity fields = (model=%q serial=%q cap=%d), want non-empty model, serial 01234567, cap>0", got.ModelName, got.SerialNumber, got.Capacity)
}
if len(got.Attributes) < 3 {
t.Fatalf("attributes len=%d, want >= 3", len(got.Attributes))
}
}

14
agent/emmc_stub.go Normal file
View File

@@ -0,0 +1,14 @@
//go:build !linux
package agent
// Non-Linux builds: eMMC health via sysfs is not available.
func scanEmmcDevices() []*DeviceInfo {
return nil
}
func (sm *SmartManager) collectEmmcHealth(deviceInfo *DeviceInfo) (bool, error) {
return false, nil
}

View File

@@ -21,13 +21,10 @@ const (
// Commands
nvidiaSmiCmd string = "nvidia-smi"
rocmSmiCmd string = "rocm-smi"
amdgpuCmd string = "amdgpu" // internal cmd for sysfs collection
tegraStatsCmd string = "tegrastats"
nvtopCmd string = "nvtop"
noGPUFoundMsg string = "no GPU found - see https://beszel.dev/guide/gpu"
// Polling intervals
nvidiaSmiInterval string = "4" // in seconds
tegraStatsInterval string = "3700" // in milliseconds
rocmSmiInterval time.Duration = 4300 * time.Millisecond
// Command retry and timeout constants
retryWaitTime time.Duration = 5 * time.Second
maxFailureRetries int = 5
@@ -40,13 +37,7 @@ const (
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
type GPUManager struct {
sync.Mutex
nvidiaSmi bool
rocmSmi bool
amdgpu bool
tegrastats bool
intelGpuStats bool
nvml bool
GpuDataMap map[string]*system.GPUData
GpuDataMap map[string]*system.GPUData
// lastAvgData stores the last calculated averages for each GPU
// Used when a collection happens before new data arrives (Count == 0)
lastAvgData map[string]system.GPUData
@@ -87,6 +78,51 @@ type gpuCollector struct {
var errNoValidData = fmt.Errorf("no valid GPU data found") // Error for missing data
// collectorSource identifies a selectable GPU collector in GPU_COLLECTOR.
type collectorSource string
const (
collectorSourceNVTop collectorSource = collectorSource(nvtopCmd)
collectorSourceNVML collectorSource = "nvml"
collectorSourceNvidiaSMI collectorSource = collectorSource(nvidiaSmiCmd)
collectorSourceIntelGpuTop collectorSource = collectorSource(intelGpuStatsCmd)
collectorSourceAmdSysfs collectorSource = "amd_sysfs"
collectorSourceRocmSMI collectorSource = collectorSource(rocmSmiCmd)
collectorGroupNvidia string = "nvidia"
collectorGroupIntel string = "intel"
collectorGroupAmd string = "amd"
)
func isValidCollectorSource(source collectorSource) bool {
switch source {
case collectorSourceNVTop,
collectorSourceNVML,
collectorSourceNvidiaSMI,
collectorSourceIntelGpuTop,
collectorSourceAmdSysfs,
collectorSourceRocmSMI:
return true
}
return false
}
// gpuCapabilities describes detected GPU tooling and sysfs support on the host.
type gpuCapabilities struct {
hasNvidiaSmi bool
hasRocmSmi bool
hasAmdSysfs bool
hasTegrastats bool
hasIntelGpuTop bool
hasNvtop bool
}
type collectorDefinition struct {
group string
available bool
start func(onFailure func()) bool
deprecationWarning string
}
// starts and manages the ongoing collection of GPU data for the specified GPU management utility
func (c *gpuCollector) start() {
for {
@@ -392,93 +428,257 @@ func (gm *GPUManager) storeSnapshot(id string, gpu *system.GPUData, cacheKey uin
gm.lastSnapshots[cacheKey][id] = snapshot
}
// detectGPUs checks for the presence of GPU management tools (nvidia-smi, rocm-smi, tegrastats)
// in the system path. It sets the corresponding flags in the GPUManager struct if any of these
// tools are found. If none of the tools are found, it returns an error indicating that no GPU
// management tools are available.
func (gm *GPUManager) detectGPUs() error {
// discoverGpuCapabilities checks for available GPU tooling and sysfs support.
// It only reports capability presence and does not apply policy decisions.
func (gm *GPUManager) discoverGpuCapabilities() gpuCapabilities {
caps := gpuCapabilities{
hasAmdSysfs: gm.hasAmdSysfs(),
}
if _, err := exec.LookPath(nvidiaSmiCmd); err == nil {
gm.nvidiaSmi = true
caps.hasNvidiaSmi = true
}
if _, err := exec.LookPath(rocmSmiCmd); err == nil {
if val, _ := GetEnv("AMD_SYSFS"); val == "true" {
gm.amdgpu = true
} else {
gm.rocmSmi = true
}
} else if gm.hasAmdSysfs() {
gm.amdgpu = true
caps.hasRocmSmi = true
}
if _, err := exec.LookPath(tegraStatsCmd); err == nil {
gm.tegrastats = true
gm.nvidiaSmi = false
caps.hasTegrastats = true
}
if _, err := exec.LookPath(intelGpuStatsCmd); err == nil {
gm.intelGpuStats = true
caps.hasIntelGpuTop = true
}
if gm.nvidiaSmi || gm.rocmSmi || gm.amdgpu || gm.tegrastats || gm.intelGpuStats || gm.nvml {
return nil
if _, err := exec.LookPath(nvtopCmd); err == nil {
caps.hasNvtop = true
}
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or intel_gpu_top")
return caps
}
// startCollector starts the appropriate GPU data collector based on the command
func (gm *GPUManager) startCollector(command string) {
collector := gpuCollector{
name: command,
bufSize: 10 * 1024,
}
switch command {
case intelGpuStatsCmd:
go func() {
failures := 0
for {
if err := gm.collectIntelStats(); err != nil {
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
time.Sleep(retryWaitTime)
continue
func hasAnyGpuCollector(caps gpuCapabilities) bool {
return caps.hasNvidiaSmi || caps.hasRocmSmi || caps.hasAmdSysfs || caps.hasTegrastats || caps.hasIntelGpuTop || caps.hasNvtop
}
func (gm *GPUManager) startIntelCollector() {
go func() {
failures := 0
for {
if err := gm.collectIntelStats(); err != nil {
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting Intel GPU data; see https://beszel.dev/guide/gpu", "err", err)
time.Sleep(retryWaitTime)
continue
}
}()
case nvidiaSmiCmd:
collector.cmdArgs = []string{
"-l", nvidiaSmiInterval,
}
}()
}
func (gm *GPUManager) startNvidiaSmiCollector(intervalSeconds string) {
collector := gpuCollector{
name: nvidiaSmiCmd,
bufSize: 10 * 1024,
cmdArgs: []string{
"-l", intervalSeconds,
"--query-gpu=index,name,temperature.gpu,memory.used,memory.total,utilization.gpu,power.draw",
"--format=csv,noheader,nounits",
}
collector.parse = gm.parseNvidiaData
go collector.start()
case tegraStatsCmd:
collector.cmdArgs = []string{"--interval", tegraStatsInterval}
collector.parse = gm.getJetsonParser()
go collector.start()
case amdgpuCmd:
go func() {
if err := gm.collectAmdStats(); err != nil {
slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
}
}()
case rocmSmiCmd:
collector.cmdArgs = []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"}
collector.parse = gm.parseAmdData
go func() {
failures := 0
for {
if err := collector.collect(); err != nil {
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err)
}
time.Sleep(rocmSmiInterval)
}
}()
},
parse: gm.parseNvidiaData,
}
go collector.start()
}
func (gm *GPUManager) startTegraStatsCollector(intervalMilliseconds string) {
collector := gpuCollector{
name: tegraStatsCmd,
bufSize: 10 * 1024,
cmdArgs: []string{"--interval", intervalMilliseconds},
parse: gm.getJetsonParser(),
}
go collector.start()
}
func (gm *GPUManager) startRocmSmiCollector(pollInterval time.Duration) {
collector := gpuCollector{
name: rocmSmiCmd,
bufSize: 10 * 1024,
cmdArgs: []string{"--showid", "--showtemp", "--showuse", "--showpower", "--showproductname", "--showmeminfo", "vram", "--json"},
parse: gm.parseAmdData,
}
go func() {
failures := 0
for {
if err := collector.collect(); err != nil {
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting AMD GPU data via rocm-smi", "err", err)
}
time.Sleep(pollInterval)
}
}()
}
func (gm *GPUManager) collectorDefinitions(caps gpuCapabilities) map[collectorSource]collectorDefinition {
return map[collectorSource]collectorDefinition{
collectorSourceNVML: {
group: collectorGroupNvidia,
available: caps.hasNvidiaSmi,
start: func(_ func()) bool {
return gm.startNvmlCollector()
},
},
collectorSourceNvidiaSMI: {
group: collectorGroupNvidia,
available: caps.hasNvidiaSmi,
start: func(_ func()) bool {
gm.startNvidiaSmiCollector("4") // seconds
return true
},
},
collectorSourceIntelGpuTop: {
group: collectorGroupIntel,
available: caps.hasIntelGpuTop,
start: func(_ func()) bool {
gm.startIntelCollector()
return true
},
},
collectorSourceAmdSysfs: {
group: collectorGroupAmd,
available: caps.hasAmdSysfs,
start: func(_ func()) bool {
return gm.startAmdSysfsCollector()
},
},
collectorSourceRocmSMI: {
group: collectorGroupAmd,
available: caps.hasRocmSmi,
deprecationWarning: "rocm-smi is deprecated and may be removed in a future release",
start: func(_ func()) bool {
gm.startRocmSmiCollector(4300 * time.Millisecond)
return true
},
},
collectorSourceNVTop: {
available: caps.hasNvtop,
start: func(onFailure func()) bool {
gm.startNvtopCollector("30", onFailure) // tens of milliseconds
return true
},
},
}
}
// parseCollectorPriority parses GPU_COLLECTOR and returns valid ordered entries.
func parseCollectorPriority(value string) []collectorSource {
parts := strings.Split(value, ",")
priorities := make([]collectorSource, 0, len(parts))
for _, raw := range parts {
name := collectorSource(strings.TrimSpace(strings.ToLower(raw)))
if !isValidCollectorSource(name) {
if name != "" {
slog.Warn("Ignoring unknown GPU collector", "collector", name)
}
continue
}
priorities = append(priorities, name)
}
return priorities
}
// startNvmlCollector initializes NVML and starts its polling loop.
func (gm *GPUManager) startNvmlCollector() bool {
collector := &nvmlCollector{gm: gm}
if err := collector.init(); err != nil {
slog.Warn("Failed to initialize NVML", "err", err)
return false
}
go collector.start()
return true
}
// startAmdSysfsCollector starts AMD GPU collection via sysfs.
func (gm *GPUManager) startAmdSysfsCollector() bool {
go func() {
if err := gm.collectAmdStats(); err != nil {
slog.Warn("Error collecting AMD GPU data via sysfs", "err", err)
}
}()
return true
}
// startCollectorsByPriority starts collectors in order with one source per vendor group.
func (gm *GPUManager) startCollectorsByPriority(priorities []collectorSource, caps gpuCapabilities) int {
definitions := gm.collectorDefinitions(caps)
selectedGroups := make(map[string]bool, 3)
started := 0
for i, source := range priorities {
definition, ok := definitions[source]
if !ok || !definition.available {
continue
}
// nvtop is not a vendor-specific collector, so should only be used if no other collectors are selected or it is first in GPU_COLLECTOR.
if source == collectorSourceNVTop {
if len(selectedGroups) > 0 {
slog.Warn("Skipping nvtop because other collectors are selected")
continue
}
// if nvtop fails, fall back to remaining collectors.
remaining := append([]collectorSource(nil), priorities[i+1:]...)
if definition.start(func() {
gm.startCollectorsByPriority(remaining, caps)
}) {
started++
return started
}
}
group := definition.group
if group == "" || selectedGroups[group] {
continue
}
if definition.deprecationWarning != "" {
slog.Warn(definition.deprecationWarning)
}
if definition.start(nil) {
selectedGroups[group] = true
started++
}
}
return started
}
// resolveLegacyCollectorPriority builds the default collector order when GPU_COLLECTOR is unset.
func (gm *GPUManager) resolveLegacyCollectorPriority(caps gpuCapabilities) []collectorSource {
priorities := make([]collectorSource, 0, 4)
if caps.hasNvidiaSmi && !caps.hasTegrastats {
if nvml, _ := GetEnv("NVML"); nvml == "true" {
priorities = append(priorities, collectorSourceNVML, collectorSourceNvidiaSMI)
} else {
priorities = append(priorities, collectorSourceNvidiaSMI)
}
}
if caps.hasRocmSmi {
if val, _ := GetEnv("AMD_SYSFS"); val == "true" {
priorities = append(priorities, collectorSourceAmdSysfs)
} else {
priorities = append(priorities, collectorSourceRocmSMI)
}
} else if caps.hasAmdSysfs {
priorities = append(priorities, collectorSourceAmdSysfs)
}
if caps.hasIntelGpuTop {
priorities = append(priorities, collectorSourceIntelGpuTop)
}
// Keep nvtop as a legacy last resort only when no vendor collector exists.
if len(priorities) == 0 && caps.hasNvtop {
priorities = append(priorities, collectorSourceNVTop)
}
return priorities
}
// NewGPUManager creates and initializes a new GPUManager
@@ -487,38 +687,30 @@ func NewGPUManager() (*GPUManager, error) {
return nil, nil
}
var gm GPUManager
if err := gm.detectGPUs(); err != nil {
return nil, err
caps := gm.discoverGpuCapabilities()
if !hasAnyGpuCollector(caps) {
return nil, fmt.Errorf(noGPUFoundMsg)
}
gm.GpuDataMap = make(map[string]*system.GPUData)
if gm.nvidiaSmi {
if nvml, _ := GetEnv("NVML"); nvml == "true" {
gm.nvml = true
gm.nvidiaSmi = false
collector := &nvmlCollector{gm: &gm}
if err := collector.init(); err == nil {
go collector.start()
} else {
slog.Warn("Failed to initialize NVML, falling back to nvidia-smi", "err", err)
gm.nvidiaSmi = true
gm.startCollector(nvidiaSmiCmd)
}
} else {
gm.startCollector(nvidiaSmiCmd)
// Jetson devices should always use tegrastats (ignore GPU_COLLECTOR).
if caps.hasTegrastats {
gm.startTegraStatsCollector("3700")
return &gm, nil
}
// if GPU_COLLECTOR is set, start user-defined collectors.
if collectorConfig, ok := GetEnv("GPU_COLLECTOR"); ok && strings.TrimSpace(collectorConfig) != "" {
priorities := parseCollectorPriority(collectorConfig)
if gm.startCollectorsByPriority(priorities, caps) == 0 {
return nil, fmt.Errorf("no configured GPU collectors are available")
}
return &gm, nil
}
if gm.rocmSmi {
gm.startCollector(rocmSmiCmd)
}
if gm.amdgpu {
gm.startCollector(amdgpuCmd)
}
if gm.tegrastats {
gm.startCollector(tegraStatsCmd)
}
if gm.intelGpuStats {
gm.startCollector(intelGpuStatsCmd)
// auto-detect and start collectors when GPU_COLLECTOR is unset.
if gm.startCollectorsByPriority(gm.resolveLegacyCollectorPriority(caps), caps) == 0 {
return nil, fmt.Errorf(noGPUFoundMsg)
}
return &gm, nil

View File

@@ -3,6 +3,7 @@
package agent
import (
"bufio"
"fmt"
"log/slog"
"os"
@@ -15,6 +16,15 @@ import (
"github.com/henrygd/beszel/internal/entities/system"
)
var amdgpuNameCache = struct {
sync.RWMutex
hits map[string]string
misses map[string]struct{}
}{
hits: make(map[string]string),
misses: make(map[string]struct{}),
}
// hasAmdSysfs returns true if any AMD GPU sysfs nodes are found
func (gm *GPUManager) hasAmdSysfs() bool {
cards, err := filepath.Glob("/sys/class/drm/card*/device/vendor")
@@ -32,6 +42,7 @@ func (gm *GPUManager) hasAmdSysfs() bool {
// collectAmdStats collects AMD GPU metrics directly from sysfs to avoid the overhead of rocm-smi
func (gm *GPUManager) collectAmdStats() error {
sysfsPollInterval := 3000 * time.Millisecond
cards, err := filepath.Glob("/sys/class/drm/card*")
if err != nil {
return err
@@ -70,10 +81,11 @@ func (gm *GPUManager) collectAmdStats() error {
continue
}
failures = 0
time.Sleep(rocmSmiInterval)
time.Sleep(sysfsPollInterval)
}
}
// isAmdGpu checks whether a DRM card path belongs to AMD vendor ID 0x1002.
func isAmdGpu(cardPath string) bool {
vendorPath := filepath.Join(cardPath, "device/vendor")
vendor, err := os.ReadFile(vendorPath)
@@ -91,8 +103,17 @@ func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
// Read all sysfs values first (no lock needed - these can be slow)
usage, usageErr := readSysfsFloat(filepath.Join(devicePath, "gpu_busy_percent"))
memUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used"))
memTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total"))
vramUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used"))
vramTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total"))
memUsed := vramUsed
memTotal := vramTotal
// if gtt is present, add it to the memory used and total (https://github.com/henrygd/beszel/issues/1569#issuecomment-3837640484)
if gttUsed, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_used")); err == nil && gttUsed > 0 {
if gttTotal, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_total")); err == nil {
memUsed += gttUsed
memTotal += gttTotal
}
}
var temp, power float64
hwmons, _ := filepath.Glob(filepath.Join(devicePath, "hwmon/hwmon*"))
@@ -133,6 +154,7 @@ func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
return true
}
// readSysfsFloat reads and parses a numeric value from a sysfs file.
func readSysfsFloat(path string) (float64, error) {
val, err := os.ReadFile(path)
if err != nil {
@@ -141,6 +163,110 @@ func readSysfsFloat(path string) (float64, error) {
return strconv.ParseFloat(strings.TrimSpace(string(val)), 64)
}
// normalizeHexID normalizes hex IDs by trimming spaces, lowercasing, and dropping 0x.
func normalizeHexID(id string) string {
return strings.TrimPrefix(strings.ToLower(strings.TrimSpace(id)), "0x")
}
// cacheKeyForAmdgpu builds the cache key for a device and optional revision.
func cacheKeyForAmdgpu(deviceID, revisionID string) string {
if revisionID != "" {
return deviceID + ":" + revisionID
}
return deviceID
}
// lookupAmdgpuNameInFile resolves an AMDGPU name from amdgpu.ids by device/revision.
func lookupAmdgpuNameInFile(deviceID, revisionID, filePath string) (name string, exact bool, found bool) {
file, err := os.Open(filePath)
if err != nil {
return "", false, false
}
defer file.Close()
var byDevice string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" || strings.HasPrefix(line, "#") {
continue
}
parts := strings.SplitN(line, ",", 3)
if len(parts) != 3 {
continue
}
dev := normalizeHexID(parts[0])
rev := normalizeHexID(parts[1])
productName := strings.TrimSpace(parts[2])
if dev == "" || productName == "" || dev != deviceID {
continue
}
if byDevice == "" {
byDevice = productName
}
if revisionID != "" && rev == revisionID {
return productName, true, true
}
}
if byDevice != "" {
return byDevice, false, true
}
return "", false, false
}
// getCachedAmdgpuName returns cached hit/miss status for the given device/revision.
func getCachedAmdgpuName(deviceID, revisionID string) (name string, found bool, done bool) {
// Build the list of cache keys to check. We always look up the exact device+revision key.
// When revisionID is set, we also look up deviceID alone, since the cache may store a
// device-only fallback when we couldn't resolve the exact revision.
keys := []string{cacheKeyForAmdgpu(deviceID, revisionID)}
if revisionID != "" {
keys = append(keys, deviceID)
}
knownMisses := 0
amdgpuNameCache.RLock()
defer amdgpuNameCache.RUnlock()
for _, key := range keys {
if name, ok := amdgpuNameCache.hits[key]; ok {
return name, true, true
}
if _, ok := amdgpuNameCache.misses[key]; ok {
knownMisses++
}
}
// done=true means "don't bother doing slow lookup": we either found a name (above) or
// every key we checked was already a known miss, so we've tried before and failed.
return "", false, knownMisses == len(keys)
}
// normalizeAmdgpuName trims standard suffixes from AMDGPU product names.
func normalizeAmdgpuName(name string) string {
return strings.TrimSuffix(strings.TrimSpace(name), " Graphics")
}
// cacheAmdgpuName stores a resolved AMDGPU name in the lookup cache.
func cacheAmdgpuName(deviceID, revisionID, name string, exact bool) {
name = normalizeAmdgpuName(name)
amdgpuNameCache.Lock()
defer amdgpuNameCache.Unlock()
if exact && revisionID != "" {
amdgpuNameCache.hits[cacheKeyForAmdgpu(deviceID, revisionID)] = name
}
amdgpuNameCache.hits[deviceID] = name
}
// cacheMissingAmdgpuName records unresolved device/revision lookups.
func cacheMissingAmdgpuName(deviceID, revisionID string) {
amdgpuNameCache.Lock()
defer amdgpuNameCache.Unlock()
amdgpuNameCache.misses[deviceID] = struct{}{}
if revisionID != "" {
amdgpuNameCache.misses[cacheKeyForAmdgpu(deviceID, revisionID)] = struct{}{}
}
}
// getAmdGpuName attempts to get a descriptive GPU name.
// First tries product_name (rarely available), then looks up the PCI device ID.
// Falls back to showing the raw device ID if not found in the lookup table.
@@ -152,33 +278,24 @@ func getAmdGpuName(devicePath string) string {
// Read PCI device ID and look it up
if deviceID, err := os.ReadFile(filepath.Join(devicePath, "device")); err == nil {
id := strings.TrimPrefix(strings.ToLower(strings.TrimSpace(string(deviceID))), "0x")
if name, ok := getRadeonNames()[id]; ok {
return fmt.Sprintf("Radeon %s", name)
id := normalizeHexID(string(deviceID))
revision := ""
if revBytes, revErr := os.ReadFile(filepath.Join(devicePath, "revision")); revErr == nil {
revision = normalizeHexID(string(revBytes))
}
if name, found, done := getCachedAmdgpuName(id, revision); found {
return name
} else if !done {
if name, exact, ok := lookupAmdgpuNameInFile(id, revision, "/usr/share/libdrm/amdgpu.ids"); ok {
cacheAmdgpuName(id, revision, name, exact)
return normalizeAmdgpuName(name)
}
cacheMissingAmdgpuName(id, revision)
}
return fmt.Sprintf("AMD GPU (%s)", id)
}
return "AMD GPU"
}
// getRadeonNames returns the AMD GPU name lookup table
// Device IDs from https://pci-ids.ucw.cz/read/PC/1002
var getRadeonNames = sync.OnceValue(func() map[string]string {
return map[string]string{
"7550": "RX 9070",
"7590": "RX 9060 XT",
"7551": "AI PRO R9700",
"744c": "RX 7900",
"1681": "680M",
"7448": "PRO W7900",
"745e": "PRO W7800",
"7470": "PRO W7700",
"73e3": "PRO W6600",
"7422": "PRO W6400",
"7341": "PRO W5500",
}
})

264
agent/gpu_amd_linux_test.go Normal file
View File

@@ -0,0 +1,264 @@
//go:build linux
package agent
import (
"os"
"path/filepath"
"testing"
"github.com/henrygd/beszel/internal/entities/system"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestNormalizeHexID(t *testing.T) {
tests := []struct {
in string
want string
}{
{"0x1002", "1002"},
{"C2", "c2"},
{" 15BF ", "15bf"},
{"0x15bf", "15bf"},
{"", ""},
}
for _, tt := range tests {
subName := tt.in
if subName == "" {
subName = "empty_string"
}
t.Run(subName, func(t *testing.T) {
got := normalizeHexID(tt.in)
assert.Equal(t, tt.want, got)
})
}
}
func TestCacheKeyForAmdgpu(t *testing.T) {
tests := []struct {
deviceID string
revisionID string
want string
}{
{"1114", "c2", "1114:c2"},
{"15bf", "", "15bf"},
{"1506", "c1", "1506:c1"},
}
for _, tt := range tests {
got := cacheKeyForAmdgpu(tt.deviceID, tt.revisionID)
assert.Equal(t, tt.want, got)
}
}
func TestReadSysfsFloat(t *testing.T) {
dir := t.TempDir()
validPath := filepath.Join(dir, "val")
require.NoError(t, os.WriteFile(validPath, []byte(" 42.5 \n"), 0o644))
got, err := readSysfsFloat(validPath)
require.NoError(t, err)
assert.Equal(t, 42.5, got)
// Integer and scientific
sciPath := filepath.Join(dir, "sci")
require.NoError(t, os.WriteFile(sciPath, []byte("1e2"), 0o644))
got, err = readSysfsFloat(sciPath)
require.NoError(t, err)
assert.Equal(t, 100.0, got)
// Missing file
_, err = readSysfsFloat(filepath.Join(dir, "missing"))
require.Error(t, err)
// Invalid content
badPath := filepath.Join(dir, "bad")
require.NoError(t, os.WriteFile(badPath, []byte("not a number"), 0o644))
_, err = readSysfsFloat(badPath)
require.Error(t, err)
}
func TestIsAmdGpu(t *testing.T) {
dir := t.TempDir()
deviceDir := filepath.Join(dir, "device")
require.NoError(t, os.MkdirAll(deviceDir, 0o755))
// AMD vendor 0x1002 -> true
require.NoError(t, os.WriteFile(filepath.Join(deviceDir, "vendor"), []byte("0x1002\n"), 0o644))
assert.True(t, isAmdGpu(dir), "vendor 0x1002 should be AMD")
// Non-AMD vendor -> false
require.NoError(t, os.WriteFile(filepath.Join(deviceDir, "vendor"), []byte("0x10de\n"), 0o644))
assert.False(t, isAmdGpu(dir), "vendor 0x10de should not be AMD")
// Missing vendor file -> false
require.NoError(t, os.Remove(filepath.Join(deviceDir, "vendor")))
assert.False(t, isAmdGpu(dir), "missing vendor file should be false")
}
func TestAmdgpuNameCacheRoundTrip(t *testing.T) {
// Cache a name and retrieve it (unique key to avoid affecting other tests)
deviceID, revisionID := "cachedev99", "00"
cacheAmdgpuName(deviceID, revisionID, "AMD Test GPU 99 Graphics", true)
name, found, done := getCachedAmdgpuName(deviceID, revisionID)
assert.True(t, found)
assert.True(t, done)
assert.Equal(t, "AMD Test GPU 99", name)
// Device-only key also stored
name2, found2, _ := getCachedAmdgpuName(deviceID, "")
assert.True(t, found2)
assert.Equal(t, "AMD Test GPU 99", name2)
// Cache a miss
cacheMissingAmdgpuName("missedev99", "ab")
_, found3, done3 := getCachedAmdgpuName("missedev99", "ab")
assert.False(t, found3)
assert.True(t, done3, "done should be true so caller skips file lookup")
}
func TestUpdateAmdGpuDataWithFakeSysfs(t *testing.T) {
tests := []struct {
name string
writeGTT bool
wantMemoryUsed float64
wantMemoryTotal float64
}{
{
name: "sums vram and gtt when gtt is present",
writeGTT: true,
wantMemoryUsed: bytesToMegabytes(1073741824 + 536870912),
wantMemoryTotal: bytesToMegabytes(2147483648 + 4294967296),
},
{
name: "falls back to vram when gtt is missing",
writeGTT: false,
wantMemoryUsed: bytesToMegabytes(1073741824),
wantMemoryTotal: bytesToMegabytes(2147483648),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dir := t.TempDir()
cardPath := filepath.Join(dir, "card0")
devicePath := filepath.Join(cardPath, "device")
hwmonPath := filepath.Join(devicePath, "hwmon", "hwmon0")
require.NoError(t, os.MkdirAll(hwmonPath, 0o755))
write := func(name, content string) {
require.NoError(t, os.WriteFile(filepath.Join(devicePath, name), []byte(content), 0o644))
}
write("vendor", "0x1002")
write("device", "0x1506")
write("revision", "0xc1")
write("gpu_busy_percent", "25")
write("mem_info_vram_used", "1073741824")
write("mem_info_vram_total", "2147483648")
if tt.writeGTT {
write("mem_info_gtt_used", "536870912")
write("mem_info_gtt_total", "4294967296")
}
require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "temp1_input"), []byte("45000"), 0o644))
require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "power1_input"), []byte("20000000"), 0o644))
// Pre-cache name so getAmdGpuName returns a known value (it uses system amdgpu.ids path)
cacheAmdgpuName("1506", "c1", "AMD Radeon 610M Graphics", true)
gm := &GPUManager{GpuDataMap: make(map[string]*system.GPUData)}
ok := gm.updateAmdGpuData(cardPath)
require.True(t, ok)
gpu, ok := gm.GpuDataMap["card0"]
require.True(t, ok)
assert.Equal(t, "AMD Radeon 610M", gpu.Name)
assert.Equal(t, 25.0, gpu.Usage)
assert.Equal(t, tt.wantMemoryUsed, gpu.MemoryUsed)
assert.Equal(t, tt.wantMemoryTotal, gpu.MemoryTotal)
assert.Equal(t, 45.0, gpu.Temperature)
assert.Equal(t, 20.0, gpu.Power)
assert.Equal(t, 1.0, gpu.Count)
})
}
}
func TestLookupAmdgpuNameInFile(t *testing.T) {
idsPath := filepath.Join("test-data", "amdgpu.ids")
tests := []struct {
name string
deviceID string
revisionID string
wantName string
wantExact bool
wantFound bool
}{
{
name: "exact device and revision match",
deviceID: "1114",
revisionID: "c2",
wantName: "AMD Radeon 860M Graphics",
wantExact: true,
wantFound: true,
},
{
name: "exact match 15BF revision 01 returns 760M",
deviceID: "15bf",
revisionID: "01",
wantName: "AMD Radeon 760M Graphics",
wantExact: true,
wantFound: true,
},
{
name: "exact match 15BF revision 00 returns 780M",
deviceID: "15bf",
revisionID: "00",
wantName: "AMD Radeon 780M Graphics",
wantExact: true,
wantFound: true,
},
{
name: "device-only match returns first entry for device",
deviceID: "1506",
revisionID: "",
wantName: "AMD Radeon 610M",
wantExact: false,
wantFound: true,
},
{
name: "unknown device not found",
deviceID: "dead",
revisionID: "00",
wantName: "",
wantExact: false,
wantFound: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotName, gotExact, gotFound := lookupAmdgpuNameInFile(tt.deviceID, tt.revisionID, idsPath)
assert.Equal(t, tt.wantName, gotName, "name")
assert.Equal(t, tt.wantExact, gotExact, "exact")
assert.Equal(t, tt.wantFound, gotFound, "found")
})
}
}
func TestGetAmdGpuNameFromIdsFile(t *testing.T) {
// Test that getAmdGpuName resolves a name when we can't inject the ids path.
// We only verify behavior when product_name is missing and device/revision
// would be read from sysfs; the actual lookup uses /usr/share/libdrm/amdgpu.ids.
// So this test focuses on normalizeAmdgpuName and that lookupAmdgpuNameInFile
// returns the expected name for our test-data file.
idsPath := filepath.Join("test-data", "amdgpu.ids")
name, exact, found := lookupAmdgpuNameInFile("1435", "ae", idsPath)
require.True(t, found)
require.True(t, exact)
assert.Equal(t, "AMD Custom GPU 0932", name)
assert.Equal(t, "AMD Custom GPU 0932", normalizeAmdgpuName(name))
// " Graphics" suffix is trimmed by normalizeAmdgpuName
name2 := "AMD Radeon 860M Graphics"
assert.Equal(t, "AMD Radeon 860M", normalizeAmdgpuName(name2))
}

View File

@@ -13,21 +13,3 @@ func (c *nvmlCollector) init() error {
}
func (c *nvmlCollector) start() {}
func (c *nvmlCollector) collect() {}
func openLibrary(name string) (uintptr, error) {
return 0, fmt.Errorf("nvml not supported on this platform")
}
func getNVMLPath() string {
return ""
}
func hasSymbol(lib uintptr, symbol string) bool {
return false
}
func (c *nvmlCollector) isGPUActive(bdf string) bool {
return true
}

159
agent/gpu_nvtop.go Normal file
View File

@@ -0,0 +1,159 @@
package agent
import (
"encoding/json"
"io"
"log/slog"
"os/exec"
"strconv"
"strings"
"time"
"github.com/henrygd/beszel/internal/entities/system"
)
type nvtopSnapshot struct {
DeviceName string `json:"device_name"`
Temp *string `json:"temp"`
PowerDraw *string `json:"power_draw"`
GpuUtil *string `json:"gpu_util"`
MemTotal *string `json:"mem_total"`
MemUsed *string `json:"mem_used"`
}
// parseNvtopNumber parses nvtop numeric strings with units (C/W/%).
func parseNvtopNumber(raw string) float64 {
cleaned := strings.TrimSpace(raw)
cleaned = strings.TrimSuffix(cleaned, "C")
cleaned = strings.TrimSuffix(cleaned, "W")
cleaned = strings.TrimSuffix(cleaned, "%")
val, _ := strconv.ParseFloat(cleaned, 64)
return val
}
// parseNvtopData parses a single nvtop JSON snapshot payload.
func (gm *GPUManager) parseNvtopData(output []byte) bool {
var snapshots []nvtopSnapshot
if err := json.Unmarshal(output, &snapshots); err != nil || len(snapshots) == 0 {
return false
}
return gm.updateNvtopSnapshots(snapshots)
}
// updateNvtopSnapshots applies one decoded nvtop snapshot batch to GPU accumulators.
func (gm *GPUManager) updateNvtopSnapshots(snapshots []nvtopSnapshot) bool {
gm.Lock()
defer gm.Unlock()
valid := false
usedIDs := make(map[string]struct{}, len(snapshots))
for i, sample := range snapshots {
if sample.DeviceName == "" {
continue
}
indexID := "n" + strconv.Itoa(i)
id := indexID
// nvtop ordering can change, so prefer reusing an existing slot with matching device name.
if existingByIndex, ok := gm.GpuDataMap[indexID]; ok && existingByIndex.Name != "" && existingByIndex.Name != sample.DeviceName {
for existingID, gpu := range gm.GpuDataMap {
if !strings.HasPrefix(existingID, "n") {
continue
}
if _, taken := usedIDs[existingID]; taken {
continue
}
if gpu.Name == sample.DeviceName {
id = existingID
break
}
}
}
if _, ok := gm.GpuDataMap[id]; !ok {
gm.GpuDataMap[id] = &system.GPUData{Name: sample.DeviceName}
}
gpu := gm.GpuDataMap[id]
gpu.Name = sample.DeviceName
if sample.Temp != nil {
gpu.Temperature = parseNvtopNumber(*sample.Temp)
}
if sample.MemUsed != nil {
gpu.MemoryUsed = bytesToMegabytes(parseNvtopNumber(*sample.MemUsed))
}
if sample.MemTotal != nil {
gpu.MemoryTotal = bytesToMegabytes(parseNvtopNumber(*sample.MemTotal))
}
if sample.GpuUtil != nil {
gpu.Usage += parseNvtopNumber(*sample.GpuUtil)
}
if sample.PowerDraw != nil {
gpu.Power += parseNvtopNumber(*sample.PowerDraw)
}
gpu.Count++
usedIDs[id] = struct{}{}
valid = true
}
return valid
}
// collectNvtopStats runs nvtop loop mode and continuously decodes JSON snapshots.
func (gm *GPUManager) collectNvtopStats(interval string) error {
cmd := exec.Command(nvtopCmd, "-lP", "-d", interval)
stdout, err := cmd.StdoutPipe()
if err != nil {
return err
}
if err := cmd.Start(); err != nil {
return err
}
defer func() {
_ = stdout.Close()
if cmd.ProcessState == nil || !cmd.ProcessState.Exited() {
_ = cmd.Process.Kill()
}
_ = cmd.Wait()
}()
decoder := json.NewDecoder(stdout)
foundValid := false
for {
var snapshots []nvtopSnapshot
if err := decoder.Decode(&snapshots); err != nil {
if err == io.EOF {
if foundValid {
return nil
}
return errNoValidData
}
return err
}
if gm.updateNvtopSnapshots(snapshots) {
foundValid = true
}
}
}
// startNvtopCollector starts nvtop collection with retry or fallback callback handling.
func (gm *GPUManager) startNvtopCollector(interval string, onFailure func()) {
go func() {
failures := 0
for {
if err := gm.collectNvtopStats(interval); err != nil {
if onFailure != nil {
slog.Warn("Error collecting GPU data via nvtop", "err", err)
onFailure()
return
}
failures++
if failures > maxFailureRetries {
break
}
slog.Warn("Error collecting GPU data via nvtop", "err", err)
time.Sleep(retryWaitTime)
continue
}
}
}()
}

View File

@@ -250,6 +250,100 @@ func TestParseAmdData(t *testing.T) {
}
}
func TestParseNvtopData(t *testing.T) {
input, err := os.ReadFile("test-data/nvtop.json")
require.NoError(t, err)
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
valid := gm.parseNvtopData(input)
require.True(t, valid)
g0, ok := gm.GpuDataMap["n0"]
require.True(t, ok)
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", g0.Name)
assert.Equal(t, 48.0, g0.Temperature)
assert.Equal(t, 5.0, g0.Usage)
assert.Equal(t, 13.0, g0.Power)
assert.Equal(t, bytesToMegabytes(349372416), g0.MemoryUsed)
assert.Equal(t, bytesToMegabytes(4294967296), g0.MemoryTotal)
assert.Equal(t, 1.0, g0.Count)
g1, ok := gm.GpuDataMap["n1"]
require.True(t, ok)
assert.Equal(t, "AMD Radeon 680M", g1.Name)
assert.Equal(t, 48.0, g1.Temperature)
assert.Equal(t, 12.0, g1.Usage)
assert.Equal(t, 9.0, g1.Power)
assert.Equal(t, bytesToMegabytes(1213784064), g1.MemoryUsed)
assert.Equal(t, bytesToMegabytes(16929173504), g1.MemoryTotal)
assert.Equal(t, 1.0, g1.Count)
}
func TestUpdateNvtopSnapshotsKeepsDeviceAssociationWhenOrderChanges(t *testing.T) {
strPtr := func(s string) *string { return &s }
gm := &GPUManager{
GpuDataMap: make(map[string]*system.GPUData),
}
firstBatch := []nvtopSnapshot{
{
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
GpuUtil: strPtr("20%"),
PowerDraw: strPtr("10W"),
},
{
DeviceName: "AMD Radeon 680M",
GpuUtil: strPtr("30%"),
PowerDraw: strPtr("20W"),
},
}
secondBatchSwapped := []nvtopSnapshot{
{
DeviceName: "AMD Radeon 680M",
GpuUtil: strPtr("40%"),
PowerDraw: strPtr("25W"),
},
{
DeviceName: "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
GpuUtil: strPtr("50%"),
PowerDraw: strPtr("15W"),
},
}
require.True(t, gm.updateNvtopSnapshots(firstBatch))
require.True(t, gm.updateNvtopSnapshots(secondBatchSwapped))
nvidia := gm.GpuDataMap["n0"]
require.NotNil(t, nvidia)
assert.Equal(t, "NVIDIA GeForce RTX 3050 Ti Laptop GPU", nvidia.Name)
assert.Equal(t, 70.0, nvidia.Usage)
assert.Equal(t, 25.0, nvidia.Power)
assert.Equal(t, 2.0, nvidia.Count)
amd := gm.GpuDataMap["n1"]
require.NotNil(t, amd)
assert.Equal(t, "AMD Radeon 680M", amd.Name)
assert.Equal(t, 70.0, amd.Usage)
assert.Equal(t, 45.0, amd.Power)
assert.Equal(t, 2.0, amd.Count)
}
func TestParseCollectorPriority(t *testing.T) {
got := parseCollectorPriority(" nvml, nvidia-smi, intel_gpu_top, amd_sysfs, nvtop, rocm-smi, bad ")
want := []collectorSource{
collectorSourceNVML,
collectorSourceNvidiaSMI,
collectorSourceIntelGpuTop,
collectorSourceAmdSysfs,
collectorSourceNVTop,
collectorSourceRocmSMI,
}
assert.Equal(t, want, got)
}
func TestParseJetsonData(t *testing.T) {
tests := []struct {
name string
@@ -987,36 +1081,35 @@ func TestCalculateGPUAverage(t *testing.T) {
})
}
func TestDetectGPUs(t *testing.T) {
func TestGPUCapabilitiesAndLegacyPriority(t *testing.T) {
// Save original PATH
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
// Set up temp dir with the commands
tempDir := t.TempDir()
os.Setenv("PATH", tempDir)
hasAmdSysfs := (&GPUManager{}).hasAmdSysfs()
tests := []struct {
name string
setupCommands func() error
setupCommands func(string) error
wantNvidiaSmi bool
wantRocmSmi bool
wantTegrastats bool
wantNvtop bool
wantErr bool
}{
{
name: "nvidia-smi not available",
setupCommands: func() error {
setupCommands: func(_ string) error {
return nil
},
wantNvidiaSmi: false,
wantRocmSmi: false,
wantTegrastats: false,
wantNvtop: false,
wantErr: true,
},
{
name: "nvidia-smi available",
setupCommands: func() error {
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "nvidia-smi")
script := `#!/bin/sh
echo "test"`
@@ -1028,29 +1121,14 @@ echo "test"`
wantNvidiaSmi: true,
wantTegrastats: false,
wantRocmSmi: false,
wantNvtop: false,
wantErr: false,
},
{
name: "rocm-smi available",
setupCommands: func() error {
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "rocm-smi")
script := `#!/bin/sh
echo "test"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
wantNvidiaSmi: true,
wantRocmSmi: true,
wantTegrastats: false,
wantErr: false,
},
{
name: "tegrastats available",
setupCommands: func() error {
path := filepath.Join(tempDir, "tegrastats")
script := `#!/bin/sh
echo "test"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
@@ -1059,12 +1137,47 @@ echo "test"`
},
wantNvidiaSmi: false,
wantRocmSmi: true,
wantTegrastats: false,
wantNvtop: false,
wantErr: false,
},
{
name: "tegrastats available",
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "tegrastats")
script := `#!/bin/sh
echo "test"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
wantNvidiaSmi: false,
wantRocmSmi: false,
wantTegrastats: true,
wantNvtop: false,
wantErr: false,
},
{
name: "nvtop available",
setupCommands: func(tempDir string) error {
path := filepath.Join(tempDir, "nvtop")
script := `#!/bin/sh
echo "[]"`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
wantNvidiaSmi: false,
wantRocmSmi: false,
wantTegrastats: false,
wantNvtop: true,
wantErr: false,
},
{
name: "no gpu tools available",
setupCommands: func() error {
setupCommands: func(_ string) error {
os.Setenv("PATH", "")
return nil
},
@@ -1074,29 +1187,53 @@ echo "test"`
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if err := tt.setupCommands(); err != nil {
tempDir := t.TempDir()
os.Setenv("PATH", tempDir)
if err := tt.setupCommands(tempDir); err != nil {
t.Fatal(err)
}
gm := &GPUManager{}
err := gm.detectGPUs()
caps := gm.discoverGpuCapabilities()
var err error
if !hasAnyGpuCollector(caps) {
err = fmt.Errorf(noGPUFoundMsg)
}
priorities := gm.resolveLegacyCollectorPriority(caps)
hasPriority := func(source collectorSource) bool {
for _, s := range priorities {
if s == source {
return true
}
}
return false
}
gotNvidiaSmi := hasPriority(collectorSourceNvidiaSMI)
gotRocmSmi := hasPriority(collectorSourceRocmSMI)
gotTegrastats := caps.hasTegrastats
gotNvtop := caps.hasNvtop
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gm.nvidiaSmi, gm.rocmSmi, gm.tegrastats)
t.Logf("nvidiaSmi: %v, rocmSmi: %v, tegrastats: %v", gotNvidiaSmi, gotRocmSmi, gotTegrastats)
if tt.wantErr {
wantErr := tt.wantErr
if hasAmdSysfs && (tt.name == "nvidia-smi not available" || tt.name == "no gpu tools available") {
wantErr = false
}
if wantErr {
assert.Error(t, err)
return
}
assert.NoError(t, err)
assert.Equal(t, tt.wantNvidiaSmi, gm.nvidiaSmi)
assert.Equal(t, tt.wantRocmSmi, gm.rocmSmi)
assert.Equal(t, tt.wantTegrastats, gm.tegrastats)
assert.Equal(t, tt.wantNvidiaSmi, gotNvidiaSmi)
assert.Equal(t, tt.wantRocmSmi, gotRocmSmi)
assert.Equal(t, tt.wantTegrastats, gotTegrastats)
assert.Equal(t, tt.wantNvtop, gotNvtop)
})
}
}
func TestStartCollector(t *testing.T) {
func TestCollectorStartHelpers(t *testing.T) {
// Save original PATH
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
@@ -1181,6 +1318,27 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
},
},
},
{
name: "nvtop collector",
command: "nvtop",
setup: func(t *testing.T) error {
path := filepath.Join(dir, "nvtop")
script := `#!/bin/sh
echo '[{"device_name":"NVIDIA Test GPU","temp":"52C","power_draw":"31W","gpu_util":"37%","mem_total":"4294967296","mem_used":"536870912","processes":[]}]'`
if err := os.WriteFile(path, []byte(script), 0755); err != nil {
return err
}
return nil
},
validate: func(t *testing.T, gm *GPUManager) {
gpu, exists := gm.GpuDataMap["n0"]
assert.True(t, exists)
if exists {
assert.Equal(t, "NVIDIA Test GPU", gpu.Name)
assert.Equal(t, 52.0, gpu.Temperature)
}
},
},
}
for _, tt := range tests {
@@ -1193,13 +1351,157 @@ echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000m
GpuDataMap: make(map[string]*system.GPUData),
}
}
tt.gm.startCollector(tt.command)
switch tt.command {
case nvidiaSmiCmd:
tt.gm.startNvidiaSmiCollector("4")
case rocmSmiCmd:
tt.gm.startRocmSmiCollector(4300 * time.Millisecond)
case tegraStatsCmd:
tt.gm.startTegraStatsCollector("3700")
case nvtopCmd:
tt.gm.startNvtopCollector("30", nil)
default:
t.Fatalf("unknown test command %q", tt.command)
}
time.Sleep(50 * time.Millisecond) // Give collector time to run
tt.validate(t, tt.gm)
})
}
}
func TestNewGPUManagerPriorityNvtopFallback(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvtop,nvidia-smi")
nvtopPath := filepath.Join(dir, "nvtop")
nvtopScript := `#!/bin/sh
echo 'not-json'`
require.NoError(t, os.WriteFile(nvtopPath, []byte(nvtopScript), 0755))
nvidiaPath := filepath.Join(dir, "nvidia-smi")
nvidiaScript := `#!/bin/sh
echo "0, NVIDIA Priority GPU, 45, 512, 2048, 12, 25"`
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(150 * time.Millisecond)
gpu, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "Priority GPU", gpu.Name)
assert.Equal(t, 45.0, gpu.Temperature)
}
func TestNewGPUManagerPriorityMixedCollectors(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "intel_gpu_top,rocm-smi")
intelPath := filepath.Join(dir, "intel_gpu_top")
intelScript := `#!/bin/sh
echo "Freq MHz IRQ RC6 Power W IMC MiB/s RCS VCS"
echo " req act /s % gpu pkg rd wr % se wa % se wa"
echo "226 223 338 58 2.00 2.69 1820 965 0.00 0 0 0.00 0 0"
echo "189 187 412 67 1.80 2.45 1950 823 8.50 2 1 15.00 1 0"
`
require.NoError(t, os.WriteFile(intelPath, []byte(intelScript), 0755))
rocmPath := filepath.Join(dir, "rocm-smi")
rocmScript := `#!/bin/sh
echo '{"card0": {"Temperature (Sensor edge) (C)": "49.0", "Current Socket Graphics Package Power (W)": "28.159", "GPU use (%)": "0", "VRAM Total Memory (B)": "536870912", "VRAM Total Used Memory (B)": "445550592", "Card Series": "Rembrandt [Radeon 680M]", "GUID": "34756"}}'
`
require.NoError(t, os.WriteFile(rocmPath, []byte(rocmScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(150 * time.Millisecond)
_, intelOk := gm.GpuDataMap["i0"]
_, amdOk := gm.GpuDataMap["34756"]
assert.True(t, intelOk)
assert.True(t, amdOk)
}
func TestNewGPUManagerPriorityNvmlFallbackToNvidiaSmi(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvml,nvidia-smi")
nvidiaPath := filepath.Join(dir, "nvidia-smi")
nvidiaScript := `#!/bin/sh
echo "0, NVIDIA Fallback GPU, 41, 256, 1024, 8, 14"`
require.NoError(t, os.WriteFile(nvidiaPath, []byte(nvidiaScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(150 * time.Millisecond)
gpu, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "Fallback GPU", gpu.Name)
}
func TestNewGPUManagerConfiguredCollectorsMustStart(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Run("configured valid collector unavailable", func(t *testing.T) {
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
gm, err := NewGPUManager()
require.Nil(t, gm)
require.Error(t, err)
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
})
t.Run("configured collector list has only unknown entries", func(t *testing.T) {
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "bad,unknown")
gm, err := NewGPUManager()
require.Nil(t, gm)
require.Error(t, err)
assert.Contains(t, err.Error(), "no configured GPU collectors are available")
})
}
func TestNewGPUManagerJetsonIgnoresCollectorConfig(t *testing.T) {
origPath := os.Getenv("PATH")
defer os.Setenv("PATH", origPath)
dir := t.TempDir()
os.Setenv("PATH", dir)
t.Setenv("BESZEL_AGENT_GPU_COLLECTOR", "nvidia-smi")
tegraPath := filepath.Join(dir, "tegrastats")
tegraScript := `#!/bin/sh
echo "11-14-2024 22:54:33 RAM 1024/4096MB GR3D_FREQ 80% tj@70C VDD_GPU_SOC 1000mW"`
require.NoError(t, os.WriteFile(tegraPath, []byte(tegraScript), 0755))
gm, err := NewGPUManager()
require.NoError(t, err)
require.NotNil(t, gm)
time.Sleep(100 * time.Millisecond)
gpu, ok := gm.GpuDataMap["0"]
require.True(t, ok)
assert.Equal(t, "GPU", gpu.Name)
}
// TestAccumulationTableDriven tests the accumulation behavior for all three GPU types
func TestAccumulation(t *testing.T) {
type expectedGPUValues struct {

View File

@@ -28,7 +28,7 @@ type SmartManager struct {
SmartDevices []*DeviceInfo
refreshMutex sync.Mutex
lastScanTime time.Time
binPath string
smartctlPath string
excludedDevices map[string]struct{}
}
@@ -170,27 +170,35 @@ func (sm *SmartManager) ScanDevices(force bool) error {
configuredDevices = parsedDevices
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, sm.binPath, "--scan", "-j")
output, err := cmd.Output()
var (
scanErr error
scannedDevices []*DeviceInfo
hasValidScan bool
)
if err != nil {
scanErr = err
} else {
scannedDevices, hasValidScan = sm.parseScan(output)
if !hasValidScan {
scanErr = errNoValidSmartData
if sm.smartctlPath != "" {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, sm.smartctlPath, "--scan", "-j")
output, err := cmd.Output()
if err != nil {
scanErr = err
} else {
scannedDevices, hasValidScan = sm.parseScan(output)
if !hasValidScan {
scanErr = errNoValidSmartData
}
}
}
// Add eMMC devices (Linux only) by reading sysfs health fields. This does not
// require smartctl and does not scan the whole device.
if emmcDevices := scanEmmcDevices(); len(emmcDevices) > 0 {
scannedDevices = append(scannedDevices, emmcDevices...)
hasValidScan = true
}
finalDevices := mergeDeviceLists(currentDevices, scannedDevices, configuredDevices)
finalDevices = sm.filterExcludedDevices(finalDevices)
sm.updateSmartDevices(finalDevices)
@@ -442,6 +450,18 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
return errNoValidSmartData
}
// eMMC health is not exposed via SMART on Linux, but the kernel provides
// wear / EOL indicators via sysfs. Prefer that path when available.
if deviceInfo != nil {
if ok, err := sm.collectEmmcHealth(deviceInfo); ok {
return err
}
}
if sm.smartctlPath == "" {
return errNoValidSmartData
}
// slog.Info("collecting SMART data", "device", deviceInfo.Name, "type", deviceInfo.Type, "has_existing_data", sm.hasDataForDevice(deviceInfo.Name))
// Check if we have any existing data for this device
@@ -452,7 +472,7 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
// Try with -n standby first if we have existing data
args := sm.smartctlArgs(deviceInfo, hasExistingData)
cmd := exec.CommandContext(ctx, sm.binPath, args...)
cmd := exec.CommandContext(ctx, sm.smartctlPath, args...)
output, err := cmd.CombinedOutput()
// Check if device is in standby (exit status 2)
@@ -465,7 +485,7 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
ctx2, cancel2 := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel2()
args = sm.smartctlArgs(deviceInfo, false)
cmd = exec.CommandContext(ctx2, sm.binPath, args...)
cmd = exec.CommandContext(ctx2, sm.smartctlPath, args...)
output, err = cmd.CombinedOutput()
}
@@ -482,7 +502,7 @@ func (sm *SmartManager) CollectSmart(deviceInfo *DeviceInfo) error {
ctx3, cancel3 := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel3()
args = sm.smartctlArgs(deviceInfo, false)
cmd = exec.CommandContext(ctx3, sm.binPath, args...)
cmd = exec.CommandContext(ctx3, sm.smartctlPath, args...)
output, err = cmd.CombinedOutput()
hasValidData = sm.parseSmartOutput(deviceInfo, output)
@@ -1123,10 +1143,15 @@ func NewSmartManager() (*SmartManager, error) {
}
sm.refreshExcludedDevices()
path, err := sm.detectSmartctl()
slog.Debug("smartctl", "path", path, "err", err)
if err != nil {
// Keep the previous fail-fast behavior unless this Linux host exposes
// eMMC health via sysfs, in which case smartctl is optional.
if runtime.GOOS == "linux" && len(scanEmmcDevices()) > 0 {
return sm, nil
}
return nil, err
}
slog.Debug("smartctl", "path", path)
sm.binPath = path
sm.smartctlPath = path
return sm, nil
}

700
agent/test-data/amdgpu.ids Normal file
View File

@@ -0,0 +1,700 @@
# List of AMDGPU IDs
#
# Syntax:
# device_id, revision_id, product_name <-- single tab after comma
1.0.0
1114, C2, AMD Radeon 860M Graphics
1114, C3, AMD Radeon 840M Graphics
1114, D2, AMD Radeon 860M Graphics
1114, D3, AMD Radeon 840M Graphics
1309, 00, AMD Radeon R7 Graphics
130A, 00, AMD Radeon R6 Graphics
130B, 00, AMD Radeon R4 Graphics
130C, 00, AMD Radeon R7 Graphics
130D, 00, AMD Radeon R6 Graphics
130E, 00, AMD Radeon R5 Graphics
130F, 00, AMD Radeon R7 Graphics
130F, D4, AMD Radeon R7 Graphics
130F, D5, AMD Radeon R7 Graphics
130F, D6, AMD Radeon R7 Graphics
130F, D7, AMD Radeon R7 Graphics
1313, 00, AMD Radeon R7 Graphics
1313, D4, AMD Radeon R7 Graphics
1313, D5, AMD Radeon R7 Graphics
1313, D6, AMD Radeon R7 Graphics
1315, 00, AMD Radeon R5 Graphics
1315, D4, AMD Radeon R5 Graphics
1315, D5, AMD Radeon R5 Graphics
1315, D6, AMD Radeon R5 Graphics
1315, D7, AMD Radeon R5 Graphics
1316, 00, AMD Radeon R5 Graphics
1318, 00, AMD Radeon R5 Graphics
131B, 00, AMD Radeon R4 Graphics
131C, 00, AMD Radeon R7 Graphics
131D, 00, AMD Radeon R6 Graphics
1435, AE, AMD Custom GPU 0932
1506, C1, AMD Radeon 610M
1506, C2, AMD Radeon 610M
1506, C3, AMD Radeon 610M
1506, C4, AMD Radeon 610M
150E, C1, AMD Radeon 890M Graphics
150E, C4, AMD Radeon 890M Graphics
150E, C5, AMD Radeon 890M Graphics
150E, C6, AMD Radeon 890M Graphics
150E, D1, AMD Radeon 890M Graphics
150E, D2, AMD Radeon 890M Graphics
150E, D3, AMD Radeon 890M Graphics
1586, C1, Radeon 8060S Graphics
1586, C2, Radeon 8050S Graphics
1586, C4, Radeon 8050S Graphics
1586, D1, Radeon 8060S Graphics
1586, D2, Radeon 8050S Graphics
1586, D4, Radeon 8050S Graphics
1586, D5, Radeon 8040S Graphics
15BF, 00, AMD Radeon 780M Graphics
15BF, 01, AMD Radeon 760M Graphics
15BF, 02, AMD Radeon 780M Graphics
15BF, 03, AMD Radeon 760M Graphics
15BF, C1, AMD Radeon 780M Graphics
15BF, C2, AMD Radeon 780M Graphics
15BF, C3, AMD Radeon 760M Graphics
15BF, C4, AMD Radeon 780M Graphics
15BF, C5, AMD Radeon 740M Graphics
15BF, C6, AMD Radeon 780M Graphics
15BF, C7, AMD Radeon 780M Graphics
15BF, C8, AMD Radeon 760M Graphics
15BF, C9, AMD Radeon 780M Graphics
15BF, CA, AMD Radeon 740M Graphics
15BF, CB, AMD Radeon 760M Graphics
15BF, CC, AMD Radeon 740M Graphics
15BF, CD, AMD Radeon 760M Graphics
15BF, CF, AMD Radeon 780M Graphics
15BF, D0, AMD Radeon 780M Graphics
15BF, D1, AMD Radeon 780M Graphics
15BF, D2, AMD Radeon 780M Graphics
15BF, D3, AMD Radeon 780M Graphics
15BF, D4, AMD Radeon 780M Graphics
15BF, D5, AMD Radeon 760M Graphics
15BF, D6, AMD Radeon 760M Graphics
15BF, D7, AMD Radeon 780M Graphics
15BF, D8, AMD Radeon 740M Graphics
15BF, D9, AMD Radeon 780M Graphics
15BF, DA, AMD Radeon 780M Graphics
15BF, DB, AMD Radeon 760M Graphics
15BF, DC, AMD Radeon 760M Graphics
15BF, DD, AMD Radeon 780M Graphics
15BF, DE, AMD Radeon 740M Graphics
15BF, DF, AMD Radeon 760M Graphics
15BF, F0, AMD Radeon 760M Graphics
15C8, C1, AMD Radeon 740M Graphics
15C8, C2, AMD Radeon 740M Graphics
15C8, C3, AMD Radeon 740M Graphics
15C8, C4, AMD Radeon 740M Graphics
15C8, D1, AMD Radeon 740M Graphics
15C8, D2, AMD Radeon 740M Graphics
15C8, D3, AMD Radeon 740M Graphics
15C8, D4, AMD Radeon 740M Graphics
15D8, 00, AMD Radeon RX Vega 8 Graphics WS
15D8, 91, AMD Radeon Vega 3 Graphics
15D8, 91, AMD Ryzen Embedded R1606G with Radeon Vega Gfx
15D8, 92, AMD Radeon Vega 3 Graphics
15D8, 92, AMD Ryzen Embedded R1505G with Radeon Vega Gfx
15D8, 93, AMD Radeon Vega 1 Graphics
15D8, A1, AMD Radeon Vega 10 Graphics
15D8, A2, AMD Radeon Vega 8 Graphics
15D8, A3, AMD Radeon Vega 6 Graphics
15D8, A4, AMD Radeon Vega 3 Graphics
15D8, B1, AMD Radeon Vega 10 Graphics
15D8, B2, AMD Radeon Vega 8 Graphics
15D8, B3, AMD Radeon Vega 6 Graphics
15D8, B4, AMD Radeon Vega 3 Graphics
15D8, C1, AMD Radeon Vega 10 Graphics
15D8, C2, AMD Radeon Vega 8 Graphics
15D8, C3, AMD Radeon Vega 6 Graphics
15D8, C4, AMD Radeon Vega 3 Graphics
15D8, C5, AMD Radeon Vega 3 Graphics
15D8, C8, AMD Radeon Vega 11 Graphics
15D8, C9, AMD Radeon Vega 8 Graphics
15D8, CA, AMD Radeon Vega 11 Graphics
15D8, CB, AMD Radeon Vega 8 Graphics
15D8, CC, AMD Radeon Vega 3 Graphics
15D8, CE, AMD Radeon Vega 3 Graphics
15D8, CF, AMD Ryzen Embedded R1305G with Radeon Vega Gfx
15D8, D1, AMD Radeon Vega 10 Graphics
15D8, D2, AMD Radeon Vega 8 Graphics
15D8, D3, AMD Radeon Vega 6 Graphics
15D8, D4, AMD Radeon Vega 3 Graphics
15D8, D8, AMD Radeon Vega 11 Graphics
15D8, D9, AMD Radeon Vega 8 Graphics
15D8, DA, AMD Radeon Vega 11 Graphics
15D8, DB, AMD Radeon Vega 3 Graphics
15D8, DB, AMD Radeon Vega 8 Graphics
15D8, DC, AMD Radeon Vega 3 Graphics
15D8, DD, AMD Radeon Vega 3 Graphics
15D8, DE, AMD Radeon Vega 3 Graphics
15D8, DF, AMD Radeon Vega 3 Graphics
15D8, E3, AMD Radeon Vega 3 Graphics
15D8, E4, AMD Ryzen Embedded R1102G with Radeon Vega Gfx
15DD, 81, AMD Ryzen Embedded V1807B with Radeon Vega Gfx
15DD, 82, AMD Ryzen Embedded V1756B with Radeon Vega Gfx
15DD, 83, AMD Ryzen Embedded V1605B with Radeon Vega Gfx
15DD, 84, AMD Radeon Vega 6 Graphics
15DD, 85, AMD Ryzen Embedded V1202B with Radeon Vega Gfx
15DD, 86, AMD Radeon Vega 11 Graphics
15DD, 88, AMD Radeon Vega 8 Graphics
15DD, C1, AMD Radeon Vega 11 Graphics
15DD, C2, AMD Radeon Vega 8 Graphics
15DD, C3, AMD Radeon Vega 3 / 10 Graphics
15DD, C4, AMD Radeon Vega 8 Graphics
15DD, C5, AMD Radeon Vega 3 Graphics
15DD, C6, AMD Radeon Vega 11 Graphics
15DD, C8, AMD Radeon Vega 8 Graphics
15DD, C9, AMD Radeon Vega 11 Graphics
15DD, CA, AMD Radeon Vega 8 Graphics
15DD, CB, AMD Radeon Vega 3 Graphics
15DD, CC, AMD Radeon Vega 6 Graphics
15DD, CE, AMD Radeon Vega 3 Graphics
15DD, CF, AMD Radeon Vega 3 Graphics
15DD, D0, AMD Radeon Vega 10 Graphics
15DD, D1, AMD Radeon Vega 8 Graphics
15DD, D3, AMD Radeon Vega 11 Graphics
15DD, D5, AMD Radeon Vega 8 Graphics
15DD, D6, AMD Radeon Vega 11 Graphics
15DD, D7, AMD Radeon Vega 8 Graphics
15DD, D8, AMD Radeon Vega 3 Graphics
15DD, D9, AMD Radeon Vega 6 Graphics
15DD, E1, AMD Radeon Vega 3 Graphics
15DD, E2, AMD Radeon Vega 3 Graphics
163F, AE, AMD Custom GPU 0405
163F, E1, AMD Custom GPU 0405
164E, D8, AMD Radeon 610M
164E, D9, AMD Radeon 610M
164E, DA, AMD Radeon 610M
164E, DB, AMD Radeon 610M
164E, DC, AMD Radeon 610M
1681, 06, AMD Radeon 680M
1681, 07, AMD Radeon 660M
1681, 0A, AMD Radeon 680M
1681, 0B, AMD Radeon 660M
1681, C7, AMD Radeon 680M
1681, C8, AMD Radeon 680M
1681, C9, AMD Radeon 660M
1900, 01, AMD Radeon 780M Graphics
1900, 02, AMD Radeon 760M Graphics
1900, 03, AMD Radeon 780M Graphics
1900, 04, AMD Radeon 760M Graphics
1900, 05, AMD Radeon 780M Graphics
1900, 06, AMD Radeon 780M Graphics
1900, 07, AMD Radeon 760M Graphics
1900, B0, AMD Radeon 780M Graphics
1900, B1, AMD Radeon 780M Graphics
1900, B2, AMD Radeon 780M Graphics
1900, B3, AMD Radeon 780M Graphics
1900, B4, AMD Radeon 780M Graphics
1900, B5, AMD Radeon 780M Graphics
1900, B6, AMD Radeon 780M Graphics
1900, B7, AMD Radeon 760M Graphics
1900, B8, AMD Radeon 760M Graphics
1900, B9, AMD Radeon 780M Graphics
1900, BA, AMD Radeon 780M Graphics
1900, BB, AMD Radeon 780M Graphics
1900, C0, AMD Radeon 780M Graphics
1900, C1, AMD Radeon 760M Graphics
1900, C2, AMD Radeon 780M Graphics
1900, C3, AMD Radeon 760M Graphics
1900, C4, AMD Radeon 780M Graphics
1900, C5, AMD Radeon 780M Graphics
1900, C6, AMD Radeon 760M Graphics
1900, C7, AMD Radeon 780M Graphics
1900, C8, AMD Radeon 760M Graphics
1900, C9, AMD Radeon 780M Graphics
1900, CA, AMD Radeon 760M Graphics
1900, CB, AMD Radeon 780M Graphics
1900, CC, AMD Radeon 780M Graphics
1900, CD, AMD Radeon 760M Graphics
1900, CE, AMD Radeon 780M Graphics
1900, CF, AMD Radeon 760M Graphics
1900, D0, AMD Radeon 780M Graphics
1900, D1, AMD Radeon 760M Graphics
1900, D2, AMD Radeon 780M Graphics
1900, D3, AMD Radeon 760M Graphics
1900, D4, AMD Radeon 780M Graphics
1900, D5, AMD Radeon 780M Graphics
1900, D6, AMD Radeon 760M Graphics
1900, D7, AMD Radeon 780M Graphics
1900, D8, AMD Radeon 760M Graphics
1900, D9, AMD Radeon 780M Graphics
1900, DA, AMD Radeon 760M Graphics
1900, DB, AMD Radeon 780M Graphics
1900, DC, AMD Radeon 780M Graphics
1900, DD, AMD Radeon 760M Graphics
1900, DE, AMD Radeon 780M Graphics
1900, DF, AMD Radeon 760M Graphics
1900, F0, AMD Radeon 780M Graphics
1900, F1, AMD Radeon 780M Graphics
1900, F2, AMD Radeon 780M Graphics
1901, C1, AMD Radeon 740M Graphics
1901, C2, AMD Radeon 740M Graphics
1901, C3, AMD Radeon 740M Graphics
1901, C6, AMD Radeon 740M Graphics
1901, C7, AMD Radeon 740M Graphics
1901, C8, AMD Radeon 740M Graphics
1901, C9, AMD Radeon 740M Graphics
1901, CA, AMD Radeon 740M Graphics
1901, D1, AMD Radeon 740M Graphics
1901, D2, AMD Radeon 740M Graphics
1901, D3, AMD Radeon 740M Graphics
1901, D4, AMD Radeon 740M Graphics
1901, D5, AMD Radeon 740M Graphics
1901, D6, AMD Radeon 740M Graphics
1901, D7, AMD Radeon 740M Graphics
1901, D8, AMD Radeon 740M Graphics
6600, 00, AMD Radeon HD 8600 / 8700M
6600, 81, AMD Radeon R7 M370
6601, 00, AMD Radeon HD 8500M / 8700M
6604, 00, AMD Radeon R7 M265 Series
6604, 81, AMD Radeon R7 M350
6605, 00, AMD Radeon R7 M260 Series
6605, 81, AMD Radeon R7 M340
6606, 00, AMD Radeon HD 8790M
6607, 00, AMD Radeon R5 M240
6608, 00, AMD FirePro W2100
6610, 00, AMD Radeon R7 200 Series
6610, 81, AMD Radeon R7 350
6610, 83, AMD Radeon R5 340
6610, 87, AMD Radeon R7 200 Series
6611, 00, AMD Radeon R7 200 Series
6611, 87, AMD Radeon R7 200 Series
6613, 00, AMD Radeon R7 200 Series
6617, 00, AMD Radeon R7 240 Series
6617, 87, AMD Radeon R7 200 Series
6617, C7, AMD Radeon R7 240 Series
6640, 00, AMD Radeon HD 8950
6640, 80, AMD Radeon R9 M380
6646, 00, AMD Radeon R9 M280X
6646, 80, AMD Radeon R9 M385
6646, 80, AMD Radeon R9 M470X
6647, 00, AMD Radeon R9 M200X Series
6647, 80, AMD Radeon R9 M380
6649, 00, AMD FirePro W5100
6658, 00, AMD Radeon R7 200 Series
665C, 00, AMD Radeon HD 7700 Series
665D, 00, AMD Radeon R7 200 Series
665F, 81, AMD Radeon R7 360 Series
6660, 00, AMD Radeon HD 8600M Series
6660, 81, AMD Radeon R5 M335
6660, 83, AMD Radeon R5 M330
6663, 00, AMD Radeon HD 8500M Series
6663, 83, AMD Radeon R5 M320
6664, 00, AMD Radeon R5 M200 Series
6665, 00, AMD Radeon R5 M230 Series
6665, 83, AMD Radeon R5 M320
6665, C3, AMD Radeon R5 M435
6666, 00, AMD Radeon R5 M200 Series
6667, 00, AMD Radeon R5 M200 Series
666F, 00, AMD Radeon HD 8500M
66A1, 02, AMD Instinct MI60 / MI50
66A1, 06, AMD Radeon Pro VII
66AF, C1, AMD Radeon VII
6780, 00, AMD FirePro W9000
6784, 00, ATI FirePro V (FireGL V) Graphics Adapter
6788, 00, ATI FirePro V (FireGL V) Graphics Adapter
678A, 00, AMD FirePro W8000
6798, 00, AMD Radeon R9 200 / HD 7900 Series
6799, 00, AMD Radeon HD 7900 Series
679A, 00, AMD Radeon HD 7900 Series
679B, 00, AMD Radeon HD 7900 Series
679E, 00, AMD Radeon HD 7800 Series
67A0, 00, AMD Radeon FirePro W9100
67A1, 00, AMD Radeon FirePro W8100
67B0, 00, AMD Radeon R9 200 Series
67B0, 80, AMD Radeon R9 390 Series
67B1, 00, AMD Radeon R9 200 Series
67B1, 80, AMD Radeon R9 390 Series
67B9, 00, AMD Radeon R9 200 Series
67C0, 00, AMD Radeon Pro WX 7100 Graphics
67C0, 80, AMD Radeon E9550
67C2, 01, AMD Radeon Pro V7350x2
67C2, 02, AMD Radeon Pro V7300X
67C4, 00, AMD Radeon Pro WX 7100 Graphics
67C4, 80, AMD Radeon E9560 / E9565 Graphics
67C7, 00, AMD Radeon Pro WX 5100 Graphics
67C7, 80, AMD Radeon E9390 Graphics
67D0, 01, AMD Radeon Pro V7350x2
67D0, 02, AMD Radeon Pro V7300X
67DF, C0, AMD Radeon Pro 580X
67DF, C1, AMD Radeon RX 580 Series
67DF, C2, AMD Radeon RX 570 Series
67DF, C3, AMD Radeon RX 580 Series
67DF, C4, AMD Radeon RX 480 Graphics
67DF, C5, AMD Radeon RX 470 Graphics
67DF, C6, AMD Radeon RX 570 Series
67DF, C7, AMD Radeon RX 480 Graphics
67DF, CF, AMD Radeon RX 470 Graphics
67DF, D7, AMD Radeon RX 470 Graphics
67DF, E0, AMD Radeon RX 470 Series
67DF, E1, AMD Radeon RX 590 Series
67DF, E3, AMD Radeon RX Series
67DF, E7, AMD Radeon RX 580 Series
67DF, EB, AMD Radeon Pro 580X
67DF, EF, AMD Radeon RX 570 Series
67DF, F7, AMD Radeon RX P30PH
67DF, FF, AMD Radeon RX 470 Series
67E0, 00, AMD Radeon Pro WX Series
67E3, 00, AMD Radeon Pro WX 4100
67E8, 00, AMD Radeon Pro WX Series
67E8, 01, AMD Radeon Pro WX Series
67E8, 80, AMD Radeon E9260 Graphics
67EB, 00, AMD Radeon Pro V5300X
67EF, C0, AMD Radeon RX Graphics
67EF, C1, AMD Radeon RX 460 Graphics
67EF, C2, AMD Radeon Pro Series
67EF, C3, AMD Radeon RX Series
67EF, C5, AMD Radeon RX 460 Graphics
67EF, C7, AMD Radeon RX Graphics
67EF, CF, AMD Radeon RX 460 Graphics
67EF, E0, AMD Radeon RX 560 Series
67EF, E1, AMD Radeon RX Series
67EF, E2, AMD Radeon RX 560X
67EF, E3, AMD Radeon RX Series
67EF, E5, AMD Radeon RX 560 Series
67EF, E7, AMD Radeon RX 560 Series
67EF, EF, AMD Radeon 550 Series
67EF, FF, AMD Radeon RX 460 Graphics
67FF, C0, AMD Radeon Pro 465
67FF, C1, AMD Radeon RX 560 Series
67FF, CF, AMD Radeon RX 560 Series
67FF, EF, AMD Radeon RX 560 Series
67FF, FF, AMD Radeon RX 550 Series
6800, 00, AMD Radeon HD 7970M
6801, 00, AMD Radeon HD 8970M
6806, 00, AMD Radeon R9 M290X
6808, 00, AMD FirePro W7000
6808, 00, ATI FirePro V (FireGL V) Graphics Adapter
6809, 00, ATI FirePro W5000
6810, 00, AMD Radeon R9 200 Series
6810, 81, AMD Radeon R9 370 Series
6811, 00, AMD Radeon R9 200 Series
6811, 81, AMD Radeon R7 370 Series
6818, 00, AMD Radeon HD 7800 Series
6819, 00, AMD Radeon HD 7800 Series
6820, 00, AMD Radeon R9 M275X
6820, 81, AMD Radeon R9 M375
6820, 83, AMD Radeon R9 M375X
6821, 00, AMD Radeon R9 M200X Series
6821, 83, AMD Radeon R9 M370X
6821, 87, AMD Radeon R7 M380
6822, 00, AMD Radeon E8860
6823, 00, AMD Radeon R9 M200X Series
6825, 00, AMD Radeon HD 7800M Series
6826, 00, AMD Radeon HD 7700M Series
6827, 00, AMD Radeon HD 7800M Series
6828, 00, AMD FirePro W600
682B, 00, AMD Radeon HD 8800M Series
682B, 87, AMD Radeon R9 M360
682C, 00, AMD FirePro W4100
682D, 00, AMD Radeon HD 7700M Series
682F, 00, AMD Radeon HD 7700M Series
6830, 00, AMD Radeon 7800M Series
6831, 00, AMD Radeon 7700M Series
6835, 00, AMD Radeon R7 Series / HD 9000 Series
6837, 00, AMD Radeon HD 7700 Series
683D, 00, AMD Radeon HD 7700 Series
683F, 00, AMD Radeon HD 7700 Series
684C, 00, ATI FirePro V (FireGL V) Graphics Adapter
6860, 00, AMD Radeon Instinct MI25
6860, 01, AMD Radeon Instinct MI25
6860, 02, AMD Radeon Instinct MI25
6860, 03, AMD Radeon Pro V340
6860, 04, AMD Radeon Instinct MI25x2
6860, 07, AMD Radeon Pro V320
6861, 00, AMD Radeon Pro WX 9100
6862, 00, AMD Radeon Pro SSG
6863, 00, AMD Radeon Vega Frontier Edition
6864, 03, AMD Radeon Pro V340
6864, 04, AMD Radeon Instinct MI25x2
6864, 05, AMD Radeon Pro V340
6868, 00, AMD Radeon Pro WX 8200
686C, 00, AMD Radeon Instinct MI25 MxGPU
686C, 01, AMD Radeon Instinct MI25 MxGPU
686C, 02, AMD Radeon Instinct MI25 MxGPU
686C, 03, AMD Radeon Pro V340 MxGPU
686C, 04, AMD Radeon Instinct MI25x2 MxGPU
686C, 05, AMD Radeon Pro V340L MxGPU
686C, 06, AMD Radeon Instinct MI25 MxGPU
687F, 01, AMD Radeon RX Vega
687F, C0, AMD Radeon RX Vega
687F, C1, AMD Radeon RX Vega
687F, C3, AMD Radeon RX Vega
687F, C7, AMD Radeon RX Vega
6900, 00, AMD Radeon R7 M260
6900, 81, AMD Radeon R7 M360
6900, 83, AMD Radeon R7 M340
6900, C1, AMD Radeon R5 M465 Series
6900, C3, AMD Radeon R5 M445 Series
6900, D1, AMD Radeon 530 Series
6900, D3, AMD Radeon 530 Series
6901, 00, AMD Radeon R5 M255
6902, 00, AMD Radeon Series
6907, 00, AMD Radeon R5 M255
6907, 87, AMD Radeon R5 M315
6920, 00, AMD Radeon R9 M395X
6920, 01, AMD Radeon R9 M390X
6921, 00, AMD Radeon R9 M390X
6929, 00, AMD FirePro S7150
6929, 01, AMD FirePro S7100X
692B, 00, AMD FirePro W7100
6938, 00, AMD Radeon R9 200 Series
6938, F0, AMD Radeon R9 200 Series
6938, F1, AMD Radeon R9 380 Series
6939, 00, AMD Radeon R9 200 Series
6939, F0, AMD Radeon R9 200 Series
6939, F1, AMD Radeon R9 380 Series
694C, C0, AMD Radeon RX Vega M GH Graphics
694E, C0, AMD Radeon RX Vega M GL Graphics
6980, 00, AMD Radeon Pro WX 3100
6981, 00, AMD Radeon Pro WX 3200 Series
6981, 01, AMD Radeon Pro WX 3200 Series
6981, 10, AMD Radeon Pro WX 3200 Series
6985, 00, AMD Radeon Pro WX 3100
6986, 00, AMD Radeon Pro WX 2100
6987, 80, AMD Embedded Radeon E9171
6987, C0, AMD Radeon 550X Series
6987, C1, AMD Radeon RX 640
6987, C3, AMD Radeon 540X Series
6987, C7, AMD Radeon 540
6995, 00, AMD Radeon Pro WX 2100
6997, 00, AMD Radeon Pro WX 2100
699F, 81, AMD Embedded Radeon E9170 Series
699F, C0, AMD Radeon 500 Series
699F, C1, AMD Radeon 540 Series
699F, C3, AMD Radeon 500 Series
699F, C7, AMD Radeon RX 550 / 550 Series
699F, C9, AMD Radeon 540
6FDF, E7, AMD Radeon RX 590 GME
6FDF, EF, AMD Radeon RX 580 2048SP
7300, C1, AMD FirePro S9300 x2
7300, C8, AMD Radeon R9 Fury Series
7300, C9, AMD Radeon Pro Duo
7300, CA, AMD Radeon R9 Fury Series
7300, CB, AMD Radeon R9 Fury Series
7312, 00, AMD Radeon Pro W5700
731E, C6, AMD Radeon RX 5700XTB
731E, C7, AMD Radeon RX 5700B
731F, C0, AMD Radeon RX 5700 XT 50th Anniversary
731F, C1, AMD Radeon RX 5700 XT
731F, C2, AMD Radeon RX 5600M
731F, C3, AMD Radeon RX 5700M
731F, C4, AMD Radeon RX 5700
731F, C5, AMD Radeon RX 5700 XT
731F, CA, AMD Radeon RX 5600 XT
731F, CB, AMD Radeon RX 5600 OEM
7340, C1, AMD Radeon RX 5500M
7340, C3, AMD Radeon RX 5300M
7340, C5, AMD Radeon RX 5500 XT
7340, C7, AMD Radeon RX 5500
7340, C9, AMD Radeon RX 5500XTB
7340, CF, AMD Radeon RX 5300
7341, 00, AMD Radeon Pro W5500
7347, 00, AMD Radeon Pro W5500M
7360, 41, AMD Radeon Pro 5600M
7360, C3, AMD Radeon Pro V520
7362, C1, AMD Radeon Pro V540
7362, C3, AMD Radeon Pro V520
738C, 01, AMD Instinct MI100
73A1, 00, AMD Radeon Pro V620
73A3, 00, AMD Radeon Pro W6800
73A5, C0, AMD Radeon RX 6950 XT
73AE, 00, AMD Radeon Pro V620 MxGPU
73AF, C0, AMD Radeon RX 6900 XT
73BF, C0, AMD Radeon RX 6900 XT
73BF, C1, AMD Radeon RX 6800 XT
73BF, C3, AMD Radeon RX 6800
73DF, C0, AMD Radeon RX 6750 XT
73DF, C1, AMD Radeon RX 6700 XT
73DF, C2, AMD Radeon RX 6800M
73DF, C3, AMD Radeon RX 6800M
73DF, C5, AMD Radeon RX 6700 XT
73DF, CF, AMD Radeon RX 6700M
73DF, D5, AMD Radeon RX 6750 GRE 12GB
73DF, D7, AMD TDC-235
73DF, DF, AMD Radeon RX 6700
73DF, E5, AMD Radeon RX 6750 GRE 12GB
73DF, FF, AMD Radeon RX 6700
73E0, 00, AMD Radeon RX 6600M
73E1, 00, AMD Radeon Pro W6600M
73E3, 00, AMD Radeon Pro W6600
73EF, C0, AMD Radeon RX 6800S
73EF, C1, AMD Radeon RX 6650 XT
73EF, C2, AMD Radeon RX 6700S
73EF, C3, AMD Radeon RX 6650M
73EF, C4, AMD Radeon RX 6650M XT
73FF, C1, AMD Radeon RX 6600 XT
73FF, C3, AMD Radeon RX 6600M
73FF, C7, AMD Radeon RX 6600
73FF, CB, AMD Radeon RX 6600S
73FF, CF, AMD Radeon RX 6600 LE
73FF, DF, AMD Radeon RX 6750 GRE 10GB
7408, 00, AMD Instinct MI250X
740C, 01, AMD Instinct MI250X / MI250
740F, 02, AMD Instinct MI210
7421, 00, AMD Radeon Pro W6500M
7422, 00, AMD Radeon Pro W6400
7423, 00, AMD Radeon Pro W6300M
7423, 01, AMD Radeon Pro W6300
7424, 00, AMD Radeon RX 6300
743F, C1, AMD Radeon RX 6500 XT
743F, C3, AMD Radeon RX 6500
743F, C3, AMD Radeon RX 6500M
743F, C7, AMD Radeon RX 6400
743F, C8, AMD Radeon RX 6500M
743F, CC, AMD Radeon 6550S
743F, CE, AMD Radeon RX 6450M
743F, CF, AMD Radeon RX 6300M
743F, D3, AMD Radeon RX 6550M
743F, D7, AMD Radeon RX 6400
7448, 00, AMD Radeon Pro W7900
7449, 00, AMD Radeon Pro W7800 48GB
744A, 00, AMD Radeon Pro W7900 Dual Slot
744B, 00, AMD Radeon Pro W7900D
744C, C8, AMD Radeon RX 7900 XTX
744C, CC, AMD Radeon RX 7900 XT
744C, CE, AMD Radeon RX 7900 GRE
744C, CF, AMD Radeon RX 7900M
745E, CC, AMD Radeon Pro W7800
7460, 00, AMD Radeon Pro V710
7461, 00, AMD Radeon Pro V710 MxGPU
7470, 00, AMD Radeon Pro W7700
747E, C8, AMD Radeon RX 7800 XT
747E, D8, AMD Radeon RX 7800M
747E, DB, AMD Radeon RX 7700
747E, FF, AMD Radeon RX 7700 XT
7480, 00, AMD Radeon Pro W7600
7480, C0, AMD Radeon RX 7600 XT
7480, C1, AMD Radeon RX 7700S
7480, C2, AMD Radeon RX 7650 GRE
7480, C3, AMD Radeon RX 7600S
7480, C7, AMD Radeon RX 7600M XT
7480, CF, AMD Radeon RX 7600
7481, C7, AMD Steam Machine
7483, CF, AMD Radeon RX 7600M
7489, 00, AMD Radeon Pro W7500
7499, 00, AMD Radeon Pro W7400
7499, C0, AMD Radeon RX 7400
7499, C1, AMD Radeon RX 7300
74A0, 00, AMD Instinct MI300A
74A1, 00, AMD Instinct MI300X
74A2, 00, AMD Instinct MI308X
74A5, 00, AMD Instinct MI325X
74A8, 00, AMD Instinct MI308X HF
74A9, 00, AMD Instinct MI300X HF
74B5, 00, AMD Instinct MI300X VF
74B6, 00, AMD Instinct MI308X
74BD, 00, AMD Instinct MI300X HF
7550, C0, AMD Radeon RX 9070 XT
7550, C2, AMD Radeon RX 9070 GRE
7550, C3, AMD Radeon RX 9070
7551, C0, AMD Radeon AI PRO R9700
7590, C0, AMD Radeon RX 9060 XT
7590, C7, AMD Radeon RX 9060
75A0, C0, AMD Instinct MI350X
75A3, C0, AMD Instinct MI355X
75B0, C0, AMD Instinct MI350X VF
75B3, C0, AMD Instinct MI355X VF
9830, 00, AMD Radeon HD 8400 / R3 Series
9831, 00, AMD Radeon HD 8400E
9832, 00, AMD Radeon HD 8330
9833, 00, AMD Radeon HD 8330E
9834, 00, AMD Radeon HD 8210
9835, 00, AMD Radeon HD 8210E
9836, 00, AMD Radeon HD 8200 / R3 Series
9837, 00, AMD Radeon HD 8280E
9838, 00, AMD Radeon HD 8200 / R3 series
9839, 00, AMD Radeon HD 8180
983D, 00, AMD Radeon HD 8250
9850, 00, AMD Radeon R3 Graphics
9850, 03, AMD Radeon R3 Graphics
9850, 40, AMD Radeon R2 Graphics
9850, 45, AMD Radeon R3 Graphics
9851, 00, AMD Radeon R4 Graphics
9851, 01, AMD Radeon R5E Graphics
9851, 05, AMD Radeon R5 Graphics
9851, 06, AMD Radeon R5E Graphics
9851, 40, AMD Radeon R4 Graphics
9851, 45, AMD Radeon R5 Graphics
9852, 00, AMD Radeon R2 Graphics
9852, 40, AMD Radeon E1 Graphics
9853, 00, AMD Radeon R2 Graphics
9853, 01, AMD Radeon R4E Graphics
9853, 03, AMD Radeon R2 Graphics
9853, 05, AMD Radeon R1E Graphics
9853, 06, AMD Radeon R1E Graphics
9853, 07, AMD Radeon R1E Graphics
9853, 08, AMD Radeon R1E Graphics
9853, 40, AMD Radeon R2 Graphics
9854, 00, AMD Radeon R3 Graphics
9854, 01, AMD Radeon R3E Graphics
9854, 02, AMD Radeon R3 Graphics
9854, 05, AMD Radeon R2 Graphics
9854, 06, AMD Radeon R4 Graphics
9854, 07, AMD Radeon R3 Graphics
9855, 02, AMD Radeon R6 Graphics
9855, 05, AMD Radeon R4 Graphics
9856, 00, AMD Radeon R2 Graphics
9856, 01, AMD Radeon R2E Graphics
9856, 02, AMD Radeon R2 Graphics
9856, 05, AMD Radeon R1E Graphics
9856, 06, AMD Radeon R2 Graphics
9856, 07, AMD Radeon R1E Graphics
9856, 08, AMD Radeon R1E Graphics
9856, 13, AMD Radeon R1E Graphics
9874, 81, AMD Radeon R6 Graphics
9874, 84, AMD Radeon R7 Graphics
9874, 85, AMD Radeon R6 Graphics
9874, 87, AMD Radeon R5 Graphics
9874, 88, AMD Radeon R7E Graphics
9874, 89, AMD Radeon R6E Graphics
9874, C4, AMD Radeon R7 Graphics
9874, C5, AMD Radeon R6 Graphics
9874, C6, AMD Radeon R6 Graphics
9874, C7, AMD Radeon R5 Graphics
9874, C8, AMD Radeon R7 Graphics
9874, C9, AMD Radeon R7 Graphics
9874, CA, AMD Radeon R5 Graphics
9874, CB, AMD Radeon R5 Graphics
9874, CC, AMD Radeon R7 Graphics
9874, CD, AMD Radeon R7 Graphics
9874, CE, AMD Radeon R5 Graphics
9874, E1, AMD Radeon R7 Graphics
9874, E2, AMD Radeon R7 Graphics
9874, E3, AMD Radeon R7 Graphics
9874, E4, AMD Radeon R7 Graphics
9874, E5, AMD Radeon R5 Graphics
9874, E6, AMD Radeon R5 Graphics
98E4, 80, AMD Radeon R5E Graphics
98E4, 81, AMD Radeon R4E Graphics
98E4, 83, AMD Radeon R2E Graphics
98E4, 84, AMD Radeon R2E Graphics
98E4, 86, AMD Radeon R1E Graphics
98E4, C0, AMD Radeon R4 Graphics
98E4, C1, AMD Radeon R5 Graphics
98E4, C2, AMD Radeon R4 Graphics
98E4, C4, AMD Radeon R5 Graphics
98E4, C6, AMD Radeon R5 Graphics
98E4, C8, AMD Radeon R4 Graphics
98E4, C9, AMD Radeon R4 Graphics
98E4, CA, AMD Radeon R5 Graphics
98E4, D0, AMD Radeon R2 Graphics
98E4, D1, AMD Radeon R2 Graphics
98E4, D2, AMD Radeon R2 Graphics
98E4, D4, AMD Radeon R2 Graphics
98E4, D9, AMD Radeon R5 Graphics
98E4, DA, AMD Radeon R5 Graphics
98E4, DB, AMD Radeon R3 Graphics
98E4, E1, AMD Radeon R3 Graphics
98E4, E2, AMD Radeon R3 Graphics
98E4, E9, AMD Radeon R4 Graphics
98E4, EA, AMD Radeon R4 Graphics
98E4, EB, AMD Radeon R3 Graphics
98E4, EB, AMD Radeon R4 Graphics

View File

@@ -0,0 +1,34 @@
[
{
"device_name": "NVIDIA GeForce RTX 3050 Ti Laptop GPU",
"gpu_clock": "1485MHz",
"mem_clock": "6001MHz",
"temp": "48C",
"fan_speed": null,
"power_draw": "13W",
"gpu_util": "5%",
"encode": "0%",
"decode": "0%",
"mem_util": "8%",
"mem_total": "4294967296",
"mem_used": "349372416",
"mem_free": "3945594880",
"processes" : []
},
{
"device_name": "AMD Radeon 680M",
"gpu_clock": "2200MHz",
"mem_clock": "2400MHz",
"temp": "48C",
"fan_speed": "CPU Fan",
"power_draw": "9W",
"gpu_util": "12%",
"encode": null,
"decode": "0%",
"mem_util": "7%",
"mem_total": "16929173504",
"mem_used": "1213784064",
"mem_free": "15715389440",
"processes" : []
}
]

View File

@@ -2,18 +2,18 @@ package alerts
import (
"fmt"
"strings"
"github.com/pocketbase/pocketbase/core"
)
// handleSmartDeviceAlert sends alerts when a SMART device state changes from PASSED to FAILED.
// handleSmartDeviceAlert sends alerts when a SMART device state worsens into WARNING/FAILED.
// This is automatic and does not require user opt-in.
func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
oldState := e.Record.Original().GetString("state")
newState := e.Record.GetString("state")
// Only alert when transitioning from PASSED to FAILED
if oldState != "PASSED" || newState != "FAILED" {
if !shouldSendSmartDeviceAlert(oldState, newState) {
return e.Next()
}
@@ -32,14 +32,15 @@ func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
systemName := systemRecord.GetString("name")
deviceName := e.Record.GetString("name")
model := e.Record.GetString("model")
statusLabel := smartStateLabel(newState)
// Build alert message
title := fmt.Sprintf("SMART failure on %s: %s \U0001F534", systemName, deviceName)
title := fmt.Sprintf("SMART %s on %s: %s %s", statusLabel, systemName, deviceName, smartStateEmoji(newState))
var message string
if model != "" {
message = fmt.Sprintf("Disk %s (%s) SMART status changed to FAILED", deviceName, model)
message = fmt.Sprintf("Disk %s (%s) SMART status changed to %s", deviceName, model, newState)
} else {
message = fmt.Sprintf("Disk %s SMART status changed to FAILED", deviceName)
message = fmt.Sprintf("Disk %s SMART status changed to %s", deviceName, newState)
}
// Get users associated with the system
@@ -65,3 +66,42 @@ func (am *AlertManager) handleSmartDeviceAlert(e *core.RecordEvent) error {
return e.Next()
}
func shouldSendSmartDeviceAlert(oldState, newState string) bool {
oldSeverity := smartStateSeverity(oldState)
newSeverity := smartStateSeverity(newState)
// Ignore unknown states and recoveries; only alert on worsening transitions
// from known-good/degraded states into WARNING/FAILED.
return oldSeverity >= 1 && newSeverity > oldSeverity
}
func smartStateSeverity(state string) int {
switch state {
case "PASSED":
return 1
case "WARNING":
return 2
case "FAILED":
return 3
default:
return 0
}
}
func smartStateEmoji(state string) string {
switch state {
case "WARNING":
return "\U0001F7E0"
default:
return "\U0001F534"
}
}
func smartStateLabel(state string) string {
switch state {
case "FAILED":
return "failure"
default:
return strings.ToLower(state)
}
}

View File

@@ -58,6 +58,74 @@ func TestSmartDeviceAlert(t *testing.T) {
assert.Contains(t, lastMessage.Text, "FAILED")
}
func TestSmartDeviceAlertPassedToWarning(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
system, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system",
"users": []string{user.Id},
"host": "127.0.0.1",
})
assert.NoError(t, err)
smartDevice, err := beszelTests.CreateRecord(hub, "smart_devices", map[string]any{
"system": system.Id,
"name": "/dev/mmcblk0",
"model": "eMMC",
"state": "PASSED",
})
assert.NoError(t, err)
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
assert.NoError(t, err)
smartDevice.Set("state", "WARNING")
err = hub.Save(smartDevice)
assert.NoError(t, err)
time.Sleep(50 * time.Millisecond)
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 email sent after state changed to WARNING")
lastMessage := hub.TestMailer.LastMessage()
assert.Contains(t, lastMessage.Subject, "SMART warning on test-system")
assert.Contains(t, lastMessage.Text, "WARNING")
}
func TestSmartDeviceAlertWarningToFailed(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
system, err := beszelTests.CreateRecord(hub, "systems", map[string]any{
"name": "test-system",
"users": []string{user.Id},
"host": "127.0.0.1",
})
assert.NoError(t, err)
smartDevice, err := beszelTests.CreateRecord(hub, "smart_devices", map[string]any{
"system": system.Id,
"name": "/dev/mmcblk0",
"model": "eMMC",
"state": "WARNING",
})
assert.NoError(t, err)
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
assert.NoError(t, err)
smartDevice.Set("state", "FAILED")
err = hub.Save(smartDevice)
assert.NoError(t, err)
time.Sleep(50 * time.Millisecond)
assert.EqualValues(t, 1, hub.TestMailer.TotalSend(), "should have 1 email sent after state changed from WARNING to FAILED")
lastMessage := hub.TestMailer.LastMessage()
assert.Contains(t, lastMessage.Subject, "SMART failure on test-system")
assert.Contains(t, lastMessage.Text, "FAILED")
}
func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
hub, user := beszelTests.GetHubWithUser(t)
defer hub.Cleanup()
@@ -83,7 +151,8 @@ func TestSmartDeviceAlertNoAlertOnNonPassedToFailed(t *testing.T) {
smartDevice, err = hub.FindRecordById("smart_devices", smartDevice.Id)
assert.NoError(t, err)
// Update the state from UNKNOWN to FAILED - should NOT trigger alert
// Update the state from UNKNOWN to FAILED - should NOT trigger alert.
// We only alert from known healthy/degraded states.
smartDevice.Set("state", "FAILED")
err = hub.Save(smartDevice)
assert.NoError(t, err)

View File

@@ -23,6 +23,9 @@ COPY --from=builder /agent /agent
# this is so we don't need to create the /tmp directory in the scratch container
COPY --from=builder /tmp /tmp
# AMD GPU name lookup (used by agent on Linux when /usr/share/libdrm/amdgpu.ids is read)
COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
# Ensure data persistence across container recreations
VOLUME ["/var/lib/beszel-agent"]

View File

@@ -20,6 +20,9 @@ RUN rm -rf /tmp/*
FROM alpine:3.23
COPY --from=builder /agent /agent
# AMD GPU name lookup (used by agent on Linux when /usr/share/libdrm/amdgpu.ids is read)
COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
RUN apk add --no-cache smartmontools
# Ensure data persistence across container recreations

View File

@@ -37,6 +37,9 @@ RUN apt-get update && apt-get install -y \
FROM nvidia/cuda:12.2.2-base-ubuntu22.04
COPY --from=builder /agent /agent
# AMD GPU name lookup (used by agent on hybrid laptops when /usr/share/libdrm/amdgpu.ids is read)
COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
# Copy smartmontools binaries and config files
COPY --from=smartmontools-builder /usr/sbin/smartctl /usr/sbin/smartctl

View File

@@ -54,36 +54,34 @@ export default function ContainersTable({ systemId }: { systemId?: string }) {
fields: "id,name,image,cpu,memory,net,health,status,system,updated",
filter: systemId ? pb.filter("system={:system}", { system: systemId }) : undefined,
})
.then(
({ items }) => {
if (items.length === 0) {
setData((curItems) => {
if (systemId) {
return curItems?.filter((item) => item.system !== systemId) ?? []
}
return []
})
return
}
.then(({ items }) => {
if (items.length === 0) {
setData((curItems) => {
const lastUpdated = Math.max(items[0].updated, items.at(-1)?.updated ?? 0)
const containerIds = new Set()
const newItems = []
for (const item of items) {
if (Math.abs(lastUpdated - item.updated) < 70_000) {
containerIds.add(item.id)
newItems.push(item)
}
if (systemId) {
return curItems?.filter((item) => item.system !== systemId) ?? []
}
for (const item of curItems ?? []) {
if (!containerIds.has(item.id) && lastUpdated - item.updated < 70_000) {
newItems.push(item)
}
}
return newItems
return []
})
return
}
)
setData((curItems) => {
const lastUpdated = Math.max(items[0].updated, items.at(-1)?.updated ?? 0)
const containerIds = new Set()
const newItems = []
for (const item of items) {
if (Math.abs(lastUpdated - item.updated) < 70_000) {
containerIds.add(item.id)
newItems.push(item)
}
}
for (const item of curItems ?? []) {
if (!containerIds.has(item.id) && lastUpdated - item.updated < 70_000) {
newItems.push(item)
}
}
return newItems
})
})
}
// initial load
@@ -285,7 +283,7 @@ async function getInfoHtml(container: ContainerRecord): Promise<string> {
])
try {
info = JSON.stringify(JSON.parse(info), null, 2)
} catch (_) { }
} catch (_) {}
return info ? highlighter.codeToHtml(info, { lang: "json", theme: syntaxTheme }) : t`No results.`
} catch (error) {
console.error(error)
@@ -342,12 +340,12 @@ function ContainerSheet({
setLogsDisplay("")
setInfoDisplay("")
if (!container) return
; (async () => {
const [logsHtml, infoHtml] = await Promise.all([getLogsHtml(container), getInfoHtml(container)])
setLogsDisplay(logsHtml)
setInfoDisplay(infoHtml)
setTimeout(scrollLogsToBottom, 20)
})()
;(async () => {
const [logsHtml, infoHtml] = await Promise.all([getLogsHtml(container), getInfoHtml(container)])
setLogsDisplay(logsHtml)
setInfoDisplay(infoHtml)
setTimeout(scrollLogsToBottom, 20)
})()
}, [container])
return (
@@ -473,7 +471,7 @@ const ContainerTableRow = memo(function ContainerTableRow({
{row.getVisibleCells().map((cell) => (
<TableCell
key={cell.id}
className="py-0"
className="py-0 ps-4.5"
style={{
height: virtualRow.size,
}}

View File

@@ -19,7 +19,7 @@ import { FreeBsdIcon, TuxIcon, WebSocketIcon, WindowsIcon } from "@/components/u
import { Separator } from "@/components/ui/separator"
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"
import { ConnectionType, connectionTypeLabels, Os, SystemStatus } from "@/lib/enums"
import { cn, formatBytes, getHostDisplayValue, secondsToString, toFixedFloat } from "@/lib/utils"
import { cn, formatBytes, getHostDisplayValue, secondsToUptimeString, toFixedFloat } from "@/lib/utils"
import type { ChartData, SystemDetailsRecord, SystemRecord } from "@/types"
export default function InfoBar({
@@ -77,14 +77,6 @@ export default function InfoBar({
},
}
let uptime: string
if (system.info.u < 3600) {
uptime = secondsToString(system.info.u, "minute")
} else if (system.info.u < 360000) {
uptime = secondsToString(system.info.u, "hour")
} else {
uptime = secondsToString(system.info.u, "day")
}
const info = [
{ value: getHostDisplayValue(system), Icon: GlobeIcon },
{
@@ -94,7 +86,7 @@ export default function InfoBar({
// hide if hostname is same as host or name
hide: hostname === system.host || hostname === system.name,
},
{ value: uptime, Icon: ClockArrowUp, label: t`Uptime`, hide: !system.info.u },
{ value: secondsToUptimeString(system.info.u), Icon: ClockArrowUp, label: t`Uptime`, hide: !system.info.u },
osInfo[os],
{
value: cpuModel,

View File

@@ -174,8 +174,8 @@ export const columns: ColumnDef<SmartDeviceRecord>[] = [
<HeaderButton column={column} name={t({ message: "Power On", comment: "Power On Time" })} Icon={Clock} />
),
cell: ({ getValue }) => {
const hours = (getValue() ?? 0) as number
if (!hours && hours !== 0) {
const hours = getValue() as number | undefined
if (hours == null) {
return <div className="text-sm text-muted-foreground ms-1.5">N/A</div>
}
const seconds = hours * 3600
@@ -195,7 +195,7 @@ export const columns: ColumnDef<SmartDeviceRecord>[] = [
),
cell: ({ getValue }) => {
const cycles = getValue() as number | undefined
if (!cycles && cycles !== 0) {
if (cycles == null) {
return <div className="text-muted-foreground ms-1.5">N/A</div>
}
return <span className="ms-1.5">{cycles.toLocaleString()}</span>
@@ -206,7 +206,11 @@ export const columns: ColumnDef<SmartDeviceRecord>[] = [
invertSorting: true,
header: ({ column }) => <HeaderButton column={column} name={t`Temp`} Icon={ThermometerIcon} />,
cell: ({ getValue }) => {
const { value, unit } = formatTemperature(getValue() as number)
const temp = getValue() as number | null | undefined
if (!temp) {
return <div className="text-muted-foreground ms-1.5">N/A</div>
}
const { value, unit } = formatTemperature(temp)
return <span className="ms-1.5">{`${value} ${unit}`}</span>
},
},
@@ -304,41 +308,41 @@ export default function DisksTable({ systemId }: { systemId?: string }) {
? { fields: SMART_DEVICE_FIELDS, filter: pb.filter("system = {:system}", { system: systemId }) }
: { fields: SMART_DEVICE_FIELDS }
; (async () => {
try {
unsubscribe = await pb.collection("smart_devices").subscribe(
"*",
(event) => {
const record = event.record as SmartDeviceRecord
setSmartDevices((currentDevices) => {
const devices = currentDevices ?? []
const matchesSystemScope = !systemId || record.system === systemId
;(async () => {
try {
unsubscribe = await pb.collection("smart_devices").subscribe(
"*",
(event) => {
const record = event.record as SmartDeviceRecord
setSmartDevices((currentDevices) => {
const devices = currentDevices ?? []
const matchesSystemScope = !systemId || record.system === systemId
if (event.action === "delete") {
return devices.filter((device) => device.id !== record.id)
}
if (event.action === "delete") {
return devices.filter((device) => device.id !== record.id)
}
if (!matchesSystemScope) {
// Record moved out of scope; ensure it disappears locally.
return devices.filter((device) => device.id !== record.id)
}
if (!matchesSystemScope) {
// Record moved out of scope; ensure it disappears locally.
return devices.filter((device) => device.id !== record.id)
}
const existingIndex = devices.findIndex((device) => device.id === record.id)
if (existingIndex === -1) {
return [record, ...devices]
}
const existingIndex = devices.findIndex((device) => device.id === record.id)
if (existingIndex === -1) {
return [record, ...devices]
}
const next = [...devices]
next[existingIndex] = record
return next
})
},
pbOptions
)
} catch (error) {
console.error("Failed to subscribe to SMART device updates:", error)
}
})()
const next = [...devices]
next[existingIndex] = record
return next
})
},
pbOptions
)
} catch (error) {
console.error("Failed to subscribe to SMART device updates:", error)
}
})()
return () => {
unsubscribe?.()

View File

@@ -35,7 +35,7 @@ import {
formatTemperature,
getMeterState,
parseSemVer,
secondsToString,
secondsToUptimeString,
} from "@/lib/utils"
import { batteryStateTranslations } from "@/lib/i18n"
import type { SystemRecord } from "@/types"
@@ -154,11 +154,7 @@ export function SystemsTableColumns(viewMode: "table" | "grid"): ColumnDef<Syste
{name}
</Link>
</span>
<Link
href={linkUrl}
className="inset-0 absolute size-full"
aria-label={name}
></Link>
<Link href={linkUrl} className="inset-0 absolute size-full" aria-label={name}></Link>
</>
)
},
@@ -382,20 +378,13 @@ export function SystemsTableColumns(viewMode: "table" | "grid"): ColumnDef<Syste
size: 50,
Icon: ClockArrowUp,
header: sortableHeader,
hideSort: true,
cell(info) {
const uptime = info.getValue() as number
if (!uptime) {
return null
}
let formatted: string
if (uptime < 3600) {
formatted = secondsToString(uptime, "minute")
} else if (uptime < 360000) {
formatted = secondsToString(uptime, "hour")
} else {
formatted = secondsToString(uptime, "day")
}
return <span className="tabular-nums whitespace-nowrap">{formatted}</span>
return <span className="tabular-nums whitespace-nowrap">{secondsToUptimeString(uptime)}</span>
},
},
{
@@ -479,9 +468,9 @@ function TableCellWithMeter(info: CellContext<SystemRecord, unknown>) {
const meterClass = cn(
"h-full",
(info.row.original.status !== SystemStatus.Up && STATUS_COLORS.paused) ||
(threshold === MeterState.Good && STATUS_COLORS.up) ||
(threshold === MeterState.Warn && STATUS_COLORS.pending) ||
STATUS_COLORS.down
(threshold === MeterState.Good && STATUS_COLORS.up) ||
(threshold === MeterState.Warn && STATUS_COLORS.pending) ||
STATUS_COLORS.down
)
return (
<div className="flex gap-2 items-center tabular-nums tracking-tight w-full">
@@ -593,7 +582,7 @@ export function IndicatorDot({ system, className }: { system: SystemRecord; clas
return (
<span
className={cn("shrink-0 size-2 rounded-full", className)}
// style={{ marginBottom: "-1px" }}
// style={{ marginBottom: "-1px" }}
/>
)
}

View File

@@ -434,7 +434,7 @@ const SystemTableRow = memo(
width: cell.column.getSize(),
height: virtualRow.size,
}}
className="py-0"
className="py-0 ps-4.5"
>
{flexRender(cell.column.columnDef.cell, cell.getContext())}
</TableCell>

View File

@@ -465,4 +465,15 @@ export function secondsToString(seconds: number, unit: "hour" | "minute" | "day"
case "day":
return plural(count, { one: `${countString} day`, other: `${countString} days` })
}
}
/** Format seconds to uptime string - "X minutes", "X hours", "X days" */
export function secondsToUptimeString(seconds: number): string {
if (seconds < 3600) {
return secondsToString(seconds, "minute")
} else if (seconds < 360000) {
return secondsToString(seconds, "hour")
} else {
return secondsToString(seconds, "day")
}
}

View File

@@ -51,7 +51,7 @@ The [quick start guide](https://beszel.dev/guide/getting-started) and other docu
- **GPU usage / power draw** - Nvidia, AMD, and Intel.
- **Battery** - Host system battery charge.
- **Containers** - Status and metrics of all running Docker / Podman containers.
- **S.M.A.R.T.** - Host system disk health.
- **S.M.A.R.T.** - Host system disk health (includes eMMC wear/EOL via Linux sysfs when available).
## Help and discussion