Compare commits

..

2 Commits
nvtop ... main

Author SHA1 Message Date
henrygd
283fa9d5c2 include GTT memory in AMD GPU metrics (#1569) 2026-02-13 20:06:37 -05:00
henrygd
7d6c0caafc add amdgpu.ids to docker images (#1569) 2026-02-13 19:55:02 -05:00
5 changed files with 78 additions and 32 deletions

View File

@@ -103,8 +103,17 @@ func (gm *GPUManager) updateAmdGpuData(cardPath string) bool {
// Read all sysfs values first (no lock needed - these can be slow)
usage, usageErr := readSysfsFloat(filepath.Join(devicePath, "gpu_busy_percent"))
memUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used"))
memTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total"))
vramUsed, memUsedErr := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_used"))
vramTotal, _ := readSysfsFloat(filepath.Join(devicePath, "mem_info_vram_total"))
memUsed := vramUsed
memTotal := vramTotal
// if gtt is present, add it to the memory used and total (https://github.com/henrygd/beszel/issues/1569#issuecomment-3837640484)
if gttUsed, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_used")); err == nil && gttUsed > 0 {
if gttTotal, err := readSysfsFloat(filepath.Join(devicePath, "mem_info_gtt_total")); err == nil {
memUsed += gttUsed
memTotal += gttTotal
}
}
var temp, power float64
hwmons, _ := filepath.Glob(filepath.Join(devicePath, "hwmon/hwmon*"))

View File

@@ -119,40 +119,68 @@ func TestAmdgpuNameCacheRoundTrip(t *testing.T) {
}
func TestUpdateAmdGpuDataWithFakeSysfs(t *testing.T) {
dir := t.TempDir()
cardPath := filepath.Join(dir, "card0")
devicePath := filepath.Join(cardPath, "device")
hwmonPath := filepath.Join(devicePath, "hwmon", "hwmon0")
require.NoError(t, os.MkdirAll(hwmonPath, 0o755))
write := func(name, content string) {
require.NoError(t, os.WriteFile(filepath.Join(devicePath, name), []byte(content), 0o644))
tests := []struct {
name string
writeGTT bool
wantMemoryUsed float64
wantMemoryTotal float64
}{
{
name: "sums vram and gtt when gtt is present",
writeGTT: true,
wantMemoryUsed: bytesToMegabytes(1073741824 + 536870912),
wantMemoryTotal: bytesToMegabytes(2147483648 + 4294967296),
},
{
name: "falls back to vram when gtt is missing",
writeGTT: false,
wantMemoryUsed: bytesToMegabytes(1073741824),
wantMemoryTotal: bytesToMegabytes(2147483648),
},
}
write("vendor", "0x1002")
write("device", "0x1506")
write("revision", "0xc1")
write("gpu_busy_percent", "25")
write("mem_info_vram_used", "1073741824")
write("mem_info_vram_total", "2147483648")
require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "temp1_input"), []byte("45000"), 0o644))
require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "power1_input"), []byte("20000000"), 0o644))
// Pre-cache name so getAmdGpuName returns a known value (it uses system amdgpu.ids path)
cacheAmdgpuName("1506", "c1", "AMD Radeon 610M Graphics", true)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dir := t.TempDir()
cardPath := filepath.Join(dir, "card0")
devicePath := filepath.Join(cardPath, "device")
hwmonPath := filepath.Join(devicePath, "hwmon", "hwmon0")
require.NoError(t, os.MkdirAll(hwmonPath, 0o755))
gm := &GPUManager{GpuDataMap: make(map[string]*system.GPUData)}
ok := gm.updateAmdGpuData(cardPath)
require.True(t, ok)
write := func(name, content string) {
require.NoError(t, os.WriteFile(filepath.Join(devicePath, name), []byte(content), 0o644))
}
write("vendor", "0x1002")
write("device", "0x1506")
write("revision", "0xc1")
write("gpu_busy_percent", "25")
write("mem_info_vram_used", "1073741824")
write("mem_info_vram_total", "2147483648")
if tt.writeGTT {
write("mem_info_gtt_used", "536870912")
write("mem_info_gtt_total", "4294967296")
}
require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "temp1_input"), []byte("45000"), 0o644))
require.NoError(t, os.WriteFile(filepath.Join(hwmonPath, "power1_input"), []byte("20000000"), 0o644))
gpu, ok := gm.GpuDataMap["card0"]
require.True(t, ok)
assert.Equal(t, "AMD Radeon 610M", gpu.Name)
assert.Equal(t, 25.0, gpu.Usage)
assert.Equal(t, bytesToMegabytes(1073741824), gpu.MemoryUsed)
assert.Equal(t, bytesToMegabytes(2147483648), gpu.MemoryTotal)
assert.Equal(t, 45.0, gpu.Temperature)
assert.Equal(t, 20.0, gpu.Power)
assert.Equal(t, 1.0, gpu.Count)
// Pre-cache name so getAmdGpuName returns a known value (it uses system amdgpu.ids path)
cacheAmdgpuName("1506", "c1", "AMD Radeon 610M Graphics", true)
gm := &GPUManager{GpuDataMap: make(map[string]*system.GPUData)}
ok := gm.updateAmdGpuData(cardPath)
require.True(t, ok)
gpu, ok := gm.GpuDataMap["card0"]
require.True(t, ok)
assert.Equal(t, "AMD Radeon 610M", gpu.Name)
assert.Equal(t, 25.0, gpu.Usage)
assert.Equal(t, tt.wantMemoryUsed, gpu.MemoryUsed)
assert.Equal(t, tt.wantMemoryTotal, gpu.MemoryTotal)
assert.Equal(t, 45.0, gpu.Temperature)
assert.Equal(t, 20.0, gpu.Power)
assert.Equal(t, 1.0, gpu.Count)
})
}
}
func TestLookupAmdgpuNameInFile(t *testing.T) {

View File

@@ -23,6 +23,9 @@ COPY --from=builder /agent /agent
# this is so we don't need to create the /tmp directory in the scratch container
COPY --from=builder /tmp /tmp
# AMD GPU name lookup (used by agent on Linux when /usr/share/libdrm/amdgpu.ids is read)
COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
# Ensure data persistence across container recreations
VOLUME ["/var/lib/beszel-agent"]

View File

@@ -20,6 +20,9 @@ RUN rm -rf /tmp/*
FROM alpine:3.23
COPY --from=builder /agent /agent
# AMD GPU name lookup (used by agent on Linux when /usr/share/libdrm/amdgpu.ids is read)
COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
RUN apk add --no-cache smartmontools
# Ensure data persistence across container recreations

View File

@@ -37,6 +37,9 @@ RUN apt-get update && apt-get install -y \
FROM nvidia/cuda:12.2.2-base-ubuntu22.04
COPY --from=builder /agent /agent
# AMD GPU name lookup (used by agent on hybrid laptops when /usr/share/libdrm/amdgpu.ids is read)
COPY --from=builder /app/agent/test-data/amdgpu.ids /usr/share/libdrm/amdgpu.ids
# Copy smartmontools binaries and config files
COPY --from=smartmontools-builder /usr/sbin/smartctl /usr/sbin/smartctl