Compare commits

..

1 Commits
nvml ... main

Author SHA1 Message Date
henrygd
2bd85e04fc add experimental nvml gpu collector (#1522) 2025-12-21 17:10:42 -05:00
2 changed files with 8 additions and 7 deletions

View File

@@ -72,7 +72,7 @@ type nvmlCollector struct {
}
func (c *nvmlCollector) init() error {
slog.Info("NVML: Initializing")
slog.Debug("NVML: Initializing")
libPath := getNVMLPath()
lib, err := openLibrary(libPath)
@@ -159,7 +159,7 @@ func (c *nvmlCollector) collect() {
gpu := c.gm.GpuDataMap[id]
if bdf != "" && !c.isGPUActive(bdf) {
slog.Info("NVML: GPU is suspended, skipping", "bdf", bdf)
slog.Debug("NVML: GPU is suspended, skipping", "bdf", bdf)
gpu.Temperature = 0
gpu.MemoryUsed = 0
continue
@@ -168,13 +168,13 @@ func (c *nvmlCollector) collect() {
// Utilization
var utilization nvmlUtilization
if ret := nvmlDeviceGetUtilizationRates(device, &utilization); ret != nvmlReturn(nvmlSuccess) {
slog.Info("NVML: Utilization failed (GPU likely suspended)", "bdf", bdf, "ret", ret)
slog.Debug("NVML: Utilization failed (GPU likely suspended)", "bdf", bdf, "ret", ret)
gpu.Temperature = 0
gpu.MemoryUsed = 0
continue
}
slog.Info("NVML: Collecting data for GPU", "bdf", bdf)
slog.Debug("NVML: Collecting data for GPU", "bdf", bdf)
// Temperature
var temp uint32
@@ -205,6 +205,6 @@ func (c *nvmlCollector) collect() {
gpu.Usage += float64(utilization.Gpu)
gpu.Power += float64(power) / 1000.0
gpu.Count++
slog.Info("NVML: Collected data", "gpu", gpu)
slog.Debug("NVML: Collected data", "gpu", gpu)
}
}

View File

@@ -29,12 +29,12 @@ func (c *nvmlCollector) isGPUActive(bdf string) bool {
statusPath := filepath.Join("/sys/bus/pci/devices", bdf, "power/runtime_status")
status, err := os.ReadFile(statusPath)
if err != nil {
slog.Info("NVML: Can't read runtime_status", "bdf", bdf, "err", err)
slog.Debug("NVML: Can't read runtime_status", "bdf", bdf, "err", err)
return true // Assume active if we can't read status
}
statusStr := strings.TrimSpace(string(status))
if statusStr != "active" && statusStr != "resuming" {
slog.Info("NVML: GPU is not active", "bdf", bdf, "status", statusStr)
slog.Debug("NVML: GPU not active", "bdf", bdf, "status", statusStr)
return false
}
@@ -47,6 +47,7 @@ func (c *nvmlCollector) isGPUActive(bdf string) bool {
if err == nil {
pstateStr := strings.TrimSpace(string(pstate))
if pstateStr != "D0" {
slog.Debug("NVML: GPU not in D0 state", "bdf", bdf, "pstate", pstateStr)
return false
}
}