Compare commits

...

1 Commits

Author SHA1 Message Date
Yorick
14f7480915 support xpu-smi for intel stats (#755) 2025-11-14 10:58:28 -05:00

View File

@@ -4,6 +4,7 @@ import (
"beszel/internal/entities/system"
"bufio"
"bytes"
"encoding/csv"
"encoding/json"
"fmt"
"os/exec"
@@ -21,11 +22,13 @@ const (
nvidiaSmiCmd = "nvidia-smi"
rocmSmiCmd = "rocm-smi"
tegraStatsCmd = "tegrastats"
xpuSmiCmd = "xpu-smi"
// Polling intervals
nvidiaSmiInterval = "4" // in seconds
tegraStatsInterval = "3700" // in milliseconds
rocmSmiInterval = 4300 * time.Millisecond
xpuSmiInterval = 4
// Command retry and timeout constants
retryWaitTime = 5 * time.Second
@@ -41,10 +44,11 @@ const (
// GPUManager manages data collection for GPUs (either Nvidia or AMD)
type GPUManager struct {
sync.Mutex
nvidiaSmi bool
rocmSmi bool
tegrastats bool
GpuDataMap map[string]*system.GPUData
nvidiaSmi bool
rocmSmi bool
tegrastats bool
intelXpuSmi bool
GpuDataMap map[string]*system.GPUData
}
// RocmSmiJson represents the JSON structure of rocm-smi output
@@ -160,6 +164,59 @@ func (gm *GPUManager) getJetsonParser() func(output []byte) bool {
}
}
func (gm *GPUManager) parseIntelData(output []byte) bool {
gm.Lock()
defer gm.Unlock()
reader := csv.NewReader(bytes.NewReader(output))
records, err := reader.ReadAll()
if err != nil {
slog.Warn("Failed to parse Intel GPU data", "err", err)
return false
}
header := []string{"Timestamp", "DeviceId", "GPU Power (W)", "GPU Frequency (MHz)", "GPU Memory Utilization (%)", "GPU Memory Used (MiB)"}
gpuData := &system.GPUData{Name: "GPU"}
gm.GpuDataMap["0"] = gpuData
for _, record := range records {
if strings.Join(record, ",") == strings.Join(header, ",") {
slog.Debug("Skipping header", "header", record)
continue
}
var memoryUtilization *float64
var memoryUsed *float64
for i, field := range header {
if field == "Timestamp" {
continue
}
stripped := strings.TrimSpace(record[i])
value, err := strconv.ParseFloat(stripped, 64)
if err != nil {
slog.Warn("Failed to parse field", "field", field, "value", stripped, "err", err)
continue
}
switch field {
case "GPU Power (W)":
gpuData.Power += value
case "GPU Frequency (MHz)":
gpuData.Usage += value
case "GPU Memory Utilization (%)":
memoryUtilization = &value
case "GPU Memory Used (MiB)":
memoryUsed = &value
}
}
if memoryUtilization != nil && memoryUsed != nil {
gpuData.MemoryUsed = *memoryUsed
gpuData.MemoryTotal = (*memoryUsed / *memoryUtilization) * 100 // convert to total memory
}
}
gpuData.Count++
return true
}
// parseNvidiaData parses the output of nvidia-smi and updates the GPUData map
func (gm *GPUManager) parseNvidiaData(output []byte) bool {
gm.Lock()
@@ -278,10 +335,14 @@ func (gm *GPUManager) detectGPUs() error {
gm.tegrastats = true
gm.nvidiaSmi = false
}
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats {
fmt.Println("Looking for gpus")
if _, err := exec.LookPath(xpuSmiCmd); err == nil {
gm.intelXpuSmi = true
}
if gm.nvidiaSmi || gm.rocmSmi || gm.tegrastats || gm.intelXpuSmi {
return nil
}
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, or tegrastats")
return fmt.Errorf("no GPU found - install nvidia-smi, rocm-smi, intel_gpu_top, or tegrastats")
}
// startCollector starts the appropriate GPU data collector based on the command
@@ -318,6 +379,10 @@ func (gm *GPUManager) startCollector(command string) {
time.Sleep(rocmSmiInterval)
}
}()
case xpuSmiCmd:
collector.cmdArgs = []string{"dump", "-d", "-1", "-m", "1,2,5,18", "-i", strconv.Itoa(xpuSmiInterval)}
collector.parse = gm.parseIntelData
go collector.start()
}
}
@@ -338,6 +403,9 @@ func NewGPUManager() (*GPUManager, error) {
if gm.tegrastats {
gm.startCollector(tegraStatsCmd)
}
if gm.intelXpuSmi {
gm.startCollector(xpuSmiCmd)
}
return &gm, nil
}