Compare commits

...

2 Commits

Author SHA1 Message Date
Sven van Ginkel
fe30f99695 [Feature] Add detailed CPU metrics (User, System, IOWait, Steal) with per-core monitoring (#1356)
* Add user, system io wait

* add per cpu core

* add total
2025-10-31 17:11:22 -04:00
henrygd
85ac2e5e9a update env var name to EXCLUDE_CONTAINERS #1352 2025-10-30 19:30:01 -04:00
9 changed files with 318 additions and 32 deletions

View File

@@ -8,6 +8,7 @@ import (
)
var lastCpuTimes = make(map[uint16]cpu.TimesStat)
var lastPerCoreCpuTimes = make(map[uint16][]cpu.TimesStat)
// init initializes the CPU monitoring by storing the initial CPU times
// for the default 60-second cache interval.
@@ -15,6 +16,18 @@ func init() {
if times, err := cpu.Times(false); err == nil {
lastCpuTimes[60000] = times[0]
}
if perCoreTimes, err := cpu.Times(true); err == nil {
lastPerCoreCpuTimes[60000] = perCoreTimes
}
}
// CpuMetrics contains detailed CPU usage breakdown
type CpuMetrics struct {
Total float64
User float64
System float64
Iowait float64
Steal float64
}
// getCpuPercent calculates the CPU usage percentage using cached previous measurements.
@@ -34,6 +47,92 @@ func getCpuPercent(cacheTimeMs uint16) (float64, error) {
return delta, nil
}
// getCpuMetrics calculates detailed CPU usage metrics using cached previous measurements.
// It returns percentages for total, user, system, iowait, and steal time.
func getCpuMetrics(cacheTimeMs uint16) (CpuMetrics, error) {
times, err := cpu.Times(false)
if err != nil || len(times) == 0 {
return CpuMetrics{}, err
}
// if cacheTimeMs is not in lastCpuTimes, use 60000 as fallback lastCpuTime
if _, ok := lastCpuTimes[cacheTimeMs]; !ok {
lastCpuTimes[cacheTimeMs] = lastCpuTimes[60000]
}
t1 := lastCpuTimes[cacheTimeMs]
t2 := times[0]
t1All, t1Busy := getAllBusy(t1)
t2All, t2Busy := getAllBusy(t2)
totalDelta := t2All - t1All
if totalDelta <= 0 {
return CpuMetrics{}, nil
}
metrics := CpuMetrics{
Total: clampPercent((t2Busy - t1Busy) / totalDelta * 100),
User: clampPercent((t2.User - t1.User) / totalDelta * 100),
System: clampPercent((t2.System - t1.System) / totalDelta * 100),
Iowait: clampPercent((t2.Iowait - t1.Iowait) / totalDelta * 100),
Steal: clampPercent((t2.Steal - t1.Steal) / totalDelta * 100),
}
lastCpuTimes[cacheTimeMs] = times[0]
return metrics, nil
}
// clampPercent ensures the percentage is between 0 and 100
func clampPercent(value float64) float64 {
return math.Min(100, math.Max(0, value))
}
// getPerCoreCpuMetrics calculates per-core CPU usage metrics.
// Returns a map where the key is "cpu0", "cpu1", etc. and the value is an array of [user, system, iowait, steal] percentages.
func getPerCoreCpuMetrics(cacheTimeMs uint16) (map[string][4]float64, error) {
perCoreTimes, err := cpu.Times(true)
if err != nil || len(perCoreTimes) == 0 {
return nil, err
}
// Initialize cache if needed
if _, ok := lastPerCoreCpuTimes[cacheTimeMs]; !ok {
lastPerCoreCpuTimes[cacheTimeMs] = lastPerCoreCpuTimes[60000]
}
lastTimes := lastPerCoreCpuTimes[cacheTimeMs]
result := make(map[string][4]float64)
// Calculate metrics for each core
for i, currentTime := range perCoreTimes {
if i >= len(lastTimes) {
break
}
t1 := lastTimes[i]
t2 := currentTime
t1All, _ := getAllBusy(t1)
t2All, _ := getAllBusy(t2)
totalDelta := t2All - t1All
if totalDelta <= 0 {
continue
}
// Store as [user, system, iowait, steal]
result[currentTime.CPU] = [4]float64{
clampPercent((t2.User - t1.User) / totalDelta * 100),
clampPercent((t2.System - t1.System) / totalDelta * 100),
clampPercent((t2.Iowait - t1.Iowait) / totalDelta * 100),
clampPercent((t2.Steal - t1.Steal) / totalDelta * 100),
}
}
lastPerCoreCpuTimes[cacheTimeMs] = perCoreTimes
return result, nil
}
// calculateBusy calculates the CPU busy percentage between two time points.
// It computes the ratio of busy time to total time elapsed between t1 and t2,
// returning a percentage clamped between 0 and 100.

View File

@@ -54,7 +54,7 @@ type dockerManager struct {
buf *bytes.Buffer // Buffer to store and read response bodies
decoder *json.Decoder // Reusable JSON decoder that reads from buf
apiStats *container.ApiStats // Reusable API stats object
containerExclude []string // Patterns to exclude containers by name (supports wildcards)
excludeContainers []string // Patterns to exclude containers by name
// Cache-time-aware tracking for CPU stats (similar to cpu.go)
// Maps cache time intervals to container-specific CPU usage tracking
@@ -96,13 +96,12 @@ func (d *dockerManager) dequeue() {
}
}
// shouldExcludeContainer checks if a container name matches any exclusion pattern using path.Match
// shouldExcludeContainer checks if a container name matches any exclusion pattern
func (dm *dockerManager) shouldExcludeContainer(name string) bool {
if len(dm.containerExclude) == 0 {
if len(dm.excludeContainers) == 0 {
return false
}
for _, pattern := range dm.containerExclude {
// Use path.Match for wildcard support
for _, pattern := range dm.excludeContainers {
if match, _ := path.Match(pattern, name); match {
return true
}
@@ -138,15 +137,9 @@ func (dm *dockerManager) getDockerStats(cacheTimeMs uint16) ([]*container.Stats,
for _, ctr := range dm.apiContainerList {
ctr.IdShort = ctr.Id[:12]
// Extract container name and check if it should be excluded
name := ctr.Names[0]
if len(name) > 0 && name[0] == '/' {
name = name[1:]
}
// Skip this container if it matches the exclusion pattern
if dm.shouldExcludeContainer(name) {
slog.Debug("Excluding container", "name", name, "patterns", dm.containerExclude)
if dm.shouldExcludeContainer(ctr.Names[0][1:]) {
slog.Debug("Excluding container", "name", ctr.Names[0][1:])
continue
}
@@ -532,20 +525,17 @@ func newDockerManager(a *Agent) *dockerManager {
userAgent: "Docker-Client/",
}
// Read container exclusion patterns from environment variable (comma-separated, supports wildcards)
var containerExclude []string
if excludeStr, set := GetEnv("CONTAINER_EXCLUDE"); set && excludeStr != "" {
// Split by comma and trim whitespace
parts := strings.Split(excludeStr, ",")
for _, part := range parts {
// Read container exclusion patterns from environment variable
var excludeContainers []string
if excludeStr, set := GetEnv("EXCLUDE_CONTAINERS"); set && excludeStr != "" {
parts := strings.SplitSeq(excludeStr, ",")
for part := range parts {
trimmed := strings.TrimSpace(part)
if trimmed != "" {
containerExclude = append(containerExclude, trimmed)
excludeContainers = append(excludeContainers, trimmed)
}
}
if len(containerExclude) > 0 {
slog.Info("Container exclusion patterns set", "patterns", containerExclude)
}
slog.Info("EXCLUDE_CONTAINERS", "patterns", excludeContainers)
}
manager := &dockerManager{
@@ -557,7 +547,7 @@ func newDockerManager(a *Agent) *dockerManager {
sem: make(chan struct{}, 5),
apiContainerList: []*container.ApiInfo{},
apiStats: &container.ApiStats{},
containerExclude: containerExclude,
excludeContainers: excludeContainers,
// Initialize cache-time-aware tracking structures
lastCpuContainer: make(map[uint16]map[string]uint64),

View File

@@ -1196,7 +1196,7 @@ func TestShouldExcludeContainer(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
dm := &dockerManager{
containerExclude: tt.patterns,
excludeContainers: tt.patterns,
}
result := dm.shouldExcludeContainer(tt.containerName)
assert.Equal(t, tt.expected, result)

View File

@@ -83,12 +83,21 @@ func (a *Agent) getSystemStats(cacheTimeMs uint16) system.Stats {
systemStats.Battery[1] = batteryState
}
// cpu percent
cpuPercent, err := getCpuPercent(cacheTimeMs)
// cpu metrics
cpuMetrics, err := getCpuMetrics(cacheTimeMs)
if err == nil {
systemStats.Cpu = twoDecimals(cpuPercent)
systemStats.Cpu = twoDecimals(cpuMetrics.Total)
systemStats.CpuUser = twoDecimals(cpuMetrics.User)
systemStats.CpuSystem = twoDecimals(cpuMetrics.System)
systemStats.CpuIowait = twoDecimals(cpuMetrics.Iowait)
systemStats.CpuSteal = twoDecimals(cpuMetrics.Steal)
} else {
slog.Error("Error getting cpu percent", "err", err)
slog.Error("Error getting cpu metrics", "err", err)
}
// per-core cpu metrics
if perCoreCpuMetrics, err := getPerCoreCpuMetrics(cacheTimeMs); err == nil && len(perCoreCpuMetrics) > 0 {
systemStats.CpuCores = perCoreCpuMetrics
}
// load average

View File

@@ -11,7 +11,12 @@ import (
type Stats struct {
Cpu float64 `json:"cpu" cbor:"0,keyasint"`
MaxCpu float64 `json:"cpum,omitempty" cbor:"1,keyasint,omitempty"`
Mem float64 `json:"m" cbor:"2,keyasint"`
CpuUser float64 `json:"cpuu,omitempty" cbor:"33,keyasint,omitempty"`
CpuSystem float64 `json:"cpus,omitempty" cbor:"34,keyasint,omitempty"`
CpuIowait float64 `json:"cpui,omitempty" cbor:"35,keyasint,omitempty"`
CpuSteal float64 `json:"cpust,omitempty" cbor:"36,keyasint,omitempty"`
CpuCores map[string][4]float64 `json:"cpuc,omitempty" cbor:"37,keyasint,omitempty"` // [user, system, iowait, steal] per core
Mem float64 `json:"m" cbor:"2,keyasint"`
MemUsed float64 `json:"mu" cbor:"3,keyasint"`
MemPct float64 `json:"mp" cbor:"4,keyasint"`
MemBuffCache float64 `json:"mb" cbor:"5,keyasint"`

View File

@@ -118,6 +118,28 @@ export function useNetworkInterfaces(interfaces: SystemStats["ni"]) {
dataKey: ({ stats }: SystemStatsRecord) => stats?.ni?.[key]?.[index],
color: `hsl(${220 + (((sortedKeys.indexOf(key) * 360) / sortedKeys.length) % 360)}, 70%, 50%)`,
opacity: 0.3,
}))
},
}
}
// Assures consistent colors for CPU cores
export function useCpuCores(cores: SystemStats["cpuc"]) {
const keys = Object.keys(cores ?? {})
// Sort cores by name (cpu0, cpu1, cpu2, etc.)
const sortedKeys = keys.sort((a, b) => {
const numA = Number.parseInt(a.replace("cpu", ""))
const numB = Number.parseInt(b.replace("cpu", ""))
return numA - numB
})
return {
length: sortedKeys.length,
data: (index = 0) => {
return sortedKeys.map((key) => ({
label: key,
dataKey: ({ stats }: SystemStatsRecord) => stats?.cpuc?.[key]?.[index],
color: `hsl(${(((sortedKeys.indexOf(key) * 360) / sortedKeys.length) % 360)}, 70%, 50%)`,
opacity: 0.3,
}))
},

View File

@@ -73,6 +73,7 @@ import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from ".
import { Separator } from "../ui/separator"
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "../ui/tooltip"
import NetworkSheet from "./system/network-sheet"
import CpuCoresSheet from "./system/cpu-cores-sheet"
import LineChartDefault from "../charts/line-chart"
@@ -585,18 +586,49 @@ export default memo(function SystemDetail({ id }: { id: string }) {
grid={grid}
title={t`CPU Usage`}
description={t`Average system-wide CPU utilization`}
cornerEl={maxValSelect}
cornerEl={
<>
{maxValSelect}
<CpuCoresSheet chartData={chartData} dataEmpty={dataEmpty} grid={grid} maxValues={maxValues} />
</>
}
legend={true}
>
<AreaChartDefault
chartData={chartData}
maxToggled={maxValues}
legend={true}
dataPoints={[
{
label: t`CPU Usage`,
label: t`Total`,
dataKey: ({ stats }) => (showMax ? stats?.cpum : stats?.cpu),
color: 1,
opacity: 0.4,
},
{
label: t`User`,
dataKey: ({ stats }) => stats?.cpuu,
color: 2,
opacity: 0.3,
},
{
label: t`System`,
dataKey: ({ stats }) => stats?.cpus,
color: 3,
opacity: 0.3,
},
{
label: t`IOWait`,
dataKey: ({ stats }) => stats?.cpui,
color: 4,
opacity: 0.3,
},
{
label: t`Steal`,
dataKey: ({ stats }) => stats?.cpust,
color: 5,
opacity: 0.3,
},
]}
tickFormatter={(val) => `${toFixedFloat(val, 2)}%`}
contentFormatter={({ value }) => `${decimalString(value)}%`}

View File

@@ -0,0 +1,119 @@
import { t } from "@lingui/core/macro"
import { MoreHorizontalIcon } from "lucide-react"
import { memo, useRef, useState } from "react"
import AreaChartDefault from "@/components/charts/area-chart"
import ChartTimeSelect from "@/components/charts/chart-time-select"
import { Button } from "@/components/ui/button"
import { Sheet, SheetContent, SheetTrigger } from "@/components/ui/sheet"
import { DialogTitle } from "@/components/ui/dialog"
import { decimalString, toFixedFloat } from "@/lib/utils"
import type { ChartData, SystemStatsRecord } from "@/types"
import { ChartCard } from "../system"
export default memo(function CpuCoresSheet({
chartData,
dataEmpty,
grid,
maxValues,
}: {
chartData: ChartData
dataEmpty: boolean
grid: boolean
maxValues: boolean
}) {
const [cpuCoresOpen, setCpuCoresOpen] = useState(false)
const hasOpened = useRef(false)
if (cpuCoresOpen && !hasOpened.current) {
hasOpened.current = true
}
// Get list of CPU cores from the latest stats
const cpuCoresData = chartData.systemStats.at(-1)?.stats?.cpuc ?? {}
const coreNames = Object.keys(cpuCoresData).sort((a, b) => {
const numA = Number.parseInt(a.replace("cpu", ""))
const numB = Number.parseInt(b.replace("cpu", ""))
return numA - numB
})
if (coreNames.length === 0) {
return null
}
return (
<Sheet open={cpuCoresOpen} onOpenChange={setCpuCoresOpen}>
<DialogTitle className="sr-only">{t`Per-core CPU usage`}</DialogTitle>
<SheetTrigger asChild>
<Button
title={t`View per-core CPU`}
variant="outline"
size="icon"
className="shrink-0 max-sm:absolute max-sm:top-3 max-sm:end-3"
>
<MoreHorizontalIcon />
</Button>
</SheetTrigger>
{hasOpened.current && (
<SheetContent aria-describedby={undefined} className="overflow-auto w-200 !max-w-full p-4 sm:p-6">
<ChartTimeSelect className="w-[calc(100%-2em)]" agentVersion={chartData.agentVersion} />
{coreNames.map((coreName) => (
<ChartCard
key={coreName}
empty={dataEmpty}
grid={grid}
title={coreName.toUpperCase()}
description={t`CPU usage breakdown for ${coreName}`}
legend={true}
className="min-h-auto"
>
<AreaChartDefault
chartData={chartData}
maxToggled={maxValues}
legend={true}
dataPoints={[
{
label: t`Total`,
dataKey: ({ stats }: SystemStatsRecord) => {
const core = stats?.cpuc?.[coreName]
if (!core) return undefined
// Sum all metrics: user + system + iowait + steal
return core[0] + core[1] + core[2] + core[3]
},
color: 1,
opacity: 0.4,
},
{
label: t`User`,
dataKey: ({ stats }: SystemStatsRecord) => stats?.cpuc?.[coreName]?.[0],
color: 2,
opacity: 0.3,
},
{
label: t`System`,
dataKey: ({ stats }: SystemStatsRecord) => stats?.cpuc?.[coreName]?.[1],
color: 3,
opacity: 0.3,
},
{
label: t`IOWait`,
dataKey: ({ stats }: SystemStatsRecord) => stats?.cpuc?.[coreName]?.[2],
color: 4,
opacity: 0.3,
},
{
label: t`Steal`,
dataKey: ({ stats }: SystemStatsRecord) => stats?.cpuc?.[coreName]?.[3],
color: 5,
opacity: 0.3,
},
]}
tickFormatter={(val) => `${toFixedFloat(val, 2)}%`}
contentFormatter={({ value }) => `${decimalString(value)}%`}
/>
</ChartCard>
))}
</SheetContent>
)}
</Sheet>
)
})

View File

@@ -84,6 +84,16 @@ export interface SystemStats {
cpu: number
/** peak cpu */
cpum?: number
/** cpu user percent */
cpuu?: number
/** cpu system percent */
cpus?: number
/** cpu iowait percent */
cpui?: number
/** cpu steal percent */
cpust?: number
/** per-core cpu metrics [user, system, iowait, steal] */
cpuc?: Record<string, [number, number, number, number]>
// TODO: remove these in future release in favor of la
/** load average 1 minute */
l1?: number