Multi-GPU Collection System: - Add modular GPU collector architecture in collectors/gpu/ - Support AMD (amdgpu), NVIDIA (nvidia-smi), and Intel (i915/xe) GPUs - GPU Manager auto-detects and aggregates all vendor collectors - Backward-compatible JSON output for existing frontend Operational Modes: - Standalone mode (default): single-host monitoring, no database - Server mode: multi-device with database, auth, agents (WIP) - Agent mode: lightweight reporter to central server (WIP) - Mode selection via TYTO_MODE env var or config.yaml Configuration Updates: - Add server config (gRPC port, mTLS settings, registration) - Add agent config (ID, server URL, TLS certificates) - Add database config (SQLite/PostgreSQL support) - Support TYTO_* prefixed environment variables 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
169 lines
4.1 KiB
Go
169 lines
4.1 KiB
Go
package gpu
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
// AMDCollector collects metrics from AMD GPUs using the amdgpu driver.
|
|
type AMDCollector struct {
|
|
sysPath string
|
|
cards []amdCard
|
|
}
|
|
|
|
// amdCard represents a single AMD GPU detected in the system.
|
|
type amdCard struct {
|
|
cardPath string
|
|
hwmonPath string
|
|
name string
|
|
}
|
|
|
|
// NewAMDCollector creates a collector for AMD GPUs.
|
|
func NewAMDCollector(sysPath string) *AMDCollector {
|
|
return &AMDCollector{
|
|
sysPath: sysPath,
|
|
cards: make([]amdCard, 0),
|
|
}
|
|
}
|
|
|
|
func (c *AMDCollector) Vendor() Vendor {
|
|
return VendorAMD
|
|
}
|
|
|
|
// Detect finds all AMD GPUs and returns their count.
|
|
func (c *AMDCollector) Detect() int {
|
|
c.cards = c.cards[:0] // Reset
|
|
|
|
drmPath := filepath.Join(c.sysPath, "class/drm")
|
|
entries, err := os.ReadDir(drmPath)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
|
|
for _, entry := range entries {
|
|
name := entry.Name()
|
|
// Look for card directories (card0, card1, ...) but not render nodes
|
|
if !strings.HasPrefix(name, "card") || strings.Contains(name, "-") {
|
|
continue
|
|
}
|
|
|
|
devicePath := filepath.Join(drmPath, name, "device")
|
|
|
|
// Check if this is an AMD GPU by looking at the driver
|
|
driverLink, err := os.Readlink(filepath.Join(devicePath, "driver"))
|
|
if err != nil || !strings.Contains(driverLink, "amdgpu") {
|
|
continue
|
|
}
|
|
|
|
card := amdCard{cardPath: devicePath}
|
|
|
|
// Find hwmon path
|
|
hwmonDir := filepath.Join(devicePath, "hwmon")
|
|
hwmonEntries, err := os.ReadDir(hwmonDir)
|
|
if err == nil && len(hwmonEntries) > 0 {
|
|
card.hwmonPath = filepath.Join(hwmonDir, hwmonEntries[0].Name())
|
|
}
|
|
|
|
// Try to get GPU name from uevent
|
|
ueventData, err := os.ReadFile(filepath.Join(devicePath, "uevent"))
|
|
if err == nil {
|
|
for _, line := range strings.Split(string(ueventData), "\n") {
|
|
if strings.HasPrefix(line, "PCI_ID=") {
|
|
card.name = strings.TrimPrefix(line, "PCI_ID=")
|
|
}
|
|
}
|
|
}
|
|
|
|
c.cards = append(c.cards, card)
|
|
}
|
|
|
|
return len(c.cards)
|
|
}
|
|
|
|
// Collect gathers metrics for all detected AMD GPUs.
|
|
func (c *AMDCollector) Collect() ([]GPUInfo, error) {
|
|
gpus := make([]GPUInfo, 0, len(c.cards))
|
|
|
|
for i, card := range c.cards {
|
|
info := GPUInfo{
|
|
Index: i,
|
|
Name: card.name,
|
|
Vendor: VendorAMD,
|
|
Driver: "amdgpu",
|
|
}
|
|
|
|
// GPU utilization
|
|
if val, err := readInt(filepath.Join(card.cardPath, "gpu_busy_percent")); err == nil {
|
|
info.Utilization = val
|
|
}
|
|
|
|
// VRAM usage
|
|
if val, err := readUint64(filepath.Join(card.cardPath, "mem_info_vram_used")); err == nil {
|
|
info.MemoryUsed = val
|
|
}
|
|
if val, err := readUint64(filepath.Join(card.cardPath, "mem_info_vram_total")); err == nil {
|
|
info.MemoryTotal = val
|
|
}
|
|
|
|
// Temperature from hwmon (millidegrees Celsius)
|
|
if card.hwmonPath != "" {
|
|
if val, err := readInt(filepath.Join(card.hwmonPath, "temp1_input")); err == nil {
|
|
info.Temperature = float64(val) / 1000.0
|
|
}
|
|
|
|
// Fan speed (RPM)
|
|
if val, err := readInt(filepath.Join(card.hwmonPath, "fan1_input")); err == nil {
|
|
info.FanRPM = val
|
|
}
|
|
|
|
// Power usage (microwatts to watts)
|
|
if val, err := readInt(filepath.Join(card.hwmonPath, "power1_average")); err == nil {
|
|
info.PowerWatts = float64(val) / 1000000.0
|
|
}
|
|
}
|
|
|
|
// Clock speeds from pp_dpm_sclk and pp_dpm_mclk
|
|
info.ClockCore = parseCurrentClock(filepath.Join(card.cardPath, "pp_dpm_sclk"))
|
|
info.ClockMemory = parseCurrentClock(filepath.Join(card.cardPath, "pp_dpm_mclk"))
|
|
|
|
gpus = append(gpus, info)
|
|
}
|
|
|
|
return gpus, nil
|
|
}
|
|
|
|
// parseCurrentClock reads AMD DPM clock files and extracts the current frequency.
|
|
func parseCurrentClock(path string) int {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
|
|
// Parse lines like "1: 1311Mhz *" where * indicates current
|
|
for _, line := range strings.Split(string(data), "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if !strings.HasSuffix(line, "*") {
|
|
continue
|
|
}
|
|
|
|
// Remove the * and parse
|
|
line = strings.TrimSuffix(line, "*")
|
|
parts := strings.Fields(line)
|
|
if len(parts) < 2 {
|
|
continue
|
|
}
|
|
|
|
freqStr := parts[1]
|
|
freqStr = strings.TrimSuffix(freqStr, "Mhz")
|
|
freqStr = strings.TrimSuffix(freqStr, "MHz")
|
|
|
|
if freq, err := strconv.Atoi(freqStr); err == nil {
|
|
return freq
|
|
}
|
|
}
|
|
|
|
return 0
|
|
}
|