Files
tyto/backend/internal/collectors/gpu/amd.go
vikingowl a0a947094d feat: add multi-GPU support and operational modes
Multi-GPU Collection System:
- Add modular GPU collector architecture in collectors/gpu/
- Support AMD (amdgpu), NVIDIA (nvidia-smi), and Intel (i915/xe) GPUs
- GPU Manager auto-detects and aggregates all vendor collectors
- Backward-compatible JSON output for existing frontend

Operational Modes:
- Standalone mode (default): single-host monitoring, no database
- Server mode: multi-device with database, auth, agents (WIP)
- Agent mode: lightweight reporter to central server (WIP)
- Mode selection via TYTO_MODE env var or config.yaml

Configuration Updates:
- Add server config (gRPC port, mTLS settings, registration)
- Add agent config (ID, server URL, TLS certificates)
- Add database config (SQLite/PostgreSQL support)
- Support TYTO_* prefixed environment variables

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 07:21:50 +01:00

169 lines
4.1 KiB
Go

package gpu
import (
"os"
"path/filepath"
"strconv"
"strings"
)
// AMDCollector collects metrics from AMD GPUs using the amdgpu driver.
type AMDCollector struct {
sysPath string
cards []amdCard
}
// amdCard represents a single AMD GPU detected in the system.
type amdCard struct {
cardPath string
hwmonPath string
name string
}
// NewAMDCollector creates a collector for AMD GPUs.
func NewAMDCollector(sysPath string) *AMDCollector {
return &AMDCollector{
sysPath: sysPath,
cards: make([]amdCard, 0),
}
}
func (c *AMDCollector) Vendor() Vendor {
return VendorAMD
}
// Detect finds all AMD GPUs and returns their count.
func (c *AMDCollector) Detect() int {
c.cards = c.cards[:0] // Reset
drmPath := filepath.Join(c.sysPath, "class/drm")
entries, err := os.ReadDir(drmPath)
if err != nil {
return 0
}
for _, entry := range entries {
name := entry.Name()
// Look for card directories (card0, card1, ...) but not render nodes
if !strings.HasPrefix(name, "card") || strings.Contains(name, "-") {
continue
}
devicePath := filepath.Join(drmPath, name, "device")
// Check if this is an AMD GPU by looking at the driver
driverLink, err := os.Readlink(filepath.Join(devicePath, "driver"))
if err != nil || !strings.Contains(driverLink, "amdgpu") {
continue
}
card := amdCard{cardPath: devicePath}
// Find hwmon path
hwmonDir := filepath.Join(devicePath, "hwmon")
hwmonEntries, err := os.ReadDir(hwmonDir)
if err == nil && len(hwmonEntries) > 0 {
card.hwmonPath = filepath.Join(hwmonDir, hwmonEntries[0].Name())
}
// Try to get GPU name from uevent
ueventData, err := os.ReadFile(filepath.Join(devicePath, "uevent"))
if err == nil {
for _, line := range strings.Split(string(ueventData), "\n") {
if strings.HasPrefix(line, "PCI_ID=") {
card.name = strings.TrimPrefix(line, "PCI_ID=")
}
}
}
c.cards = append(c.cards, card)
}
return len(c.cards)
}
// Collect gathers metrics for all detected AMD GPUs.
func (c *AMDCollector) Collect() ([]GPUInfo, error) {
gpus := make([]GPUInfo, 0, len(c.cards))
for i, card := range c.cards {
info := GPUInfo{
Index: i,
Name: card.name,
Vendor: VendorAMD,
Driver: "amdgpu",
}
// GPU utilization
if val, err := readInt(filepath.Join(card.cardPath, "gpu_busy_percent")); err == nil {
info.Utilization = val
}
// VRAM usage
if val, err := readUint64(filepath.Join(card.cardPath, "mem_info_vram_used")); err == nil {
info.MemoryUsed = val
}
if val, err := readUint64(filepath.Join(card.cardPath, "mem_info_vram_total")); err == nil {
info.MemoryTotal = val
}
// Temperature from hwmon (millidegrees Celsius)
if card.hwmonPath != "" {
if val, err := readInt(filepath.Join(card.hwmonPath, "temp1_input")); err == nil {
info.Temperature = float64(val) / 1000.0
}
// Fan speed (RPM)
if val, err := readInt(filepath.Join(card.hwmonPath, "fan1_input")); err == nil {
info.FanRPM = val
}
// Power usage (microwatts to watts)
if val, err := readInt(filepath.Join(card.hwmonPath, "power1_average")); err == nil {
info.PowerWatts = float64(val) / 1000000.0
}
}
// Clock speeds from pp_dpm_sclk and pp_dpm_mclk
info.ClockCore = parseCurrentClock(filepath.Join(card.cardPath, "pp_dpm_sclk"))
info.ClockMemory = parseCurrentClock(filepath.Join(card.cardPath, "pp_dpm_mclk"))
gpus = append(gpus, info)
}
return gpus, nil
}
// parseCurrentClock reads AMD DPM clock files and extracts the current frequency.
func parseCurrentClock(path string) int {
data, err := os.ReadFile(path)
if err != nil {
return 0
}
// Parse lines like "1: 1311Mhz *" where * indicates current
for _, line := range strings.Split(string(data), "\n") {
line = strings.TrimSpace(line)
if !strings.HasSuffix(line, "*") {
continue
}
// Remove the * and parse
line = strings.TrimSuffix(line, "*")
parts := strings.Fields(line)
if len(parts) < 2 {
continue
}
freqStr := parts[1]
freqStr = strings.TrimSuffix(freqStr, "Mhz")
freqStr = strings.TrimSuffix(freqStr, "MHz")
if freq, err := strconv.Atoi(freqStr); err == nil {
return freq
}
}
return 0
}