Multi-GPU Collection System: - Add modular GPU collector architecture in collectors/gpu/ - Support AMD (amdgpu), NVIDIA (nvidia-smi), and Intel (i915/xe) GPUs - GPU Manager auto-detects and aggregates all vendor collectors - Backward-compatible JSON output for existing frontend Operational Modes: - Standalone mode (default): single-host monitoring, no database - Server mode: multi-device with database, auth, agents (WIP) - Agent mode: lightweight reporter to central server (WIP) - Mode selection via TYTO_MODE env var or config.yaml Configuration Updates: - Add server config (gRPC port, mTLS settings, registration) - Add agent config (ID, server URL, TLS certificates) - Add database config (SQLite/PostgreSQL support) - Support TYTO_* prefixed environment variables 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
146 lines
3.6 KiB
Go
146 lines
3.6 KiB
Go
package gpu
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// IntelCollector collects metrics from Intel GPUs (integrated and discrete).
|
|
// Uses the i915 driver sysfs interface.
|
|
type IntelCollector struct {
|
|
sysPath string
|
|
cards []intelCard
|
|
}
|
|
|
|
// intelCard represents a single Intel GPU.
|
|
type intelCard struct {
|
|
cardPath string
|
|
hwmonPath string
|
|
name string
|
|
driver string // i915 or xe (newer driver)
|
|
}
|
|
|
|
// NewIntelCollector creates a collector for Intel GPUs.
|
|
func NewIntelCollector(sysPath string) *IntelCollector {
|
|
return &IntelCollector{
|
|
sysPath: sysPath,
|
|
cards: make([]intelCard, 0),
|
|
}
|
|
}
|
|
|
|
func (c *IntelCollector) Vendor() Vendor {
|
|
return VendorIntel
|
|
}
|
|
|
|
// Detect finds all Intel GPUs and returns their count.
|
|
func (c *IntelCollector) Detect() int {
|
|
c.cards = c.cards[:0]
|
|
|
|
drmPath := filepath.Join(c.sysPath, "class/drm")
|
|
entries, err := os.ReadDir(drmPath)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
|
|
for _, entry := range entries {
|
|
name := entry.Name()
|
|
// Look for card directories, skip render nodes
|
|
if !strings.HasPrefix(name, "card") || strings.Contains(name, "-") {
|
|
continue
|
|
}
|
|
|
|
devicePath := filepath.Join(drmPath, name, "device")
|
|
|
|
// Check driver - Intel uses i915 or xe (newer driver)
|
|
driverLink, err := os.Readlink(filepath.Join(devicePath, "driver"))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
driverName := filepath.Base(driverLink)
|
|
if driverName != "i915" && driverName != "xe" {
|
|
continue
|
|
}
|
|
|
|
card := intelCard{
|
|
cardPath: devicePath,
|
|
driver: driverName,
|
|
}
|
|
|
|
// Find hwmon path
|
|
hwmonDir := filepath.Join(devicePath, "hwmon")
|
|
hwmonEntries, err := os.ReadDir(hwmonDir)
|
|
if err == nil && len(hwmonEntries) > 0 {
|
|
card.hwmonPath = filepath.Join(hwmonDir, hwmonEntries[0].Name())
|
|
}
|
|
|
|
// Get GPU name from uevent
|
|
ueventData, err := os.ReadFile(filepath.Join(devicePath, "uevent"))
|
|
if err == nil {
|
|
for _, line := range strings.Split(string(ueventData), "\n") {
|
|
if strings.HasPrefix(line, "PCI_ID=") {
|
|
card.name = strings.TrimPrefix(line, "PCI_ID=")
|
|
}
|
|
}
|
|
}
|
|
|
|
c.cards = append(c.cards, card)
|
|
}
|
|
|
|
return len(c.cards)
|
|
}
|
|
|
|
// Collect gathers metrics for all detected Intel GPUs.
|
|
func (c *IntelCollector) Collect() ([]GPUInfo, error) {
|
|
gpus := make([]GPUInfo, 0, len(c.cards))
|
|
|
|
for i, card := range c.cards {
|
|
info := GPUInfo{
|
|
Index: i,
|
|
Name: card.name,
|
|
Vendor: VendorIntel,
|
|
Driver: card.driver,
|
|
}
|
|
|
|
// Intel GPU utilization via i915 perf or debugfs
|
|
// Try reading from sysfs if available
|
|
if val, err := readInt(filepath.Join(card.cardPath, "gt_cur_freq_mhz")); err == nil {
|
|
info.ClockCore = val
|
|
}
|
|
if val, err := readInt(filepath.Join(card.cardPath, "gt_max_freq_mhz")); err == nil {
|
|
// Estimate utilization based on frequency ratio
|
|
if val > 0 && info.ClockCore > 0 {
|
|
info.Utilization = (info.ClockCore * 100) / val
|
|
}
|
|
}
|
|
|
|
// Temperature from hwmon
|
|
if card.hwmonPath != "" {
|
|
if val, err := readInt(filepath.Join(card.hwmonPath, "temp1_input")); err == nil {
|
|
info.Temperature = float64(val) / 1000.0
|
|
}
|
|
|
|
// Power (microwatts to watts) - Intel uses energy counter, need to compute
|
|
if val, err := readInt(filepath.Join(card.hwmonPath, "power1_average")); err == nil {
|
|
info.PowerWatts = float64(val) / 1000000.0
|
|
}
|
|
}
|
|
|
|
// Intel discrete GPUs have VRAM, integrated use system RAM
|
|
// Try to read local memory info for discrete GPUs
|
|
if val, err := readUint64(filepath.Join(card.cardPath, "lmem_total_bytes")); err == nil {
|
|
info.MemoryTotal = val
|
|
}
|
|
if val, err := readUint64(filepath.Join(card.cardPath, "lmem_avail_bytes")); err == nil {
|
|
if info.MemoryTotal > 0 {
|
|
info.MemoryUsed = info.MemoryTotal - val
|
|
}
|
|
}
|
|
|
|
gpus = append(gpus, info)
|
|
}
|
|
|
|
return gpus, nil
|
|
}
|