Multi-GPU Collection System: - Add modular GPU collector architecture in collectors/gpu/ - Support AMD (amdgpu), NVIDIA (nvidia-smi), and Intel (i915/xe) GPUs - GPU Manager auto-detects and aggregates all vendor collectors - Backward-compatible JSON output for existing frontend Operational Modes: - Standalone mode (default): single-host monitoring, no database - Server mode: multi-device with database, auth, agents (WIP) - Agent mode: lightweight reporter to central server (WIP) - Mode selection via TYTO_MODE env var or config.yaml Configuration Updates: - Add server config (gRPC port, mTLS settings, registration) - Add agent config (ID, server URL, TLS certificates) - Add database config (SQLite/PostgreSQL support) - Support TYTO_* prefixed environment variables 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
250 lines
5.7 KiB
Go
250 lines
5.7 KiB
Go
package gpu
|
|
|
|
import (
|
|
"bytes"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
// NVIDIACollector collects metrics from NVIDIA GPUs using nvidia-smi.
|
|
// This is the fallback collector that works without CGO or NVML.
|
|
type NVIDIACollector struct {
|
|
sysPath string
|
|
available bool
|
|
gpuCount int
|
|
nvidiaSmi string // Path to nvidia-smi
|
|
devicePaths []string
|
|
}
|
|
|
|
// NewNVIDIACollector creates a collector for NVIDIA GPUs.
|
|
func NewNVIDIACollector(sysPath string) *NVIDIACollector {
|
|
return &NVIDIACollector{
|
|
sysPath: sysPath,
|
|
devicePaths: make([]string, 0),
|
|
}
|
|
}
|
|
|
|
func (c *NVIDIACollector) Vendor() Vendor {
|
|
return VendorNVIDIA
|
|
}
|
|
|
|
// Detect finds all NVIDIA GPUs using multiple detection methods.
|
|
func (c *NVIDIACollector) Detect() int {
|
|
// Method 1: Check for nvidia-smi
|
|
c.nvidiaSmi = c.findNvidiaSmi()
|
|
if c.nvidiaSmi != "" {
|
|
count := c.countGPUsViaSmi()
|
|
if count > 0 {
|
|
c.available = true
|
|
c.gpuCount = count
|
|
return count
|
|
}
|
|
}
|
|
|
|
// Method 2: Check /dev/nvidia* devices
|
|
matches, err := filepath.Glob("/dev/nvidia[0-9]*")
|
|
if err == nil && len(matches) > 0 {
|
|
c.available = true
|
|
c.gpuCount = len(matches)
|
|
c.devicePaths = matches
|
|
return len(matches)
|
|
}
|
|
|
|
// Method 3: Check sysfs for nvidia driver
|
|
count := c.detectViaSysfs()
|
|
if count > 0 {
|
|
c.available = true
|
|
c.gpuCount = count
|
|
return count
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
func (c *NVIDIACollector) findNvidiaSmi() string {
|
|
// Check common locations
|
|
paths := []string{
|
|
"/usr/bin/nvidia-smi",
|
|
"/usr/local/bin/nvidia-smi",
|
|
"/opt/nvidia/bin/nvidia-smi",
|
|
}
|
|
|
|
for _, p := range paths {
|
|
if _, err := os.Stat(p); err == nil {
|
|
return p
|
|
}
|
|
}
|
|
|
|
// Try PATH lookup
|
|
path, err := exec.LookPath("nvidia-smi")
|
|
if err == nil {
|
|
return path
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func (c *NVIDIACollector) countGPUsViaSmi() int {
|
|
cmd := exec.Command(c.nvidiaSmi, "--query-gpu=count", "--format=csv,noheader,nounits")
|
|
var out bytes.Buffer
|
|
cmd.Stdout = &out
|
|
cmd.Stderr = nil
|
|
|
|
if err := cmd.Run(); err != nil {
|
|
return 0
|
|
}
|
|
|
|
lines := strings.Split(strings.TrimSpace(out.String()), "\n")
|
|
return len(lines)
|
|
}
|
|
|
|
func (c *NVIDIACollector) detectViaSysfs() int {
|
|
drmPath := filepath.Join(c.sysPath, "class/drm")
|
|
entries, err := os.ReadDir(drmPath)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
|
|
count := 0
|
|
for _, entry := range entries {
|
|
name := entry.Name()
|
|
if !strings.HasPrefix(name, "card") || strings.Contains(name, "-") {
|
|
continue
|
|
}
|
|
|
|
devicePath := filepath.Join(drmPath, name, "device")
|
|
driverLink, err := os.Readlink(filepath.Join(devicePath, "driver"))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
if strings.Contains(driverLink, "nvidia") {
|
|
c.devicePaths = append(c.devicePaths, devicePath)
|
|
count++
|
|
}
|
|
}
|
|
|
|
return count
|
|
}
|
|
|
|
// Collect gathers metrics for all detected NVIDIA GPUs.
|
|
func (c *NVIDIACollector) Collect() ([]GPUInfo, error) {
|
|
if !c.available {
|
|
return nil, nil
|
|
}
|
|
|
|
// Prefer nvidia-smi for detailed metrics
|
|
if c.nvidiaSmi != "" {
|
|
return c.collectViaSmi()
|
|
}
|
|
|
|
// Fallback to basic sysfs info
|
|
return c.collectViaSysfs()
|
|
}
|
|
|
|
func (c *NVIDIACollector) collectViaSmi() ([]GPUInfo, error) {
|
|
// Query all relevant metrics in one call for efficiency
|
|
cmd := exec.Command(c.nvidiaSmi,
|
|
"--query-gpu=index,name,driver_version,utilization.gpu,memory.used,memory.total,temperature.gpu,fan.speed,power.draw,clocks.gr,clocks.mem",
|
|
"--format=csv,noheader,nounits")
|
|
|
|
var out bytes.Buffer
|
|
cmd.Stdout = &out
|
|
cmd.Stderr = nil
|
|
|
|
if err := cmd.Run(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
gpus := make([]GPUInfo, 0, c.gpuCount)
|
|
lines := strings.Split(strings.TrimSpace(out.String()), "\n")
|
|
|
|
for _, line := range lines {
|
|
if line == "" {
|
|
continue
|
|
}
|
|
|
|
fields := strings.Split(line, ", ")
|
|
if len(fields) < 11 {
|
|
continue
|
|
}
|
|
|
|
info := GPUInfo{
|
|
Vendor: VendorNVIDIA,
|
|
}
|
|
|
|
// Parse each field
|
|
if idx, err := strconv.Atoi(strings.TrimSpace(fields[0])); err == nil {
|
|
info.Index = idx
|
|
}
|
|
info.Name = strings.TrimSpace(fields[1])
|
|
info.Driver = strings.TrimSpace(fields[2])
|
|
|
|
if util, err := strconv.Atoi(strings.TrimSpace(fields[3])); err == nil {
|
|
info.Utilization = util
|
|
}
|
|
|
|
// Memory in MiB from nvidia-smi, convert to bytes
|
|
if memUsed, err := strconv.ParseFloat(strings.TrimSpace(fields[4]), 64); err == nil {
|
|
info.MemoryUsed = uint64(memUsed * 1024 * 1024)
|
|
}
|
|
if memTotal, err := strconv.ParseFloat(strings.TrimSpace(fields[5]), 64); err == nil {
|
|
info.MemoryTotal = uint64(memTotal * 1024 * 1024)
|
|
}
|
|
|
|
if temp, err := strconv.ParseFloat(strings.TrimSpace(fields[6]), 64); err == nil {
|
|
info.Temperature = temp
|
|
}
|
|
|
|
// Fan speed is a percentage, but we report RPM if available
|
|
// nvidia-smi reports percentage, not RPM - skip or convert
|
|
if fan, err := strconv.Atoi(strings.TrimSpace(fields[7])); err == nil {
|
|
info.FanRPM = fan // Actually percentage, but keep field for consistency
|
|
}
|
|
|
|
if power, err := strconv.ParseFloat(strings.TrimSpace(fields[8]), 64); err == nil {
|
|
info.PowerWatts = power
|
|
}
|
|
|
|
if clockCore, err := strconv.Atoi(strings.TrimSpace(fields[9])); err == nil {
|
|
info.ClockCore = clockCore
|
|
}
|
|
if clockMem, err := strconv.Atoi(strings.TrimSpace(fields[10])); err == nil {
|
|
info.ClockMemory = clockMem
|
|
}
|
|
|
|
gpus = append(gpus, info)
|
|
}
|
|
|
|
return gpus, nil
|
|
}
|
|
|
|
func (c *NVIDIACollector) collectViaSysfs() ([]GPUInfo, error) {
|
|
gpus := make([]GPUInfo, 0, len(c.devicePaths))
|
|
|
|
for i, devicePath := range c.devicePaths {
|
|
info := GPUInfo{
|
|
Index: i,
|
|
Vendor: VendorNVIDIA,
|
|
Driver: "nvidia",
|
|
}
|
|
|
|
// Try to get name from uevent
|
|
ueventData, err := os.ReadFile(filepath.Join(devicePath, "uevent"))
|
|
if err == nil {
|
|
for _, line := range strings.Split(string(ueventData), "\n") {
|
|
if strings.HasPrefix(line, "PCI_ID=") {
|
|
info.Name = strings.TrimPrefix(line, "PCI_ID=")
|
|
}
|
|
}
|
|
}
|
|
|
|
gpus = append(gpus, info)
|
|
}
|
|
|
|
return gpus, nil
|
|
}
|