Files
tyto/backend/internal/collectors/gpu/nvidia.go
vikingowl a0a947094d feat: add multi-GPU support and operational modes
Multi-GPU Collection System:
- Add modular GPU collector architecture in collectors/gpu/
- Support AMD (amdgpu), NVIDIA (nvidia-smi), and Intel (i915/xe) GPUs
- GPU Manager auto-detects and aggregates all vendor collectors
- Backward-compatible JSON output for existing frontend

Operational Modes:
- Standalone mode (default): single-host monitoring, no database
- Server mode: multi-device with database, auth, agents (WIP)
- Agent mode: lightweight reporter to central server (WIP)
- Mode selection via TYTO_MODE env var or config.yaml

Configuration Updates:
- Add server config (gRPC port, mTLS settings, registration)
- Add agent config (ID, server URL, TLS certificates)
- Add database config (SQLite/PostgreSQL support)
- Support TYTO_* prefixed environment variables

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 07:21:50 +01:00

250 lines
5.7 KiB
Go

package gpu
import (
"bytes"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
)
// NVIDIACollector collects metrics from NVIDIA GPUs using nvidia-smi.
// This is the fallback collector that works without CGO or NVML.
type NVIDIACollector struct {
sysPath string
available bool
gpuCount int
nvidiaSmi string // Path to nvidia-smi
devicePaths []string
}
// NewNVIDIACollector creates a collector for NVIDIA GPUs.
func NewNVIDIACollector(sysPath string) *NVIDIACollector {
return &NVIDIACollector{
sysPath: sysPath,
devicePaths: make([]string, 0),
}
}
func (c *NVIDIACollector) Vendor() Vendor {
return VendorNVIDIA
}
// Detect finds all NVIDIA GPUs using multiple detection methods.
func (c *NVIDIACollector) Detect() int {
// Method 1: Check for nvidia-smi
c.nvidiaSmi = c.findNvidiaSmi()
if c.nvidiaSmi != "" {
count := c.countGPUsViaSmi()
if count > 0 {
c.available = true
c.gpuCount = count
return count
}
}
// Method 2: Check /dev/nvidia* devices
matches, err := filepath.Glob("/dev/nvidia[0-9]*")
if err == nil && len(matches) > 0 {
c.available = true
c.gpuCount = len(matches)
c.devicePaths = matches
return len(matches)
}
// Method 3: Check sysfs for nvidia driver
count := c.detectViaSysfs()
if count > 0 {
c.available = true
c.gpuCount = count
return count
}
return 0
}
func (c *NVIDIACollector) findNvidiaSmi() string {
// Check common locations
paths := []string{
"/usr/bin/nvidia-smi",
"/usr/local/bin/nvidia-smi",
"/opt/nvidia/bin/nvidia-smi",
}
for _, p := range paths {
if _, err := os.Stat(p); err == nil {
return p
}
}
// Try PATH lookup
path, err := exec.LookPath("nvidia-smi")
if err == nil {
return path
}
return ""
}
func (c *NVIDIACollector) countGPUsViaSmi() int {
cmd := exec.Command(c.nvidiaSmi, "--query-gpu=count", "--format=csv,noheader,nounits")
var out bytes.Buffer
cmd.Stdout = &out
cmd.Stderr = nil
if err := cmd.Run(); err != nil {
return 0
}
lines := strings.Split(strings.TrimSpace(out.String()), "\n")
return len(lines)
}
func (c *NVIDIACollector) detectViaSysfs() int {
drmPath := filepath.Join(c.sysPath, "class/drm")
entries, err := os.ReadDir(drmPath)
if err != nil {
return 0
}
count := 0
for _, entry := range entries {
name := entry.Name()
if !strings.HasPrefix(name, "card") || strings.Contains(name, "-") {
continue
}
devicePath := filepath.Join(drmPath, name, "device")
driverLink, err := os.Readlink(filepath.Join(devicePath, "driver"))
if err != nil {
continue
}
if strings.Contains(driverLink, "nvidia") {
c.devicePaths = append(c.devicePaths, devicePath)
count++
}
}
return count
}
// Collect gathers metrics for all detected NVIDIA GPUs.
func (c *NVIDIACollector) Collect() ([]GPUInfo, error) {
if !c.available {
return nil, nil
}
// Prefer nvidia-smi for detailed metrics
if c.nvidiaSmi != "" {
return c.collectViaSmi()
}
// Fallback to basic sysfs info
return c.collectViaSysfs()
}
func (c *NVIDIACollector) collectViaSmi() ([]GPUInfo, error) {
// Query all relevant metrics in one call for efficiency
cmd := exec.Command(c.nvidiaSmi,
"--query-gpu=index,name,driver_version,utilization.gpu,memory.used,memory.total,temperature.gpu,fan.speed,power.draw,clocks.gr,clocks.mem",
"--format=csv,noheader,nounits")
var out bytes.Buffer
cmd.Stdout = &out
cmd.Stderr = nil
if err := cmd.Run(); err != nil {
return nil, err
}
gpus := make([]GPUInfo, 0, c.gpuCount)
lines := strings.Split(strings.TrimSpace(out.String()), "\n")
for _, line := range lines {
if line == "" {
continue
}
fields := strings.Split(line, ", ")
if len(fields) < 11 {
continue
}
info := GPUInfo{
Vendor: VendorNVIDIA,
}
// Parse each field
if idx, err := strconv.Atoi(strings.TrimSpace(fields[0])); err == nil {
info.Index = idx
}
info.Name = strings.TrimSpace(fields[1])
info.Driver = strings.TrimSpace(fields[2])
if util, err := strconv.Atoi(strings.TrimSpace(fields[3])); err == nil {
info.Utilization = util
}
// Memory in MiB from nvidia-smi, convert to bytes
if memUsed, err := strconv.ParseFloat(strings.TrimSpace(fields[4]), 64); err == nil {
info.MemoryUsed = uint64(memUsed * 1024 * 1024)
}
if memTotal, err := strconv.ParseFloat(strings.TrimSpace(fields[5]), 64); err == nil {
info.MemoryTotal = uint64(memTotal * 1024 * 1024)
}
if temp, err := strconv.ParseFloat(strings.TrimSpace(fields[6]), 64); err == nil {
info.Temperature = temp
}
// Fan speed is a percentage, but we report RPM if available
// nvidia-smi reports percentage, not RPM - skip or convert
if fan, err := strconv.Atoi(strings.TrimSpace(fields[7])); err == nil {
info.FanRPM = fan // Actually percentage, but keep field for consistency
}
if power, err := strconv.ParseFloat(strings.TrimSpace(fields[8]), 64); err == nil {
info.PowerWatts = power
}
if clockCore, err := strconv.Atoi(strings.TrimSpace(fields[9])); err == nil {
info.ClockCore = clockCore
}
if clockMem, err := strconv.Atoi(strings.TrimSpace(fields[10])); err == nil {
info.ClockMemory = clockMem
}
gpus = append(gpus, info)
}
return gpus, nil
}
func (c *NVIDIACollector) collectViaSysfs() ([]GPUInfo, error) {
gpus := make([]GPUInfo, 0, len(c.devicePaths))
for i, devicePath := range c.devicePaths {
info := GPUInfo{
Index: i,
Vendor: VendorNVIDIA,
Driver: "nvidia",
}
// Try to get name from uevent
ueventData, err := os.ReadFile(filepath.Join(devicePath, "uevent"))
if err == nil {
for _, line := range strings.Split(string(ueventData), "\n") {
if strings.HasPrefix(line, "PCI_ID=") {
info.Name = strings.TrimPrefix(line, "PCI_ID=")
}
}
}
gpus = append(gpus, info)
}
return gpus, nil
}