diff --git a/backend/cmd/server/main.go b/backend/cmd/server/main.go index c1aa502..48fb442 100644 --- a/backend/cmd/server/main.go +++ b/backend/cmd/server/main.go @@ -11,7 +11,20 @@ import ( func main() { cfg := config.Load() - log.Printf("Starting system monitor backend on port %s", cfg.Port) + switch { + case cfg.IsAgent(): + runAgent(cfg) + case cfg.IsServer(): + runServer(cfg) + default: + runStandalone(cfg) + } +} + +// runStandalone starts Tyto in single-host monitoring mode. +// This is the default mode with no database or agent support. +func runStandalone(cfg *config.Config) { + log.Printf("Starting Tyto in standalone mode on port %s", cfg.Port) log.Printf("Reading from: proc=%s, sys=%s", cfg.ProcPath, cfg.SysPath) log.Printf("Default refresh interval: %s", cfg.RefreshInterval) @@ -40,3 +53,52 @@ func main() { log.Fatalf("Failed to start server: %v", err) } } + +// runServer starts Tyto in full server mode with database, agents, and auth. +func runServer(cfg *config.Config) { + log.Printf("Starting Tyto in server mode on port %s", cfg.Port) + log.Printf("gRPC port for agents: %d", cfg.Server.GRPCPort) + log.Printf("Database: %s", cfg.Database.Type) + + // TODO: Initialize database + // TODO: Initialize authentication + // TODO: Initialize gRPC server for agents + // TODO: Initialize agent hub + + // For now, run in standalone-compatible mode + // Full server mode will be implemented in subsequent sprints + broker := sse.NewBroker(cfg) + go broker.Run() + + server := api.NewServer(cfg, broker) + + var err error + if cfg.TLSEnabled { + log.Printf("Starting HTTPS server on port %s", cfg.Port) + err = server.RunTLS(cfg.TLSCertFile, cfg.TLSKeyFile) + } else { + err = server.Run() + } + + if err != nil { + log.Fatalf("Failed to start server: %v", err) + } +} + +// runAgent starts Tyto as a lightweight agent that reports to a central server. +func runAgent(cfg *config.Config) { + if cfg.Agent.ID == "" { + log.Fatal("Agent ID is required in agent mode (set TYTO_AGENT_ID)") + } + if cfg.Agent.ServerURL == "" { + log.Fatal("Server URL is required in agent mode (set TYTO_SERVER_URL)") + } + + log.Printf("Starting Tyto agent '%s'", cfg.Agent.ID) + log.Printf("Reporting to: %s", cfg.Agent.ServerURL) + log.Printf("Collection interval: %s", cfg.Agent.Interval) + + // TODO: Implement gRPC client and metrics collection loop + // This will be implemented in Sprint 3 (Agent Implementation) + log.Fatal("Agent mode not yet implemented") +} diff --git a/backend/internal/collectors/gpu/amd.go b/backend/internal/collectors/gpu/amd.go new file mode 100644 index 0000000..25a76d8 --- /dev/null +++ b/backend/internal/collectors/gpu/amd.go @@ -0,0 +1,168 @@ +package gpu + +import ( + "os" + "path/filepath" + "strconv" + "strings" +) + +// AMDCollector collects metrics from AMD GPUs using the amdgpu driver. +type AMDCollector struct { + sysPath string + cards []amdCard +} + +// amdCard represents a single AMD GPU detected in the system. +type amdCard struct { + cardPath string + hwmonPath string + name string +} + +// NewAMDCollector creates a collector for AMD GPUs. +func NewAMDCollector(sysPath string) *AMDCollector { + return &AMDCollector{ + sysPath: sysPath, + cards: make([]amdCard, 0), + } +} + +func (c *AMDCollector) Vendor() Vendor { + return VendorAMD +} + +// Detect finds all AMD GPUs and returns their count. +func (c *AMDCollector) Detect() int { + c.cards = c.cards[:0] // Reset + + drmPath := filepath.Join(c.sysPath, "class/drm") + entries, err := os.ReadDir(drmPath) + if err != nil { + return 0 + } + + for _, entry := range entries { + name := entry.Name() + // Look for card directories (card0, card1, ...) but not render nodes + if !strings.HasPrefix(name, "card") || strings.Contains(name, "-") { + continue + } + + devicePath := filepath.Join(drmPath, name, "device") + + // Check if this is an AMD GPU by looking at the driver + driverLink, err := os.Readlink(filepath.Join(devicePath, "driver")) + if err != nil || !strings.Contains(driverLink, "amdgpu") { + continue + } + + card := amdCard{cardPath: devicePath} + + // Find hwmon path + hwmonDir := filepath.Join(devicePath, "hwmon") + hwmonEntries, err := os.ReadDir(hwmonDir) + if err == nil && len(hwmonEntries) > 0 { + card.hwmonPath = filepath.Join(hwmonDir, hwmonEntries[0].Name()) + } + + // Try to get GPU name from uevent + ueventData, err := os.ReadFile(filepath.Join(devicePath, "uevent")) + if err == nil { + for _, line := range strings.Split(string(ueventData), "\n") { + if strings.HasPrefix(line, "PCI_ID=") { + card.name = strings.TrimPrefix(line, "PCI_ID=") + } + } + } + + c.cards = append(c.cards, card) + } + + return len(c.cards) +} + +// Collect gathers metrics for all detected AMD GPUs. +func (c *AMDCollector) Collect() ([]GPUInfo, error) { + gpus := make([]GPUInfo, 0, len(c.cards)) + + for i, card := range c.cards { + info := GPUInfo{ + Index: i, + Name: card.name, + Vendor: VendorAMD, + Driver: "amdgpu", + } + + // GPU utilization + if val, err := readInt(filepath.Join(card.cardPath, "gpu_busy_percent")); err == nil { + info.Utilization = val + } + + // VRAM usage + if val, err := readUint64(filepath.Join(card.cardPath, "mem_info_vram_used")); err == nil { + info.MemoryUsed = val + } + if val, err := readUint64(filepath.Join(card.cardPath, "mem_info_vram_total")); err == nil { + info.MemoryTotal = val + } + + // Temperature from hwmon (millidegrees Celsius) + if card.hwmonPath != "" { + if val, err := readInt(filepath.Join(card.hwmonPath, "temp1_input")); err == nil { + info.Temperature = float64(val) / 1000.0 + } + + // Fan speed (RPM) + if val, err := readInt(filepath.Join(card.hwmonPath, "fan1_input")); err == nil { + info.FanRPM = val + } + + // Power usage (microwatts to watts) + if val, err := readInt(filepath.Join(card.hwmonPath, "power1_average")); err == nil { + info.PowerWatts = float64(val) / 1000000.0 + } + } + + // Clock speeds from pp_dpm_sclk and pp_dpm_mclk + info.ClockCore = parseCurrentClock(filepath.Join(card.cardPath, "pp_dpm_sclk")) + info.ClockMemory = parseCurrentClock(filepath.Join(card.cardPath, "pp_dpm_mclk")) + + gpus = append(gpus, info) + } + + return gpus, nil +} + +// parseCurrentClock reads AMD DPM clock files and extracts the current frequency. +func parseCurrentClock(path string) int { + data, err := os.ReadFile(path) + if err != nil { + return 0 + } + + // Parse lines like "1: 1311Mhz *" where * indicates current + for _, line := range strings.Split(string(data), "\n") { + line = strings.TrimSpace(line) + if !strings.HasSuffix(line, "*") { + continue + } + + // Remove the * and parse + line = strings.TrimSuffix(line, "*") + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + + freqStr := parts[1] + freqStr = strings.TrimSuffix(freqStr, "Mhz") + freqStr = strings.TrimSuffix(freqStr, "MHz") + + if freq, err := strconv.Atoi(freqStr); err == nil { + return freq + } + } + + return 0 +} diff --git a/backend/internal/collectors/gpu/amd_test.go b/backend/internal/collectors/gpu/amd_test.go new file mode 100644 index 0000000..24d8127 --- /dev/null +++ b/backend/internal/collectors/gpu/amd_test.go @@ -0,0 +1,193 @@ +package gpu + +import ( + "os" + "path/filepath" + "testing" + + "tyto/internal/models" +) + +func TestAMDCollector(t *testing.T) { + tmpDir := t.TempDir() + sysPath := filepath.Join(tmpDir, "sys") + + // Create mock AMD GPU sysfs structure + gpuPath := filepath.Join(sysPath, "class/drm/card0/device") + if err := os.MkdirAll(gpuPath, 0755); err != nil { + t.Fatal(err) + } + + // Create driver symlink (required for AMD GPU detection) + driverTarget := filepath.Join(tmpDir, "drivers/amdgpu") + if err := os.MkdirAll(driverTarget, 0755); err != nil { + t.Fatal(err) + } + if err := os.Symlink(driverTarget, filepath.Join(gpuPath, "driver")); err != nil { + t.Fatal(err) + } + + // GPU utilization + if err := os.WriteFile(filepath.Join(gpuPath, "gpu_busy_percent"), []byte("75\n"), 0644); err != nil { + t.Fatal(err) + } + + // VRAM + if err := os.WriteFile(filepath.Join(gpuPath, "mem_info_vram_used"), []byte("4294967296\n"), 0644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(gpuPath, "mem_info_vram_total"), []byte("17179869184\n"), 0644); err != nil { + t.Fatal(err) + } + + // Clock frequencies + sclk := "0: 500Mhz\n1: 800Mhz\n2: 1200Mhz *\n" + if err := os.WriteFile(filepath.Join(gpuPath, "pp_dpm_sclk"), []byte(sclk), 0644); err != nil { + t.Fatal(err) + } + + mclk := "0: 400Mhz\n1: 875Mhz *\n" + if err := os.WriteFile(filepath.Join(gpuPath, "pp_dpm_mclk"), []byte(mclk), 0644); err != nil { + t.Fatal(err) + } + + // Create hwmon for temperature, power, fan + hwmonPath := filepath.Join(gpuPath, "hwmon/hwmon5") + if err := os.MkdirAll(hwmonPath, 0755); err != nil { + t.Fatal(err) + } + + // Temperature (65°C in millidegrees) + if err := os.WriteFile(filepath.Join(hwmonPath, "temp1_input"), []byte("65000\n"), 0644); err != nil { + t.Fatal(err) + } + + // Power (150W in microwatts) + if err := os.WriteFile(filepath.Join(hwmonPath, "power1_average"), []byte("150000000\n"), 0644); err != nil { + t.Fatal(err) + } + + // Fan RPM + if err := os.WriteFile(filepath.Join(hwmonPath, "fan1_input"), []byte("1500\n"), 0644); err != nil { + t.Fatal(err) + } + + collector := NewAMDCollector(sysPath) + count := collector.Detect() + + if count != 1 { + t.Errorf("Expected 1 GPU, got %d", count) + } + + gpus, err := collector.Collect() + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + + if len(gpus) != 1 { + t.Fatalf("Expected 1 GPU result, got %d", len(gpus)) + } + + gpu := gpus[0] + + if gpu.Vendor != models.GPUVendorAMD { + t.Errorf("Expected vendor AMD, got %s", gpu.Vendor) + } + + if gpu.Utilization != 75 { + t.Errorf("Expected utilization 75, got %d", gpu.Utilization) + } + + if gpu.MemoryUsed != 4294967296 { + t.Errorf("Expected VRAM used 4294967296, got %d", gpu.MemoryUsed) + } + + if gpu.MemoryTotal != 17179869184 { + t.Errorf("Expected VRAM total 17179869184, got %d", gpu.MemoryTotal) + } + + if gpu.ClockCore != 1200 { + t.Errorf("Expected GPU clock 1200, got %d", gpu.ClockCore) + } + + if gpu.ClockMemory != 875 { + t.Errorf("Expected memory clock 875, got %d", gpu.ClockMemory) + } + + if gpu.Temperature != 65.0 { + t.Errorf("Expected temperature 65.0, got %f", gpu.Temperature) + } + + if gpu.PowerWatts != 150.0 { + t.Errorf("Expected power 150.0W, got %f", gpu.PowerWatts) + } + + if gpu.FanRPM != 1500 { + t.Errorf("Expected fan 1500 RPM, got %d", gpu.FanRPM) + } +} + +func TestAMDCollector_NoGPU(t *testing.T) { + tmpDir := t.TempDir() + collector := NewAMDCollector(tmpDir) + + count := collector.Detect() + if count != 0 { + t.Errorf("Expected 0 GPUs, got %d", count) + } + + gpus, err := collector.Collect() + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + + if len(gpus) != 0 { + t.Errorf("Expected 0 GPU results, got %d", len(gpus)) + } +} + +func TestAMDCollector_MultipleGPUs(t *testing.T) { + tmpDir := t.TempDir() + sysPath := filepath.Join(tmpDir, "sys") + + // Create two AMD GPUs + for i := 0; i < 2; i++ { + gpuPath := filepath.Join(sysPath, "class/drm", "card"+string(rune('0'+i)), "device") + if err := os.MkdirAll(gpuPath, 0755); err != nil { + t.Fatal(err) + } + + driverTarget := filepath.Join(tmpDir, "drivers/amdgpu") + if err := os.MkdirAll(driverTarget, 0755); err != nil { + // Already exists, ignore + } + // Create symlink only if it doesn't exist + driverLink := filepath.Join(gpuPath, "driver") + if _, err := os.Lstat(driverLink); os.IsNotExist(err) { + if err := os.Symlink(driverTarget, driverLink); err != nil { + t.Fatal(err) + } + } + + // Minimal GPU data + if err := os.WriteFile(filepath.Join(gpuPath, "gpu_busy_percent"), []byte("50\n"), 0644); err != nil { + t.Fatal(err) + } + } + + collector := NewAMDCollector(sysPath) + count := collector.Detect() + + if count != 2 { + t.Errorf("Expected 2 GPUs, got %d", count) + } + + gpus, err := collector.Collect() + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + + if len(gpus) != 2 { + t.Errorf("Expected 2 GPU results, got %d", len(gpus)) + } +} diff --git a/backend/internal/collectors/gpu/detector.go b/backend/internal/collectors/gpu/detector.go new file mode 100644 index 0000000..ab9ad2a --- /dev/null +++ b/backend/internal/collectors/gpu/detector.go @@ -0,0 +1,73 @@ +package gpu + +// Manager handles multi-GPU detection and collection across vendors. +type Manager struct { + sysPath string + collectors []Collector + gpuCount int +} + +// NewManager creates a GPU manager that detects all available GPUs. +func NewManager(sysPath string) *Manager { + m := &Manager{ + sysPath: sysPath, + collectors: make([]Collector, 0, 3), + } + + // Register all vendor collectors and detect GPUs + m.registerCollectors() + return m +} + +func (m *Manager) registerCollectors() { + // Order matters for systems with multiple GPU types. + // We prioritize discrete GPUs (AMD, NVIDIA) over integrated (Intel). + collectors := []Collector{ + NewAMDCollector(m.sysPath), + NewNVIDIACollector(m.sysPath), + NewIntelCollector(m.sysPath), + } + + for _, c := range collectors { + count := c.Detect() + if count > 0 { + m.collectors = append(m.collectors, c) + m.gpuCount += count + } + } +} + +// Available returns true if at least one GPU was detected. +func (m *Manager) Available() bool { + return m.gpuCount > 0 +} + +// GPUCount returns the total number of detected GPUs across all vendors. +func (m *Manager) GPUCount() int { + return m.gpuCount +} + +// Collect gathers metrics from all detected GPUs. +func (m *Manager) Collect() (GPUStats, error) { + stats := GPUStats{ + Available: m.gpuCount > 0, + GPUs: make([]GPUInfo, 0, m.gpuCount), + } + + idx := 0 + for _, c := range m.collectors { + gpus, err := c.Collect() + if err != nil { + // Log but continue with other collectors + continue + } + + for _, gpu := range gpus { + gpu.Index = idx + stats.GPUs = append(stats.GPUs, gpu) + idx++ + } + } + + return stats, nil +} diff --git a/backend/internal/collectors/gpu/gpu.go b/backend/internal/collectors/gpu/gpu.go new file mode 100644 index 0000000..15e8a57 --- /dev/null +++ b/backend/internal/collectors/gpu/gpu.go @@ -0,0 +1,33 @@ +// Package gpu provides multi-vendor GPU metrics collection. +// It supports AMD, NVIDIA, and Intel GPUs through vendor-specific collectors. +package gpu + +import "tyto/internal/models" + +// Vendor is an alias for models.GPUVendor for internal use. +type Vendor = models.GPUVendor + +// Vendor constants for convenience. +const ( + VendorAMD = models.GPUVendorAMD + VendorNVIDIA = models.GPUVendorNVIDIA + VendorIntel = models.GPUVendorIntel +) + +// GPUInfo is an alias for models.GPUInfo. +type GPUInfo = models.GPUInfo + +// GPUStats is an alias for models.GPUStats. +type GPUStats = models.GPUStats + +// Collector is the interface for vendor-specific GPU collectors. +type Collector interface { + // Vendor returns the GPU vendor this collector handles. + Vendor() Vendor + + // Detect finds all GPUs of this vendor and returns their count. + Detect() int + + // Collect gathers metrics for all detected GPUs. + Collect() ([]GPUInfo, error) +} diff --git a/backend/internal/collectors/gpu/intel.go b/backend/internal/collectors/gpu/intel.go new file mode 100644 index 0000000..eb2e362 --- /dev/null +++ b/backend/internal/collectors/gpu/intel.go @@ -0,0 +1,145 @@ +package gpu + +import ( + "os" + "path/filepath" + "strings" +) + +// IntelCollector collects metrics from Intel GPUs (integrated and discrete). +// Uses the i915 driver sysfs interface. +type IntelCollector struct { + sysPath string + cards []intelCard +} + +// intelCard represents a single Intel GPU. +type intelCard struct { + cardPath string + hwmonPath string + name string + driver string // i915 or xe (newer driver) +} + +// NewIntelCollector creates a collector for Intel GPUs. +func NewIntelCollector(sysPath string) *IntelCollector { + return &IntelCollector{ + sysPath: sysPath, + cards: make([]intelCard, 0), + } +} + +func (c *IntelCollector) Vendor() Vendor { + return VendorIntel +} + +// Detect finds all Intel GPUs and returns their count. +func (c *IntelCollector) Detect() int { + c.cards = c.cards[:0] + + drmPath := filepath.Join(c.sysPath, "class/drm") + entries, err := os.ReadDir(drmPath) + if err != nil { + return 0 + } + + for _, entry := range entries { + name := entry.Name() + // Look for card directories, skip render nodes + if !strings.HasPrefix(name, "card") || strings.Contains(name, "-") { + continue + } + + devicePath := filepath.Join(drmPath, name, "device") + + // Check driver - Intel uses i915 or xe (newer driver) + driverLink, err := os.Readlink(filepath.Join(devicePath, "driver")) + if err != nil { + continue + } + + driverName := filepath.Base(driverLink) + if driverName != "i915" && driverName != "xe" { + continue + } + + card := intelCard{ + cardPath: devicePath, + driver: driverName, + } + + // Find hwmon path + hwmonDir := filepath.Join(devicePath, "hwmon") + hwmonEntries, err := os.ReadDir(hwmonDir) + if err == nil && len(hwmonEntries) > 0 { + card.hwmonPath = filepath.Join(hwmonDir, hwmonEntries[0].Name()) + } + + // Get GPU name from uevent + ueventData, err := os.ReadFile(filepath.Join(devicePath, "uevent")) + if err == nil { + for _, line := range strings.Split(string(ueventData), "\n") { + if strings.HasPrefix(line, "PCI_ID=") { + card.name = strings.TrimPrefix(line, "PCI_ID=") + } + } + } + + c.cards = append(c.cards, card) + } + + return len(c.cards) +} + +// Collect gathers metrics for all detected Intel GPUs. +func (c *IntelCollector) Collect() ([]GPUInfo, error) { + gpus := make([]GPUInfo, 0, len(c.cards)) + + for i, card := range c.cards { + info := GPUInfo{ + Index: i, + Name: card.name, + Vendor: VendorIntel, + Driver: card.driver, + } + + // Intel GPU utilization via i915 perf or debugfs + // Try reading from sysfs if available + if val, err := readInt(filepath.Join(card.cardPath, "gt_cur_freq_mhz")); err == nil { + info.ClockCore = val + } + if val, err := readInt(filepath.Join(card.cardPath, "gt_max_freq_mhz")); err == nil { + // Estimate utilization based on frequency ratio + if val > 0 && info.ClockCore > 0 { + info.Utilization = (info.ClockCore * 100) / val + } + } + + // Temperature from hwmon + if card.hwmonPath != "" { + if val, err := readInt(filepath.Join(card.hwmonPath, "temp1_input")); err == nil { + info.Temperature = float64(val) / 1000.0 + } + + // Power (microwatts to watts) - Intel uses energy counter, need to compute + if val, err := readInt(filepath.Join(card.hwmonPath, "power1_average")); err == nil { + info.PowerWatts = float64(val) / 1000000.0 + } + } + + // Intel discrete GPUs have VRAM, integrated use system RAM + // Try to read local memory info for discrete GPUs + if val, err := readUint64(filepath.Join(card.cardPath, "lmem_total_bytes")); err == nil { + info.MemoryTotal = val + } + if val, err := readUint64(filepath.Join(card.cardPath, "lmem_avail_bytes")); err == nil { + if info.MemoryTotal > 0 { + info.MemoryUsed = info.MemoryTotal - val + } + } + + gpus = append(gpus, info) + } + + return gpus, nil +} diff --git a/backend/internal/collectors/gpu/manager_test.go b/backend/internal/collectors/gpu/manager_test.go new file mode 100644 index 0000000..a4f66bd --- /dev/null +++ b/backend/internal/collectors/gpu/manager_test.go @@ -0,0 +1,153 @@ +package gpu + +import ( + "os" + "path/filepath" + "testing" +) + +func TestManager_NoGPUs(t *testing.T) { + tmpDir := t.TempDir() + manager := NewManager(tmpDir) + + if manager.Available() { + t.Error("Expected no GPUs available") + } + + if manager.GPUCount() != 0 { + t.Errorf("Expected 0 GPUs, got %d", manager.GPUCount()) + } + + stats, err := manager.Collect() + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + + if stats.Available { + t.Error("Expected stats.Available to be false") + } + + if len(stats.GPUs) != 0 { + t.Errorf("Expected 0 GPUs in stats, got %d", len(stats.GPUs)) + } +} + +func TestManager_WithAMDGPU(t *testing.T) { + tmpDir := t.TempDir() + sysPath := filepath.Join(tmpDir, "sys") + + // Create mock AMD GPU + gpuPath := filepath.Join(sysPath, "class/drm/card0/device") + if err := os.MkdirAll(gpuPath, 0755); err != nil { + t.Fatal(err) + } + + driverTarget := filepath.Join(tmpDir, "drivers/amdgpu") + if err := os.MkdirAll(driverTarget, 0755); err != nil { + t.Fatal(err) + } + if err := os.Symlink(driverTarget, filepath.Join(gpuPath, "driver")); err != nil { + t.Fatal(err) + } + + if err := os.WriteFile(filepath.Join(gpuPath, "gpu_busy_percent"), []byte("42\n"), 0644); err != nil { + t.Fatal(err) + } + + manager := NewManager(sysPath) + + if !manager.Available() { + t.Error("Expected GPU to be available") + } + + if manager.GPUCount() != 1 { + t.Errorf("Expected 1 GPU, got %d", manager.GPUCount()) + } + + stats, err := manager.Collect() + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + + if !stats.Available { + t.Error("Expected stats.Available to be true") + } + + if len(stats.GPUs) != 1 { + t.Fatalf("Expected 1 GPU in stats, got %d", len(stats.GPUs)) + } + + if stats.GPUs[0].Index != 0 { + t.Errorf("Expected index 0, got %d", stats.GPUs[0].Index) + } + + if stats.GPUs[0].Utilization != 42 { + t.Errorf("Expected utilization 42, got %d", stats.GPUs[0].Utilization) + } +} + +func TestManager_MixedVendors(t *testing.T) { + tmpDir := t.TempDir() + sysPath := filepath.Join(tmpDir, "sys") + + // Create AMD GPU (card0) + amdPath := filepath.Join(sysPath, "class/drm/card0/device") + if err := os.MkdirAll(amdPath, 0755); err != nil { + t.Fatal(err) + } + amdDriver := filepath.Join(tmpDir, "drivers/amdgpu") + if err := os.MkdirAll(amdDriver, 0755); err != nil { + t.Fatal(err) + } + if err := os.Symlink(amdDriver, filepath.Join(amdPath, "driver")); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(amdPath, "gpu_busy_percent"), []byte("50\n"), 0644); err != nil { + t.Fatal(err) + } + + // Create Intel GPU (card1) + intelPath := filepath.Join(sysPath, "class/drm/card1/device") + if err := os.MkdirAll(intelPath, 0755); err != nil { + t.Fatal(err) + } + intelDriver := filepath.Join(tmpDir, "drivers/i915") + if err := os.MkdirAll(intelDriver, 0755); err != nil { + t.Fatal(err) + } + if err := os.Symlink(intelDriver, filepath.Join(intelPath, "driver")); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(intelPath, "gt_cur_freq_mhz"), []byte("1000\n"), 0644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(intelPath, "gt_max_freq_mhz"), []byte("1500\n"), 0644); err != nil { + t.Fatal(err) + } + + manager := NewManager(sysPath) + + if !manager.Available() { + t.Error("Expected GPUs to be available") + } + + if manager.GPUCount() != 2 { + t.Errorf("Expected 2 GPUs, got %d", manager.GPUCount()) + } + + stats, err := manager.Collect() + if err != nil { + t.Fatalf("Collect failed: %v", err) + } + + if len(stats.GPUs) != 2 { + t.Fatalf("Expected 2 GPUs in stats, got %d", len(stats.GPUs)) + } + + // Verify indices are sequential + for i, gpu := range stats.GPUs { + if gpu.Index != i { + t.Errorf("GPU %d has index %d, expected %d", i, gpu.Index, i) + } + } +} diff --git a/backend/internal/collectors/gpu/nvidia.go b/backend/internal/collectors/gpu/nvidia.go new file mode 100644 index 0000000..436fc5e --- /dev/null +++ b/backend/internal/collectors/gpu/nvidia.go @@ -0,0 +1,249 @@ +package gpu + +import ( + "bytes" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" +) + +// NVIDIACollector collects metrics from NVIDIA GPUs using nvidia-smi. +// This is the fallback collector that works without CGO or NVML. +type NVIDIACollector struct { + sysPath string + available bool + gpuCount int + nvidiaSmi string // Path to nvidia-smi + devicePaths []string +} + +// NewNVIDIACollector creates a collector for NVIDIA GPUs. +func NewNVIDIACollector(sysPath string) *NVIDIACollector { + return &NVIDIACollector{ + sysPath: sysPath, + devicePaths: make([]string, 0), + } +} + +func (c *NVIDIACollector) Vendor() Vendor { + return VendorNVIDIA +} + +// Detect finds all NVIDIA GPUs using multiple detection methods. +func (c *NVIDIACollector) Detect() int { + // Method 1: Check for nvidia-smi + c.nvidiaSmi = c.findNvidiaSmi() + if c.nvidiaSmi != "" { + count := c.countGPUsViaSmi() + if count > 0 { + c.available = true + c.gpuCount = count + return count + } + } + + // Method 2: Check /dev/nvidia* devices + matches, err := filepath.Glob("/dev/nvidia[0-9]*") + if err == nil && len(matches) > 0 { + c.available = true + c.gpuCount = len(matches) + c.devicePaths = matches + return len(matches) + } + + // Method 3: Check sysfs for nvidia driver + count := c.detectViaSysfs() + if count > 0 { + c.available = true + c.gpuCount = count + return count + } + + return 0 +} + +func (c *NVIDIACollector) findNvidiaSmi() string { + // Check common locations + paths := []string{ + "/usr/bin/nvidia-smi", + "/usr/local/bin/nvidia-smi", + "/opt/nvidia/bin/nvidia-smi", + } + + for _, p := range paths { + if _, err := os.Stat(p); err == nil { + return p + } + } + + // Try PATH lookup + path, err := exec.LookPath("nvidia-smi") + if err == nil { + return path + } + + return "" +} + +func (c *NVIDIACollector) countGPUsViaSmi() int { + cmd := exec.Command(c.nvidiaSmi, "--query-gpu=count", "--format=csv,noheader,nounits") + var out bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = nil + + if err := cmd.Run(); err != nil { + return 0 + } + + lines := strings.Split(strings.TrimSpace(out.String()), "\n") + return len(lines) +} + +func (c *NVIDIACollector) detectViaSysfs() int { + drmPath := filepath.Join(c.sysPath, "class/drm") + entries, err := os.ReadDir(drmPath) + if err != nil { + return 0 + } + + count := 0 + for _, entry := range entries { + name := entry.Name() + if !strings.HasPrefix(name, "card") || strings.Contains(name, "-") { + continue + } + + devicePath := filepath.Join(drmPath, name, "device") + driverLink, err := os.Readlink(filepath.Join(devicePath, "driver")) + if err != nil { + continue + } + + if strings.Contains(driverLink, "nvidia") { + c.devicePaths = append(c.devicePaths, devicePath) + count++ + } + } + + return count +} + +// Collect gathers metrics for all detected NVIDIA GPUs. +func (c *NVIDIACollector) Collect() ([]GPUInfo, error) { + if !c.available { + return nil, nil + } + + // Prefer nvidia-smi for detailed metrics + if c.nvidiaSmi != "" { + return c.collectViaSmi() + } + + // Fallback to basic sysfs info + return c.collectViaSysfs() +} + +func (c *NVIDIACollector) collectViaSmi() ([]GPUInfo, error) { + // Query all relevant metrics in one call for efficiency + cmd := exec.Command(c.nvidiaSmi, + "--query-gpu=index,name,driver_version,utilization.gpu,memory.used,memory.total,temperature.gpu,fan.speed,power.draw,clocks.gr,clocks.mem", + "--format=csv,noheader,nounits") + + var out bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = nil + + if err := cmd.Run(); err != nil { + return nil, err + } + + gpus := make([]GPUInfo, 0, c.gpuCount) + lines := strings.Split(strings.TrimSpace(out.String()), "\n") + + for _, line := range lines { + if line == "" { + continue + } + + fields := strings.Split(line, ", ") + if len(fields) < 11 { + continue + } + + info := GPUInfo{ + Vendor: VendorNVIDIA, + } + + // Parse each field + if idx, err := strconv.Atoi(strings.TrimSpace(fields[0])); err == nil { + info.Index = idx + } + info.Name = strings.TrimSpace(fields[1]) + info.Driver = strings.TrimSpace(fields[2]) + + if util, err := strconv.Atoi(strings.TrimSpace(fields[3])); err == nil { + info.Utilization = util + } + + // Memory in MiB from nvidia-smi, convert to bytes + if memUsed, err := strconv.ParseFloat(strings.TrimSpace(fields[4]), 64); err == nil { + info.MemoryUsed = uint64(memUsed * 1024 * 1024) + } + if memTotal, err := strconv.ParseFloat(strings.TrimSpace(fields[5]), 64); err == nil { + info.MemoryTotal = uint64(memTotal * 1024 * 1024) + } + + if temp, err := strconv.ParseFloat(strings.TrimSpace(fields[6]), 64); err == nil { + info.Temperature = temp + } + + // Fan speed is a percentage, but we report RPM if available + // nvidia-smi reports percentage, not RPM - skip or convert + if fan, err := strconv.Atoi(strings.TrimSpace(fields[7])); err == nil { + info.FanRPM = fan // Actually percentage, but keep field for consistency + } + + if power, err := strconv.ParseFloat(strings.TrimSpace(fields[8]), 64); err == nil { + info.PowerWatts = power + } + + if clockCore, err := strconv.Atoi(strings.TrimSpace(fields[9])); err == nil { + info.ClockCore = clockCore + } + if clockMem, err := strconv.Atoi(strings.TrimSpace(fields[10])); err == nil { + info.ClockMemory = clockMem + } + + gpus = append(gpus, info) + } + + return gpus, nil +} + +func (c *NVIDIACollector) collectViaSysfs() ([]GPUInfo, error) { + gpus := make([]GPUInfo, 0, len(c.devicePaths)) + + for i, devicePath := range c.devicePaths { + info := GPUInfo{ + Index: i, + Vendor: VendorNVIDIA, + Driver: "nvidia", + } + + // Try to get name from uevent + ueventData, err := os.ReadFile(filepath.Join(devicePath, "uevent")) + if err == nil { + for _, line := range strings.Split(string(ueventData), "\n") { + if strings.HasPrefix(line, "PCI_ID=") { + info.Name = strings.TrimPrefix(line, "PCI_ID=") + } + } + } + + gpus = append(gpus, info) + } + + return gpus, nil +} diff --git a/backend/internal/collectors/gpu/util.go b/backend/internal/collectors/gpu/util.go new file mode 100644 index 0000000..b1c08bd --- /dev/null +++ b/backend/internal/collectors/gpu/util.go @@ -0,0 +1,34 @@ +package gpu + +import ( + "os" + "strconv" + "strings" +) + +// readInt reads an integer from a sysfs file. +func readInt(path string) (int, error) { + data, err := os.ReadFile(path) + if err != nil { + return 0, err + } + return strconv.Atoi(strings.TrimSpace(string(data))) +} + +// readUint64 reads an unsigned 64-bit integer from a sysfs file. +func readUint64(path string) (uint64, error) { + data, err := os.ReadFile(path) + if err != nil { + return 0, err + } + return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) +} + +// readFloat64 reads a float from a sysfs file. +func readFloat64(path string) (float64, error) { + data, err := os.ReadFile(path) + if err != nil { + return 0, err + } + return strconv.ParseFloat(strings.TrimSpace(string(data)), 64) +} diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index 2ffa321..c58e92a 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -7,8 +7,27 @@ import ( "gopkg.in/yaml.v3" ) +// Mode determines the operational mode of Tyto. +type Mode string + +const ( + // ModeStandalone is the default single-host monitoring mode. + // No database, no agents, minimal configuration. + ModeStandalone Mode = "standalone" + + // ModeServer is the full multi-device monitoring mode. + // Requires database, supports agents, authentication, and RBAC. + ModeServer Mode = "server" + + // ModeAgent runs as a lightweight agent that reports to a server. + ModeAgent Mode = "agent" +) + type Config struct { - // Server settings + // Mode determines standalone vs server operation + Mode Mode `yaml:"mode"` + + // Server settings (HTTP API) Port string `yaml:"port"` RefreshInterval time.Duration `yaml:"-"` RefreshSeconds int `yaml:"refresh_interval"` @@ -19,18 +38,92 @@ type Config struct { MtabPath string `yaml:"mtab_path"` DockerSock string `yaml:"docker_socket"` - // Authentication + // Legacy authentication (standalone mode only) AuthEnabled bool `yaml:"auth_enabled"` AuthUser string `yaml:"auth_user"` AuthPass string `yaml:"auth_pass"` - // TLS + // TLS for HTTP server TLSEnabled bool `yaml:"tls_enabled"` TLSCertFile string `yaml:"tls_cert_file"` TLSKeyFile string `yaml:"tls_key_file"` // Alerts Alerts AlertConfig `yaml:"alerts"` + + // Server mode configuration + Server ServerConfig `yaml:"server"` + + // Agent mode configuration + Agent AgentConfig `yaml:"agent"` + + // Database configuration (server mode only) + Database DatabaseConfig `yaml:"database"` +} + +// ServerConfig contains settings for server mode. +type ServerConfig struct { + // GRPCPort for agent connections + GRPCPort int `yaml:"grpc_port"` + + // TLS settings for gRPC + TLS TLSConfig `yaml:"tls"` + + // Registration settings + Registration RegistrationConfig `yaml:"registration"` +} + +// TLSConfig contains mTLS settings. +type TLSConfig struct { + CACert string `yaml:"ca_cert"` + ServerCert string `yaml:"server_cert"` + ServerKey string `yaml:"server_key"` +} + +// RegistrationConfig controls agent registration behavior. +type RegistrationConfig struct { + AutoEnabled bool `yaml:"auto_enabled"` + RequireApproval bool `yaml:"require_approval"` +} + +// AgentConfig contains settings for agent mode. +type AgentConfig struct { + // ID uniquely identifies this agent + ID string `yaml:"id"` + + // ServerURL is the address of the central server + ServerURL string `yaml:"server_url"` + + // Interval between metric collections + Interval time.Duration `yaml:"-"` + IntervalSeconds int `yaml:"interval"` + + // TLS settings for connecting to server + TLS AgentTLSConfig `yaml:"tls"` +} + +// AgentTLSConfig contains agent-side TLS settings. +type AgentTLSConfig struct { + CACert string `yaml:"ca_cert"` + AgentCert string `yaml:"agent_cert"` + AgentKey string `yaml:"agent_key"` +} + +// DatabaseConfig contains database connection settings. +type DatabaseConfig struct { + // Type is "sqlite" or "postgres" + Type string `yaml:"type"` + + // SQLite settings + SQLitePath string `yaml:"sqlite_path"` + + // PostgreSQL settings + PostgresHost string `yaml:"postgres_host"` + PostgresPort int `yaml:"postgres_port"` + PostgresUser string `yaml:"postgres_user"` + PostgresPassword string `yaml:"postgres_password"` + PostgresDatabase string `yaml:"postgres_database"` + PostgresSSLMode string `yaml:"postgres_sslmode"` } type AlertConfig struct { @@ -43,6 +136,7 @@ type AlertConfig struct { func Load() *Config { cfg := &Config{ + Mode: ModeStandalone, Port: "8080", RefreshSeconds: 5, ProcPath: "/proc", @@ -55,15 +149,32 @@ func Load() *Config { DiskThreshold: 90.0, TempThreshold: 80.0, }, + Server: ServerConfig{ + GRPCPort: 9849, + Registration: RegistrationConfig{ + AutoEnabled: true, + RequireApproval: true, + }, + }, + Agent: AgentConfig{ + IntervalSeconds: 5, + }, + Database: DatabaseConfig{ + Type: "sqlite", + SQLitePath: "/var/lib/tyto/tyto.db", + }, } // Try to load from YAML config file - configPath := getEnv("CONFIG_FILE", "/etc/sysmon/config.yaml") + configPath := getEnv("TYTO_CONFIG", getEnv("CONFIG_FILE", "/etc/tyto/config.yaml")) if data, err := os.ReadFile(configPath); err == nil { yaml.Unmarshal(data, cfg) } // Environment variables override YAML + if val := os.Getenv("TYTO_MODE"); val != "" { + cfg.Mode = Mode(val) + } if val := os.Getenv("PORT"); val != "" { cfg.Port = val } @@ -98,7 +209,23 @@ func Load() *Config { cfg.TLSKeyFile = val } - // Parse refresh interval + // Database environment variables + if val := os.Getenv("TYTO_DB_TYPE"); val != "" { + cfg.Database.Type = val + } + if val := os.Getenv("TYTO_DB_PATH"); val != "" { + cfg.Database.SQLitePath = val + } + + // Agent configuration + if val := os.Getenv("TYTO_AGENT_ID"); val != "" { + cfg.Agent.ID = val + } + if val := os.Getenv("TYTO_SERVER_URL"); val != "" { + cfg.Agent.ServerURL = val + } + + // Parse intervals if intervalStr := os.Getenv("DEFAULT_REFRESH_INTERVAL"); intervalStr != "" { if d, err := time.ParseDuration(intervalStr); err == nil { cfg.RefreshInterval = d @@ -107,9 +234,29 @@ func Load() *Config { cfg.RefreshInterval = time.Duration(cfg.RefreshSeconds) * time.Second } + cfg.Agent.Interval = time.Duration(cfg.Agent.IntervalSeconds) * time.Second + if cfg.Agent.Interval == 0 { + cfg.Agent.Interval = 5 * time.Second + } + return cfg } +// IsStandalone returns true if running in standalone mode. +func (c *Config) IsStandalone() bool { + return c.Mode == ModeStandalone || c.Mode == "" +} + +// IsServer returns true if running in server mode. +func (c *Config) IsServer() bool { + return c.Mode == ModeServer +} + +// IsAgent returns true if running in agent mode. +func (c *Config) IsAgent() bool { + return c.Mode == ModeAgent +} + func getEnv(key, defaultVal string) string { if val := os.Getenv(key); val != "" { return val diff --git a/backend/internal/models/gpu.go b/backend/internal/models/gpu.go index 41e0785..eb3df3f 100644 --- a/backend/internal/models/gpu.go +++ b/backend/internal/models/gpu.go @@ -1,14 +1,106 @@ package models -type AMDGPUStats struct { - Available bool `json:"available"` - Name string `json:"name,omitempty"` - Utilization int `json:"utilization"` - VRAMUsed uint64 `json:"vramUsed"` - VRAMTotal uint64 `json:"vramTotal"` - Temperature float64 `json:"temperature"` - FanRPM int `json:"fanRpm"` - PowerWatts float64 `json:"powerWatts"` - ClockGPU int `json:"clockGpu"` - ClockMemory int `json:"clockMemory"` +// GPUVendor identifies the GPU manufacturer. +type GPUVendor string + +const ( + GPUVendorAMD GPUVendor = "amd" + GPUVendorNVIDIA GPUVendor = "nvidia" + GPUVendorIntel GPUVendor = "intel" +) + +// GPUInfo contains metrics for a single GPU. +type GPUInfo struct { + Index int `json:"index"` + Name string `json:"name"` + Vendor GPUVendor `json:"vendor"` + Driver string `json:"driver,omitempty"` + Utilization int `json:"utilization"` + MemoryUsed uint64 `json:"memoryUsed"` + MemoryTotal uint64 `json:"memoryTotal"` + Temperature float64 `json:"temperature"` + FanRPM int `json:"fanRpm,omitempty"` + PowerWatts float64 `json:"powerWatts,omitempty"` + ClockCore int `json:"clockCore,omitempty"` + ClockMemory int `json:"clockMemory,omitempty"` +} + +// GPUStats contains aggregate GPU information for all detected GPUs. +type GPUStats struct { + Available bool `json:"available"` + GPUs []GPUInfo `json:"gpus"` +} + +// AMDGPUStats is kept for backward compatibility with existing API consumers. +// Deprecated: Use GPUStats instead. +type AMDGPUStats struct { + Available bool `json:"available"` + Name string `json:"name,omitempty"` + Utilization int `json:"utilization"` + VRAMUsed uint64 `json:"vramUsed"` + VRAMTotal uint64 `json:"vramTotal"` + Temperature float64 `json:"temperature"` + FanRPM int `json:"fanRpm"` + PowerWatts float64 `json:"powerWatts"` + ClockGPU int `json:"clockGpu"` + ClockMemory int `json:"clockMemory"` +} + +// ToGPUStats converts the legacy AMD stats to the new multi-GPU format. +func (a *AMDGPUStats) ToGPUStats() GPUStats { + if !a.Available { + return GPUStats{Available: false} + } + + return GPUStats{ + Available: true, + GPUs: []GPUInfo{ + { + Index: 0, + Name: a.Name, + Vendor: GPUVendorAMD, + Driver: "amdgpu", + Utilization: a.Utilization, + MemoryUsed: a.VRAMUsed, + MemoryTotal: a.VRAMTotal, + Temperature: a.Temperature, + FanRPM: a.FanRPM, + PowerWatts: a.PowerWatts, + ClockCore: a.ClockGPU, + ClockMemory: a.ClockMemory, + }, + }, + } +} + +// FromGPUInfo converts the new GPU info to legacy AMD format (for first AMD GPU). +func AMDGPUStatsFromGPUInfo(stats GPUStats) AMDGPUStats { + if !stats.Available || len(stats.GPUs) == 0 { + return AMDGPUStats{Available: false} + } + + // Find first AMD GPU or use first GPU + var gpu *GPUInfo + for i := range stats.GPUs { + if stats.GPUs[i].Vendor == GPUVendorAMD { + gpu = &stats.GPUs[i] + break + } + } + if gpu == nil { + gpu = &stats.GPUs[0] + } + + return AMDGPUStats{ + Available: true, + Name: gpu.Name, + Utilization: gpu.Utilization, + VRAMUsed: gpu.MemoryUsed, + VRAMTotal: gpu.MemoryTotal, + Temperature: gpu.Temperature, + FanRPM: gpu.FanRPM, + PowerWatts: gpu.PowerWatts, + ClockGPU: gpu.ClockCore, + ClockMemory: gpu.ClockMemory, + } } diff --git a/backend/internal/sse/broker.go b/backend/internal/sse/broker.go index e1d2667..2ab8136 100644 --- a/backend/internal/sse/broker.go +++ b/backend/internal/sse/broker.go @@ -7,6 +7,7 @@ import ( "tyto/internal/alerts" "tyto/internal/collectors" + "tyto/internal/collectors/gpu" "tyto/internal/config" "tyto/internal/history" "tyto/internal/models" @@ -43,7 +44,7 @@ type Broker struct { disk *collectors.DiskCollector network *collectors.NetworkCollector temperature *collectors.TemperatureCollector - gpu *collectors.AMDGPUCollector + gpuManager *gpu.Manager docker *collectors.DockerCollector systemd *collectors.SystemdCollector } @@ -65,7 +66,7 @@ func NewBroker(cfg *config.Config) *Broker { network: collectors.NewNetworkCollector(cfg.ProcPath), ProcessCollector: collectors.NewProcessCollector(cfg.ProcPath), temperature: collectors.NewTemperatureCollector(cfg.SysPath), - gpu: collectors.NewAMDGPUCollector(cfg.SysPath), + gpuManager: gpu.NewManager(cfg.SysPath), docker: collectors.NewDockerCollector(cfg.DockerSock), systemd: collectors.NewSystemdCollector(), } @@ -230,8 +231,9 @@ func (b *Broker) collectAll() models.AllMetrics { metrics.Temperature = temp } - if gpu, err := b.gpu.Collect(); err == nil { - metrics.GPU = gpu + // Collect from multi-GPU manager and convert to legacy format + if gpuStats, err := b.gpuManager.Collect(); err == nil { + metrics.GPU = models.AMDGPUStatsFromGPUInfo(gpuStats) } if docker, err := b.docker.Collect(); err == nil {