// Package agent implements a lightweight Tyto agent that collects metrics // and reports them to a central server via gRPC. package agent import ( "context" "encoding/json" "log" "sync" "time" "tyto/internal/collectors" "tyto/internal/collectors/gpu" "tyto/internal/config" "tyto/internal/models" pb "tyto/internal/proto" ) // Agent collects metrics and reports to a central server. type Agent struct { config *config.Config client *Client // Collectors system *collectors.SystemCollector cpu *collectors.CPUCollector memory *collectors.MemoryCollector disk *collectors.DiskCollector network *collectors.NetworkCollector process *collectors.ProcessCollector temperature *collectors.TemperatureCollector gpuManager *gpu.Manager docker *collectors.DockerCollector systemd *collectors.SystemdCollector // Pooling for memory efficiency metricsPool sync.Pool // Control stopCh chan struct{} wg sync.WaitGroup } // New creates a new agent with the given configuration. func New(cfg *config.Config) *Agent { a := &Agent{ config: cfg, stopCh: make(chan struct{}), metricsPool: sync.Pool{ New: func() interface{} { return &models.AllMetrics{} }, }, } // Initialize collectors a.initCollectors() return a } func (a *Agent) initCollectors() { cfg := a.config a.system = collectors.NewSystemCollector(cfg.ProcPath) a.cpu = collectors.NewCPUCollector(cfg.ProcPath, cfg.SysPath) a.memory = collectors.NewMemoryCollector(cfg.ProcPath) a.disk = collectors.NewDiskCollector(cfg.ProcPath, cfg.MtabPath) a.network = collectors.NewNetworkCollector(cfg.ProcPath) a.process = collectors.NewProcessCollector(cfg.ProcPath) a.temperature = collectors.NewTemperatureCollector(cfg.SysPath) a.gpuManager = gpu.NewManager(cfg.SysPath) a.docker = collectors.NewDockerCollector(cfg.DockerSock) a.systemd = collectors.NewSystemdCollector() } // Run starts the agent's main loop. func (a *Agent) Run(ctx context.Context) error { log.Printf("Agent %s starting...", a.config.Agent.ID) // Create gRPC client client, err := NewClient(a.config) if err != nil { return err } a.client = client // Set up reconnection callback to re-register a.client.SetOnReconnect(func() { log.Println("Reconnected to server, re-registering...") if err := a.register(context.Background()); err != nil { log.Printf("Re-registration failed: %v", err) } }) // Connect to server with retry if err := a.client.ConnectWithRetry(ctx); err != nil { return err } defer a.client.Close() // Register with server if err := a.register(ctx); err != nil { log.Printf("Registration failed: %v", err) // Continue anyway, server might accept unregistered agents } // Start collection loop return a.runLoop(ctx) } func (a *Agent) register(ctx context.Context) error { info := a.collectAgentInfo() return a.client.Register(ctx, info) } func (a *Agent) collectAgentInfo() *pb.AgentInfo { sysInfo, _ := a.system.Collect() capabilities := []string{} if a.gpuManager.Available() { capabilities = append(capabilities, "gpu") } // Check docker availability if dockerStats, err := a.docker.Collect(); err == nil && dockerStats.Available { capabilities = append(capabilities, "docker") } // Check systemd availability if systemdStats, err := a.systemd.Collect(); err == nil && systemdStats.Available { capabilities = append(capabilities, "systemd") } return &pb.AgentInfo{ AgentId: a.config.Agent.ID, Hostname: sysInfo.Hostname, Os: sysInfo.OS, Architecture: sysInfo.Architecture, Version: "1.0.0", // TODO: Use build version Capabilities: capabilities, } } func (a *Agent) runLoop(ctx context.Context) error { ticker := time.NewTicker(a.config.Agent.Interval) defer ticker.Stop() log.Printf("Starting collection loop (interval: %s)", a.config.Agent.Interval) for { select { case <-ctx.Done(): log.Println("Agent stopping (context cancelled)") return ctx.Err() case <-a.stopCh: log.Println("Agent stopping (stop signal)") return nil case <-ticker.C: if err := a.collectAndSend(ctx); err != nil { log.Printf("Collection/send error: %v", err) // Don't return, keep trying } } } } func (a *Agent) collectAndSend(ctx context.Context) error { // Get metrics struct from pool metrics := a.metricsPool.Get().(*models.AllMetrics) defer a.metricsPool.Put(metrics) // Reset metrics *metrics = models.AllMetrics{ Timestamp: time.Now(), } // Collect all metrics a.collect(metrics) // Send to server err := a.client.SendMetrics(ctx, a.config.Agent.ID, metrics) if err != nil { // Check if we need to reconnect if !a.client.IsConnected() { log.Println("Connection lost, attempting reconnection...") a.wg.Add(1) go func() { defer a.wg.Done() if reconnErr := a.client.Reconnect(ctx); reconnErr != nil { log.Printf("Reconnection failed: %v", reconnErr) } }() } return err } return nil } func (a *Agent) collect(m *models.AllMetrics) { // System info if sys, err := a.system.Collect(); err == nil { m.System = sys } // CPU if cpu, err := a.cpu.Collect(); err == nil { m.CPU = cpu } // Memory if mem, err := a.memory.Collect(); err == nil { m.Memory = mem } // Disk if disk, err := a.disk.Collect(); err == nil { m.Disk = disk } // Network if net, err := a.network.Collect(); err == nil { m.Network = net } // Processes if proc, err := a.process.Collect(); err == nil { m.Processes = proc } // Temperature if temp, err := a.temperature.Collect(); err == nil { m.Temperature = temp } // GPU if gpuStats, err := a.gpuManager.Collect(); err == nil { m.GPU = models.AMDGPUStatsFromGPUInfo(gpuStats) } // Docker if docker, err := a.docker.Collect(); err == nil { m.Docker = docker } // Systemd if systemd, err := a.systemd.Collect(); err == nil { m.Systemd = systemd } } // Stop signals the agent to stop and waits for cleanup. func (a *Agent) Stop() { close(a.stopCh) a.wg.Wait() } // SerializeMetrics converts metrics to JSON bytes. func SerializeMetrics(m *models.AllMetrics) ([]byte, error) { return json.Marshal(m) }