Files
tyto/backend/internal/agent/agent.go
vikingowl 5e781c0e04 feat: implement lightweight agent with gRPC and mTLS support
Agent Package (internal/agent/):
- Agent struct with all collectors and memory-efficient pooling
- Run loop with configurable collection interval
- Graceful shutdown with context cancellation
- Auto-reconnection callback for re-registration

gRPC Client (internal/agent/client.go):
- mTLS support with CA, agent cert, and key
- Bidirectional streaming for metrics
- Heartbeat fallback when streaming fails
- Exponential backoff with jitter for reconnection
- Concurrent reconnection handling with mutex

Protocol Buffers (proto/tyto.proto):
- AgentService with Stream, Register, Heartbeat RPCs
- MetricsReport with summary fields for aggregation
- ConfigUpdate and Command messages for server control
- RegisterStatus enum for registration workflow

CLI Integration (cmd/tyto/main.go):
- Full agent subcommand with flag parsing
- Support for --id, --server, --interval, --ca-cert, etc.
- Environment variable overrides (TYTO_AGENT_*)
- Signal handling for graceful shutdown

Build System (Makefile):
- Cross-compilation for linux/amd64, arm64, armv7
- Stripped binaries with version info
- Proto generation target
- Test and coverage targets

Config Updates:
- DefaultConfig() and LoadFromPath() functions
- Agent config properly parsed from YAML

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 07:42:44 +01:00

263 lines
6.1 KiB
Go

// Package agent implements a lightweight Tyto agent that collects metrics
// and reports them to a central server via gRPC.
package agent
import (
"context"
"encoding/json"
"log"
"sync"
"time"
"tyto/internal/collectors"
"tyto/internal/collectors/gpu"
"tyto/internal/config"
"tyto/internal/models"
pb "tyto/internal/proto"
)
// Agent collects metrics and reports to a central server.
type Agent struct {
config *config.Config
client *Client
// Collectors
system *collectors.SystemCollector
cpu *collectors.CPUCollector
memory *collectors.MemoryCollector
disk *collectors.DiskCollector
network *collectors.NetworkCollector
process *collectors.ProcessCollector
temperature *collectors.TemperatureCollector
gpuManager *gpu.Manager
docker *collectors.DockerCollector
systemd *collectors.SystemdCollector
// Pooling for memory efficiency
metricsPool sync.Pool
// Control
stopCh chan struct{}
wg sync.WaitGroup
}
// New creates a new agent with the given configuration.
func New(cfg *config.Config) *Agent {
a := &Agent{
config: cfg,
stopCh: make(chan struct{}),
metricsPool: sync.Pool{
New: func() interface{} {
return &models.AllMetrics{}
},
},
}
// Initialize collectors
a.initCollectors()
return a
}
func (a *Agent) initCollectors() {
cfg := a.config
a.system = collectors.NewSystemCollector(cfg.ProcPath)
a.cpu = collectors.NewCPUCollector(cfg.ProcPath, cfg.SysPath)
a.memory = collectors.NewMemoryCollector(cfg.ProcPath)
a.disk = collectors.NewDiskCollector(cfg.ProcPath, cfg.MtabPath)
a.network = collectors.NewNetworkCollector(cfg.ProcPath)
a.process = collectors.NewProcessCollector(cfg.ProcPath)
a.temperature = collectors.NewTemperatureCollector(cfg.SysPath)
a.gpuManager = gpu.NewManager(cfg.SysPath)
a.docker = collectors.NewDockerCollector(cfg.DockerSock)
a.systemd = collectors.NewSystemdCollector()
}
// Run starts the agent's main loop.
func (a *Agent) Run(ctx context.Context) error {
log.Printf("Agent %s starting...", a.config.Agent.ID)
// Create gRPC client
client, err := NewClient(a.config)
if err != nil {
return err
}
a.client = client
// Set up reconnection callback to re-register
a.client.SetOnReconnect(func() {
log.Println("Reconnected to server, re-registering...")
if err := a.register(context.Background()); err != nil {
log.Printf("Re-registration failed: %v", err)
}
})
// Connect to server with retry
if err := a.client.ConnectWithRetry(ctx); err != nil {
return err
}
defer a.client.Close()
// Register with server
if err := a.register(ctx); err != nil {
log.Printf("Registration failed: %v", err)
// Continue anyway, server might accept unregistered agents
}
// Start collection loop
return a.runLoop(ctx)
}
func (a *Agent) register(ctx context.Context) error {
info := a.collectAgentInfo()
return a.client.Register(ctx, info)
}
func (a *Agent) collectAgentInfo() *pb.AgentInfo {
sysInfo, _ := a.system.Collect()
capabilities := []string{}
if a.gpuManager.Available() {
capabilities = append(capabilities, "gpu")
}
// Check docker availability
if dockerStats, err := a.docker.Collect(); err == nil && dockerStats.Available {
capabilities = append(capabilities, "docker")
}
// Check systemd availability
if systemdStats, err := a.systemd.Collect(); err == nil && systemdStats.Available {
capabilities = append(capabilities, "systemd")
}
return &pb.AgentInfo{
AgentId: a.config.Agent.ID,
Hostname: sysInfo.Hostname,
Os: sysInfo.OS,
Architecture: sysInfo.Architecture,
Version: "1.0.0", // TODO: Use build version
Capabilities: capabilities,
}
}
func (a *Agent) runLoop(ctx context.Context) error {
ticker := time.NewTicker(a.config.Agent.Interval)
defer ticker.Stop()
log.Printf("Starting collection loop (interval: %s)", a.config.Agent.Interval)
for {
select {
case <-ctx.Done():
log.Println("Agent stopping (context cancelled)")
return ctx.Err()
case <-a.stopCh:
log.Println("Agent stopping (stop signal)")
return nil
case <-ticker.C:
if err := a.collectAndSend(ctx); err != nil {
log.Printf("Collection/send error: %v", err)
// Don't return, keep trying
}
}
}
}
func (a *Agent) collectAndSend(ctx context.Context) error {
// Get metrics struct from pool
metrics := a.metricsPool.Get().(*models.AllMetrics)
defer a.metricsPool.Put(metrics)
// Reset metrics
*metrics = models.AllMetrics{
Timestamp: time.Now(),
}
// Collect all metrics
a.collect(metrics)
// Send to server
err := a.client.SendMetrics(ctx, a.config.Agent.ID, metrics)
if err != nil {
// Check if we need to reconnect
if !a.client.IsConnected() {
log.Println("Connection lost, attempting reconnection...")
a.wg.Add(1)
go func() {
defer a.wg.Done()
if reconnErr := a.client.Reconnect(ctx); reconnErr != nil {
log.Printf("Reconnection failed: %v", reconnErr)
}
}()
}
return err
}
return nil
}
func (a *Agent) collect(m *models.AllMetrics) {
// System info
if sys, err := a.system.Collect(); err == nil {
m.System = sys
}
// CPU
if cpu, err := a.cpu.Collect(); err == nil {
m.CPU = cpu
}
// Memory
if mem, err := a.memory.Collect(); err == nil {
m.Memory = mem
}
// Disk
if disk, err := a.disk.Collect(); err == nil {
m.Disk = disk
}
// Network
if net, err := a.network.Collect(); err == nil {
m.Network = net
}
// Processes
if proc, err := a.process.Collect(); err == nil {
m.Processes = proc
}
// Temperature
if temp, err := a.temperature.Collect(); err == nil {
m.Temperature = temp
}
// GPU
if gpuStats, err := a.gpuManager.Collect(); err == nil {
m.GPU = models.AMDGPUStatsFromGPUInfo(gpuStats)
}
// Docker
if docker, err := a.docker.Collect(); err == nil {
m.Docker = docker
}
// Systemd
if systemd, err := a.systemd.Collect(); err == nil {
m.Systemd = systemd
}
}
// Stop signals the agent to stop and waits for cleanup.
func (a *Agent) Stop() {
close(a.stopCh)
a.wg.Wait()
}
// SerializeMetrics converts metrics to JSON bytes.
func SerializeMetrics(m *models.AllMetrics) ([]byte, error) {
return json.Marshal(m)
}