Agent Package (internal/agent/): - Agent struct with all collectors and memory-efficient pooling - Run loop with configurable collection interval - Graceful shutdown with context cancellation - Auto-reconnection callback for re-registration gRPC Client (internal/agent/client.go): - mTLS support with CA, agent cert, and key - Bidirectional streaming for metrics - Heartbeat fallback when streaming fails - Exponential backoff with jitter for reconnection - Concurrent reconnection handling with mutex Protocol Buffers (proto/tyto.proto): - AgentService with Stream, Register, Heartbeat RPCs - MetricsReport with summary fields for aggregation - ConfigUpdate and Command messages for server control - RegisterStatus enum for registration workflow CLI Integration (cmd/tyto/main.go): - Full agent subcommand with flag parsing - Support for --id, --server, --interval, --ca-cert, etc. - Environment variable overrides (TYTO_AGENT_*) - Signal handling for graceful shutdown Build System (Makefile): - Cross-compilation for linux/amd64, arm64, armv7 - Stripped binaries with version info - Proto generation target - Test and coverage targets Config Updates: - DefaultConfig() and LoadFromPath() functions - Agent config properly parsed from YAML 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
263 lines
6.1 KiB
Go
263 lines
6.1 KiB
Go
// Package agent implements a lightweight Tyto agent that collects metrics
|
|
// and reports them to a central server via gRPC.
|
|
package agent
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"log"
|
|
"sync"
|
|
"time"
|
|
|
|
"tyto/internal/collectors"
|
|
"tyto/internal/collectors/gpu"
|
|
"tyto/internal/config"
|
|
"tyto/internal/models"
|
|
pb "tyto/internal/proto"
|
|
)
|
|
|
|
// Agent collects metrics and reports to a central server.
|
|
type Agent struct {
|
|
config *config.Config
|
|
client *Client
|
|
|
|
// Collectors
|
|
system *collectors.SystemCollector
|
|
cpu *collectors.CPUCollector
|
|
memory *collectors.MemoryCollector
|
|
disk *collectors.DiskCollector
|
|
network *collectors.NetworkCollector
|
|
process *collectors.ProcessCollector
|
|
temperature *collectors.TemperatureCollector
|
|
gpuManager *gpu.Manager
|
|
docker *collectors.DockerCollector
|
|
systemd *collectors.SystemdCollector
|
|
|
|
// Pooling for memory efficiency
|
|
metricsPool sync.Pool
|
|
|
|
// Control
|
|
stopCh chan struct{}
|
|
wg sync.WaitGroup
|
|
}
|
|
|
|
// New creates a new agent with the given configuration.
|
|
func New(cfg *config.Config) *Agent {
|
|
a := &Agent{
|
|
config: cfg,
|
|
stopCh: make(chan struct{}),
|
|
metricsPool: sync.Pool{
|
|
New: func() interface{} {
|
|
return &models.AllMetrics{}
|
|
},
|
|
},
|
|
}
|
|
|
|
// Initialize collectors
|
|
a.initCollectors()
|
|
|
|
return a
|
|
}
|
|
|
|
func (a *Agent) initCollectors() {
|
|
cfg := a.config
|
|
|
|
a.system = collectors.NewSystemCollector(cfg.ProcPath)
|
|
a.cpu = collectors.NewCPUCollector(cfg.ProcPath, cfg.SysPath)
|
|
a.memory = collectors.NewMemoryCollector(cfg.ProcPath)
|
|
a.disk = collectors.NewDiskCollector(cfg.ProcPath, cfg.MtabPath)
|
|
a.network = collectors.NewNetworkCollector(cfg.ProcPath)
|
|
a.process = collectors.NewProcessCollector(cfg.ProcPath)
|
|
a.temperature = collectors.NewTemperatureCollector(cfg.SysPath)
|
|
a.gpuManager = gpu.NewManager(cfg.SysPath)
|
|
a.docker = collectors.NewDockerCollector(cfg.DockerSock)
|
|
a.systemd = collectors.NewSystemdCollector()
|
|
}
|
|
|
|
// Run starts the agent's main loop.
|
|
func (a *Agent) Run(ctx context.Context) error {
|
|
log.Printf("Agent %s starting...", a.config.Agent.ID)
|
|
|
|
// Create gRPC client
|
|
client, err := NewClient(a.config)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
a.client = client
|
|
|
|
// Set up reconnection callback to re-register
|
|
a.client.SetOnReconnect(func() {
|
|
log.Println("Reconnected to server, re-registering...")
|
|
if err := a.register(context.Background()); err != nil {
|
|
log.Printf("Re-registration failed: %v", err)
|
|
}
|
|
})
|
|
|
|
// Connect to server with retry
|
|
if err := a.client.ConnectWithRetry(ctx); err != nil {
|
|
return err
|
|
}
|
|
defer a.client.Close()
|
|
|
|
// Register with server
|
|
if err := a.register(ctx); err != nil {
|
|
log.Printf("Registration failed: %v", err)
|
|
// Continue anyway, server might accept unregistered agents
|
|
}
|
|
|
|
// Start collection loop
|
|
return a.runLoop(ctx)
|
|
}
|
|
|
|
func (a *Agent) register(ctx context.Context) error {
|
|
info := a.collectAgentInfo()
|
|
return a.client.Register(ctx, info)
|
|
}
|
|
|
|
func (a *Agent) collectAgentInfo() *pb.AgentInfo {
|
|
sysInfo, _ := a.system.Collect()
|
|
|
|
capabilities := []string{}
|
|
if a.gpuManager.Available() {
|
|
capabilities = append(capabilities, "gpu")
|
|
}
|
|
// Check docker availability
|
|
if dockerStats, err := a.docker.Collect(); err == nil && dockerStats.Available {
|
|
capabilities = append(capabilities, "docker")
|
|
}
|
|
// Check systemd availability
|
|
if systemdStats, err := a.systemd.Collect(); err == nil && systemdStats.Available {
|
|
capabilities = append(capabilities, "systemd")
|
|
}
|
|
|
|
return &pb.AgentInfo{
|
|
AgentId: a.config.Agent.ID,
|
|
Hostname: sysInfo.Hostname,
|
|
Os: sysInfo.OS,
|
|
Architecture: sysInfo.Architecture,
|
|
Version: "1.0.0", // TODO: Use build version
|
|
Capabilities: capabilities,
|
|
}
|
|
}
|
|
|
|
func (a *Agent) runLoop(ctx context.Context) error {
|
|
ticker := time.NewTicker(a.config.Agent.Interval)
|
|
defer ticker.Stop()
|
|
|
|
log.Printf("Starting collection loop (interval: %s)", a.config.Agent.Interval)
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
log.Println("Agent stopping (context cancelled)")
|
|
return ctx.Err()
|
|
|
|
case <-a.stopCh:
|
|
log.Println("Agent stopping (stop signal)")
|
|
return nil
|
|
|
|
case <-ticker.C:
|
|
if err := a.collectAndSend(ctx); err != nil {
|
|
log.Printf("Collection/send error: %v", err)
|
|
// Don't return, keep trying
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (a *Agent) collectAndSend(ctx context.Context) error {
|
|
// Get metrics struct from pool
|
|
metrics := a.metricsPool.Get().(*models.AllMetrics)
|
|
defer a.metricsPool.Put(metrics)
|
|
|
|
// Reset metrics
|
|
*metrics = models.AllMetrics{
|
|
Timestamp: time.Now(),
|
|
}
|
|
|
|
// Collect all metrics
|
|
a.collect(metrics)
|
|
|
|
// Send to server
|
|
err := a.client.SendMetrics(ctx, a.config.Agent.ID, metrics)
|
|
if err != nil {
|
|
// Check if we need to reconnect
|
|
if !a.client.IsConnected() {
|
|
log.Println("Connection lost, attempting reconnection...")
|
|
a.wg.Add(1)
|
|
go func() {
|
|
defer a.wg.Done()
|
|
if reconnErr := a.client.Reconnect(ctx); reconnErr != nil {
|
|
log.Printf("Reconnection failed: %v", reconnErr)
|
|
}
|
|
}()
|
|
}
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (a *Agent) collect(m *models.AllMetrics) {
|
|
// System info
|
|
if sys, err := a.system.Collect(); err == nil {
|
|
m.System = sys
|
|
}
|
|
|
|
// CPU
|
|
if cpu, err := a.cpu.Collect(); err == nil {
|
|
m.CPU = cpu
|
|
}
|
|
|
|
// Memory
|
|
if mem, err := a.memory.Collect(); err == nil {
|
|
m.Memory = mem
|
|
}
|
|
|
|
// Disk
|
|
if disk, err := a.disk.Collect(); err == nil {
|
|
m.Disk = disk
|
|
}
|
|
|
|
// Network
|
|
if net, err := a.network.Collect(); err == nil {
|
|
m.Network = net
|
|
}
|
|
|
|
// Processes
|
|
if proc, err := a.process.Collect(); err == nil {
|
|
m.Processes = proc
|
|
}
|
|
|
|
// Temperature
|
|
if temp, err := a.temperature.Collect(); err == nil {
|
|
m.Temperature = temp
|
|
}
|
|
|
|
// GPU
|
|
if gpuStats, err := a.gpuManager.Collect(); err == nil {
|
|
m.GPU = models.AMDGPUStatsFromGPUInfo(gpuStats)
|
|
}
|
|
|
|
// Docker
|
|
if docker, err := a.docker.Collect(); err == nil {
|
|
m.Docker = docker
|
|
}
|
|
|
|
// Systemd
|
|
if systemd, err := a.systemd.Collect(); err == nil {
|
|
m.Systemd = systemd
|
|
}
|
|
}
|
|
|
|
// Stop signals the agent to stop and waits for cleanup.
|
|
func (a *Agent) Stop() {
|
|
close(a.stopCh)
|
|
a.wg.Wait()
|
|
}
|
|
|
|
// SerializeMetrics converts metrics to JSON bytes.
|
|
func SerializeMetrics(m *models.AllMetrics) ([]byte, error) {
|
|
return json.Marshal(m)
|
|
}
|