Files
gnoma/internal/security/sanitize.go
vikingowl 33dec722b8 feat: add security firewall with secret scanning and incognito mode
internal/security/ — core security layer baked into gnoma:
- Secret scanner: gitleaks-derived regex patterns (Anthropic, OpenAI,
  AWS, GitHub, GitLab, Slack, Stripe, private keys, DB URLs, generic
  secrets) + Shannon entropy detection for unknown formats
- Redactor: replaces matched secrets with [REDACTED], merges
  overlapping ranges, preserves surrounding context
- Unicode sanitizer: NFKC normalization, strips Cf/Co categories,
  tag characters (ASCII smuggling), zero-width chars, RTL overrides
- Incognito mode: suppresses persistence, learning, content logging
- Firewall: wraps engine, scans outgoing messages + system prompt +
  tool results before they reach the provider

Wired into engine and CLI. 21 security tests.
2026-04-03 14:07:50 +02:00

58 lines
1.3 KiB
Go

package security
import (
"strings"
"unicode"
"golang.org/x/text/unicode/norm"
)
// SanitizeUnicode removes potentially dangerous invisible Unicode characters.
// Applies NFKC normalization then strips format (Cf), private use (Co),
// and unassigned (Cn) characters. Prevents ASCII smuggling and hidden
// prompt injection attacks.
func SanitizeUnicode(s string) string {
// Step 1: NFKC normalization (handles composed characters)
s = norm.NFKC.String(s)
// Step 2: Strip dangerous Unicode categories
var b strings.Builder
b.Grow(len(s))
for _, r := range s {
if shouldStrip(r) {
continue
}
b.WriteRune(r)
}
return b.String()
}
func shouldStrip(r rune) bool {
// Keep normal printable characters, whitespace, and common symbols
if r <= 0x7E && r >= 0x20 {
return false // ASCII printable
}
if r == '\n' || r == '\t' || r == '\r' {
return false // common whitespace
}
// Strip Unicode format characters (Cf) — invisible formatting
if unicode.Is(unicode.Cf, r) {
return true
}
// Strip private use (Co) — unregistered characters
if unicode.Is(unicode.Co, r) {
return true
}
// Strip specific dangerous ranges
switch {
case r >= 0xE0000 && r <= 0xE007F: // Unicode Tag characters (ASCII smuggling)
return true
case r >= 0xFFF0 && r <= 0xFFFD: // Specials (interlinear annotation, etc.)
return true
}
return false
}