Files
vessel/backend/internal/api/fetcher.go
vikingowl ddce578833 feat: implement cross-chat RAG for project conversations
- Add embedding-based chat indexing for project conversations
- Chunk long messages (1500 chars with 200 overlap) for better coverage
- Index messages when leaving a conversation (background)
- Search indexed chat history with semantic similarity
- Show other project conversations with message count and summary status
- Include relevant chat snippets in project context for LLM
- Fix chunker infinite loop bug near end of text
- Fix curl encoding error with explicit Accept-Encoding header
- Add document previews to project knowledge base context
- Lower RAG threshold to 0.2 and increase topK to 10 for better recall
2026-01-07 18:06:49 +01:00

703 lines
19 KiB
Go

package api
import (
"bytes"
"context"
"fmt"
"io"
"log"
"net/http"
"net/http/cookiejar"
"os/exec"
"regexp"
"strings"
"sync"
"time"
"github.com/chromedp/chromedp"
)
// FetchMethod represents the method used to fetch URLs
type FetchMethod string
const (
FetchMethodCurl FetchMethod = "curl"
FetchMethodWget FetchMethod = "wget"
FetchMethodChrome FetchMethod = "chrome"
FetchMethodNative FetchMethod = "native"
)
// FetchResult contains the result of a URL fetch
type FetchResult struct {
Content string
ContentType string
FinalURL string
StatusCode int
Method FetchMethod
Truncated bool // True if content was truncated due to MaxLength
OriginalSize int // Original size before truncation (0 if not truncated)
}
// FetchOptions configures the fetch behavior
type FetchOptions struct {
MaxLength int
Timeout time.Duration
UserAgent string
Headers map[string]string
FollowRedirects bool
// ForceHeadless forces using headless browser even if curl succeeds
ForceHeadless bool
// WaitForSelector waits for a specific CSS selector before capturing content
WaitForSelector string
// WaitTime is additional time to wait for JS to render (default 2s for headless)
WaitTime time.Duration
}
// DefaultFetchOptions returns sensible defaults
func DefaultFetchOptions() FetchOptions {
return FetchOptions{
MaxLength: 500000, // 500KB
Timeout: 30 * time.Second,
UserAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
Headers: make(map[string]string),
FollowRedirects: true,
WaitTime: 2 * time.Second,
}
}
// Fetcher provides URL fetching with multiple backend support
type Fetcher struct {
curlPath string
wgetPath string
wgetIsBusyBox bool // BusyBox wget has limited options
chromePath string
httpClient *http.Client
method FetchMethod
hasChrome bool
mu sync.RWMutex
// chromedp allocator context (reused for efficiency)
allocCtx context.Context
allocCancel context.CancelFunc
}
var (
globalFetcher *Fetcher
fetcherOnce sync.Once
)
// GetFetcher returns the singleton Fetcher instance
func GetFetcher() *Fetcher {
fetcherOnce.Do(func() {
globalFetcher = NewFetcher()
})
return globalFetcher
}
// NewFetcher creates a new Fetcher, detecting available tools
func NewFetcher() *Fetcher {
f := &Fetcher{}
f.detectTools()
f.initHTTPClient()
f.initChromeDp()
return f
}
// detectTools checks which external tools are available
func (f *Fetcher) detectTools() {
f.mu.Lock()
defer f.mu.Unlock()
// Check for curl
if path, err := exec.LookPath("curl"); err == nil {
f.curlPath = path
f.method = FetchMethodCurl
}
// Check for wget
if path, err := exec.LookPath("wget"); err == nil {
f.wgetPath = path
// Check if it's BusyBox wget (has limited options)
versionCmd := exec.Command(path, "--version")
versionOut, _ := versionCmd.CombinedOutput()
f.wgetIsBusyBox = strings.Contains(string(versionOut), "BusyBox")
if f.wgetIsBusyBox {
log.Printf("[Fetcher] Found BusyBox wget (limited options)")
}
if f.method == "" {
f.method = FetchMethodWget
}
}
// Check for Chrome/Chromium (for headless browser support)
chromePaths := []string{
"google-chrome",
"google-chrome-stable",
"chromium",
"chromium-browser",
"/usr/bin/google-chrome",
"/usr/bin/chromium",
"/usr/bin/chromium-browser",
"/snap/bin/chromium",
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
}
for _, p := range chromePaths {
if path, err := exec.LookPath(p); err == nil {
f.chromePath = path
f.hasChrome = true
log.Printf("[Fetcher] Found Chrome at: %s", path)
break
}
}
// Fall back to native if nothing else available
if f.method == "" {
f.method = FetchMethodNative
}
}
// initHTTPClient sets up the native Go HTTP client with cookie support
func (f *Fetcher) initHTTPClient() {
jar, _ := cookiejar.New(nil)
f.httpClient = &http.Client{
Jar: jar,
Timeout: 30 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
}
}
// initChromeDp initializes the chromedp allocator if Chrome is available
func (f *Fetcher) initChromeDp() {
if !f.hasChrome {
return
}
// Create a persistent allocator context for reuse
opts := append(chromedp.DefaultExecAllocatorOptions[:],
chromedp.Flag("headless", true),
chromedp.Flag("disable-gpu", true),
chromedp.Flag("no-sandbox", true),
chromedp.Flag("disable-dev-shm-usage", true),
chromedp.Flag("disable-extensions", true),
chromedp.Flag("disable-background-networking", true),
chromedp.Flag("disable-sync", true),
chromedp.Flag("disable-translate", true),
chromedp.Flag("mute-audio", true),
chromedp.Flag("hide-scrollbars", true),
chromedp.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"),
)
if f.chromePath != "" {
opts = append(opts, chromedp.ExecPath(f.chromePath))
}
f.allocCtx, f.allocCancel = chromedp.NewExecAllocator(context.Background(), opts...)
log.Printf("[Fetcher] Chrome headless browser initialized")
}
// Close cleans up resources
func (f *Fetcher) Close() {
if f.allocCancel != nil {
f.allocCancel()
}
}
// Method returns the current primary fetch method being used
func (f *Fetcher) Method() FetchMethod {
f.mu.RLock()
defer f.mu.RUnlock()
return f.method
}
// HasChrome returns whether headless Chrome is available
func (f *Fetcher) HasChrome() bool {
f.mu.RLock()
defer f.mu.RUnlock()
return f.hasChrome
}
// Fetch fetches a URL using the best available method
// For most sites, uses curl/wget. Falls back to headless browser for JS-heavy sites.
func (f *Fetcher) Fetch(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
// If force headless is set and Chrome is available, use it directly
if opts.ForceHeadless && f.hasChrome {
return f.fetchWithChrome(ctx, url, opts)
}
// Try fast methods first
result, err := f.fetchFast(ctx, url, opts)
if err != nil {
return nil, err
}
// Check if content looks like a JS-rendered page that needs headless browser
if f.hasChrome && f.isJSRenderedPage(result.Content) {
log.Printf("[Fetcher] Content appears to be JS-rendered, trying headless browser for: %s", url)
headlessResult, headlessErr := f.fetchWithChrome(ctx, url, opts)
if headlessErr == nil && len(headlessResult.Content) > len(result.Content) {
return headlessResult, nil
}
// If headless failed or got less content, return original
if headlessErr != nil {
log.Printf("[Fetcher] Headless browser failed: %v, using original content", headlessErr)
}
}
return result, nil
}
// fetchFast tries curl, wget, or native HTTP in order
func (f *Fetcher) fetchFast(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
f.mu.RLock()
curlPath := f.curlPath
wgetPath := f.wgetPath
method := f.method
f.mu.RUnlock()
switch method {
case FetchMethodCurl:
return f.fetchWithCurl(ctx, url, curlPath, opts)
case FetchMethodWget:
return f.fetchWithWget(ctx, url, wgetPath, opts)
default:
return f.fetchNative(ctx, url, opts)
}
}
// isJSRenderedPage checks if the content appears to be a JS-rendered page
// that hasn't actually rendered its content yet
func (f *Fetcher) isJSRenderedPage(content string) bool {
// Too short content often indicates JS rendering needed
if len(strings.TrimSpace(content)) < 500 {
return true
}
// Common patterns indicating JS-only rendering
jsPatterns := []string{
`<div id="root"></div>`,
`<div id="app"></div>`,
`<div id="__next"></div>`,
`<div id="__nuxt"></div>`,
`noscript`,
`"Loading..."`,
`"loading..."`,
`window.__INITIAL_STATE__`,
`window.__NUXT__`,
`window.__NEXT_DATA__`,
}
contentLower := strings.ToLower(content)
for _, pattern := range jsPatterns {
if strings.Contains(contentLower, strings.ToLower(pattern)) {
// Found JS pattern, but also check if there's substantial content
// Extract text content (very rough)
textContent := stripHTMLTags(content)
if len(strings.TrimSpace(textContent)) < 1000 {
return true
}
}
}
// Check for common documentation sites that need JS
jsHeavySites := []string{
"docs.rs",
"reactjs.org",
"vuejs.org",
"angular.io",
"nextjs.org",
"vercel.com",
"netlify.com",
}
for _, site := range jsHeavySites {
if strings.Contains(content, site) {
textContent := stripHTMLTags(content)
if len(strings.TrimSpace(textContent)) < 2000 {
return true
}
}
}
return false
}
// stripHTMLTags removes HTML tags from content (rough extraction)
func stripHTMLTags(content string) string {
// Remove script and style tags with their content
scriptRe := regexp.MustCompile(`(?is)<script[^>]*>.*?</script>`)
content = scriptRe.ReplaceAllString(content, "")
styleRe := regexp.MustCompile(`(?is)<style[^>]*>.*?</style>`)
content = styleRe.ReplaceAllString(content, "")
// Remove all remaining tags
tagRe := regexp.MustCompile(`<[^>]*>`)
content = tagRe.ReplaceAllString(content, " ")
// Collapse whitespace
spaceRe := regexp.MustCompile(`\s+`)
content = spaceRe.ReplaceAllString(content, " ")
return strings.TrimSpace(content)
}
// fetchWithChrome uses headless Chrome to fetch and render the page
func (f *Fetcher) fetchWithChrome(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
if !f.hasChrome || f.allocCtx == nil {
return nil, fmt.Errorf("headless Chrome not available")
}
// Create a timeout context
timeout := opts.Timeout
if timeout == 0 {
timeout = 30 * time.Second
}
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
// Create a new browser context from the allocator
browserCtx, browserCancel := chromedp.NewContext(f.allocCtx)
defer browserCancel()
var content string
var finalURL string
// Wait time for JS to render
waitTime := opts.WaitTime
if waitTime == 0 {
waitTime = 2 * time.Second
}
// Build the actions
actions := []chromedp.Action{
chromedp.Navigate(url),
}
// Wait for specific selector if provided
if opts.WaitForSelector != "" {
actions = append(actions, chromedp.WaitVisible(opts.WaitForSelector, chromedp.ByQuery))
} else {
// Default: wait for body to be visible and give JS time to render
actions = append(actions,
chromedp.WaitVisible("body", chromedp.ByQuery),
chromedp.Sleep(waitTime),
)
}
// Get the final URL and content
actions = append(actions,
chromedp.Location(&finalURL),
chromedp.OuterHTML("html", &content, chromedp.ByQuery),
)
// Execute
if err := chromedp.Run(browserCtx, actions...); err != nil {
return nil, fmt.Errorf("chromedp failed: %w", err)
}
// Truncate if needed
var truncated bool
var originalSize int
if len(content) > opts.MaxLength {
originalSize = len(content)
content = content[:opts.MaxLength]
truncated = true
}
return &FetchResult{
Content: content,
ContentType: "text/html",
FinalURL: finalURL,
StatusCode: 200,
Method: FetchMethodChrome,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
// fetchWithCurl uses curl to fetch the URL
func (f *Fetcher) fetchWithCurl(ctx context.Context, url string, curlPath string, opts FetchOptions) (*FetchResult, error) {
args := []string{
"-sS", // Silent but show errors
"-L", // Follow redirects
"--max-time", fmt.Sprintf("%d", int(opts.Timeout.Seconds())),
"-A", opts.UserAgent, // User agent
"-w", "\n---CURL_INFO---\n%{content_type}\n%{url_effective}\n%{http_code}", // Output metadata
"--compressed", // Automatically decompress responses
}
// Add custom headers
for key, value := range opts.Headers {
args = append(args, "-H", fmt.Sprintf("%s: %s", key, value))
}
// Add common headers for better compatibility
// Override Accept-Encoding to only include widely-supported formats
// This prevents errors when servers return zstd/br that curl may not support
args = append(args,
"-H", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"-H", "Accept-Language: en-US,en;q=0.5",
"-H", "Accept-Encoding: gzip, deflate, identity",
"-H", "DNT: 1",
"-H", "Connection: keep-alive",
"-H", "Upgrade-Insecure-Requests: 1",
)
args = append(args, url)
cmd := exec.CommandContext(ctx, curlPath, args...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
// Check if it's a context cancellation
if ctx.Err() != nil {
return nil, ctx.Err()
}
return nil, fmt.Errorf("curl failed: %s - %s", err.Error(), stderr.String())
}
output := stdout.String()
// Parse the output - content and metadata are separated by ---CURL_INFO---
parts := strings.Split(output, "\n---CURL_INFO---\n")
if len(parts) != 2 {
return nil, fmt.Errorf("unexpected curl output format")
}
content := parts[0]
metaLines := strings.Split(strings.TrimSpace(parts[1]), "\n")
if len(metaLines) < 3 {
return nil, fmt.Errorf("incomplete curl metadata")
}
contentType := metaLines[0]
finalURL := metaLines[1]
statusCode := 200
fmt.Sscanf(metaLines[2], "%d", &statusCode)
// Truncate content if needed
var truncated bool
var originalSize int
if len(content) > opts.MaxLength {
originalSize = len(content)
content = content[:opts.MaxLength]
truncated = true
}
return &FetchResult{
Content: content,
ContentType: contentType,
FinalURL: finalURL,
StatusCode: statusCode,
Method: FetchMethodCurl,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
// fetchWithWget uses wget to fetch the URL
func (f *Fetcher) fetchWithWget(ctx context.Context, url string, wgetPath string, opts FetchOptions) (*FetchResult, error) {
f.mu.RLock()
isBusyBox := f.wgetIsBusyBox
f.mu.RUnlock()
var args []string
if isBusyBox {
// BusyBox wget has limited options - use short flags only
args = []string{
"-q", // Quiet
"-O", "-", // Output to stdout
"-T", fmt.Sprintf("%d", int(opts.Timeout.Seconds())), // Timeout
"-U", opts.UserAgent, // User agent
}
// BusyBox wget doesn't support custom headers or max-redirect
} else {
// GNU wget supports full options
args = []string{
"-q", // Quiet
"-O", "-", // Output to stdout
"--timeout", fmt.Sprintf("%d", int(opts.Timeout.Seconds())),
"--user-agent", opts.UserAgent,
"--max-redirect", "10", // Follow up to 10 redirects
"--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"--header", "Accept-Language: en-US,en;q=0.5",
}
// Add custom headers (GNU wget only)
for key, value := range opts.Headers {
args = append(args, "--header", fmt.Sprintf("%s: %s", key, value))
}
}
args = append(args, url)
cmd := exec.CommandContext(ctx, wgetPath, args...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
if ctx.Err() != nil {
return nil, ctx.Err()
}
return nil, fmt.Errorf("wget failed: %s - %s", err.Error(), stderr.String())
}
content := stdout.String()
// Truncate content if needed
var truncated bool
var originalSize int
if len(content) > opts.MaxLength {
originalSize = len(content)
content = content[:opts.MaxLength]
truncated = true
}
// wget doesn't easily provide metadata, so we use defaults
return &FetchResult{
Content: content,
ContentType: "text/html", // Assume HTML (wget doesn't easily give us this)
FinalURL: url, // wget doesn't easily give us the final URL
StatusCode: 200,
Method: FetchMethodWget,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
// fetchNative uses Go's native http.Client with enhanced capabilities
func (f *Fetcher) fetchNative(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
// Create request with context
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
// Set headers
req.Header.Set("User-Agent", opts.UserAgent)
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
req.Header.Set("Accept-Encoding", "gzip, deflate")
req.Header.Set("DNT", "1")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Upgrade-Insecure-Requests", "1")
// Add custom headers
for key, value := range opts.Headers {
req.Header.Set(key, value)
}
// Create a client with custom timeout
client := &http.Client{
Jar: f.httpClient.Jar,
Timeout: opts.Timeout,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if !opts.FollowRedirects {
return http.ErrUseLastResponse
}
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
}
// Execute request
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
// Read body with limit + 1 byte to detect truncation
body, err := io.ReadAll(io.LimitReader(resp.Body, int64(opts.MaxLength)+1))
if err != nil {
return nil, fmt.Errorf("failed to read response: %w", err)
}
var truncated bool
var originalSize int
if len(body) > opts.MaxLength {
originalSize = len(body) // Note: this is just maxLength+1, not true original
body = body[:opts.MaxLength]
truncated = true
}
return &FetchResult{
Content: string(body),
ContentType: resp.Header.Get("Content-Type"),
FinalURL: resp.Request.URL.String(),
StatusCode: resp.StatusCode,
Method: FetchMethodNative,
Truncated: truncated,
OriginalSize: originalSize,
}, nil
}
// FetchWithHeadless explicitly uses headless browser (for API use)
func (f *Fetcher) FetchWithHeadless(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
if !f.hasChrome {
return nil, fmt.Errorf("headless Chrome not available - Chrome/Chromium not found")
}
return f.fetchWithChrome(ctx, url, opts)
}
// TryFetchWithFallback attempts to fetch using all available methods
func (f *Fetcher) TryFetchWithFallback(ctx context.Context, url string, opts FetchOptions) (*FetchResult, error) {
f.mu.RLock()
curlPath := f.curlPath
wgetPath := f.wgetPath
hasChrome := f.hasChrome
f.mu.RUnlock()
var lastErr error
// Try curl first if available
if curlPath != "" {
result, err := f.fetchWithCurl(ctx, url, curlPath, opts)
if err == nil {
return result, nil
}
lastErr = fmt.Errorf("curl: %w", err)
}
// Try wget if available
if wgetPath != "" {
result, err := f.fetchWithWget(ctx, url, wgetPath, opts)
if err == nil {
return result, nil
}
lastErr = fmt.Errorf("wget: %w", err)
}
// Try native HTTP
result, err := f.fetchNative(ctx, url, opts)
if err == nil {
return result, nil
}
lastErr = fmt.Errorf("native: %w", err)
// Last resort: try headless Chrome
if hasChrome {
result, err := f.fetchWithChrome(ctx, url, opts)
if err == nil {
return result, nil
}
lastErr = fmt.Errorf("chrome: %w", err)
}
return nil, fmt.Errorf("all fetch methods failed: %v", lastErr)
}