diff --git a/backend/internal/api/fetcher.go b/backend/internal/api/fetcher.go index 46b98f2..4c589e8 100644 --- a/backend/internal/api/fetcher.go +++ b/backend/internal/api/fetcher.go @@ -29,11 +29,13 @@ const ( // FetchResult contains the result of a URL fetch type FetchResult struct { - Content string - ContentType string - FinalURL string - StatusCode int - Method FetchMethod + Content string + ContentType string + FinalURL string + StatusCode int + Method FetchMethod + Truncated bool // True if content was truncated due to MaxLength + OriginalSize int // Original size before truncation (0 if not truncated) } // FetchOptions configures the fetch behavior @@ -401,16 +403,22 @@ func (f *Fetcher) fetchWithChrome(ctx context.Context, url string, opts FetchOpt } // Truncate if needed + var truncated bool + var originalSize int if len(content) > opts.MaxLength { + originalSize = len(content) content = content[:opts.MaxLength] + truncated = true } return &FetchResult{ - Content: content, - ContentType: "text/html", - FinalURL: finalURL, - StatusCode: 200, - Method: FetchMethodChrome, + Content: content, + ContentType: "text/html", + FinalURL: finalURL, + StatusCode: 200, + Method: FetchMethodChrome, + Truncated: truncated, + OriginalSize: originalSize, }, nil } @@ -420,7 +428,6 @@ func (f *Fetcher) fetchWithCurl(ctx context.Context, url string, curlPath string "-sS", // Silent but show errors "-L", // Follow redirects "--max-time", fmt.Sprintf("%d", int(opts.Timeout.Seconds())), - "--max-filesize", fmt.Sprintf("%d", opts.MaxLength), "-A", opts.UserAgent, // User agent "-w", "\n---CURL_INFO---\n%{content_type}\n%{url_effective}\n%{http_code}", // Output metadata "--compressed", // Accept compressed responses @@ -476,16 +483,22 @@ func (f *Fetcher) fetchWithCurl(ctx context.Context, url string, curlPath string fmt.Sscanf(metaLines[2], "%d", &statusCode) // Truncate content if needed + var truncated bool + var originalSize int if len(content) > opts.MaxLength { + originalSize = len(content) content = content[:opts.MaxLength] + truncated = true } return &FetchResult{ - Content: content, - ContentType: contentType, - FinalURL: finalURL, - StatusCode: statusCode, - Method: FetchMethodCurl, + Content: content, + ContentType: contentType, + FinalURL: finalURL, + StatusCode: statusCode, + Method: FetchMethodCurl, + Truncated: truncated, + OriginalSize: originalSize, }, nil } @@ -541,17 +554,23 @@ func (f *Fetcher) fetchWithWget(ctx context.Context, url string, wgetPath string content := stdout.String() // Truncate content if needed + var truncated bool + var originalSize int if len(content) > opts.MaxLength { + originalSize = len(content) content = content[:opts.MaxLength] + truncated = true } // wget doesn't easily provide metadata, so we use defaults return &FetchResult{ - Content: content, - ContentType: "text/html", // Assume HTML (wget doesn't easily give us this) - FinalURL: url, // wget doesn't easily give us the final URL - StatusCode: 200, - Method: FetchMethodWget, + Content: content, + ContentType: "text/html", // Assume HTML (wget doesn't easily give us this) + FinalURL: url, // wget doesn't easily give us the final URL + StatusCode: 200, + Method: FetchMethodWget, + Truncated: truncated, + OriginalSize: originalSize, }, nil } @@ -599,18 +618,28 @@ func (f *Fetcher) fetchNative(ctx context.Context, url string, opts FetchOptions } defer resp.Body.Close() - // Read body with limit - body, err := io.ReadAll(io.LimitReader(resp.Body, int64(opts.MaxLength))) + // Read body with limit + 1 byte to detect truncation + body, err := io.ReadAll(io.LimitReader(resp.Body, int64(opts.MaxLength)+1)) if err != nil { return nil, fmt.Errorf("failed to read response: %w", err) } + var truncated bool + var originalSize int + if len(body) > opts.MaxLength { + originalSize = len(body) // Note: this is just maxLength+1, not true original + body = body[:opts.MaxLength] + truncated = true + } + return &FetchResult{ - Content: string(body), - ContentType: resp.Header.Get("Content-Type"), - FinalURL: resp.Request.URL.String(), - StatusCode: resp.StatusCode, - Method: FetchMethodNative, + Content: string(body), + ContentType: resp.Header.Get("Content-Type"), + FinalURL: resp.Request.URL.String(), + StatusCode: resp.StatusCode, + Method: FetchMethodNative, + Truncated: truncated, + OriginalSize: originalSize, }, nil } diff --git a/backend/internal/api/proxy.go b/backend/internal/api/proxy.go index 3cb576b..3e594bc 100644 --- a/backend/internal/api/proxy.go +++ b/backend/internal/api/proxy.go @@ -12,6 +12,7 @@ import ( type URLFetchRequest struct { URL string `json:"url" binding:"required"` MaxLength int `json:"maxLength"` + Timeout int `json:"timeout"` // Timeout in seconds } // URLFetchProxyHandler returns a handler that fetches URLs for the frontend @@ -42,10 +43,16 @@ func URLFetchProxyHandler() gin.HandlerFunc { // Set up fetch options opts := DefaultFetchOptions() - opts.Timeout = 30 * time.Second - // Set max length (default 500KB) - if req.MaxLength > 0 && req.MaxLength <= 500000 { + // Set timeout (default 30s, max 120s) + if req.Timeout > 0 && req.Timeout <= 120 { + opts.Timeout = time.Duration(req.Timeout) * time.Second + } else { + opts.Timeout = 30 * time.Second + } + + // Set max length (default 500KB, max 2MB) + if req.MaxLength > 0 && req.MaxLength <= 2000000 { opts.MaxLength = req.MaxLength } @@ -66,13 +73,22 @@ func URLFetchProxyHandler() gin.HandlerFunc { } // Return the content - c.JSON(http.StatusOK, gin.H{ + response := gin.H{ "content": result.Content, "contentType": result.ContentType, "url": result.FinalURL, "status": result.StatusCode, "fetchMethod": string(result.Method), - }) + } + + // Include truncation info if content was truncated + if result.Truncated { + response["truncated"] = true + response["originalSize"] = result.OriginalSize + response["returnedSize"] = len(result.Content) + } + + c.JSON(http.StatusOK, response) } } diff --git a/frontend/src/lib/tools/builtin.ts b/frontend/src/lib/tools/builtin.ts index 49371a6..b044fae 100644 --- a/frontend/src/lib/tools/builtin.ts +++ b/frontend/src/lib/tools/builtin.ts @@ -291,13 +291,14 @@ interface FetchUrlArgs { url: string; extract?: 'text' | 'title' | 'links' | 'all'; maxLength?: number; + timeout?: number; } const fetchUrlDefinition: ToolDefinition = { type: 'function', function: { name: 'fetch_url', - description: 'Fetches and reads content from a specific URL. Use after web_search to read full content from a result URL, or when user provides a URL directly.', + description: 'Fetches and reads content from a URL. If content is truncated, you can retry with a larger maxLength. Use after web_search to read full content, or when user provides a URL directly.', parameters: { type: 'object', properties: { @@ -312,7 +313,11 @@ const fetchUrlDefinition: ToolDefinition = { }, maxLength: { type: 'number', - description: 'Max text length (default: 5000)' + description: 'Max content length in bytes. Start with 50000, increase to 200000 or 500000 if truncated. Max: 2000000' + }, + timeout: { + type: 'number', + description: 'Request timeout in seconds (default: 30, max: 120). Increase for slow sites.' } }, required: ['url'] @@ -320,21 +325,35 @@ const fetchUrlDefinition: ToolDefinition = { } }; +interface ProxyFetchResult { + html: string; + finalUrl: string; + truncated?: boolean; + originalSize?: number; + returnedSize?: number; +} + /** * Try to fetch URL via backend proxy first (bypasses CORS), fall back to direct fetch */ -async function fetchViaProxy(url: string, maxLength: number): Promise<{ html: string; finalUrl: string } | { error: string }> { +async function fetchViaProxy(url: string, maxLength: number, timeout: number): Promise { // Try backend proxy first try { const proxyResponse = await fetch('/api/v1/proxy/fetch', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ url, maxLength }) + body: JSON.stringify({ url, maxLength, timeout }) }); if (proxyResponse.ok) { const data = await proxyResponse.json(); - return { html: data.content, finalUrl: data.url }; + return { + html: data.content, + finalUrl: data.url, + truncated: data.truncated, + originalSize: data.originalSize, + returnedSize: data.returnedSize + }; } // If proxy returns an error, extract it @@ -380,7 +399,7 @@ async function fetchViaProxy(url: string, maxLength: number): Promise<{ html: st } const fetchUrlHandler: BuiltinToolHandler = async (args) => { - const { url, extract = 'text', maxLength = 5000 } = args; + const { url, extract = 'text', maxLength = 50000, timeout = 30 } = args; try { const parsedUrl = new URL(url); @@ -389,12 +408,12 @@ const fetchUrlHandler: BuiltinToolHandler = async (args) => { } // Fetch via proxy or direct - const result = await fetchViaProxy(url, maxLength); + const result = await fetchViaProxy(url, maxLength, timeout); if ('error' in result) { return result; } - const { html, finalUrl } = result; + const { html, finalUrl, truncated, originalSize, returnedSize } = result; const titleMatch = html.match(/]*>([^<]+)<\/title>/i); const title = titleMatch ? titleMatch[1].trim() : null; @@ -419,21 +438,34 @@ const fetchUrlHandler: BuiltinToolHandler = async (args) => { })).filter(link => link.url && !link.url.startsWith('#')); if (extract === 'links') { - return links; + return truncated + ? { links, warning: `Content was truncated (${returnedSize ?? maxLength} bytes). Some links may be missing.` } + : links; } const text = stripHtml(html).substring(0, maxLength); + // Build response with truncation info + const buildResponse = (data: unknown) => { + if (!truncated) return data; + const suggestedSize = originalSize ? Math.min(originalSize * 2, 2000000) : maxLength * 2; + return { + ...(typeof data === 'object' ? data : { content: data }), + _truncated: true, + _hint: `Content truncated to ${returnedSize ?? maxLength} bytes. ${originalSize ? `Original was ${originalSize} bytes. ` : ''}Retry with larger maxLength (e.g., ${suggestedSize}) to get full content.` + }; + }; + if (extract === 'text') { - return text; + return buildResponse(text); } - return { + return buildResponse({ title, text, links: links.slice(0, 20), url: finalUrl - }; + }); } catch (error) { return { error: `Failed to fetch URL: ${error instanceof Error ? error.message : 'Unknown error'}` }; }