Files
vessel/frontend/src/lib/services/fileAnalyzer.ts
vikingowl 26b58fbd50 feat: improve file attachment handling with processing indicator
- Add "Processing X files..." indicator in chat while handling attachments
- Indicator transitions to "Analyzing X files..." for large files needing LLM summarization
- Reuse streaming message for seamless transition to LLM response
- Add FileAnalyzer service for large file summarization with 10s timeout
- Skip analysis for borderline files (within 20% of 8K threshold)
- Read up to 50KB from original file for analysis (not just truncated content)
- Remove base64 blobs from JSON before analysis to reduce prompt size
- Add AttachmentDisplay component for showing file badges on messages
- Persist attachments to IndexedDB with message references
- Add chat state methods: setStreamContent, removeMessage
- Clean up debug logging
2026-01-04 00:35:33 +01:00

408 lines
12 KiB
TypeScript

/**
* File Analyzer Service
*
* Spawns a separate Ollama request to analyze/summarize large files
* before adding them to the main conversation context.
* This keeps the main context clean for conversation while still
* allowing the model to understand file contents.
*/
import { ollamaClient } from '$lib/ollama';
import type { FileAttachment } from '$lib/types/attachment.js';
import { ANALYSIS_THRESHOLD, MAX_EXTRACTED_CONTENT } from '$lib/types/attachment.js';
import { formatFileSize } from '$lib/utils/file-processor.js';
// ============================================================================
// Types
// ============================================================================
export interface AnalysisResult {
/** Whether to use the original content (file was small enough) */
useOriginal: boolean;
/** The content to use in the message (original or summary) */
content: string;
/** Summary generated by the analysis agent (if analyzed) */
summary?: string;
/** Original content size in characters */
originalLength: number;
/** Whether the file was analyzed by the sub-agent */
analyzed: boolean;
/** Error message if analysis failed */
error?: string;
}
export interface FileAnalyzerConfig {
/** Size thresholds for different file types (in bytes) */
thresholds: {
text: number;
pdf: number;
json: number;
};
/** Timeout for analysis request (ms) */
timeout: number;
/** Maximum tokens for analysis response */
maxResponseTokens: number;
}
// ============================================================================
// Default Configuration
// ============================================================================
const DEFAULT_CONFIG: FileAnalyzerConfig = {
thresholds: {
text: 500 * 1024, // 500KB for general text
pdf: 1024 * 1024, // 1MB for PDFs
json: 300 * 1024, // 300KB for JSON (dense data)
},
timeout: 10000, // 10 seconds - fail fast, fall back to truncated
maxResponseTokens: 256, // Keep summaries very concise for speed
};
/** Maximum content to read from file for analysis (50KB) */
const MAX_ANALYSIS_CONTENT = 50 * 1024;
/** If content is within this % of threshold, skip analysis */
const BORDERLINE_THRESHOLD_PERCENT = 0.2; // 20%
// ============================================================================
// Content Reading
// ============================================================================
/**
* Read full content from original file for analysis
* Returns up to MAX_ANALYSIS_CONTENT chars
*/
async function readFullContentForAnalysis(attachment: FileAttachment): Promise<string> {
// If we have the original file, read from it
if (attachment.originalFile) {
try {
const text = await attachment.originalFile.text();
// Limit to max analysis content
if (text.length > MAX_ANALYSIS_CONTENT) {
return text.slice(0, MAX_ANALYSIS_CONTENT);
}
return text;
} catch (err) {
console.warn('[FileAnalyzer] Failed to read original file:', err);
}
}
// Fall back to stored textContent
return attachment.textContent || '';
}
// ============================================================================
// Content Cleaning
// ============================================================================
/**
* Remove base64 blobs and large binary data from JSON content
* Replaces them with descriptive placeholders
*/
function cleanJsonForAnalysis(content: string): { cleaned: string; blobsRemoved: number } {
let blobsRemoved = 0;
// Pattern to match base64 data (common patterns in JSON)
// Matches: "data:image/...;base64,..." or long base64 strings (>100 chars of base64 alphabet)
const base64DataUrlPattern = /"data:[^"]*;base64,[A-Za-z0-9+/=]+"/g;
const longBase64Pattern = /"[A-Za-z0-9+/=]{100,}"/g;
let cleaned = content;
// Replace data URLs
cleaned = cleaned.replace(base64DataUrlPattern, (match) => {
blobsRemoved++;
// Extract mime type if possible
const mimeMatch = match.match(/data:([^;]+);/);
const mime = mimeMatch ? mimeMatch[1] : 'binary';
return `"[BLOB: ${mime} data removed]"`;
});
// Replace remaining long base64 strings
cleaned = cleaned.replace(longBase64Pattern, () => {
blobsRemoved++;
return '"[BLOB: large binary data removed]"';
});
return { cleaned, blobsRemoved };
}
// ============================================================================
// Analysis Prompts
// ============================================================================
/**
* Build an analysis prompt based on file type
* @param attachment The file attachment metadata
* @param rawContent The full content to analyze (from original file)
*/
function buildAnalysisPrompt(attachment: FileAttachment, rawContent: string): string {
let content = rawContent;
const fileTypeHint = getFileTypeHint(attachment);
let blobNote = '';
// For JSON files, remove blobs to reduce size
const ext = attachment.filename.split('.').pop()?.toLowerCase();
const mime = attachment.mimeType.toLowerCase();
if (mime === 'application/json' || ext === 'json') {
const { cleaned, blobsRemoved } = cleanJsonForAnalysis(content);
content = cleaned;
if (blobsRemoved > 0) {
blobNote = `\n(Note: ${blobsRemoved} binary blob(s) were removed from the JSON for analysis)`;
}
}
return `Summarize this ${fileTypeHint} in 2-3 sentences. Focus on: what it is, key data/content, and structure.${blobNote}
<file name="${attachment.filename}">
${content}
</file>
Summary:`;
}
/**
* Get a human-readable hint about the file type
*/
function getFileTypeHint(attachment: FileAttachment): string {
const ext = attachment.filename.split('.').pop()?.toLowerCase();
const mime = attachment.mimeType.toLowerCase();
if (mime === 'application/json' || ext === 'json') {
return 'JSON data file';
}
if (mime === 'application/pdf' || ext === 'pdf') {
return 'PDF document';
}
if (ext === 'md' || ext === 'markdown') {
return 'Markdown document';
}
if (['js', 'ts', 'jsx', 'tsx', 'py', 'go', 'rs', 'java', 'c', 'cpp'].includes(ext || '')) {
return `${ext?.toUpperCase()} source code file`;
}
if (['yaml', 'yml', 'toml', 'ini', 'cfg', 'conf'].includes(ext || '')) {
return 'configuration file';
}
if (ext === 'csv') {
return 'CSV data file';
}
if (ext === 'xml') {
return 'XML document';
}
if (ext === 'html' || ext === 'htm') {
return 'HTML document';
}
return 'text file';
}
// ============================================================================
// File Analyzer Class
// ============================================================================
export class FileAnalyzer {
private config: FileAnalyzerConfig;
constructor(config: Partial<FileAnalyzerConfig> = {}) {
this.config = { ...DEFAULT_CONFIG, ...config };
}
/**
* Get the size threshold for a given attachment type
*/
private getThreshold(attachment: FileAttachment): number {
const ext = attachment.filename.split('.').pop()?.toLowerCase();
const mime = attachment.mimeType.toLowerCase();
// JSON files are dense, use lower threshold
if (mime === 'application/json' || ext === 'json') {
return this.config.thresholds.json;
}
// PDFs can be larger
if (mime === 'application/pdf' || ext === 'pdf') {
return this.config.thresholds.pdf;
}
// Default text threshold
return this.config.thresholds.text;
}
/**
* Check if a file should be analyzed (based on content size)
* Skips analysis for borderline files (within 20% of threshold)
*/
shouldAnalyze(attachment: FileAttachment): boolean {
const contentLength = attachment.textContent?.length || 0;
// Below threshold - no analysis needed
if (contentLength <= ANALYSIS_THRESHOLD) {
return false;
}
// Check if borderline (within 20% of threshold)
// These files are small enough to just use directly
const borderlineLimit = ANALYSIS_THRESHOLD * (1 + BORDERLINE_THRESHOLD_PERCENT);
if (contentLength <= borderlineLimit) {
return false;
}
return true;
}
/**
* Analyze a file attachment if needed
* Returns either the original content (for small files) or a summary (for large files)
*/
async analyzeIfNeeded(
attachment: FileAttachment,
model: string
): Promise<AnalysisResult> {
const contentLength = attachment.textContent?.length || 0;
// Small files or borderline: use original content
if (!this.shouldAnalyze(attachment)) {
return {
useOriginal: true,
content: attachment.textContent || '',
originalLength: contentLength,
analyzed: false,
};
}
// Large files: spawn analysis agent with timeout
const startTime = Date.now();
try {
// Race between analysis and timeout
const summary = await Promise.race([
this.spawnAnalysisAgent(attachment, model),
new Promise<never>((_, reject) =>
setTimeout(() => reject(new Error('Analysis timeout')), this.config.timeout)
)
]);
return {
useOriginal: false,
content: summary,
summary,
originalLength: contentLength,
analyzed: true,
};
} catch (error) {
const elapsed = Date.now() - startTime;
const isTimeout = error instanceof Error && error.message === 'Analysis timeout';
console.warn(`[FileAnalyzer] Analysis ${isTimeout ? 'TIMEOUT' : 'FAILED'} for ${attachment.filename} after ${elapsed}ms`);
// Fallback: use truncated content (faster than waiting for slow analysis)
const truncated = attachment.textContent?.slice(0, ANALYSIS_THRESHOLD) || '';
const reason = isTimeout ? 'timed out' : 'failed';
return {
useOriginal: false,
content: truncated + `\n\n[Analysis ${reason} - showing first ${formatFileSize(ANALYSIS_THRESHOLD)} of ${formatFileSize(contentLength)}]`,
originalLength: contentLength,
analyzed: false,
error: error instanceof Error ? error.message : 'Analysis failed',
};
}
}
/**
* Spawn a separate Ollama request to analyze the file
*/
private async spawnAnalysisAgent(
attachment: FileAttachment,
model: string
): Promise<string> {
// Read full content from original file if available
const fullContent = await readFullContentForAnalysis(attachment);
const prompt = buildAnalysisPrompt(attachment, fullContent);
// Use generate for a simple completion
const response = await ollamaClient.generate({
model,
prompt,
options: {
temperature: 0.3, // Lower temperature for consistent summaries
num_predict: this.config.maxResponseTokens,
}
});
return response.response.trim();
}
}
// ============================================================================
// Singleton Instance
// ============================================================================
export const fileAnalyzer = new FileAnalyzer();
// ============================================================================
// Batch Analysis Helper
// ============================================================================
/**
* Analyze multiple files with concurrency limit
* @param files Files to analyze
* @param model Model to use
* @param maxConcurrent Maximum parallel analyses (default 2)
*/
export async function analyzeFilesInBatches(
files: FileAttachment[],
model: string,
maxConcurrent: number = 2
): Promise<Map<string, AnalysisResult>> {
const results = new Map<string, AnalysisResult>();
// Process in batches of maxConcurrent
for (let i = 0; i < files.length; i += maxConcurrent) {
const batch = files.slice(i, i + maxConcurrent);
const batchResults = await Promise.all(
batch.map(file => fileAnalyzer.analyzeIfNeeded(file, model))
);
batch.forEach((file, idx) => {
results.set(file.id, batchResults[idx]);
});
}
return results;
}
// ============================================================================
// Utility Functions
// ============================================================================
/**
* Format an analyzed attachment for inclusion in a message
*/
export function formatAnalyzedAttachment(
attachment: FileAttachment,
result: AnalysisResult
): string {
if (result.analyzed && result.summary) {
return `<file name="${escapeXmlAttr(attachment.filename)}" size="${formatFileSize(attachment.size)}" analyzed="true">
## Summary (original: ${formatFileSize(result.originalLength)} chars)
${result.summary}
</file>`;
}
// Not analyzed, use content directly
return `<file name="${escapeXmlAttr(attachment.filename)}" size="${formatFileSize(attachment.size)}">
${result.content}
</file>`;
}
/**
* Escape special characters for XML attribute values
*/
function escapeXmlAttr(str: string): string {
return str
.replace(/&/g, '&amp;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&apos;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;');
}