- Add "Processing X files..." indicator in chat while handling attachments - Indicator transitions to "Analyzing X files..." for large files needing LLM summarization - Reuse streaming message for seamless transition to LLM response - Add FileAnalyzer service for large file summarization with 10s timeout - Skip analysis for borderline files (within 20% of 8K threshold) - Read up to 50KB from original file for analysis (not just truncated content) - Remove base64 blobs from JSON before analysis to reduce prompt size - Add AttachmentDisplay component for showing file badges on messages - Persist attachments to IndexedDB with message references - Add chat state methods: setStreamContent, removeMessage - Clean up debug logging
408 lines
12 KiB
TypeScript
408 lines
12 KiB
TypeScript
/**
|
|
* File Analyzer Service
|
|
*
|
|
* Spawns a separate Ollama request to analyze/summarize large files
|
|
* before adding them to the main conversation context.
|
|
* This keeps the main context clean for conversation while still
|
|
* allowing the model to understand file contents.
|
|
*/
|
|
|
|
import { ollamaClient } from '$lib/ollama';
|
|
import type { FileAttachment } from '$lib/types/attachment.js';
|
|
import { ANALYSIS_THRESHOLD, MAX_EXTRACTED_CONTENT } from '$lib/types/attachment.js';
|
|
import { formatFileSize } from '$lib/utils/file-processor.js';
|
|
|
|
// ============================================================================
|
|
// Types
|
|
// ============================================================================
|
|
|
|
export interface AnalysisResult {
|
|
/** Whether to use the original content (file was small enough) */
|
|
useOriginal: boolean;
|
|
/** The content to use in the message (original or summary) */
|
|
content: string;
|
|
/** Summary generated by the analysis agent (if analyzed) */
|
|
summary?: string;
|
|
/** Original content size in characters */
|
|
originalLength: number;
|
|
/** Whether the file was analyzed by the sub-agent */
|
|
analyzed: boolean;
|
|
/** Error message if analysis failed */
|
|
error?: string;
|
|
}
|
|
|
|
export interface FileAnalyzerConfig {
|
|
/** Size thresholds for different file types (in bytes) */
|
|
thresholds: {
|
|
text: number;
|
|
pdf: number;
|
|
json: number;
|
|
};
|
|
/** Timeout for analysis request (ms) */
|
|
timeout: number;
|
|
/** Maximum tokens for analysis response */
|
|
maxResponseTokens: number;
|
|
}
|
|
|
|
// ============================================================================
|
|
// Default Configuration
|
|
// ============================================================================
|
|
|
|
const DEFAULT_CONFIG: FileAnalyzerConfig = {
|
|
thresholds: {
|
|
text: 500 * 1024, // 500KB for general text
|
|
pdf: 1024 * 1024, // 1MB for PDFs
|
|
json: 300 * 1024, // 300KB for JSON (dense data)
|
|
},
|
|
timeout: 10000, // 10 seconds - fail fast, fall back to truncated
|
|
maxResponseTokens: 256, // Keep summaries very concise for speed
|
|
};
|
|
|
|
/** Maximum content to read from file for analysis (50KB) */
|
|
const MAX_ANALYSIS_CONTENT = 50 * 1024;
|
|
|
|
/** If content is within this % of threshold, skip analysis */
|
|
const BORDERLINE_THRESHOLD_PERCENT = 0.2; // 20%
|
|
|
|
// ============================================================================
|
|
// Content Reading
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Read full content from original file for analysis
|
|
* Returns up to MAX_ANALYSIS_CONTENT chars
|
|
*/
|
|
async function readFullContentForAnalysis(attachment: FileAttachment): Promise<string> {
|
|
// If we have the original file, read from it
|
|
if (attachment.originalFile) {
|
|
try {
|
|
const text = await attachment.originalFile.text();
|
|
// Limit to max analysis content
|
|
if (text.length > MAX_ANALYSIS_CONTENT) {
|
|
return text.slice(0, MAX_ANALYSIS_CONTENT);
|
|
}
|
|
return text;
|
|
} catch (err) {
|
|
console.warn('[FileAnalyzer] Failed to read original file:', err);
|
|
}
|
|
}
|
|
|
|
// Fall back to stored textContent
|
|
return attachment.textContent || '';
|
|
}
|
|
|
|
// ============================================================================
|
|
// Content Cleaning
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Remove base64 blobs and large binary data from JSON content
|
|
* Replaces them with descriptive placeholders
|
|
*/
|
|
function cleanJsonForAnalysis(content: string): { cleaned: string; blobsRemoved: number } {
|
|
let blobsRemoved = 0;
|
|
|
|
// Pattern to match base64 data (common patterns in JSON)
|
|
// Matches: "data:image/...;base64,..." or long base64 strings (>100 chars of base64 alphabet)
|
|
const base64DataUrlPattern = /"data:[^"]*;base64,[A-Za-z0-9+/=]+"/g;
|
|
const longBase64Pattern = /"[A-Za-z0-9+/=]{100,}"/g;
|
|
|
|
let cleaned = content;
|
|
|
|
// Replace data URLs
|
|
cleaned = cleaned.replace(base64DataUrlPattern, (match) => {
|
|
blobsRemoved++;
|
|
// Extract mime type if possible
|
|
const mimeMatch = match.match(/data:([^;]+);/);
|
|
const mime = mimeMatch ? mimeMatch[1] : 'binary';
|
|
return `"[BLOB: ${mime} data removed]"`;
|
|
});
|
|
|
|
// Replace remaining long base64 strings
|
|
cleaned = cleaned.replace(longBase64Pattern, () => {
|
|
blobsRemoved++;
|
|
return '"[BLOB: large binary data removed]"';
|
|
});
|
|
|
|
return { cleaned, blobsRemoved };
|
|
}
|
|
|
|
// ============================================================================
|
|
// Analysis Prompts
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Build an analysis prompt based on file type
|
|
* @param attachment The file attachment metadata
|
|
* @param rawContent The full content to analyze (from original file)
|
|
*/
|
|
function buildAnalysisPrompt(attachment: FileAttachment, rawContent: string): string {
|
|
let content = rawContent;
|
|
const fileTypeHint = getFileTypeHint(attachment);
|
|
let blobNote = '';
|
|
|
|
// For JSON files, remove blobs to reduce size
|
|
const ext = attachment.filename.split('.').pop()?.toLowerCase();
|
|
const mime = attachment.mimeType.toLowerCase();
|
|
if (mime === 'application/json' || ext === 'json') {
|
|
const { cleaned, blobsRemoved } = cleanJsonForAnalysis(content);
|
|
content = cleaned;
|
|
if (blobsRemoved > 0) {
|
|
blobNote = `\n(Note: ${blobsRemoved} binary blob(s) were removed from the JSON for analysis)`;
|
|
}
|
|
}
|
|
|
|
return `Summarize this ${fileTypeHint} in 2-3 sentences. Focus on: what it is, key data/content, and structure.${blobNote}
|
|
|
|
<file name="${attachment.filename}">
|
|
${content}
|
|
</file>
|
|
|
|
Summary:`;
|
|
}
|
|
|
|
/**
|
|
* Get a human-readable hint about the file type
|
|
*/
|
|
function getFileTypeHint(attachment: FileAttachment): string {
|
|
const ext = attachment.filename.split('.').pop()?.toLowerCase();
|
|
const mime = attachment.mimeType.toLowerCase();
|
|
|
|
if (mime === 'application/json' || ext === 'json') {
|
|
return 'JSON data file';
|
|
}
|
|
if (mime === 'application/pdf' || ext === 'pdf') {
|
|
return 'PDF document';
|
|
}
|
|
if (ext === 'md' || ext === 'markdown') {
|
|
return 'Markdown document';
|
|
}
|
|
if (['js', 'ts', 'jsx', 'tsx', 'py', 'go', 'rs', 'java', 'c', 'cpp'].includes(ext || '')) {
|
|
return `${ext?.toUpperCase()} source code file`;
|
|
}
|
|
if (['yaml', 'yml', 'toml', 'ini', 'cfg', 'conf'].includes(ext || '')) {
|
|
return 'configuration file';
|
|
}
|
|
if (ext === 'csv') {
|
|
return 'CSV data file';
|
|
}
|
|
if (ext === 'xml') {
|
|
return 'XML document';
|
|
}
|
|
if (ext === 'html' || ext === 'htm') {
|
|
return 'HTML document';
|
|
}
|
|
|
|
return 'text file';
|
|
}
|
|
|
|
// ============================================================================
|
|
// File Analyzer Class
|
|
// ============================================================================
|
|
|
|
export class FileAnalyzer {
|
|
private config: FileAnalyzerConfig;
|
|
|
|
constructor(config: Partial<FileAnalyzerConfig> = {}) {
|
|
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
}
|
|
|
|
/**
|
|
* Get the size threshold for a given attachment type
|
|
*/
|
|
private getThreshold(attachment: FileAttachment): number {
|
|
const ext = attachment.filename.split('.').pop()?.toLowerCase();
|
|
const mime = attachment.mimeType.toLowerCase();
|
|
|
|
// JSON files are dense, use lower threshold
|
|
if (mime === 'application/json' || ext === 'json') {
|
|
return this.config.thresholds.json;
|
|
}
|
|
|
|
// PDFs can be larger
|
|
if (mime === 'application/pdf' || ext === 'pdf') {
|
|
return this.config.thresholds.pdf;
|
|
}
|
|
|
|
// Default text threshold
|
|
return this.config.thresholds.text;
|
|
}
|
|
|
|
/**
|
|
* Check if a file should be analyzed (based on content size)
|
|
* Skips analysis for borderline files (within 20% of threshold)
|
|
*/
|
|
shouldAnalyze(attachment: FileAttachment): boolean {
|
|
const contentLength = attachment.textContent?.length || 0;
|
|
|
|
// Below threshold - no analysis needed
|
|
if (contentLength <= ANALYSIS_THRESHOLD) {
|
|
return false;
|
|
}
|
|
|
|
// Check if borderline (within 20% of threshold)
|
|
// These files are small enough to just use directly
|
|
const borderlineLimit = ANALYSIS_THRESHOLD * (1 + BORDERLINE_THRESHOLD_PERCENT);
|
|
if (contentLength <= borderlineLimit) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Analyze a file attachment if needed
|
|
* Returns either the original content (for small files) or a summary (for large files)
|
|
*/
|
|
async analyzeIfNeeded(
|
|
attachment: FileAttachment,
|
|
model: string
|
|
): Promise<AnalysisResult> {
|
|
const contentLength = attachment.textContent?.length || 0;
|
|
|
|
// Small files or borderline: use original content
|
|
if (!this.shouldAnalyze(attachment)) {
|
|
return {
|
|
useOriginal: true,
|
|
content: attachment.textContent || '',
|
|
originalLength: contentLength,
|
|
analyzed: false,
|
|
};
|
|
}
|
|
|
|
// Large files: spawn analysis agent with timeout
|
|
const startTime = Date.now();
|
|
|
|
try {
|
|
// Race between analysis and timeout
|
|
const summary = await Promise.race([
|
|
this.spawnAnalysisAgent(attachment, model),
|
|
new Promise<never>((_, reject) =>
|
|
setTimeout(() => reject(new Error('Analysis timeout')), this.config.timeout)
|
|
)
|
|
]);
|
|
|
|
return {
|
|
useOriginal: false,
|
|
content: summary,
|
|
summary,
|
|
originalLength: contentLength,
|
|
analyzed: true,
|
|
};
|
|
} catch (error) {
|
|
const elapsed = Date.now() - startTime;
|
|
const isTimeout = error instanceof Error && error.message === 'Analysis timeout';
|
|
console.warn(`[FileAnalyzer] Analysis ${isTimeout ? 'TIMEOUT' : 'FAILED'} for ${attachment.filename} after ${elapsed}ms`);
|
|
|
|
// Fallback: use truncated content (faster than waiting for slow analysis)
|
|
const truncated = attachment.textContent?.slice(0, ANALYSIS_THRESHOLD) || '';
|
|
const reason = isTimeout ? 'timed out' : 'failed';
|
|
return {
|
|
useOriginal: false,
|
|
content: truncated + `\n\n[Analysis ${reason} - showing first ${formatFileSize(ANALYSIS_THRESHOLD)} of ${formatFileSize(contentLength)}]`,
|
|
originalLength: contentLength,
|
|
analyzed: false,
|
|
error: error instanceof Error ? error.message : 'Analysis failed',
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Spawn a separate Ollama request to analyze the file
|
|
*/
|
|
private async spawnAnalysisAgent(
|
|
attachment: FileAttachment,
|
|
model: string
|
|
): Promise<string> {
|
|
// Read full content from original file if available
|
|
const fullContent = await readFullContentForAnalysis(attachment);
|
|
const prompt = buildAnalysisPrompt(attachment, fullContent);
|
|
|
|
// Use generate for a simple completion
|
|
const response = await ollamaClient.generate({
|
|
model,
|
|
prompt,
|
|
options: {
|
|
temperature: 0.3, // Lower temperature for consistent summaries
|
|
num_predict: this.config.maxResponseTokens,
|
|
}
|
|
});
|
|
|
|
return response.response.trim();
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Singleton Instance
|
|
// ============================================================================
|
|
|
|
export const fileAnalyzer = new FileAnalyzer();
|
|
|
|
// ============================================================================
|
|
// Batch Analysis Helper
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Analyze multiple files with concurrency limit
|
|
* @param files Files to analyze
|
|
* @param model Model to use
|
|
* @param maxConcurrent Maximum parallel analyses (default 2)
|
|
*/
|
|
export async function analyzeFilesInBatches(
|
|
files: FileAttachment[],
|
|
model: string,
|
|
maxConcurrent: number = 2
|
|
): Promise<Map<string, AnalysisResult>> {
|
|
const results = new Map<string, AnalysisResult>();
|
|
|
|
// Process in batches of maxConcurrent
|
|
for (let i = 0; i < files.length; i += maxConcurrent) {
|
|
const batch = files.slice(i, i + maxConcurrent);
|
|
const batchResults = await Promise.all(
|
|
batch.map(file => fileAnalyzer.analyzeIfNeeded(file, model))
|
|
);
|
|
|
|
batch.forEach((file, idx) => {
|
|
results.set(file.id, batchResults[idx]);
|
|
});
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
// ============================================================================
|
|
// Utility Functions
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Format an analyzed attachment for inclusion in a message
|
|
*/
|
|
export function formatAnalyzedAttachment(
|
|
attachment: FileAttachment,
|
|
result: AnalysisResult
|
|
): string {
|
|
if (result.analyzed && result.summary) {
|
|
return `<file name="${escapeXmlAttr(attachment.filename)}" size="${formatFileSize(attachment.size)}" analyzed="true">
|
|
## Summary (original: ${formatFileSize(result.originalLength)} chars)
|
|
${result.summary}
|
|
</file>`;
|
|
}
|
|
|
|
// Not analyzed, use content directly
|
|
return `<file name="${escapeXmlAttr(attachment.filename)}" size="${formatFileSize(attachment.size)}">
|
|
${result.content}
|
|
</file>`;
|
|
}
|
|
|
|
/**
|
|
* Escape special characters for XML attribute values
|
|
*/
|
|
function escapeXmlAttr(str: string): string {
|
|
return str
|
|
.replace(/&/g, '&')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, ''')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>');
|
|
}
|