vessel/frontend/src/lib/services/fileAnalyzer.ts

/**
 * File Analyzer Service
 *
 * Spawns a separate Ollama request to analyze/summarize large files
 * before adding them to the main conversation context.
 * This keeps the main context clean for conversation while still
 * allowing the model to understand file contents.
 */

import { ollamaClient } from '$lib/ollama';
import type { FileAttachment } from '$lib/types/attachment.js';
import { ANALYSIS_THRESHOLD, MAX_EXTRACTED_CONTENT } from '$lib/types/attachment.js';
import { formatFileSize } from '$lib/utils/file-processor.js';

// ============================================================================
// Types
// ============================================================================

export interface AnalysisResult {
	/** Whether to use the original content (file was small enough) */
	useOriginal: boolean;
	/** The content to use in the message (original or summary) */
	content: string;
	/** Summary generated by the analysis agent (if analyzed) */
	summary?: string;
	/** Original content size in characters */
	originalLength: number;
	/** Whether the file was analyzed by the sub-agent */
	analyzed: boolean;
	/** Error message if analysis failed */
	error?: string;
}

export interface FileAnalyzerConfig {
	/** Size thresholds for different file types (in bytes) */
	thresholds: {
		text: number;
		pdf: number;
		json: number;
	};
	/** Timeout for analysis request (ms) */
	timeout: number;
	/** Maximum tokens for analysis response */
	maxResponseTokens: number;
}

// ============================================================================
// Default Configuration
// ============================================================================

const DEFAULT_CONFIG: FileAnalyzerConfig = {
	thresholds: {
		text: 500 * 1024,    // 500KB for general text
		pdf: 1024 * 1024,    // 1MB for PDFs
		json: 300 * 1024,    // 300KB for JSON (dense data)
	},
	timeout: 10000,          // 10 seconds - fail fast, fall back to truncated
	maxResponseTokens: 256,  // Keep summaries very concise for speed
};

/** Maximum content to read from file for analysis (50KB) */
const MAX_ANALYSIS_CONTENT = 50 * 1024;

/** If content is within this % of threshold, skip analysis */
const BORDERLINE_THRESHOLD_PERCENT = 0.2; // 20%

// ============================================================================
// Content Reading
// ============================================================================

/**
 * Read full content from original file for analysis
 * Returns up to MAX_ANALYSIS_CONTENT chars
 */
async function readFullContentForAnalysis(attachment: FileAttachment): Promise<string> {
	// If we have the original file, read from it
	if (attachment.originalFile) {
		try {
			const text = await attachment.originalFile.text();
			// Limit to max analysis content
			if (text.length > MAX_ANALYSIS_CONTENT) {
				return text.slice(0, MAX_ANALYSIS_CONTENT);
			}
			return text;
		} catch (err) {
			console.warn('[FileAnalyzer] Failed to read original file:', err);
		}
	}

	// Fall back to stored textContent
	return attachment.textContent || '';
}

// ============================================================================
// Content Cleaning
// ============================================================================

/**
 * Remove base64 blobs and large binary data from JSON content
 * Replaces them with descriptive placeholders
 */
function cleanJsonForAnalysis(content: string): { cleaned: string; blobsRemoved: number } {
	let blobsRemoved = 0;

	// Pattern to match base64 data (common patterns in JSON)
	// Matches: "data:image/...;base64,..." or long base64 strings (>100 chars of base64 alphabet)
	const base64DataUrlPattern = /"data:[^"]*;base64,[A-Za-z0-9+/=]+"/g;
	const longBase64Pattern = /"[A-Za-z0-9+/=]{100,}"/g;

	let cleaned = content;

	// Replace data URLs
	cleaned = cleaned.replace(base64DataUrlPattern, (match) => {
		blobsRemoved++;
		// Extract mime type if possible
		const mimeMatch = match.match(/data:([^;]+);/);
		const mime = mimeMatch ? mimeMatch[1] : 'binary';
		return `"[BLOB: ${mime} data removed]"`;
	});

	// Replace remaining long base64 strings
	cleaned = cleaned.replace(longBase64Pattern, () => {
		blobsRemoved++;
		return '"[BLOB: large binary data removed]"';
	});

	return { cleaned, blobsRemoved };
}

// ============================================================================
// Analysis Prompts
// ============================================================================

/**
 * Build an analysis prompt based on file type
 * @param attachment The file attachment metadata
 * @param rawContent The full content to analyze (from original file)
 */
function buildAnalysisPrompt(attachment: FileAttachment, rawContent: string): string {
	let content = rawContent;
	const fileTypeHint = getFileTypeHint(attachment);
	let blobNote = '';

	// For JSON files, remove blobs to reduce size
	const ext = attachment.filename.split('.').pop()?.toLowerCase();
	const mime = attachment.mimeType.toLowerCase();
	if (mime === 'application/json' || ext === 'json') {
		const { cleaned, blobsRemoved } = cleanJsonForAnalysis(content);
		content = cleaned;
		if (blobsRemoved > 0) {
			blobNote = `\n(Note: ${blobsRemoved} binary blob(s) were removed from the JSON for analysis)`;
		}
	}

	return `Summarize this ${fileTypeHint} in 2-3 sentences. Focus on: what it is, key data/content, and structure.${blobNote}

<file name="${attachment.filename}">
${content}
</file>

Summary:`;
}

/**
 * Get a human-readable hint about the file type
 */
function getFileTypeHint(attachment: FileAttachment): string {
	const ext = attachment.filename.split('.').pop()?.toLowerCase();
	const mime = attachment.mimeType.toLowerCase();

	if (mime === 'application/json' || ext === 'json') {
		return 'JSON data file';
	}
	if (mime === 'application/pdf' || ext === 'pdf') {
		return 'PDF document';
	}
	if (ext === 'md' || ext === 'markdown') {
		return 'Markdown document';
	}
	if (['js', 'ts', 'jsx', 'tsx', 'py', 'go', 'rs', 'java', 'c', 'cpp'].includes(ext || '')) {
		return `${ext?.toUpperCase()} source code file`;
	}
	if (['yaml', 'yml', 'toml', 'ini', 'cfg', 'conf'].includes(ext || '')) {
		return 'configuration file';
	}
	if (ext === 'csv') {
		return 'CSV data file';
	}
	if (ext === 'xml') {
		return 'XML document';
	}
	if (ext === 'html' || ext === 'htm') {
		return 'HTML document';
	}

	return 'text file';
}

// ============================================================================
// File Analyzer Class
// ============================================================================

export class FileAnalyzer {
	private config: FileAnalyzerConfig;

	constructor(config: Partial<FileAnalyzerConfig> = {}) {
		this.config = { ...DEFAULT_CONFIG, ...config };
	}

	/**
	 * Get the size threshold for a given attachment type
	 */
	private getThreshold(attachment: FileAttachment): number {
		const ext = attachment.filename.split('.').pop()?.toLowerCase();
		const mime = attachment.mimeType.toLowerCase();

		// JSON files are dense, use lower threshold
		if (mime === 'application/json' || ext === 'json') {
			return this.config.thresholds.json;
		}

		// PDFs can be larger
		if (mime === 'application/pdf' || ext === 'pdf') {
			return this.config.thresholds.pdf;
		}

		// Default text threshold
		return this.config.thresholds.text;
	}

	/**
	 * Check if a file should be analyzed (based on content size)
	 * Skips analysis for borderline files (within 20% of threshold)
	 */
	shouldAnalyze(attachment: FileAttachment): boolean {
		const contentLength = attachment.textContent?.length || 0;

		// Below threshold - no analysis needed
		if (contentLength <= ANALYSIS_THRESHOLD) {
			return false;
		}

		// Check if borderline (within 20% of threshold)
		// These files are small enough to just use directly
		const borderlineLimit = ANALYSIS_THRESHOLD * (1 + BORDERLINE_THRESHOLD_PERCENT);
		if (contentLength <= borderlineLimit) {
			return false;
		}

		return true;
	}

	/**
	 * Analyze a file attachment if needed
	 * Returns either the original content (for small files) or a summary (for large files)
	 */
	async analyzeIfNeeded(
		attachment: FileAttachment,
		model: string
	): Promise<AnalysisResult> {
		const contentLength = attachment.textContent?.length || 0;

		// Small files or borderline: use original content
		if (!this.shouldAnalyze(attachment)) {
			return {
				useOriginal: true,
				content: attachment.textContent || '',
				originalLength: contentLength,
				analyzed: false,
			};
		}

		// Large files: spawn analysis agent with timeout
		const startTime = Date.now();

		try {
			// Race between analysis and timeout
			const summary = await Promise.race([
				this.spawnAnalysisAgent(attachment, model),
				new Promise<never>((_, reject) =>
					setTimeout(() => reject(new Error('Analysis timeout')), this.config.timeout)
				)
			]);

			return {
				useOriginal: false,
				content: summary,
				summary,
				originalLength: contentLength,
				analyzed: true,
			};
		} catch (error) {
			const elapsed = Date.now() - startTime;
			const isTimeout = error instanceof Error && error.message === 'Analysis timeout';
			console.warn(`[FileAnalyzer] Analysis ${isTimeout ? 'TIMEOUT' : 'FAILED'} for ${attachment.filename} after ${elapsed}ms`);

			// Fallback: use truncated content (faster than waiting for slow analysis)
			const truncated = attachment.textContent?.slice(0, ANALYSIS_THRESHOLD) || '';
			const reason = isTimeout ? 'timed out' : 'failed';
			return {
				useOriginal: false,
				content: truncated + `\n\n[Analysis ${reason} - showing first ${formatFileSize(ANALYSIS_THRESHOLD)} of ${formatFileSize(contentLength)}]`,
				originalLength: contentLength,
				analyzed: false,
				error: error instanceof Error ? error.message : 'Analysis failed',
			};
		}
	}

	/**
	 * Spawn a separate Ollama request to analyze the file
	 */
	private async spawnAnalysisAgent(
		attachment: FileAttachment,
		model: string
	): Promise<string> {
		// Read full content from original file if available
		const fullContent = await readFullContentForAnalysis(attachment);
		const prompt = buildAnalysisPrompt(attachment, fullContent);

		// Use generate for a simple completion
		const response = await ollamaClient.generate({
			model,
			prompt,
			options: {
				temperature: 0.3,  // Lower temperature for consistent summaries
				num_predict: this.config.maxResponseTokens,
			}
		});

		return response.response.trim();
	}
}

// ============================================================================
// Singleton Instance
// ============================================================================

export const fileAnalyzer = new FileAnalyzer();

// ============================================================================
// Batch Analysis Helper
// ============================================================================

/**
 * Analyze multiple files with concurrency limit
 * @param files Files to analyze
 * @param model Model to use
 * @param maxConcurrent Maximum parallel analyses (default 2)
 */
export async function analyzeFilesInBatches(
	files: FileAttachment[],
	model: string,
	maxConcurrent: number = 2
): Promise<Map<string, AnalysisResult>> {
	const results = new Map<string, AnalysisResult>();

	// Process in batches of maxConcurrent
	for (let i = 0; i < files.length; i += maxConcurrent) {
		const batch = files.slice(i, i + maxConcurrent);
		const batchResults = await Promise.all(
			batch.map(file => fileAnalyzer.analyzeIfNeeded(file, model))
		);

		batch.forEach((file, idx) => {
			results.set(file.id, batchResults[idx]);
		});
	}

	return results;
}

// ============================================================================
// Utility Functions
// ============================================================================

/**
 * Format an analyzed attachment for inclusion in a message
 */
export function formatAnalyzedAttachment(
	attachment: FileAttachment,
	result: AnalysisResult
): string {
	if (result.analyzed && result.summary) {
		return `<file name="${escapeXmlAttr(attachment.filename)}" size="${formatFileSize(attachment.size)}" analyzed="true">
## Summary (original: ${formatFileSize(result.originalLength)} chars)
${result.summary}
</file>`;
	}

	// Not analyzed, use content directly
	return `<file name="${escapeXmlAttr(attachment.filename)}" size="${formatFileSize(attachment.size)}">
${result.content}
</file>`;
}

/**
 * Escape special characters for XML attribute values
 */
function escapeXmlAttr(str: string): string {
	return str
		.replace(/&/g, '&amp;')
		.replace(/"/g, '&quot;')
		.replace(/'/g, '&apos;')
		.replace(/</g, '&lt;')
		.replace(/>/g, '&gt;');
}