- Add embedding-based chat indexing for project conversations - Chunk long messages (1500 chars with 200 overlap) for better coverage - Index messages when leaving a conversation (background) - Search indexed chat history with semantic similarity - Show other project conversations with message count and summary status - Include relevant chat snippets in project context for LLM - Fix chunker infinite loop bug near end of text - Fix curl encoding error with explicit Accept-Encoding header - Add document previews to project knowledge base context - Lower RAG threshold to 0.2 and increase topK to 10 for better recall
438 lines
12 KiB
TypeScript
438 lines
12 KiB
TypeScript
/**
|
|
* Vector store for knowledge base documents
|
|
*
|
|
* Stores document chunks with embeddings in IndexedDB
|
|
* and provides similarity search for RAG retrieval.
|
|
*/
|
|
|
|
import { db, type StoredDocument, type StoredChunk } from '$lib/storage/db.js';
|
|
import { generateEmbedding, generateEmbeddings, cosineSimilarity, DEFAULT_EMBEDDING_MODEL } from './embeddings.js';
|
|
import { chunkText, chunkTextAsync, estimateChunkTokens, type ChunkOptions } from './chunker.js';
|
|
|
|
/** Result of a similarity search */
|
|
export interface SearchResult {
|
|
chunk: StoredChunk;
|
|
document: StoredDocument;
|
|
similarity: number;
|
|
}
|
|
|
|
/** Options for adding a document */
|
|
export interface AddDocumentOptions {
|
|
/** Chunking options */
|
|
chunkOptions?: ChunkOptions;
|
|
/** Embedding model to use */
|
|
embeddingModel?: string;
|
|
/** Callback for progress updates */
|
|
onProgress?: (current: number, total: number) => void;
|
|
/** Project ID if document belongs to a project */
|
|
projectId?: string;
|
|
}
|
|
|
|
/**
|
|
* Add a document to the knowledge base
|
|
* Chunks the content and generates embeddings for each chunk
|
|
*/
|
|
export async function addDocument(
|
|
name: string,
|
|
content: string,
|
|
mimeType: string,
|
|
options: AddDocumentOptions = {}
|
|
): Promise<StoredDocument> {
|
|
const {
|
|
chunkOptions,
|
|
embeddingModel = DEFAULT_EMBEDDING_MODEL,
|
|
onProgress,
|
|
projectId
|
|
} = options;
|
|
|
|
const documentId = crypto.randomUUID();
|
|
const now = Date.now();
|
|
|
|
// Chunk the content
|
|
const textChunks = chunkText(content, documentId, chunkOptions);
|
|
|
|
if (textChunks.length === 0) {
|
|
throw new Error('Document produced no chunks');
|
|
}
|
|
|
|
// Generate embeddings for all chunks
|
|
const chunkContents = textChunks.map(c => c.content);
|
|
const embeddings: number[][] = [];
|
|
|
|
// Process in batches with progress
|
|
const BATCH_SIZE = 5;
|
|
for (let i = 0; i < chunkContents.length; i += BATCH_SIZE) {
|
|
const batch = chunkContents.slice(i, i + BATCH_SIZE);
|
|
const batchEmbeddings = await generateEmbeddings(batch, embeddingModel);
|
|
embeddings.push(...batchEmbeddings);
|
|
|
|
if (onProgress) {
|
|
onProgress(Math.min(i + BATCH_SIZE, chunkContents.length), chunkContents.length);
|
|
}
|
|
}
|
|
|
|
// Create stored chunks with embeddings
|
|
const storedChunks: StoredChunk[] = textChunks.map((chunk, index) => ({
|
|
id: chunk.id,
|
|
documentId,
|
|
content: chunk.content,
|
|
embedding: embeddings[index],
|
|
startIndex: chunk.startIndex,
|
|
endIndex: chunk.endIndex,
|
|
tokenCount: estimateChunkTokens(chunk.content)
|
|
}));
|
|
|
|
// Create document record
|
|
const document: StoredDocument = {
|
|
id: documentId,
|
|
name,
|
|
mimeType,
|
|
size: content.length,
|
|
createdAt: now,
|
|
updatedAt: now,
|
|
chunkCount: storedChunks.length,
|
|
embeddingModel,
|
|
projectId: projectId ?? null,
|
|
embeddingStatus: 'ready'
|
|
};
|
|
|
|
// Store in database
|
|
await db.transaction('rw', [db.documents, db.chunks], async () => {
|
|
await db.documents.add(document);
|
|
await db.chunks.bulkAdd(storedChunks);
|
|
});
|
|
|
|
return document;
|
|
}
|
|
|
|
/** Options for async document upload */
|
|
export interface AddDocumentAsyncOptions extends AddDocumentOptions {
|
|
/** Callback when embedding generation completes */
|
|
onComplete?: (doc: StoredDocument) => void;
|
|
/** Callback when embedding generation fails */
|
|
onError?: (error: Error) => void;
|
|
}
|
|
|
|
/**
|
|
* Add a document asynchronously - stores immediately, generates embeddings in background
|
|
* Returns immediately with the document in 'pending' state
|
|
*/
|
|
export async function addDocumentAsync(
|
|
name: string,
|
|
content: string,
|
|
mimeType: string,
|
|
options: AddDocumentAsyncOptions = {}
|
|
): Promise<StoredDocument> {
|
|
const {
|
|
chunkOptions,
|
|
embeddingModel = DEFAULT_EMBEDDING_MODEL,
|
|
onProgress,
|
|
onComplete,
|
|
onError,
|
|
projectId
|
|
} = options;
|
|
|
|
const documentId = crypto.randomUUID();
|
|
const now = Date.now();
|
|
|
|
// Create document record immediately (without knowing chunk count yet)
|
|
// We'll update it after chunking in the background
|
|
const document: StoredDocument = {
|
|
id: documentId,
|
|
name,
|
|
mimeType,
|
|
size: content.length,
|
|
createdAt: now,
|
|
updatedAt: now,
|
|
chunkCount: 0, // Will be updated after chunking
|
|
embeddingModel,
|
|
projectId: projectId ?? null,
|
|
embeddingStatus: 'pending'
|
|
};
|
|
|
|
// Store document immediately
|
|
await db.documents.add(document);
|
|
|
|
// Process everything in background (non-blocking) - including chunking
|
|
setTimeout(async () => {
|
|
console.log('[Embedding] Starting for:', name, 'content length:', content.length);
|
|
try {
|
|
// Update status to processing
|
|
await db.documents.update(documentId, { embeddingStatus: 'processing' });
|
|
console.log('[Embedding] Status updated, starting chunking...');
|
|
|
|
// Chunk the content using async version (yields periodically)
|
|
let textChunks;
|
|
try {
|
|
textChunks = await chunkTextAsync(content, documentId, chunkOptions);
|
|
} catch (chunkError) {
|
|
console.error('[Embedding] Chunking failed:', chunkError);
|
|
throw chunkError;
|
|
}
|
|
console.log('[Embedding] Chunked into', textChunks.length, 'chunks');
|
|
|
|
if (textChunks.length === 0) {
|
|
throw new Error('Document produced no chunks');
|
|
}
|
|
|
|
// Update chunk count
|
|
await db.documents.update(documentId, { chunkCount: textChunks.length });
|
|
|
|
const chunkContents = textChunks.map(c => c.content);
|
|
const embeddings: number[][] = [];
|
|
|
|
// Process embeddings in batches with progress
|
|
const BATCH_SIZE = 5;
|
|
const totalBatches = Math.ceil(chunkContents.length / BATCH_SIZE);
|
|
for (let i = 0; i < chunkContents.length; i += BATCH_SIZE) {
|
|
const batchNum = Math.floor(i / BATCH_SIZE) + 1;
|
|
console.log(`[Embedding] Batch ${batchNum}/${totalBatches}...`);
|
|
const batch = chunkContents.slice(i, i + BATCH_SIZE);
|
|
const batchEmbeddings = await generateEmbeddings(batch, embeddingModel);
|
|
embeddings.push(...batchEmbeddings);
|
|
|
|
if (onProgress) {
|
|
onProgress(Math.min(i + BATCH_SIZE, chunkContents.length), chunkContents.length);
|
|
}
|
|
}
|
|
|
|
// Create stored chunks with embeddings
|
|
const storedChunks: StoredChunk[] = textChunks.map((chunk, index) => ({
|
|
id: chunk.id,
|
|
documentId,
|
|
content: chunk.content,
|
|
embedding: embeddings[index],
|
|
startIndex: chunk.startIndex,
|
|
endIndex: chunk.endIndex,
|
|
tokenCount: estimateChunkTokens(chunk.content)
|
|
}));
|
|
|
|
// Store chunks and update document status
|
|
await db.transaction('rw', [db.documents, db.chunks], async () => {
|
|
await db.chunks.bulkAdd(storedChunks);
|
|
await db.documents.update(documentId, {
|
|
embeddingStatus: 'ready',
|
|
updatedAt: Date.now()
|
|
});
|
|
});
|
|
|
|
console.log('[Embedding] Complete for:', name);
|
|
const updatedDoc = await db.documents.get(documentId);
|
|
if (updatedDoc && onComplete) {
|
|
onComplete(updatedDoc);
|
|
}
|
|
} catch (error) {
|
|
console.error('[Embedding] Failed:', error);
|
|
await db.documents.update(documentId, { embeddingStatus: 'failed' });
|
|
if (onError) {
|
|
onError(error instanceof Error ? error : new Error(String(error)));
|
|
}
|
|
}
|
|
}, 0);
|
|
|
|
return document;
|
|
}
|
|
|
|
/** Options for similarity search */
|
|
export interface SearchOptions {
|
|
/** Maximum number of results to return */
|
|
topK?: number;
|
|
/** Minimum similarity threshold (0-1) */
|
|
threshold?: number;
|
|
/** Embedding model to use */
|
|
embeddingModel?: string;
|
|
/** Filter to documents in this project only (null = global only, undefined = all) */
|
|
projectId?: string | null;
|
|
}
|
|
|
|
/**
|
|
* Search for similar chunks across documents
|
|
* @param query - The search query
|
|
* @param options - Search options including projectId filter
|
|
*/
|
|
export async function searchSimilar(
|
|
query: string,
|
|
options: SearchOptions = {}
|
|
): Promise<SearchResult[]> {
|
|
const {
|
|
topK = 5,
|
|
threshold = 0.5,
|
|
embeddingModel = DEFAULT_EMBEDDING_MODEL,
|
|
projectId
|
|
} = options;
|
|
|
|
// Generate embedding for query
|
|
const queryEmbedding = await generateEmbedding(query, embeddingModel);
|
|
|
|
// Get all chunks (for small collections, this is fine)
|
|
// For larger collections, we'd want to implement approximate NN search
|
|
const allChunks = await db.chunks.toArray();
|
|
|
|
if (allChunks.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
// Get document IDs that match the project filter
|
|
let allowedDocumentIds: Set<string> | null = null;
|
|
if (projectId !== undefined) {
|
|
const docs = await db.documents.toArray();
|
|
const filteredDocs = docs.filter((d) =>
|
|
projectId === null ? !d.projectId : d.projectId === projectId
|
|
);
|
|
allowedDocumentIds = new Set(filteredDocs.map((d) => d.id));
|
|
}
|
|
|
|
// Filter chunks by project and calculate similarities
|
|
const relevantChunks = allowedDocumentIds
|
|
? allChunks.filter((c) => allowedDocumentIds!.has(c.documentId))
|
|
: allChunks;
|
|
|
|
if (relevantChunks.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
// Calculate similarities
|
|
const scored = relevantChunks.map((chunk) => ({
|
|
chunk,
|
|
similarity: cosineSimilarity(queryEmbedding, chunk.embedding)
|
|
}));
|
|
|
|
// Filter and sort
|
|
const filtered = scored
|
|
.filter((item) => item.similarity >= threshold)
|
|
.sort((a, b) => b.similarity - a.similarity)
|
|
.slice(0, topK);
|
|
|
|
// Fetch document info for results
|
|
const documentIds = [...new Set(filtered.map((r) => r.chunk.documentId))];
|
|
const documents = await db.documents.bulkGet(documentIds);
|
|
const documentMap = new Map(documents.filter(Boolean).map((d) => [d!.id, d!]));
|
|
|
|
// Build results
|
|
return filtered
|
|
.map((item) => ({
|
|
chunk: item.chunk,
|
|
document: documentMap.get(item.chunk.documentId)!,
|
|
similarity: item.similarity
|
|
}))
|
|
.filter((r) => r.document !== undefined);
|
|
}
|
|
|
|
/**
|
|
* Get all documents in the knowledge base
|
|
*/
|
|
export async function listDocuments(): Promise<StoredDocument[]> {
|
|
return db.documents.orderBy('updatedAt').reverse().toArray();
|
|
}
|
|
|
|
/**
|
|
* Get a document by ID
|
|
*/
|
|
export async function getDocument(id: string): Promise<StoredDocument | undefined> {
|
|
return db.documents.get(id);
|
|
}
|
|
|
|
/**
|
|
* Get all chunks for a document
|
|
*/
|
|
export async function getDocumentChunks(documentId: string): Promise<StoredChunk[]> {
|
|
return db.chunks.where('documentId').equals(documentId).toArray();
|
|
}
|
|
|
|
/**
|
|
* Delete a document and its chunks
|
|
*/
|
|
export async function deleteDocument(id: string): Promise<void> {
|
|
await db.transaction('rw', [db.documents, db.chunks], async () => {
|
|
await db.chunks.where('documentId').equals(id).delete();
|
|
await db.documents.delete(id);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Retry embedding for a stuck document
|
|
* Useful when HMR or page refresh interrupts background processing
|
|
*/
|
|
export async function retryDocumentEmbedding(
|
|
documentId: string,
|
|
onComplete?: (doc: StoredDocument) => void,
|
|
onError?: (error: Error) => void
|
|
): Promise<void> {
|
|
const doc = await db.documents.get(documentId);
|
|
if (!doc) {
|
|
throw new Error('Document not found');
|
|
}
|
|
|
|
// Only retry if stuck in pending or processing state
|
|
if (doc.embeddingStatus === 'ready') {
|
|
console.log('Document already has embeddings');
|
|
return;
|
|
}
|
|
|
|
// Delete any existing chunks for this document
|
|
await db.chunks.where('documentId').equals(documentId).delete();
|
|
|
|
// We need the original content, which we don't store
|
|
// So we need to mark it as failed - user will need to re-upload
|
|
// OR we could store the content temporarily...
|
|
|
|
// For now, just mark as failed so user knows to re-upload
|
|
await db.documents.update(documentId, { embeddingStatus: 'failed' });
|
|
|
|
if (onError) {
|
|
onError(new Error('Cannot retry - document content not cached. Please re-upload the file.'));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reset stuck documents (pending/processing) to failed state
|
|
* Call this on app startup to clean up interrupted uploads
|
|
*/
|
|
export async function resetStuckDocuments(): Promise<number> {
|
|
// Get all documents and filter in memory (no index required)
|
|
const allDocs = await db.documents.toArray();
|
|
const stuckDocs = allDocs.filter(
|
|
doc => doc.embeddingStatus === 'pending' || doc.embeddingStatus === 'processing'
|
|
);
|
|
|
|
for (const doc of stuckDocs) {
|
|
await db.documents.update(doc.id, { embeddingStatus: 'failed' });
|
|
}
|
|
|
|
return stuckDocs.length;
|
|
}
|
|
|
|
/**
|
|
* Get total statistics for the knowledge base
|
|
*/
|
|
export async function getKnowledgeBaseStats(): Promise<{
|
|
documentCount: number;
|
|
chunkCount: number;
|
|
totalTokens: number;
|
|
}> {
|
|
const documents = await db.documents.count();
|
|
const chunks = await db.chunks.toArray();
|
|
|
|
return {
|
|
documentCount: documents,
|
|
chunkCount: chunks.length,
|
|
totalTokens: chunks.reduce((sum, c) => sum + c.tokenCount, 0)
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Format search results as context for the LLM
|
|
*/
|
|
export function formatResultsAsContext(results: SearchResult[]): string {
|
|
if (results.length === 0) {
|
|
return '';
|
|
}
|
|
|
|
const sections = results.map((r, i) => {
|
|
const source = r.document.name;
|
|
return `[Source ${i + 1}: ${source}]\n${r.chunk.content}`;
|
|
});
|
|
|
|
return `Relevant context from knowledge base:\n\n${sections.join('\n\n---\n\n')}`;
|
|
}
|