Files
vessel/frontend/src/lib/memory/chunker.ts
vikingowl ddce578833 feat: implement cross-chat RAG for project conversations
- Add embedding-based chat indexing for project conversations
- Chunk long messages (1500 chars with 200 overlap) for better coverage
- Index messages when leaving a conversation (background)
- Search indexed chat history with semantic similarity
- Show other project conversations with message count and summary status
- Include relevant chat snippets in project context for LLM
- Fix chunker infinite loop bug near end of text
- Fix curl encoding error with explicit Accept-Encoding header
- Add document previews to project knowledge base context
- Lower RAG threshold to 0.2 and increase topK to 10 for better recall
2026-01-07 18:06:49 +01:00

338 lines
7.8 KiB
TypeScript

/**
* Text chunking utilities for RAG
*
* Splits documents into smaller chunks for embedding and retrieval.
* Uses overlapping windows to preserve context across chunk boundaries.
*/
import type { DocumentChunk } from './types.js';
/** Default chunk size in characters (roughly ~128 tokens) */
const DEFAULT_CHUNK_SIZE = 512;
/** Default overlap between chunks */
const DEFAULT_OVERLAP = 50;
/** Minimum chunk size to avoid tiny fragments */
const MIN_CHUNK_SIZE = 50;
/**
* Chunking options
*/
export interface ChunkOptions {
/** Target chunk size in characters */
chunkSize?: number;
/** Overlap between consecutive chunks */
overlap?: number;
/** Prefer splitting at sentence boundaries */
respectSentences?: boolean;
/** Prefer splitting at paragraph boundaries */
respectParagraphs?: boolean;
}
/**
* Split text into overlapping chunks (synchronous version)
*/
export function chunkText(
text: string,
documentId: string,
options: ChunkOptions = {}
): DocumentChunk[] {
const {
chunkSize = DEFAULT_CHUNK_SIZE,
overlap = DEFAULT_OVERLAP,
respectSentences = true,
respectParagraphs = true
} = options;
if (!text || text.length === 0) {
return [];
}
// For very short texts, return as single chunk
if (text.length <= chunkSize) {
return [{
id: crypto.randomUUID(),
documentId,
content: text.trim(),
startIndex: 0,
endIndex: text.length
}];
}
const chunks: DocumentChunk[] = [];
let currentIndex = 0;
let previousIndex = -1;
while (currentIndex < text.length) {
// Prevent infinite loop - if we haven't advanced, we're stuck
if (currentIndex === previousIndex) {
break;
}
previousIndex = currentIndex;
// Calculate end position for this chunk
let endIndex = Math.min(currentIndex + chunkSize, text.length);
// If not at end of text, try to find a good break point
if (endIndex < text.length) {
endIndex = findBreakPoint(text, currentIndex, endIndex, {
respectSentences,
respectParagraphs
});
}
// Extract chunk content
const content = text.slice(currentIndex, endIndex).trim();
// Only add non-empty chunks above minimum size
if (content.length >= MIN_CHUNK_SIZE) {
chunks.push({
id: crypto.randomUUID(),
documentId,
content,
startIndex: currentIndex,
endIndex
});
}
// If we've reached the end, we're done
if (endIndex >= text.length) {
break;
}
// Move to next chunk position (with overlap)
currentIndex = endIndex - overlap;
// Safety: ensure we always advance
if (currentIndex <= previousIndex) {
currentIndex = previousIndex + 1;
}
}
return chunks;
}
/**
* Split text into overlapping chunks (async version that yields to event loop)
* Use this for large files to avoid blocking the UI
*/
export async function chunkTextAsync(
text: string,
documentId: string,
options: ChunkOptions = {}
): Promise<DocumentChunk[]> {
const {
chunkSize = DEFAULT_CHUNK_SIZE,
overlap = DEFAULT_OVERLAP,
respectSentences = true,
respectParagraphs = true
} = options;
if (!text || text.length === 0) {
return [];
}
// For very short texts, return as single chunk
if (text.length <= chunkSize) {
return [{
id: crypto.randomUUID(),
documentId,
content: text.trim(),
startIndex: 0,
endIndex: text.length
}];
}
const chunks: DocumentChunk[] = [];
let currentIndex = 0;
let iterationCount = 0;
let previousIndex = -1;
while (currentIndex < text.length) {
// Yield every 10 chunks to let UI breathe
if (iterationCount > 0 && iterationCount % 10 === 0) {
await new Promise(r => setTimeout(r, 0));
}
iterationCount++;
// Prevent infinite loop - if we haven't advanced, we're stuck
if (currentIndex === previousIndex) {
break;
}
previousIndex = currentIndex;
// Calculate end position for this chunk
let endIndex = Math.min(currentIndex + chunkSize, text.length);
// If not at end of text, try to find a good break point
if (endIndex < text.length) {
endIndex = findBreakPoint(text, currentIndex, endIndex, {
respectSentences,
respectParagraphs
});
}
// Extract chunk content
const content = text.slice(currentIndex, endIndex).trim();
// Only add non-empty chunks above minimum size
if (content.length >= MIN_CHUNK_SIZE) {
chunks.push({
id: crypto.randomUUID(),
documentId,
content,
startIndex: currentIndex,
endIndex
});
}
// If we've reached the end, we're done
if (endIndex >= text.length) {
break;
}
// Move to next chunk position (with overlap)
currentIndex = endIndex - overlap;
// Safety: ensure we always advance
if (currentIndex <= previousIndex) {
currentIndex = previousIndex + 1;
}
}
return chunks;
}
/**
* Find a good break point for chunking
* Prefers paragraph breaks > sentence breaks > word breaks
*/
function findBreakPoint(
text: string,
startIndex: number,
endIndex: number,
options: { respectSentences: boolean; respectParagraphs: boolean }
): number {
const searchWindow = text.slice(startIndex, endIndex);
const windowLength = searchWindow.length;
// Look for break points in last 20% of chunk
const searchStart = Math.floor(windowLength * 0.8);
// Try paragraph break first
if (options.respectParagraphs) {
const paragraphBreak = findLastMatchPosition(searchWindow, /\n\s*\n/g, searchStart);
if (paragraphBreak >= 0) {
return startIndex + paragraphBreak;
}
}
// Try sentence break
if (options.respectSentences) {
const sentenceBreak = findLastMatchPosition(searchWindow, /[.!?]\s+/g, searchStart);
if (sentenceBreak >= 0) {
return startIndex + sentenceBreak;
}
}
// Fall back to word break
const wordBreak = findLastMatchPosition(searchWindow, /\s+/g, searchStart);
if (wordBreak >= 0) {
return startIndex + wordBreak;
}
// No good break point found, use original end
return endIndex;
}
/**
* Find the last match of a pattern after a given position
* Uses matchAll to find all matches and returns the last one after minPos
*/
function findLastMatchPosition(text: string, pattern: RegExp, minPos: number): number {
let lastMatch = -1;
// Use matchAll to iterate through matches
for (const match of text.matchAll(pattern)) {
if (match.index !== undefined && match.index >= minPos) {
// Track position after the match
lastMatch = match.index + match[0].length;
}
}
return lastMatch;
}
/**
* Split text by paragraphs (for simpler chunking)
*/
export function splitByParagraphs(text: string): string[] {
return text
.split(/\n\s*\n/)
.map(p => p.trim())
.filter(p => p.length > 0);
}
/**
* Split text by sentences
*/
export function splitBySentences(text: string): string[] {
// Simple sentence splitting (handles common cases)
return text
.split(/(?<=[.!?])\s+/)
.map(s => s.trim())
.filter(s => s.length > 0);
}
/**
* Estimate token count for a chunk
*/
export function estimateChunkTokens(text: string): number {
// Rough estimate: ~4 characters per token
return Math.ceil(text.length / 4);
}
/**
* Merge very small adjacent chunks
*/
export function mergeSmallChunks(
chunks: DocumentChunk[],
minSize: number = MIN_CHUNK_SIZE * 2
): DocumentChunk[] {
if (chunks.length <= 1) return chunks;
const merged: DocumentChunk[] = [];
let currentChunk: DocumentChunk | null = null;
for (const chunk of chunks) {
if (currentChunk === null) {
currentChunk = { ...chunk };
continue;
}
if (currentChunk.content.length + chunk.content.length < minSize) {
// Merge with current
currentChunk = {
id: currentChunk.id,
documentId: currentChunk.documentId,
content: currentChunk.content + '\n\n' + chunk.content,
startIndex: currentChunk.startIndex,
endIndex: chunk.endIndex,
embedding: currentChunk.embedding,
metadata: currentChunk.metadata
};
} else {
// Push current and start new
merged.push(currentChunk);
currentChunk = { ...chunk };
}
}
if (currentChunk) {
merged.push(currentChunk);
}
return merged;
}