chromadb.py
1 from __future__ import annotations 2 3 """ChromaDB vector store implementation.""" 4 5 from pathlib import Path 6 from typing import Any 7 8 from langchain_community.vectorstores import Chroma 9 from langchain_core.documents import Document 10 11 from ..constants import DEFAULT_RETRIEVAL_K 12 from ..embeddings.protocol import Embeddings 13 from .constants import CHUNK_ID_PREFIX, DEFAULT_COLLECTION_NAME, DEFAULT_VECTOR_DB_DIR 14 from .protocol import VectorStore 15 16 17 class ChromaDBWrapper: 18 """Wrapper for ChromaDB to convert int IDs to strings.""" 19 20 def __init__(self, chroma_store: Chroma): 21 """Initialize the wrapper. 22 23 Parameters 24 ---------- 25 chroma_store 26 The underlying Chroma instance. 27 """ 28 self._store = chroma_store 29 30 def add_texts( 31 self, 32 texts: list[str], 33 metadatas: list[dict[str, Any]] | None = None, 34 ids: list[int] | None = None, 35 embeddings: list[list[float]] | None = None, 36 ) -> None: 37 """Add texts to the vector store with pre-computed embeddings. 38 39 Parameters 40 ---------- 41 texts 42 List of text strings to add. 43 metadatas 44 Optional list of metadata dictionaries. 45 ids 46 Optional list of document IDs (integers). 47 embeddings 48 Pre-computed embedding vectors. 49 """ 50 # Convert int IDs to strings with prefix for ChromaDB 51 string_ids = [f"{CHUNK_ID_PREFIX}{doc_id}" for doc_id in ids] if ids else None 52 53 self._store.add_texts( 54 texts=texts, 55 metadatas=metadatas, 56 ids=string_ids, 57 embeddings=embeddings, 58 ) 59 60 def similarity_search( 61 self, 62 query: str, 63 k: int = DEFAULT_RETRIEVAL_K, 64 filter: Any | None = None, 65 ) -> list[Document]: 66 """Search for similar documents. 67 68 Note: ChromaDB does not support metadata filtering in this implementation. 69 The filter parameter is ignored. 70 """ 71 return self._store.similarity_search(query, k=k) 72 73 def similarity_search_with_score( 74 self, 75 query: str, 76 k: int = DEFAULT_RETRIEVAL_K, 77 filter: Any | None = None, 78 ) -> list[tuple[Document, float]]: 79 """Search for similar documents with similarity scores. 80 81 Note: ChromaDB does not support metadata filtering in this implementation. 82 The filter parameter is ignored. 83 """ 84 return self._store.similarity_search_with_score(query, k=k) 85 86 def add_documents(self, documents: list[Document]) -> list[int]: 87 """Add documents to the vector store.""" 88 result_ids = self._store.add_documents(documents) 89 # ChromaDB returns string IDs, try to convert back to ints 90 converted_ids = [] 91 for doc_id in result_ids: 92 converted_ids.append(int(doc_id.replace(CHUNK_ID_PREFIX, ""))) 93 return converted_ids 94 95 def persist(self) -> None: 96 """Persist the vector store to disk.""" 97 self._store.persist() 98 99 100 def create_chromadb_store(config: dict[str, Any]) -> VectorStore: 101 """Create a ChromaDB vector store from configuration. 102 103 Parameters 104 ---------- 105 config 106 Configuration dictionary with keys: 107 - embedding_function: Embeddings (required) - Embedding model instance 108 - persist_directory: str (optional) - Directory to persist the database 109 - collection_name: str (optional) - Name of the collection 110 111 Returns 112 ------- 113 VectorStore instance. 114 115 Raises 116 ------ 117 ValueError 118 If embedding_function is not provided. 119 """ 120 persist_directory = config.get("persist_directory", DEFAULT_VECTOR_DB_DIR) 121 collection_name = config.get("collection_name", DEFAULT_COLLECTION_NAME) 122 embedding_function: Embeddings = config.get("embedding_function") 123 124 if embedding_function is None: 125 raise ValueError( 126 "ChromaDB requires an embedding_function. " 127 "Pass it via config: {'embedding_function': embedder.embedding_model}" 128 ) 129 130 vector_db_path = Path(persist_directory).expanduser().resolve() 131 if not vector_db_path.exists(): 132 raise RuntimeError( 133 f"Vector database not found at {vector_db_path}. " 134 "Please run ingestion first." 135 ) 136 137 chroma_store = Chroma( 138 embedding_function=embedding_function, 139 persist_directory=persist_directory, 140 collection_name=collection_name, 141 ) 142 143 return ChromaDBWrapper(chroma_store) 144