/ src / vector_stores / chromadb.py
chromadb.py
  1  from __future__ import annotations
  2  
  3  """ChromaDB vector store implementation."""
  4  
  5  from pathlib import Path
  6  from typing import Any
  7  
  8  from langchain_community.vectorstores import Chroma
  9  from langchain_core.documents import Document
 10  
 11  from ..constants import DEFAULT_RETRIEVAL_K
 12  from ..embeddings.protocol import Embeddings
 13  from .constants import CHUNK_ID_PREFIX, DEFAULT_COLLECTION_NAME, DEFAULT_VECTOR_DB_DIR
 14  from .protocol import VectorStore
 15  
 16  
 17  class ChromaDBWrapper:
 18      """Wrapper for ChromaDB to convert int IDs to strings."""
 19  
 20      def __init__(self, chroma_store: Chroma):
 21          """Initialize the wrapper.
 22  
 23          Parameters
 24          ----------
 25          chroma_store
 26              The underlying Chroma instance.
 27          """
 28          self._store = chroma_store
 29  
 30      def add_texts(
 31          self,
 32          texts: list[str],
 33          metadatas: list[dict[str, Any]] | None = None,
 34          ids: list[int] | None = None,
 35          embeddings: list[list[float]] | None = None,
 36      ) -> None:
 37          """Add texts to the vector store with pre-computed embeddings.
 38  
 39          Parameters
 40          ----------
 41          texts
 42              List of text strings to add.
 43          metadatas
 44              Optional list of metadata dictionaries.
 45          ids
 46              Optional list of document IDs (integers).
 47          embeddings
 48              Pre-computed embedding vectors.
 49          """
 50          # Convert int IDs to strings with prefix for ChromaDB
 51          string_ids = [f"{CHUNK_ID_PREFIX}{doc_id}" for doc_id in ids] if ids else None
 52  
 53          self._store.add_texts(
 54              texts=texts,
 55              metadatas=metadatas,
 56              ids=string_ids,
 57              embeddings=embeddings,
 58          )
 59  
 60      def similarity_search(
 61          self,
 62          query: str,
 63          k: int = DEFAULT_RETRIEVAL_K,
 64          filter: Any | None = None,
 65      ) -> list[Document]:
 66          """Search for similar documents.
 67  
 68          Note: ChromaDB does not support metadata filtering in this implementation.
 69          The filter parameter is ignored.
 70          """
 71          return self._store.similarity_search(query, k=k)
 72  
 73      def similarity_search_with_score(
 74          self,
 75          query: str,
 76          k: int = DEFAULT_RETRIEVAL_K,
 77          filter: Any | None = None,
 78      ) -> list[tuple[Document, float]]:
 79          """Search for similar documents with similarity scores.
 80  
 81          Note: ChromaDB does not support metadata filtering in this implementation.
 82          The filter parameter is ignored.
 83          """
 84          return self._store.similarity_search_with_score(query, k=k)
 85  
 86      def add_documents(self, documents: list[Document]) -> list[int]:
 87          """Add documents to the vector store."""
 88          result_ids = self._store.add_documents(documents)
 89          # ChromaDB returns string IDs, try to convert back to ints
 90          converted_ids = []
 91          for doc_id in result_ids:
 92              converted_ids.append(int(doc_id.replace(CHUNK_ID_PREFIX, "")))
 93          return converted_ids
 94  
 95      def persist(self) -> None:
 96          """Persist the vector store to disk."""
 97          self._store.persist()
 98  
 99  
100  def create_chromadb_store(config: dict[str, Any]) -> VectorStore:
101      """Create a ChromaDB vector store from configuration.
102  
103      Parameters
104      ----------
105      config
106          Configuration dictionary with keys:
107          - embedding_function: Embeddings (required) - Embedding model instance
108          - persist_directory: str (optional) - Directory to persist the database
109          - collection_name: str (optional) - Name of the collection
110  
111      Returns
112      -------
113      VectorStore instance.
114  
115      Raises
116      ------
117      ValueError
118          If embedding_function is not provided.
119      """
120      persist_directory = config.get("persist_directory", DEFAULT_VECTOR_DB_DIR)
121      collection_name = config.get("collection_name", DEFAULT_COLLECTION_NAME)
122      embedding_function: Embeddings = config.get("embedding_function")
123  
124      if embedding_function is None:
125          raise ValueError(
126              "ChromaDB requires an embedding_function. "
127              "Pass it via config: {'embedding_function': embedder.embedding_model}"
128          )
129  
130      vector_db_path = Path(persist_directory).expanduser().resolve()
131      if not vector_db_path.exists():
132          raise RuntimeError(
133              f"Vector database not found at {vector_db_path}. "
134              "Please run ingestion first."
135          )
136  
137      chroma_store = Chroma(
138          embedding_function=embedding_function,
139          persist_directory=persist_directory,
140          collection_name=collection_name,
141      )
142  
143      return ChromaDBWrapper(chroma_store)
144