/ src / chunkers / protocol.py
protocol.py
 1  from __future__ import annotations
 2  
 3  """Protocol for chunker implementations."""
 4  
 5  from typing import Protocol
 6  
 7  from langchain_core.documents import Document
 8  
 9  from ..constants import DEFAULT_ENCODING
10  
11  
12  class Chunker(Protocol):
13      """Protocol for text chunking implementations.
14  
15      Any class implementing these methods can chunk documents using
16      different strategies (recursive, character-based, token-based, semantic, etc.).
17      This allows swapping chunking algorithms without changing the rest of the code.
18      """
19  
20      def chunk_text(self, text: str, metadata: dict | None = None) -> list[Document]:
21          """Chunk a text string into Document objects."""
22          ...
23  
24      def chunk_documents(self, documents: list[Document]) -> list[Document]:
25          """Chunk a list of Document objects into smaller chunks."""
26          ...
27  
28      def chunk_markdown_file(
29          self,
30          file_path: str,
31          encoding: str = DEFAULT_ENCODING
32      ) -> list[Document]:
33          """Load a markdown file and chunk it."""
34          ...
35