protocol.py
1 from __future__ import annotations 2 3 """Protocol for chunker implementations.""" 4 5 from typing import Protocol 6 7 from langchain_core.documents import Document 8 9 from ..constants import DEFAULT_ENCODING 10 11 12 class Chunker(Protocol): 13 """Protocol for text chunking implementations. 14 15 Any class implementing these methods can chunk documents using 16 different strategies (recursive, character-based, token-based, semantic, etc.). 17 This allows swapping chunking algorithms without changing the rest of the code. 18 """ 19 20 def chunk_text(self, text: str, metadata: dict | None = None) -> list[Document]: 21 """Chunk a text string into Document objects.""" 22 ... 23 24 def chunk_documents(self, documents: list[Document]) -> list[Document]: 25 """Chunk a list of Document objects into smaller chunks.""" 26 ... 27 28 def chunk_markdown_file( 29 self, 30 file_path: str, 31 encoding: str = DEFAULT_ENCODING 32 ) -> list[Document]: 33 """Load a markdown file and chunk it.""" 34 ... 35