chunk_step.py
1 from __future__ import annotations 2 3 """Step that chunks the markdown text into smaller pieces.""" 4 5 import logging 6 7 from ...chunkers.protocol import Chunker 8 from ..contexts.ingestion_context import IngestionContext 9 10 11 class ChunkStep: 12 """Step that chunks the markdown text into smaller pieces.""" 13 14 def __init__(self, chunker: Chunker): 15 """Initialize the chunk step. 16 17 Parameters 18 ---------- 19 chunker 20 Chunker instance created by ChunkerFactory. 21 """ 22 self.chunker = chunker 23 24 def run(self, context: IngestionContext) -> None: 25 """Chunk the markdown file into documents. 26 27 Parameters 28 ---------- 29 context 30 Ingestion context with markdown_path set. 31 """ 32 logger = logging.getLogger(__name__) 33 if not context.markdown_path: 34 context.mark_failed("Markdown path not set. Load step must run first.") 35 return 36 37 logger.info(f"Chunking markdown file: {context.markdown_path}") 38 39 base_metadata = context.metadata.copy() if context.metadata else {} 40 base_metadata["source"] = context.source_id 41 42 chunks = self.chunker.chunk_markdown_file(str(context.markdown_path)) 43 44 for chunk in chunks: 45 chunk.metadata.update(base_metadata) 46 47 context.chunks = chunks 48 49 logger.info(f"Created {len(chunks)} chunks")