/ src / pipeline / steps / chunk_step.py
chunk_step.py
 1  from __future__ import annotations
 2  
 3  """Step that chunks the markdown text into smaller pieces."""
 4  
 5  import logging
 6  
 7  from ...chunkers.protocol import Chunker
 8  from ..contexts.ingestion_context import IngestionContext
 9  
10  
11  class ChunkStep:
12      """Step that chunks the markdown text into smaller pieces."""
13  
14      def __init__(self, chunker: Chunker):
15          """Initialize the chunk step.
16  
17          Parameters
18          ----------
19          chunker
20              Chunker instance created by ChunkerFactory.
21          """
22          self.chunker = chunker
23  
24      def run(self, context: IngestionContext) -> None:
25          """Chunk the markdown file into documents.
26  
27          Parameters
28          ----------
29          context
30              Ingestion context with markdown_path set.
31          """
32          logger = logging.getLogger(__name__)
33          if not context.markdown_path:
34              context.mark_failed("Markdown path not set. Load step must run first.")
35              return
36  
37          logger.info(f"Chunking markdown file: {context.markdown_path}")
38  
39          base_metadata = context.metadata.copy() if context.metadata else {}
40          base_metadata["source"] = context.source_id
41  
42          chunks = self.chunker.chunk_markdown_file(str(context.markdown_path))
43  
44          for chunk in chunks:
45              chunk.metadata.update(base_metadata)
46  
47          context.chunks = chunks
48  
49          logger.info(f"Created {len(chunks)} chunks")