/ src / pipeline / steps / load_step.py
load_step.py
 1  from __future__ import annotations
 2  
 3  """Step that loads a document and converts it to markdown."""
 4  
 5  import logging
 6  
 7  from ...loaders.protocol import DocumentLoader
 8  from ..contexts.ingestion_context import IngestionContext
 9  
10  
11  class LoadStep:
12      """Step that loads a document and converts it to markdown."""
13  
14      def __init__(self, loader: DocumentLoader):
15          """Initialize the load step.
16  
17          Parameters
18          ----------
19          loader
20              Document loader instance created by LoaderFactory.
21          """
22          self.loader = loader
23  
24      def run(self, context: IngestionContext) -> None:
25          """Load the document and save as markdown.
26  
27          Parameters
28          ----------
29          context
30              Ingestion context with file_path set.
31          """
32          logger = logging.getLogger(__name__)
33          logger.info(f"Loading document: {context.file_path}")
34  
35          loaded_docs = self.loader.load_documents()
36  
37          if loaded_docs:
38              context.metadata = loaded_docs[0].metadata.copy()
39  
40              # Inject default access_tags if not present in document metadata
41              if "access_tags" not in context.metadata and context.access_tags:
42                  context.metadata["access_tags"] = context.access_tags
43  
44              # Get markdown text from the loaded document
45              md_text = loaded_docs[0].page_content
46              context.raw_text = md_text
47  
48              markdown_path = self.loader.to_markdown_file(md_text=md_text)
49              context.markdown_path = markdown_path
50  
51              logger.info(f"Markdown saved to: {markdown_path}")