load_step.py
1 from __future__ import annotations 2 3 """Step that loads a document and converts it to markdown.""" 4 5 import logging 6 7 from ...loaders.protocol import DocumentLoader 8 from ..contexts.ingestion_context import IngestionContext 9 10 11 class LoadStep: 12 """Step that loads a document and converts it to markdown.""" 13 14 def __init__(self, loader: DocumentLoader): 15 """Initialize the load step. 16 17 Parameters 18 ---------- 19 loader 20 Document loader instance created by LoaderFactory. 21 """ 22 self.loader = loader 23 24 def run(self, context: IngestionContext) -> None: 25 """Load the document and save as markdown. 26 27 Parameters 28 ---------- 29 context 30 Ingestion context with file_path set. 31 """ 32 logger = logging.getLogger(__name__) 33 logger.info(f"Loading document: {context.file_path}") 34 35 loaded_docs = self.loader.load_documents() 36 37 if loaded_docs: 38 context.metadata = loaded_docs[0].metadata.copy() 39 40 # Inject default access_tags if not present in document metadata 41 if "access_tags" not in context.metadata and context.access_tags: 42 context.metadata["access_tags"] = context.access_tags 43 44 # Get markdown text from the loaded document 45 md_text = loaded_docs[0].page_content 46 context.raw_text = md_text 47 48 markdown_path = self.loader.to_markdown_file(md_text=md_text) 49 context.markdown_path = markdown_path 50 51 logger.info(f"Markdown saved to: {markdown_path}")