markitdown_loader.py
1 """MarkItDown file loader — converts files to Markdown using Microsoft's 2 markitdown library, then wraps the result in LlamaIndex Document objects. 3 4 Also provides `auto_ingest()` which tries docling → markitdown → classic 5 in order, returning the first that succeeds. 6 """ 7 from __future__ import annotations 8 9 import logging 10 from pathlib import Path 11 from typing import Optional 12 13 from llama_index.core.schema import Document 14 15 logger = logging.getLogger(__name__) 16 17 18 def load_with_markitdown( 19 file_path: str, 20 source: Optional[str] = None, 21 ) -> list[Document]: 22 """Convert a file to Markdown via MarkItDown and return as Documents.""" 23 from markitdown import MarkItDown 24 25 md = MarkItDown() 26 result = md.convert(file_path) 27 text = result.text_content if hasattr(result, "text_content") else str(result) 28 if not text or not text.strip(): 29 return [] 30 metadata = {"source": source or Path(file_path).name} 31 return [Document(text=text, metadata=metadata)] 32 33 34 def load_url_with_markitdown( 35 url: str, 36 source: Optional[str] = None, 37 ) -> list[Document]: 38 """Fetch a URL and convert to Markdown via MarkItDown.""" 39 from markitdown import MarkItDown 40 41 md = MarkItDown() 42 result = md.convert_url(url) 43 text = result.text_content if hasattr(result, "text_content") else str(result) 44 if not text or not text.strip(): 45 return [] 46 metadata = {"source": source or url} 47 return [Document(text=text, metadata=metadata)] 48 49 50 def _has_content(docs: list[Document]) -> bool: 51 return bool(docs) and any(d.text.strip() for d in docs) 52 53 54 def auto_ingest( 55 file_path: str, 56 source: str, 57 *, 58 manager=None, 59 opts: Optional[dict] = None, 60 ) -> tuple[list[Document], str]: 61 """Try docling → markitdown → classic in order. Return (documents, method_used). 62 63 `manager` is the multiprocessing Manager needed by docling's subprocess runner. 64 `opts` is passed to the classic loader for format-specific options. 65 """ 66 # 1. Docling — best quality for PDF/DOCX (layout-aware) 67 if manager is not None: 68 try: 69 from restai.document.runner import load_documents 70 71 docs = load_documents(manager, file_path) 72 if _has_content(docs): 73 logger.info("auto_ingest: docling succeeded for %s", source) 74 return docs, "docling" 75 except Exception as e: 76 logger.info("auto_ingest: docling failed for %s (%s), trying markitdown", source, e) 77 78 # 2. MarkItDown — broad format support, lightweight 79 try: 80 docs = load_with_markitdown(file_path, source=source) 81 if _has_content(docs): 82 logger.info("auto_ingest: markitdown succeeded for %s", source) 83 return docs, "markitdown" 84 except Exception as e: 85 logger.info("auto_ingest: markitdown failed for %s (%s), trying classic", source, e) 86 87 # 3. Classic — LlamaIndex file readers (always works for supported formats) 88 try: 89 from restai.vectordb.tools import find_file_loader 90 91 ext = Path(file_path).suffix.lower() 92 loader = find_file_loader(ext, opts or {}) 93 try: 94 docs = loader.load_data(file=Path(file_path)) 95 except TypeError: 96 docs = loader.load_data(input_file=Path(file_path)) 97 if _has_content(docs): 98 logger.info("auto_ingest: classic succeeded for %s", source) 99 return docs, "classic" 100 except Exception as e: 101 logger.warning("auto_ingest: classic also failed for %s (%s)", source, e) 102 103 return [], "classic"