Cradicle Explorer

/ restai / loaders / markitdown_loader.py
markitdown_loader.py
  1  """MarkItDown file loader — converts files to Markdown using Microsoft's
  2  markitdown library, then wraps the result in LlamaIndex Document objects.
  3  
  4  Also provides `auto_ingest()` which tries docling → markitdown → classic
  5  in order, returning the first that succeeds.
  6  """
  7  from __future__ import annotations
  8  
  9  import logging
 10  from pathlib import Path
 11  from typing import Optional
 12  
 13  from llama_index.core.schema import Document
 14  
 15  logger = logging.getLogger(__name__)
 16  
 17  
 18  def load_with_markitdown(
 19      file_path: str,
 20      source: Optional[str] = None,
 21  ) -> list[Document]:
 22      """Convert a file to Markdown via MarkItDown and return as Documents."""
 23      from markitdown import MarkItDown
 24  
 25      md = MarkItDown()
 26      result = md.convert(file_path)
 27      text = result.text_content if hasattr(result, "text_content") else str(result)
 28      if not text or not text.strip():
 29          return []
 30      metadata = {"source": source or Path(file_path).name}
 31      return [Document(text=text, metadata=metadata)]
 32  
 33  
 34  def load_url_with_markitdown(
 35      url: str,
 36      source: Optional[str] = None,
 37  ) -> list[Document]:
 38      """Fetch a URL and convert to Markdown via MarkItDown."""
 39      from markitdown import MarkItDown
 40  
 41      md = MarkItDown()
 42      result = md.convert_url(url)
 43      text = result.text_content if hasattr(result, "text_content") else str(result)
 44      if not text or not text.strip():
 45          return []
 46      metadata = {"source": source or url}
 47      return [Document(text=text, metadata=metadata)]
 48  
 49  
 50  def _has_content(docs: list[Document]) -> bool:
 51      return bool(docs) and any(d.text.strip() for d in docs)
 52  
 53  
 54  def auto_ingest(
 55      file_path: str,
 56      source: str,
 57      *,
 58      manager=None,
 59      opts: Optional[dict] = None,
 60  ) -> tuple[list[Document], str]:
 61      """Try docling → markitdown → classic in order. Return (documents, method_used).
 62  
 63      `manager` is the multiprocessing Manager needed by docling's subprocess runner.
 64      `opts` is passed to the classic loader for format-specific options.
 65      """
 66      # 1. Docling — best quality for PDF/DOCX (layout-aware)
 67      if manager is not None:
 68          try:
 69              from restai.document.runner import load_documents
 70  
 71              docs = load_documents(manager, file_path)
 72              if _has_content(docs):
 73                  logger.info("auto_ingest: docling succeeded for %s", source)
 74                  return docs, "docling"
 75          except Exception as e:
 76              logger.info("auto_ingest: docling failed for %s (%s), trying markitdown", source, e)
 77  
 78      # 2. MarkItDown — broad format support, lightweight
 79      try:
 80          docs = load_with_markitdown(file_path, source=source)
 81          if _has_content(docs):
 82              logger.info("auto_ingest: markitdown succeeded for %s", source)
 83              return docs, "markitdown"
 84      except Exception as e:
 85          logger.info("auto_ingest: markitdown failed for %s (%s), trying classic", source, e)
 86  
 87      # 3. Classic — LlamaIndex file readers (always works for supported formats)
 88      try:
 89          from restai.vectordb.tools import find_file_loader
 90  
 91          ext = Path(file_path).suffix.lower()
 92          loader = find_file_loader(ext, opts or {})
 93          try:
 94              docs = loader.load_data(file=Path(file_path))
 95          except TypeError:
 96              docs = loader.load_data(input_file=Path(file_path))
 97          if _has_content(docs):
 98              logger.info("auto_ingest: classic succeeded for %s", source)
 99              return docs, "classic"
100      except Exception as e:
101          logger.warning("auto_ingest: classic also failed for %s (%s)", source, e)
102  
103      return [], "classic"