Cradicle Explorer

/ src / loaders / docling_loader.py
docling_loader.py
  1  import logging
  2  from pathlib import Path
  3  from typing import Any
  4  
  5  from docling.datamodel.base_models import InputFormat
  6  from docling.datamodel.pipeline_options import (
  7      ConvertPipelineOptions,
  8      PdfPipelineOptions,
  9      PictureDescriptionApiOptions,
 10      TableFormerMode,
 11      TableStructureOptions,
 12  )
 13  from docling.document_converter import (
 14      DocumentConverter,
 15      PdfFormatOption,
 16      WordFormatOption,
 17  )
 18  from langchain_core.documents import Document
 19  
 20  from src.constants import DEFAULT_IMAGE_DESCRIPTION_PROMPT, DEFAULT_IMAGE_DESCRIPTION_TIMEOUT
 21  
 22  from .protocol import DocumentLoader
 23  
 24  LLM_API_KEY = "llm_api_key"
 25  LLM_ENDPOINT = "llm_endpoint"
 26  LLM_MODEL = "llm_model"
 27  
 28  class DoclingLoader(DocumentLoader):
 29      def __init__(self, config: dict[str, Any]) -> None:
 30          self._config = config
 31          self._logger = logging.getLogger(__name__)
 32          self._picture_description_enabled = self._config.get("picture_description_enabled", False)
 33          self._converter = self._create_converter()
 34          self.input_path = self._resolve_input_path()
 35          self.output_dir = self._resolve_output_dir()
 36  
 37      def _resolve_input_path(self) -> Path:
 38          file_path = self._config.get("file_path")
 39          if not file_path:
 40              raise ValueError("'file_path' is required in loader configuration")
 41  
 42          resolved_path = Path(file_path).expanduser().resolve()
 43          if not resolved_path.exists():
 44              raise FileNotFoundError(f"Doc not found: {resolved_path}")
 45  
 46          return resolved_path
 47  
 48      def _resolve_output_dir(self) -> Path | None:
 49          output_dir = self._config.get("output_dir")
 50          return Path(output_dir).expanduser().resolve() if output_dir else None
 51  
 52      def _create_converter(self) -> DocumentConverter:
 53          self._validate_picture_description_config(self._picture_description_enabled)
 54          self._logger.info(f"Picture description: {'enabled' if self._picture_description_enabled else 'disabled'}")
 55  
 56          pdf_options = self._create_pdf_pipeline_options(self._picture_description_enabled)
 57          docx_options = self._create_docx_pipeline_options(self._picture_description_enabled)
 58  
 59          if self._picture_description_enabled:
 60              description_options = self._create_picture_description_options()
 61              pdf_options.picture_description_options = description_options
 62              docx_options.picture_description_options = description_options
 63  
 64          return DocumentConverter(format_options={
 65              InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options),
 66              InputFormat.DOCX: WordFormatOption(pipeline_options=docx_options),
 67          })
 68  
 69      def _validate_picture_description_config(self, enabled: bool) -> None:
 70          if not enabled:
 71              return
 72  
 73          missing = [field for field in (LLM_API_KEY, LLM_ENDPOINT, LLM_MODEL) if not self._config.get(field)]
 74          if missing:
 75              raise ValueError(
 76                  f"picture_description_enabled=True but missing required fields: {', '.join(missing)}. "
 77                  "Either provide all credentials or set picture_description_enabled=False."
 78              )
 79  
 80      def _create_pdf_pipeline_options(self, picture_description_enabled: bool) -> PdfPipelineOptions:
 81          return PdfPipelineOptions(
 82              enable_remote_services=True,
 83              do_table_structure=True,
 84              allow_external_plugins=True,
 85              do_ocr=False,
 86              do_picture_description=picture_description_enabled,
 87              table_structure_options=TableStructureOptions(
 88                  do_cell_matching=True,
 89                  table_former_mode=TableFormerMode.ACCURATE,
 90              ),
 91          )
 92  
 93      def _create_docx_pipeline_options(self, picture_description_enabled: bool) -> ConvertPipelineOptions:
 94          return ConvertPipelineOptions(
 95              allow_external_plugins=True,
 96              enable_remote_services=True,
 97              do_picture_description=picture_description_enabled,
 98              do_ocr=False,
 99          )
100  
101      def _create_picture_description_options(self) -> PictureDescriptionApiOptions:
102          return PictureDescriptionApiOptions(
103              url=self._config[LLM_ENDPOINT],
104              headers={
105                  "Authorization": f"Bearer {self._config[LLM_API_KEY]}",
106                  "Content-Type": "application/json",
107              },
108              prompt=self._config.get("image_description_prompt", DEFAULT_IMAGE_DESCRIPTION_PROMPT),
109              params={"model": self._config[LLM_MODEL]},
110              timeout=self._config.get("image_description_timeout", DEFAULT_IMAGE_DESCRIPTION_TIMEOUT),
111          )
112  
113      def _build_metadata(self) -> dict[str, str]:
114          return {
115              "source": str(self.input_path),
116              "file_name": self.input_path.name,
117              "loader": "DoclingLoader",
118              "file_type": self.input_path.suffix.lower(),
119          }
120  
121      def _has_failed_description(self, picture) -> bool:
122          """Check if a picture has a description annotation but with empty text (API failure)."""
123          if not picture.annotations:
124              return False
125          return any(
126              getattr(ann, "kind", None) == "description" and not getattr(ann, "text", None)
127              for ann in picture.annotations
128          )
129  
130      def _check_picture_conversion_success(self, document) -> None:
131          if not self._picture_description_enabled:
132              return
133  
134          all_pictures = document.pictures
135          failed_pictures = [p for p in all_pictures if self._has_failed_description(p)]
136  
137          if failed_pictures:
138              raise RuntimeError(
139                  f"Picture descriptions: {len(failed_pictures)}/{len(all_pictures)} failed "
140                  f"(had 'description' annotation but empty text - likely API error)"
141              )
142  
143          # Count pictures that were actually processed (have description annotation with text)
144          described = sum(
145              1 for p in all_pictures
146              if any(
147                  getattr(ann, "kind", None) == "description" and getattr(ann, "text", None)
148                  for ann in (p.annotations or [])
149              )
150          )
151          skipped = len(all_pictures) - described
152          self._logger.info(f"Picture descriptions: {described} described, {skipped} skipped (decorative/small)")
153  
154      def load_documents(self) -> list[Document]:
155          try:
156              result = self._converter.convert(str(self.input_path))
157          except Exception as e:
158              raise RuntimeError(f"Docling conversion failed for {self.input_path}: {e}") from e
159  
160          if result.errors:
161              error_messages = "; ".join(str(e) for e in result.errors)
162              raise RuntimeError(f"Docling conversion had errors for {self.input_path}: {error_messages}")
163  
164          self._check_picture_conversion_success(result.document)
165  
166          return [Document(page_content=result.document.export_to_markdown(), metadata=self._build_metadata())]
167  
168  
169  def create_docling_loader(config: dict[str, Any]) -> DocumentLoader:
170      return DoclingLoader(config=config)