docling_loader.py
1 import logging 2 from pathlib import Path 3 from typing import Any 4 5 from docling.datamodel.base_models import InputFormat 6 from docling.datamodel.pipeline_options import ( 7 ConvertPipelineOptions, 8 PdfPipelineOptions, 9 PictureDescriptionApiOptions, 10 TableFormerMode, 11 TableStructureOptions, 12 ) 13 from docling.document_converter import ( 14 DocumentConverter, 15 PdfFormatOption, 16 WordFormatOption, 17 ) 18 from langchain_core.documents import Document 19 20 from src.constants import DEFAULT_IMAGE_DESCRIPTION_PROMPT, DEFAULT_IMAGE_DESCRIPTION_TIMEOUT 21 22 from .protocol import DocumentLoader 23 24 LLM_API_KEY = "llm_api_key" 25 LLM_ENDPOINT = "llm_endpoint" 26 LLM_MODEL = "llm_model" 27 28 class DoclingLoader(DocumentLoader): 29 def __init__(self, config: dict[str, Any]) -> None: 30 self._config = config 31 self._logger = logging.getLogger(__name__) 32 self._picture_description_enabled = self._config.get("picture_description_enabled", False) 33 self._converter = self._create_converter() 34 self.input_path = self._resolve_input_path() 35 self.output_dir = self._resolve_output_dir() 36 37 def _resolve_input_path(self) -> Path: 38 file_path = self._config.get("file_path") 39 if not file_path: 40 raise ValueError("'file_path' is required in loader configuration") 41 42 resolved_path = Path(file_path).expanduser().resolve() 43 if not resolved_path.exists(): 44 raise FileNotFoundError(f"Doc not found: {resolved_path}") 45 46 return resolved_path 47 48 def _resolve_output_dir(self) -> Path | None: 49 output_dir = self._config.get("output_dir") 50 return Path(output_dir).expanduser().resolve() if output_dir else None 51 52 def _create_converter(self) -> DocumentConverter: 53 self._validate_picture_description_config(self._picture_description_enabled) 54 self._logger.info(f"Picture description: {'enabled' if self._picture_description_enabled else 'disabled'}") 55 56 pdf_options = self._create_pdf_pipeline_options(self._picture_description_enabled) 57 docx_options = self._create_docx_pipeline_options(self._picture_description_enabled) 58 59 if self._picture_description_enabled: 60 description_options = self._create_picture_description_options() 61 pdf_options.picture_description_options = description_options 62 docx_options.picture_description_options = description_options 63 64 return DocumentConverter(format_options={ 65 InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options), 66 InputFormat.DOCX: WordFormatOption(pipeline_options=docx_options), 67 }) 68 69 def _validate_picture_description_config(self, enabled: bool) -> None: 70 if not enabled: 71 return 72 73 missing = [field for field in (LLM_API_KEY, LLM_ENDPOINT, LLM_MODEL) if not self._config.get(field)] 74 if missing: 75 raise ValueError( 76 f"picture_description_enabled=True but missing required fields: {', '.join(missing)}. " 77 "Either provide all credentials or set picture_description_enabled=False." 78 ) 79 80 def _create_pdf_pipeline_options(self, picture_description_enabled: bool) -> PdfPipelineOptions: 81 return PdfPipelineOptions( 82 enable_remote_services=True, 83 do_table_structure=True, 84 allow_external_plugins=True, 85 do_ocr=False, 86 do_picture_description=picture_description_enabled, 87 table_structure_options=TableStructureOptions( 88 do_cell_matching=True, 89 table_former_mode=TableFormerMode.ACCURATE, 90 ), 91 ) 92 93 def _create_docx_pipeline_options(self, picture_description_enabled: bool) -> ConvertPipelineOptions: 94 return ConvertPipelineOptions( 95 allow_external_plugins=True, 96 enable_remote_services=True, 97 do_picture_description=picture_description_enabled, 98 do_ocr=False, 99 ) 100 101 def _create_picture_description_options(self) -> PictureDescriptionApiOptions: 102 return PictureDescriptionApiOptions( 103 url=self._config[LLM_ENDPOINT], 104 headers={ 105 "Authorization": f"Bearer {self._config[LLM_API_KEY]}", 106 "Content-Type": "application/json", 107 }, 108 prompt=self._config.get("image_description_prompt", DEFAULT_IMAGE_DESCRIPTION_PROMPT), 109 params={"model": self._config[LLM_MODEL]}, 110 timeout=self._config.get("image_description_timeout", DEFAULT_IMAGE_DESCRIPTION_TIMEOUT), 111 ) 112 113 def _build_metadata(self) -> dict[str, str]: 114 return { 115 "source": str(self.input_path), 116 "file_name": self.input_path.name, 117 "loader": "DoclingLoader", 118 "file_type": self.input_path.suffix.lower(), 119 } 120 121 def _has_failed_description(self, picture) -> bool: 122 """Check if a picture has a description annotation but with empty text (API failure).""" 123 if not picture.annotations: 124 return False 125 return any( 126 getattr(ann, "kind", None) == "description" and not getattr(ann, "text", None) 127 for ann in picture.annotations 128 ) 129 130 def _check_picture_conversion_success(self, document) -> None: 131 if not self._picture_description_enabled: 132 return 133 134 all_pictures = document.pictures 135 failed_pictures = [p for p in all_pictures if self._has_failed_description(p)] 136 137 if failed_pictures: 138 raise RuntimeError( 139 f"Picture descriptions: {len(failed_pictures)}/{len(all_pictures)} failed " 140 f"(had 'description' annotation but empty text - likely API error)" 141 ) 142 143 # Count pictures that were actually processed (have description annotation with text) 144 described = sum( 145 1 for p in all_pictures 146 if any( 147 getattr(ann, "kind", None) == "description" and getattr(ann, "text", None) 148 for ann in (p.annotations or []) 149 ) 150 ) 151 skipped = len(all_pictures) - described 152 self._logger.info(f"Picture descriptions: {described} described, {skipped} skipped (decorative/small)") 153 154 def load_documents(self) -> list[Document]: 155 try: 156 result = self._converter.convert(str(self.input_path)) 157 except Exception as e: 158 raise RuntimeError(f"Docling conversion failed for {self.input_path}: {e}") from e 159 160 if result.errors: 161 error_messages = "; ".join(str(e) for e in result.errors) 162 raise RuntimeError(f"Docling conversion had errors for {self.input_path}: {error_messages}") 163 164 self._check_picture_conversion_success(result.document) 165 166 return [Document(page_content=result.document.export_to_markdown(), metadata=self._build_metadata())] 167 168 169 def create_docling_loader(config: dict[str, Any]) -> DocumentLoader: 170 return DoclingLoader(config=config)