helpers.py
1 from __future__ import annotations 2 3 """Helper class for working with loaders.""" 4 5 from pathlib import Path 6 from typing import Any 7 8 from ..config import Config 9 from .constants import SUPPORTED_FILE_EXTENSIONS 10 11 12 class LoaderHelper: 13 """Static helper class for loader operations.""" 14 15 @staticmethod 16 def get_loader_config_for_file( 17 file_path: Path, 18 ) -> tuple[str, dict[str, Any]]: 19 """Get the loader name and config for a file based on its extension and configuration. 20 21 Parameters 22 ---------- 23 file_path 24 Path to the file. 25 26 Returns 27 ------- 28 Tuple of (loader_name, loader_config_dict). 29 30 """ 31 config = Config.get_config() 32 if not isinstance(file_path, Path): 33 raise TypeError(f"file_path must be a Path object, got {type(file_path)}") 34 35 suffix = file_path.suffix.lower() 36 37 if suffix not in SUPPORTED_FILE_EXTENSIONS: 38 supported = ", ".join(SUPPORTED_FILE_EXTENSIONS) 39 raise ValueError( 40 f"Unsupported file type: {suffix}. " 41 f"Supported types: {supported}" 42 ) 43 44 # Find the loader entry that contains this extension 45 loader_entry = None 46 for entry in config.loader.file_type_mapping: 47 if not isinstance(entry, dict): 48 raise TypeError( 49 f"Each entry in file_type_mapping must be a dictionary. " 50 f"Received: {type(entry)}" 51 ) 52 53 extensions = entry.get("extensions") 54 if not extensions: 55 raise ValueError( 56 "Each entry in file_type_mapping must have 'extensions' key " 57 "containing a list of file extensions." 58 ) 59 60 if not isinstance(extensions, list): 61 raise TypeError( 62 f"'extensions' must be a list. Received: {type(extensions)}" 63 ) 64 65 if suffix in extensions: 66 loader_entry = entry 67 break 68 69 if loader_entry is None: 70 raise ValueError( 71 f"No loader configured for file type: {suffix}. " 72 f"Please add an entry with '{suffix}' in the 'extensions' list " 73 f"to loader.file_type_mapping in config.yaml" 74 ) 75 76 loader_name = loader_entry.get("loader_name") 77 if not loader_name: 78 raise ValueError( 79 f"Loader entry for '{suffix}' must have 'loader_name' key. " 80 f"Received: {loader_entry}" 81 ) 82 83 if not isinstance(loader_name, str) or not loader_name.strip(): 84 raise ValueError( 85 f"Loader name for '{suffix}' must be a non-empty string. " 86 f"Received: {loader_name!r}" 87 ) 88 89 loader_config = loader_entry.get("loader_config") or {} 90 if not isinstance(loader_config, dict): 91 raise TypeError( 92 f"Loader config for '{suffix}' must be a dictionary or None. " 93 f"Received: {type(loader_config)}" 94 ) 95 96 return loader_name, loader_config 97 98 @staticmethod 99 def resolve_media_inputs(input_path: Path) -> list[Path]: 100 """Resolve input path to a list of supported media files. 101 102 Parameters 103 ---------- 104 input_path 105 Path to a file or directory. 106 107 Returns 108 ------- 109 List of file paths (sorted). 110 111 Raises 112 ------ 113 TypeError 114 If input_path is not a Path object. 115 FileNotFoundError 116 If the path doesn't exist or no supported files are found. 117 ValueError 118 If a file has an unsupported extension. 119 """ 120 if not isinstance(input_path, Path): 121 raise TypeError(f"input_path must be a Path object, got {type(input_path)}") 122 123 if not input_path.exists(): 124 raise FileNotFoundError(f"Path not found: {input_path}") 125 126 if input_path.is_file(): 127 suffix = input_path.suffix.lower() 128 if suffix not in SUPPORTED_FILE_EXTENSIONS: 129 supported = ", ".join(SUPPORTED_FILE_EXTENSIONS) 130 raise ValueError( 131 f"Unsupported file type: {suffix}. " 132 f"Supported types: {supported}" 133 ) 134 return [input_path] 135 136 if not input_path.is_dir(): 137 raise ValueError(f"Path is neither a file nor a directory: {input_path}") 138 139 media_files = [ 140 path for ext in SUPPORTED_FILE_EXTENSIONS 141 for path in input_path.glob(f"*{ext}") 142 if path.is_file() 143 ] 144 145 media_files = sorted(media_files) 146 if not media_files: 147 supported = ", ".join(SUPPORTED_FILE_EXTENSIONS) 148 raise FileNotFoundError( 149 f"No supported files found in directory: {input_path}. " 150 f"Supported types: {supported}" 151 ) 152 return media_files 153 154 @staticmethod 155 def prepare_output_dir(output_dir: Path) -> Path: 156 """Create output directory if it doesn't exist and return it.""" 157 output_dir.mkdir(parents=True, exist_ok=True) 158 return output_dir 159 160 @staticmethod 161 def create_loader_config( 162 file_path: Path, 163 loader_name: str, 164 loader_config_from_mapping: dict[str, Any], 165 ) -> dict[str, Any]: 166 """Create loader configuration based on file type and loader name. 167 168 Parameters 169 ---------- 170 file_path 171 Path to the media file. 172 loader_name 173 Name of the loader to use (e.g., "pymupdf", "video_loader"). 174 loader_config_from_mapping 175 Loader-specific configuration from file_type_mapping. 176 177 Returns 178 ------- 179 Configuration dictionary for the loader. 180 181 Raises 182 ------ 183 TypeError 184 If any parameter has an invalid type. 185 ValueError 186 If loader_name is empty or loader_config_from_mapping is not a dict. 187 """ 188 if not isinstance(file_path, Path): 189 raise TypeError(f"file_path must be a Path object, got {type(file_path)}") 190 191 if not isinstance(loader_name, str) or not loader_name.strip(): 192 raise ValueError( 193 f"loader_name must be a non-empty string, got: {loader_name!r}" 194 ) 195 196 if not isinstance(loader_config_from_mapping, dict): 197 raise TypeError( 198 f"loader_config_from_mapping must be a dict, got {type(loader_config_from_mapping)}" 199 ) 200 201 config = Config.get_config() 202 output_dir = LoaderHelper.prepare_output_dir(config.paths.markdown_dir) 203 204 result = loader_config_from_mapping.copy() 205 result["file_path"] = str(file_path) 206 result["output_dir"] = str(output_dir) 207 208 return result 209