/ src / loaders / helpers.py
helpers.py
  1  from __future__ import annotations
  2  
  3  """Helper class for working with loaders."""
  4  
  5  from pathlib import Path
  6  from typing import Any
  7  
  8  from ..config import Config
  9  from .constants import SUPPORTED_FILE_EXTENSIONS
 10  
 11  
 12  class LoaderHelper:
 13      """Static helper class for loader operations."""
 14  
 15      @staticmethod
 16      def get_loader_config_for_file(
 17          file_path: Path,
 18      ) -> tuple[str, dict[str, Any]]:
 19          """Get the loader name and config for a file based on its extension and configuration.
 20  
 21          Parameters
 22          ----------
 23          file_path
 24              Path to the file.
 25  
 26          Returns
 27          -------
 28          Tuple of (loader_name, loader_config_dict).
 29  
 30          """
 31          config = Config.get_config()
 32          if not isinstance(file_path, Path):
 33              raise TypeError(f"file_path must be a Path object, got {type(file_path)}")
 34  
 35          suffix = file_path.suffix.lower()
 36  
 37          if suffix not in SUPPORTED_FILE_EXTENSIONS:
 38              supported = ", ".join(SUPPORTED_FILE_EXTENSIONS)
 39              raise ValueError(
 40                  f"Unsupported file type: {suffix}. "
 41                  f"Supported types: {supported}"
 42              )
 43  
 44          # Find the loader entry that contains this extension
 45          loader_entry = None
 46          for entry in config.loader.file_type_mapping:
 47              if not isinstance(entry, dict):
 48                  raise TypeError(
 49                      f"Each entry in file_type_mapping must be a dictionary. "
 50                      f"Received: {type(entry)}"
 51                  )
 52  
 53              extensions = entry.get("extensions")
 54              if not extensions:
 55                  raise ValueError(
 56                      "Each entry in file_type_mapping must have 'extensions' key "
 57                      "containing a list of file extensions."
 58                  )
 59  
 60              if not isinstance(extensions, list):
 61                  raise TypeError(
 62                      f"'extensions' must be a list. Received: {type(extensions)}"
 63                  )
 64  
 65              if suffix in extensions:
 66                  loader_entry = entry
 67                  break
 68  
 69          if loader_entry is None:
 70              raise ValueError(
 71                  f"No loader configured for file type: {suffix}. "
 72                  f"Please add an entry with '{suffix}' in the 'extensions' list "
 73                  f"to loader.file_type_mapping in config.yaml"
 74              )
 75  
 76          loader_name = loader_entry.get("loader_name")
 77          if not loader_name:
 78              raise ValueError(
 79                  f"Loader entry for '{suffix}' must have 'loader_name' key. "
 80                  f"Received: {loader_entry}"
 81              )
 82  
 83          if not isinstance(loader_name, str) or not loader_name.strip():
 84              raise ValueError(
 85                  f"Loader name for '{suffix}' must be a non-empty string. "
 86                  f"Received: {loader_name!r}"
 87              )
 88  
 89          loader_config = loader_entry.get("loader_config") or {}
 90          if not isinstance(loader_config, dict):
 91              raise TypeError(
 92                  f"Loader config for '{suffix}' must be a dictionary or None. "
 93                  f"Received: {type(loader_config)}"
 94              )
 95  
 96          return loader_name, loader_config
 97  
 98      @staticmethod
 99      def resolve_media_inputs(input_path: Path) -> list[Path]:
100          """Resolve input path to a list of supported media files.
101  
102          Parameters
103          ----------
104          input_path
105              Path to a file or directory.
106  
107          Returns
108          -------
109          List of file paths (sorted).
110  
111          Raises
112          ------
113          TypeError
114              If input_path is not a Path object.
115          FileNotFoundError
116              If the path doesn't exist or no supported files are found.
117          ValueError
118              If a file has an unsupported extension.
119          """
120          if not isinstance(input_path, Path):
121              raise TypeError(f"input_path must be a Path object, got {type(input_path)}")
122  
123          if not input_path.exists():
124              raise FileNotFoundError(f"Path not found: {input_path}")
125  
126          if input_path.is_file():
127              suffix = input_path.suffix.lower()
128              if suffix not in SUPPORTED_FILE_EXTENSIONS:
129                  supported = ", ".join(SUPPORTED_FILE_EXTENSIONS)
130                  raise ValueError(
131                      f"Unsupported file type: {suffix}. "
132                      f"Supported types: {supported}"
133                  )
134              return [input_path]
135  
136          if not input_path.is_dir():
137              raise ValueError(f"Path is neither a file nor a directory: {input_path}")
138  
139          media_files = [
140              path for ext in SUPPORTED_FILE_EXTENSIONS
141              for path in input_path.glob(f"*{ext}")
142              if path.is_file()
143          ]
144  
145          media_files = sorted(media_files)
146          if not media_files:
147              supported = ", ".join(SUPPORTED_FILE_EXTENSIONS)
148              raise FileNotFoundError(
149                  f"No supported files found in directory: {input_path}. "
150                  f"Supported types: {supported}"
151              )
152          return media_files
153  
154      @staticmethod
155      def prepare_output_dir(output_dir: Path) -> Path:
156          """Create output directory if it doesn't exist and return it."""
157          output_dir.mkdir(parents=True, exist_ok=True)
158          return output_dir
159  
160      @staticmethod
161      def create_loader_config(
162          file_path: Path,
163          loader_name: str,
164          loader_config_from_mapping: dict[str, Any],
165      ) -> dict[str, Any]:
166          """Create loader configuration based on file type and loader name.
167  
168          Parameters
169          ----------
170          file_path
171              Path to the media file.
172          loader_name
173              Name of the loader to use (e.g., "pymupdf", "video_loader").
174          loader_config_from_mapping
175              Loader-specific configuration from file_type_mapping.
176  
177          Returns
178          -------
179          Configuration dictionary for the loader.
180  
181          Raises
182          ------
183          TypeError
184              If any parameter has an invalid type.
185          ValueError
186              If loader_name is empty or loader_config_from_mapping is not a dict.
187          """
188          if not isinstance(file_path, Path):
189              raise TypeError(f"file_path must be a Path object, got {type(file_path)}")
190  
191          if not isinstance(loader_name, str) or not loader_name.strip():
192              raise ValueError(
193                  f"loader_name must be a non-empty string, got: {loader_name!r}"
194              )
195  
196          if not isinstance(loader_config_from_mapping, dict):
197              raise TypeError(
198                  f"loader_config_from_mapping must be a dict, got {type(loader_config_from_mapping)}"
199              )
200  
201          config = Config.get_config()
202          output_dir = LoaderHelper.prepare_output_dir(config.paths.markdown_dir)
203  
204          result = loader_config_from_mapping.copy()
205          result["file_path"] = str(file_path)
206          result["output_dir"] = str(output_dir)
207  
208          return result
209