protocol.py
1 from __future__ import annotations 2 3 """Protocol for input source implementations.""" 4 5 from pathlib import Path 6 from typing import Protocol 7 8 9 class InputSource(Protocol): 10 """Protocol for input source implementations. 11 12 Defines the interface for accessing files from different sources 13 (local filesystem, S3, etc.). All implementations must provide 14 these methods to be compatible. 15 """ 16 17 def list_files(self, path: str = "", extensions: list[str] | None = None) -> list[str]: 18 """List files in the given path. 19 20 Parameters 21 ---------- 22 path 23 Path to list files from (can be directory path or S3 prefix). 24 If empty string or not provided, uses the base path configured for the source: 25 - Local sources: uses base_path from config 26 - S3 sources: uses prefix from config 27 extensions 28 Optional list of file extensions to filter by (e.g., ['.pdf', '.docx']). 29 If None, return all files. 30 31 Returns 32 ------- 33 List of file identifiers (paths or URIs) that can be used with get_file(). 34 """ 35 ... 36 37 def get_file(self, file_id: str) -> Path: 38 """Get a file and return a local path to it. 39 40 For local sources, this returns the path directly. 41 For remote sources (S3), this downloads the file to a temp location. 42 43 Parameters 44 ---------- 45 file_id 46 File identifier (local path or S3 URI). 47 48 Returns 49 ------- 50 Path to the local file (original or downloaded temp file). 51 """ 52 ... 53 54 def cleanup(self) -> None: 55 """Clean up any temporary files or resources. 56 57 Called after processing is complete to remove downloaded files, 58 close connections, etc. 59 """ 60 ...