mime_helpers.py
1 """ 2 Utility functions for handling MIME types. 3 """ 4 5 import os 6 from typing import Optional, Set 7 8 _OCTET_STREAM = "application/octet-stream" 9 10 TEXT_CONTAINER_MIME_TYPES: Set[str] = { 11 "text/plain", 12 "text/markdown", 13 "text/html", 14 "application/json", 15 "application/yaml", 16 "text/yaml", 17 "application/x-yaml", 18 "text/x-yaml", 19 "application/xml", 20 "text/xml", 21 "text/csv", 22 } 23 24 _TEXT_BASED_PRIMARY_TYPES = {"text"} 25 _TEXT_BASED_SUBTYPE_WHOLE = { 26 "json", 27 "xml", 28 "yaml", 29 "x-yaml", 30 "yml", 31 "csv", 32 "javascript", 33 "ecmascript", 34 "xhtml+xml", 35 "svg+xml", 36 "atom+xml", 37 "rss+xml", 38 "sparql-query", 39 "sparql-update", 40 "sql", 41 "graphql", 42 "markdown", 43 "html", 44 "rtf", 45 "sgml", 46 } 47 _TEXT_BASED_SUBTYPE_SUFFIXES_AFTER_PLUS = { 48 "json", 49 "xml", 50 "yaml", 51 "csv", 52 "svg", 53 "xhtml", 54 } 55 56 57 def is_text_based_mime_type(mime_type: Optional[str]) -> bool: 58 """ 59 Checks if a given MIME type is considered text-based. 60 61 Args: 62 mime_type: The MIME type string (e.g., "text/plain", "application/json"). 63 64 Returns: 65 True if the MIME type is text-based, False otherwise. 66 """ 67 if not mime_type: 68 return False 69 70 normalized_mime_type = mime_type.lower().strip() 71 72 if normalized_mime_type.startswith("text/"): 73 return True 74 75 if normalized_mime_type in TEXT_CONTAINER_MIME_TYPES: 76 return True 77 78 return False 79 80 81 def is_text_based_file( 82 mime_type: Optional[str], content_bytes: Optional[bytes] = None 83 ) -> bool: 84 """ 85 Determines if a file is text-based based on its MIME type and content. 86 Args: 87 mime_type: The MIME type of the file. 88 content_bytes: The content of the file as bytes. 89 Returns: 90 True if the file is text-based, False otherwise. 91 """ 92 if not mime_type: 93 return False 94 95 normalized_mime_type = mime_type.lower().strip() 96 primary_type, _, subtype = normalized_mime_type.partition("/") 97 98 if primary_type in _TEXT_BASED_PRIMARY_TYPES: 99 return True 100 elif subtype in _TEXT_BASED_SUBTYPE_WHOLE: 101 return True 102 elif "+" in subtype: 103 specific_format = subtype.split("+")[-1] 104 if specific_format in _TEXT_BASED_SUBTYPE_SUFFIXES_AFTER_PLUS: 105 return True 106 elif ( 107 normalized_mime_type == _OCTET_STREAM and content_bytes is not None 108 ): 109 try: 110 sample_size = min(1024, len(content_bytes)) 111 content_bytes[:sample_size].decode("utf-8") 112 return True 113 except UnicodeDecodeError: 114 return False 115 116 return False 117 118 119 # Canonical MIME-type ↔ extension mapping, used bi-directionally 120 _MIME_TO_EXTENSION = { 121 # Default 122 _OCTET_STREAM: ".bin", 123 # Text / code formats 124 "text/plain": ".txt", 125 "text/html": ".html", 126 "text/css": ".css", 127 "text/javascript": ".js", 128 "text/csv": ".csv", 129 "text/markdown": ".md", 130 "text/xml": ".xml", 131 "text/yaml": ".yaml", 132 "text/x-typescript": ".ts", 133 "text/jsx": ".jsx", 134 "text/x-toml": ".toml", 135 "text/x-rust": ".rs", 136 "text/x-go": ".go", 137 "text/x-kotlin": ".kt", 138 "text/x-swift": ".swift", 139 "text/x-ruby": ".rb", 140 "text/x-php": ".php", 141 "text/x-c": ".c", 142 "text/x-c++": ".cpp", 143 "text/x-python": ".py", 144 "text/x-java-source": ".java", 145 # Application formats 146 "application/json": ".json", 147 "application/x-yaml": ".yaml", 148 "application/yaml": ".yaml", 149 "application/x-sh": ".sh", 150 "application/pdf": ".pdf", 151 "application/zip": ".zip", 152 # Image formats 153 "image/png": ".png", 154 "image/jpeg": ".jpg", 155 "image/jpg": ".jpg", 156 "image/gif": ".gif", 157 "image/bmp": ".bmp", 158 "image/webp": ".webp", 159 "image/svg+xml": ".svg", 160 # Audio formats 161 "audio/wav": ".wav", 162 "audio/mp3": ".mp3", 163 "audio/mpeg": ".mp3", 164 "audio/ogg": ".ogg", 165 "audio/flac": ".flac", 166 "audio/aac": ".aac", 167 "audio/m4a": ".m4a", 168 # Video formats 169 "video/mp4": ".mp4", 170 "video/webm": ".webm", 171 "video/x-msvideo": ".avi", 172 "video/quicktime": ".mov", 173 } 174 175 _EXTENSION_TO_MIME = {ext: mime for mime, ext in _MIME_TO_EXTENSION.items()} 176 # Remove default _OCTET_STREAM mapping 177 _EXTENSION_TO_MIME.pop(".bin", None) 178 # Add aliases for MIME types with more than one extension 179 _EXTENSION_TO_MIME.update({ 180 ".yaml": "text/yaml", 181 ".yml": "text/yaml", 182 ".jpg": "image/jpeg", 183 ".tsx": "text/x-typescript", 184 ".bash": "application/x-sh", 185 ".env": "text/plain", 186 ".ini": "text/plain", 187 ".cfg": "text/plain", 188 ".hpp": "text/x-c++", 189 ".h": "text/x-c", 190 ".mmd": "text/plain", 191 }) 192 193 194 # Raster image extensions that vision-capable LLMs can process as inline binary. 195 # SVG is excluded: it is XML-based and cannot be processed as inline binary by LLMs. 196 _INLINE_VISION_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"} 197 198 199 def is_image_artifact(filename: Optional[str], mime_type: Optional[str]) -> bool: 200 """Determine whether an artifact should be treated as an inline vision image. 201 202 Uses *mime_type* as the source of truth. Falls back to file extension only 203 when mime_type is missing or ``application/octet-stream``. 204 205 SVG (``image/svg+xml``) is explicitly excluded because it is XML-based and 206 most LLMs cannot process it as inline binary vision data. 207 """ 208 if mime_type: 209 normalized = mime_type.lower().split(";")[0].strip() 210 if normalized == "image/svg+xml": 211 return False 212 if normalized.startswith("image/"): 213 return True 214 # Known non-image mime type — do not fall through to extension check 215 if normalized != _OCTET_STREAM: 216 return False 217 218 # Fallback: check file extension when mime_type is absent / octet-stream 219 if filename: 220 ext = os.path.splitext(filename)[1].lower() 221 if ext in _INLINE_VISION_EXTENSIONS: 222 return True 223 224 return False 225 226 227 def get_extension_for_mime_type( 228 mime_type: Optional[str], default_extension: str = ".dat" 229 ) -> str: 230 """ 231 Returns a file extension for a given MIME type. 232 233 Args: 234 mime_type: The MIME type string (e.g., 'image/png', 'application/json'). 235 default_extension: The extension to return if the MIME type is not found. 236 237 Returns: 238 The corresponding file extension (e.g., '.png', '.json'). 239 """ 240 if not mime_type: 241 return default_extension 242 243 normalized = mime_type.lower().split(";")[0].strip() 244 return _MIME_TO_EXTENSION.get(normalized, default_extension) 245 246 247 def resolve_mime_type( 248 filename: Optional[str], provided_mime_type: Optional[str] = None 249 ) -> str: 250 """ 251 Resolves a MIME type from a filename when the provided type is missing or 252 ``application/octet-stream`` (the browser default for unrecognised extensions). 253 254 Resolution order: 255 1. Normalize *provided_mime_type* (lowercase, strip parameters like 256 ``; charset=binary``). 257 2. If the normalized type is present and not ``application/octet-stream``, 258 return it. 259 3. Check the file extension against the canonical extension map. 260 4. Return ``application/octet-stream`` if nothing matched. 261 262 Args: 263 filename: The original filename (used for extension lookup). 264 provided_mime_type: The MIME type reported by the client / browser. 265 266 Returns: 267 The best-effort MIME type string. 268 """ 269 normalized = provided_mime_type.lower().split(";")[0].strip() if provided_mime_type else None 270 271 if normalized and normalized != _OCTET_STREAM: 272 return normalized 273 274 if not filename: 275 return normalized or _OCTET_STREAM 276 277 ext = os.path.splitext(filename)[1].lower() 278 279 mapped = _EXTENSION_TO_MIME.get(ext) 280 if mapped: 281 return mapped 282 283 return normalized or _OCTET_STREAM