/ src / solace_agent_mesh / common / utils / mime_helpers.py
mime_helpers.py
  1  """
  2  Utility functions for handling MIME types.
  3  """
  4  
  5  import os
  6  from typing import Optional, Set
  7  
  8  _OCTET_STREAM = "application/octet-stream"
  9  
 10  TEXT_CONTAINER_MIME_TYPES: Set[str] = {
 11      "text/plain",
 12      "text/markdown",
 13      "text/html",
 14      "application/json",
 15      "application/yaml",
 16      "text/yaml",
 17      "application/x-yaml",
 18      "text/x-yaml",
 19      "application/xml",
 20      "text/xml",
 21      "text/csv",
 22  }
 23  
 24  _TEXT_BASED_PRIMARY_TYPES = {"text"}
 25  _TEXT_BASED_SUBTYPE_WHOLE = {
 26      "json",
 27      "xml",
 28      "yaml",
 29      "x-yaml",
 30      "yml",
 31      "csv",
 32      "javascript",
 33      "ecmascript",
 34      "xhtml+xml",
 35      "svg+xml",
 36      "atom+xml",
 37      "rss+xml",
 38      "sparql-query",
 39      "sparql-update",
 40      "sql",
 41      "graphql",
 42      "markdown",
 43      "html",
 44      "rtf",
 45      "sgml",
 46  }
 47  _TEXT_BASED_SUBTYPE_SUFFIXES_AFTER_PLUS = {
 48      "json",
 49      "xml",
 50      "yaml",
 51      "csv",
 52      "svg",
 53      "xhtml",
 54  }
 55  
 56  
 57  def is_text_based_mime_type(mime_type: Optional[str]) -> bool:
 58      """
 59      Checks if a given MIME type is considered text-based.
 60  
 61      Args:
 62          mime_type: The MIME type string (e.g., "text/plain", "application/json").
 63  
 64      Returns:
 65          True if the MIME type is text-based, False otherwise.
 66      """
 67      if not mime_type:
 68          return False
 69  
 70      normalized_mime_type = mime_type.lower().strip()
 71  
 72      if normalized_mime_type.startswith("text/"):
 73          return True
 74  
 75      if normalized_mime_type in TEXT_CONTAINER_MIME_TYPES:
 76          return True
 77  
 78      return False
 79  
 80  
 81  def is_text_based_file(
 82      mime_type: Optional[str], content_bytes: Optional[bytes] = None
 83  ) -> bool:
 84      """
 85      Determines if a file is text-based based on its MIME type and content.
 86      Args:
 87          mime_type: The MIME type of the file.
 88          content_bytes: The content of the file as bytes.
 89      Returns:
 90          True if the file is text-based, False otherwise.
 91      """
 92      if not mime_type:
 93          return False
 94  
 95      normalized_mime_type = mime_type.lower().strip()
 96      primary_type, _, subtype = normalized_mime_type.partition("/")
 97  
 98      if primary_type in _TEXT_BASED_PRIMARY_TYPES:
 99          return True
100      elif subtype in _TEXT_BASED_SUBTYPE_WHOLE:
101          return True
102      elif "+" in subtype:
103          specific_format = subtype.split("+")[-1]
104          if specific_format in _TEXT_BASED_SUBTYPE_SUFFIXES_AFTER_PLUS:
105              return True
106      elif (
107          normalized_mime_type == _OCTET_STREAM and content_bytes is not None
108      ):
109          try:
110              sample_size = min(1024, len(content_bytes))
111              content_bytes[:sample_size].decode("utf-8")
112              return True
113          except UnicodeDecodeError:
114              return False
115  
116      return False
117  
118  
119  # Canonical MIME-type ↔ extension mapping, used bi-directionally
120  _MIME_TO_EXTENSION = {
121      # Default
122      _OCTET_STREAM: ".bin",
123      # Text / code formats
124      "text/plain": ".txt",
125      "text/html": ".html",
126      "text/css": ".css",
127      "text/javascript": ".js",
128      "text/csv": ".csv",
129      "text/markdown": ".md",
130      "text/xml": ".xml",
131      "text/yaml": ".yaml",
132      "text/x-typescript": ".ts",
133      "text/jsx": ".jsx",
134      "text/x-toml": ".toml",
135      "text/x-rust": ".rs",
136      "text/x-go": ".go",
137      "text/x-kotlin": ".kt",
138      "text/x-swift": ".swift",
139      "text/x-ruby": ".rb",
140      "text/x-php": ".php",
141      "text/x-c": ".c",
142      "text/x-c++": ".cpp",
143      "text/x-python": ".py",
144      "text/x-java-source": ".java",
145      # Application formats
146      "application/json": ".json",
147      "application/x-yaml": ".yaml",
148      "application/yaml": ".yaml",
149      "application/x-sh": ".sh",
150      "application/pdf": ".pdf",
151      "application/zip": ".zip",
152      # Image formats
153      "image/png": ".png",
154      "image/jpeg": ".jpg",
155      "image/jpg": ".jpg",
156      "image/gif": ".gif",
157      "image/bmp": ".bmp",
158      "image/webp": ".webp",
159      "image/svg+xml": ".svg",
160      # Audio formats
161      "audio/wav": ".wav",
162      "audio/mp3": ".mp3",
163      "audio/mpeg": ".mp3",
164      "audio/ogg": ".ogg",
165      "audio/flac": ".flac",
166      "audio/aac": ".aac",
167      "audio/m4a": ".m4a",
168      # Video formats
169      "video/mp4": ".mp4",
170      "video/webm": ".webm",
171      "video/x-msvideo": ".avi",
172      "video/quicktime": ".mov",
173  }
174  
175  _EXTENSION_TO_MIME = {ext: mime for mime, ext in _MIME_TO_EXTENSION.items()}
176  # Remove default _OCTET_STREAM mapping
177  _EXTENSION_TO_MIME.pop(".bin", None)
178  # Add aliases for MIME types with more than one extension
179  _EXTENSION_TO_MIME.update({
180      ".yaml": "text/yaml",
181      ".yml": "text/yaml",
182      ".jpg": "image/jpeg",
183      ".tsx": "text/x-typescript",
184      ".bash": "application/x-sh",
185      ".env": "text/plain",
186      ".ini": "text/plain",
187      ".cfg": "text/plain",
188      ".hpp": "text/x-c++",
189      ".h": "text/x-c",
190      ".mmd": "text/plain",
191  })
192  
193  
194  # Raster image extensions that vision-capable LLMs can process as inline binary.
195  # SVG is excluded: it is XML-based and cannot be processed as inline binary by LLMs.
196  _INLINE_VISION_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"}
197  
198  
199  def is_image_artifact(filename: Optional[str], mime_type: Optional[str]) -> bool:
200      """Determine whether an artifact should be treated as an inline vision image.
201  
202      Uses *mime_type* as the source of truth.  Falls back to file extension only
203      when mime_type is missing or ``application/octet-stream``.
204  
205      SVG (``image/svg+xml``) is explicitly excluded because it is XML-based and
206      most LLMs cannot process it as inline binary vision data.
207      """
208      if mime_type:
209          normalized = mime_type.lower().split(";")[0].strip()
210          if normalized == "image/svg+xml":
211              return False
212          if normalized.startswith("image/"):
213              return True
214          # Known non-image mime type — do not fall through to extension check
215          if normalized != _OCTET_STREAM:
216              return False
217  
218      # Fallback: check file extension when mime_type is absent / octet-stream
219      if filename:
220          ext = os.path.splitext(filename)[1].lower()
221          if ext in _INLINE_VISION_EXTENSIONS:
222              return True
223  
224      return False
225  
226  
227  def get_extension_for_mime_type(
228      mime_type: Optional[str], default_extension: str = ".dat"
229  ) -> str:
230      """
231      Returns a file extension for a given MIME type.
232  
233      Args:
234          mime_type: The MIME type string (e.g., 'image/png', 'application/json').
235          default_extension: The extension to return if the MIME type is not found.
236  
237      Returns:
238          The corresponding file extension (e.g., '.png', '.json').
239      """
240      if not mime_type:
241          return default_extension
242  
243      normalized = mime_type.lower().split(";")[0].strip()
244      return _MIME_TO_EXTENSION.get(normalized, default_extension)
245  
246  
247  def resolve_mime_type(
248      filename: Optional[str], provided_mime_type: Optional[str] = None
249  ) -> str:
250      """
251      Resolves a MIME type from a filename when the provided type is missing or
252      ``application/octet-stream`` (the browser default for unrecognised extensions).
253  
254      Resolution order:
255        1. Normalize *provided_mime_type* (lowercase, strip parameters like
256           ``; charset=binary``).
257        2. If the normalized type is present and not ``application/octet-stream``,
258           return it.
259        3. Check the file extension against the canonical extension map.
260        4. Return ``application/octet-stream`` if nothing matched.
261  
262      Args:
263          filename: The original filename (used for extension lookup).
264          provided_mime_type: The MIME type reported by the client / browser.
265  
266      Returns:
267          The best-effort MIME type string.
268      """
269      normalized = provided_mime_type.lower().split(";")[0].strip() if provided_mime_type else None
270  
271      if normalized and normalized != _OCTET_STREAM:
272          return normalized
273  
274      if not filename:
275          return normalized or _OCTET_STREAM
276  
277      ext = os.path.splitext(filename)[1].lower()
278  
279      mapped = _EXTENSION_TO_MIME.get(ext)
280      if mapped:
281          return mapped
282  
283      return normalized or _OCTET_STREAM