file_content.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 import base64 6 import mimetypes 7 import os 8 from dataclasses import asdict, dataclass, field 9 from pathlib import Path 10 from typing import Any 11 from urllib.parse import unquote, urlparse 12 13 import filetype 14 15 from haystack import logging 16 from haystack.utils.dataclasses import _warn_on_inplace_mutation 17 18 logger = logging.getLogger(__name__) 19 20 21 @_warn_on_inplace_mutation 22 @dataclass 23 class FileContent: 24 """ 25 The file content of a chat message. 26 27 :param base64_data: A base64 string representing the file. 28 :param mime_type: The MIME type of the file (e.g. "application/pdf"). 29 Providing this value is recommended, as most LLM providers require it. 30 If not provided, the MIME type is guessed from the base64 string, which can be slow and not always reliable. 31 :param filename: Optional filename of the file. Some LLM providers use this information. 32 :param extra: Dictionary of extra information about the file. Can be used to store provider-specific information. 33 To avoid serialization issues, values should be JSON serializable. 34 :param validation: If True (default), a validation process is performed: 35 - Check whether the base64 string is valid; 36 - Guess the MIME type if not provided. 37 Set to False to skip validation and speed up initialization. 38 """ 39 40 base64_data: str 41 mime_type: str | None = None 42 filename: str | None = None 43 extra: dict[str, Any] = field(default_factory=dict) 44 validation: bool = True 45 46 def __post_init__(self) -> None: 47 if not self.validation: 48 return 49 50 try: 51 decoded_data = base64.b64decode(self.base64_data, validate=True) 52 except Exception as e: 53 raise ValueError("The base64 string is not valid") from e 54 55 # mime_type is an important information, so we try to guess it if not provided 56 if not self.mime_type: 57 guess = filetype.guess(decoded_data) 58 if guess: 59 self.mime_type = guess.mime 60 else: 61 msg = ( 62 "Failed to guess the MIME type of the file. Omitting the MIME type may result in " 63 "processing errors or incorrect handling of the file by LLM providers." 64 ) 65 logger.warning(msg) 66 67 def __repr__(self) -> str: 68 """ 69 Return a string representation of the FileContent, truncating the base64_data to 100 bytes. 70 """ 71 fields = [] 72 73 truncated_data = self.base64_data[:100] + "..." if len(self.base64_data) > 100 else self.base64_data 74 fields.append(f"base64_data={truncated_data!r}") 75 fields.append(f"mime_type={self.mime_type!r}") 76 fields.append(f"filename={self.filename!r}") 77 fields.append(f"extra={self.extra!r}") 78 fields_str = ", ".join(fields) 79 return f"{self.__class__.__name__}({fields_str})" 80 81 def to_dict(self) -> dict[str, Any]: 82 """ 83 Convert FileContent into a dictionary. 84 """ 85 return asdict(self) 86 87 def _to_trace_dict(self) -> dict[str, Any]: 88 """ 89 Convert the FileContent to a dictionary representation for tracing. 90 91 The base64_data is replaced with a placeholder string to avoid sending large payloads to the tracing backend. 92 93 :returns: 94 Serialized version of the object only for tracing purposes. 95 """ 96 data = self.to_dict() 97 data["base64_data"] = f"Base64 string ({len(self.base64_data)} characters)" 98 return data 99 100 @classmethod 101 def from_dict(cls, data: dict[str, Any]) -> "FileContent": 102 """ 103 Create an FileContent from a dictionary. 104 """ 105 return FileContent(**data) 106 107 @classmethod 108 def from_file_path( 109 cls, file_path: str | Path, *, filename: str | None = None, extra: dict[str, Any] | None = None 110 ) -> "FileContent": 111 """ 112 Create an FileContent object from a file path. 113 114 :param file_path: 115 The path to the file. 116 :param filename: 117 Optional file name. Some LLM providers use this information. If not provided, the filename is extracted 118 from the file path. 119 :param extra: 120 Dictionary of extra information about the file. Can be used to store provider-specific information. 121 To avoid serialization issues, values should be JSON serializable. 122 123 :returns: 124 An FileContent object. 125 """ 126 if isinstance(file_path, str): 127 file_path = Path(file_path) 128 129 mime_type = mimetypes.guess_type(file_path.as_posix())[0] 130 filename = filename or file_path.name 131 132 with open(file_path, "rb") as f: 133 data = f.read() 134 135 return cls( 136 base64_data=base64.b64encode(data).decode("utf-8"), 137 mime_type=mime_type, 138 filename=filename, 139 extra=extra or {}, 140 validation=False, 141 ) 142 143 @classmethod 144 def from_url( 145 cls, 146 url: str, 147 *, 148 retry_attempts: int = 2, 149 timeout: int = 10, 150 filename: str | None = None, 151 extra: dict[str, Any] | None = None, 152 ) -> "FileContent": 153 """ 154 Create an FileContent object from a URL. The file is downloaded and converted to a base64 string. 155 156 :param url: 157 The URL of the file. 158 :param retry_attempts: 159 The number of times to retry to fetch the URL's content. 160 :param timeout: 161 Timeout in seconds for the request. 162 :param filename: 163 Optional filename of the file. Some LLM providers use this information. If not provided, the filename is 164 extracted from the URL. 165 :param extra: 166 Dictionary of extra information about the file. Can be used to store provider-specific information. 167 To avoid serialization issues, values should be JSON serializable. 168 169 :returns: 170 An FileContent object. 171 """ 172 from haystack.components.fetchers.link_content import LinkContentFetcher 173 174 fetcher = LinkContentFetcher(raise_on_failure=True, retry_attempts=retry_attempts, timeout=timeout) 175 bytestream = fetcher.run(urls=[url])["streams"][0] 176 177 mime_type = bytestream.mime_type 178 data = bytestream.data 179 180 if not filename: 181 filename = os.path.basename(unquote(urlparse(url).path)) 182 183 return cls( 184 base64_data=base64.b64encode(data).decode("utf-8"), 185 mime_type=mime_type, 186 filename=filename, 187 extra=extra or {}, 188 validation=False, 189 )