/ haystack / dataclasses / file_content.py
file_content.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  import base64
  6  import mimetypes
  7  import os
  8  from dataclasses import asdict, dataclass, field
  9  from pathlib import Path
 10  from typing import Any
 11  from urllib.parse import unquote, urlparse
 12  
 13  import filetype
 14  
 15  from haystack import logging
 16  from haystack.utils.dataclasses import _warn_on_inplace_mutation
 17  
 18  logger = logging.getLogger(__name__)
 19  
 20  
 21  @_warn_on_inplace_mutation
 22  @dataclass
 23  class FileContent:
 24      """
 25      The file content of a chat message.
 26  
 27      :param base64_data: A base64 string representing the file.
 28      :param mime_type: The MIME type of the file (e.g. "application/pdf").
 29          Providing this value is recommended, as most LLM providers require it.
 30          If not provided, the MIME type is guessed from the base64 string, which can be slow and not always reliable.
 31      :param filename: Optional filename of the file. Some LLM providers use this information.
 32      :param extra: Dictionary of extra information about the file. Can be used to store provider-specific information.
 33          To avoid serialization issues, values should be JSON serializable.
 34      :param validation: If True (default), a validation process is performed:
 35          - Check whether the base64 string is valid;
 36          - Guess the MIME type if not provided.
 37          Set to False to skip validation and speed up initialization.
 38      """
 39  
 40      base64_data: str
 41      mime_type: str | None = None
 42      filename: str | None = None
 43      extra: dict[str, Any] = field(default_factory=dict)
 44      validation: bool = True
 45  
 46      def __post_init__(self) -> None:
 47          if not self.validation:
 48              return
 49  
 50          try:
 51              decoded_data = base64.b64decode(self.base64_data, validate=True)
 52          except Exception as e:
 53              raise ValueError("The base64 string is not valid") from e
 54  
 55          # mime_type is an important information, so we try to guess it if not provided
 56          if not self.mime_type:
 57              guess = filetype.guess(decoded_data)
 58              if guess:
 59                  self.mime_type = guess.mime
 60              else:
 61                  msg = (
 62                      "Failed to guess the MIME type of the file. Omitting the MIME type may result in "
 63                      "processing errors or incorrect handling of the file by LLM providers."
 64                  )
 65                  logger.warning(msg)
 66  
 67      def __repr__(self) -> str:
 68          """
 69          Return a string representation of the FileContent, truncating the base64_data to 100 bytes.
 70          """
 71          fields = []
 72  
 73          truncated_data = self.base64_data[:100] + "..." if len(self.base64_data) > 100 else self.base64_data
 74          fields.append(f"base64_data={truncated_data!r}")
 75          fields.append(f"mime_type={self.mime_type!r}")
 76          fields.append(f"filename={self.filename!r}")
 77          fields.append(f"extra={self.extra!r}")
 78          fields_str = ", ".join(fields)
 79          return f"{self.__class__.__name__}({fields_str})"
 80  
 81      def to_dict(self) -> dict[str, Any]:
 82          """
 83          Convert FileContent into a dictionary.
 84          """
 85          return asdict(self)
 86  
 87      def _to_trace_dict(self) -> dict[str, Any]:
 88          """
 89          Convert the FileContent to a dictionary representation for tracing.
 90  
 91          The base64_data is replaced with a placeholder string to avoid sending large payloads to the tracing backend.
 92  
 93          :returns:
 94              Serialized version of the object only for tracing purposes.
 95          """
 96          data = self.to_dict()
 97          data["base64_data"] = f"Base64 string ({len(self.base64_data)} characters)"
 98          return data
 99  
100      @classmethod
101      def from_dict(cls, data: dict[str, Any]) -> "FileContent":
102          """
103          Create an FileContent from a dictionary.
104          """
105          return FileContent(**data)
106  
107      @classmethod
108      def from_file_path(
109          cls, file_path: str | Path, *, filename: str | None = None, extra: dict[str, Any] | None = None
110      ) -> "FileContent":
111          """
112          Create an FileContent object from a file path.
113  
114          :param file_path:
115              The path to the file.
116          :param filename:
117              Optional file name. Some LLM providers use this information. If not provided, the filename is extracted
118              from the file path.
119          :param extra:
120              Dictionary of extra information about the file. Can be used to store provider-specific information.
121              To avoid serialization issues, values should be JSON serializable.
122  
123          :returns:
124              An FileContent object.
125          """
126          if isinstance(file_path, str):
127              file_path = Path(file_path)
128  
129          mime_type = mimetypes.guess_type(file_path.as_posix())[0]
130          filename = filename or file_path.name
131  
132          with open(file_path, "rb") as f:
133              data = f.read()
134  
135          return cls(
136              base64_data=base64.b64encode(data).decode("utf-8"),
137              mime_type=mime_type,
138              filename=filename,
139              extra=extra or {},
140              validation=False,
141          )
142  
143      @classmethod
144      def from_url(
145          cls,
146          url: str,
147          *,
148          retry_attempts: int = 2,
149          timeout: int = 10,
150          filename: str | None = None,
151          extra: dict[str, Any] | None = None,
152      ) -> "FileContent":
153          """
154          Create an FileContent object from a URL. The file is downloaded and converted to a base64 string.
155  
156          :param url:
157              The URL of the file.
158          :param retry_attempts:
159              The number of times to retry to fetch the URL's content.
160          :param timeout:
161              Timeout in seconds for the request.
162          :param filename:
163              Optional filename of the file. Some LLM providers use this information. If not provided, the filename is
164              extracted from the URL.
165          :param extra:
166              Dictionary of extra information about the file. Can be used to store provider-specific information.
167              To avoid serialization issues, values should be JSON serializable.
168  
169          :returns:
170              An FileContent object.
171          """
172          from haystack.components.fetchers.link_content import LinkContentFetcher
173  
174          fetcher = LinkContentFetcher(raise_on_failure=True, retry_attempts=retry_attempts, timeout=timeout)
175          bytestream = fetcher.run(urls=[url])["streams"][0]
176  
177          mime_type = bytestream.mime_type
178          data = bytestream.data
179  
180          if not filename:
181              filename = os.path.basename(unquote(urlparse(url).path))
182  
183          return cls(
184              base64_data=base64.b64encode(data).decode("utf-8"),
185              mime_type=mime_type,
186              filename=filename,
187              extra=extra or {},
188              validation=False,
189          )