/ haystack / dataclasses / byte_stream.py
byte_stream.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  from dataclasses import dataclass, field
  6  from pathlib import Path
  7  from typing import Any
  8  
  9  from haystack.utils.dataclasses import _warn_on_inplace_mutation
 10  from haystack.utils.misc import _guess_mime_type
 11  
 12  
 13  @_warn_on_inplace_mutation
 14  @dataclass(repr=False)
 15  class ByteStream:
 16      """
 17      Base data class representing a binary object in the Haystack API.
 18  
 19      :param data: The binary data stored in Bytestream.
 20      :param meta: Additional metadata to be stored with the ByteStream.
 21      :param mime_type: The mime type of the binary data.
 22      """
 23  
 24      data: bytes
 25      meta: dict[str, Any] = field(default_factory=dict, hash=False)
 26      mime_type: str | None = field(default=None)
 27  
 28      def to_file(self, destination_path: Path) -> None:
 29          """
 30          Write the ByteStream to a file. Note: the metadata will be lost.
 31  
 32          :param destination_path: The path to write the ByteStream to.
 33          """
 34          with open(destination_path, "wb") as fd:
 35              fd.write(self.data)
 36  
 37      @classmethod
 38      def from_file_path(
 39          cls,
 40          filepath: Path,
 41          mime_type: str | None = None,
 42          meta: dict[str, Any] | None = None,
 43          guess_mime_type: bool = False,
 44      ) -> "ByteStream":
 45          """
 46          Create a ByteStream from the contents read from a file.
 47  
 48          :param filepath: A valid path to a file.
 49          :param mime_type: The mime type of the file.
 50          :param meta: Additional metadata to be stored with the ByteStream.
 51          :param guess_mime_type: Whether to guess the mime type from the file.
 52          """
 53          if not mime_type and guess_mime_type:
 54              mime_type = _guess_mime_type(filepath)
 55          with open(filepath, "rb") as fd:
 56              return cls(data=fd.read(), mime_type=mime_type, meta=meta or {})
 57  
 58      @classmethod
 59      def from_string(
 60          cls, text: str, encoding: str = "utf-8", mime_type: str | None = None, meta: dict[str, Any] | None = None
 61      ) -> "ByteStream":
 62          """
 63          Create a ByteStream encoding a string.
 64  
 65          :param text: The string to encode
 66          :param encoding: The encoding used to convert the string into bytes
 67          :param mime_type: The mime type of the file.
 68          :param meta: Additional metadata to be stored with the ByteStream.
 69          """
 70          return cls(data=text.encode(encoding), mime_type=mime_type, meta=meta or {})
 71  
 72      def to_string(self, encoding: str = "utf-8") -> str:
 73          """
 74          Convert the ByteStream to a string, metadata will not be included.
 75  
 76          :param encoding: The encoding used to convert the bytes to a string. Defaults to "utf-8".
 77          :returns: The string representation of the ByteStream.
 78          :raises UnicodeDecodeError: If the ByteStream data cannot be decoded with the specified encoding.
 79          """
 80          return self.data.decode(encoding)
 81  
 82      def __repr__(self) -> str:
 83          """
 84          Return a string representation of the ByteStream, truncating the data to 100 bytes.
 85          """
 86          fields = []
 87          truncated_data = self.data[:100] + b"..." if len(self.data) > 100 else self.data
 88          fields.append(f"data={truncated_data!r}")
 89          fields.append(f"meta={self.meta!r}")
 90          fields.append(f"mime_type={self.mime_type!r}")
 91          fields_str = ", ".join(fields)
 92          return f"{self.__class__.__name__}({fields_str})"
 93  
 94      def to_dict(self) -> dict[str, Any]:
 95          """
 96          Convert the ByteStream to a dictionary representation.
 97  
 98          :returns: A dictionary with keys 'data', 'meta', and 'mime_type'.
 99          """
100          # Note: The data is converted to a list of integers for serialization since JSON does not support bytes
101          # directly.
102          return {"data": list(self.data), "meta": self.meta, "mime_type": self.mime_type}
103  
104      def _to_trace_dict(self) -> dict[str, Any]:
105          """
106          Convert the ByteStream to a dictionary representation for tracing.
107  
108          Binary data is replaced with a placeholder string to avoid sending large payloads to the tracing backend.
109  
110          :returns:
111              Serialized version of the object only for tracing purposes.
112          """
113          return {"data": f"Binary data ({len(self.data)} bytes)", "meta": self.meta, "mime_type": self.mime_type}
114  
115      @classmethod
116      def from_dict(cls, data: dict[str, Any]) -> "ByteStream":
117          """
118          Create a ByteStream from a dictionary representation.
119  
120          :param data: A dictionary with keys 'data', 'meta', and 'mime_type'.
121  
122          :returns: A ByteStream instance.
123          """
124          return ByteStream(data=bytes(data["data"]), meta=data.get("meta", {}), mime_type=data.get("mime_type"))