byte_stream.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 from dataclasses import dataclass, field 6 from pathlib import Path 7 from typing import Any 8 9 from haystack.utils.dataclasses import _warn_on_inplace_mutation 10 from haystack.utils.misc import _guess_mime_type 11 12 13 @_warn_on_inplace_mutation 14 @dataclass(repr=False) 15 class ByteStream: 16 """ 17 Base data class representing a binary object in the Haystack API. 18 19 :param data: The binary data stored in Bytestream. 20 :param meta: Additional metadata to be stored with the ByteStream. 21 :param mime_type: The mime type of the binary data. 22 """ 23 24 data: bytes 25 meta: dict[str, Any] = field(default_factory=dict, hash=False) 26 mime_type: str | None = field(default=None) 27 28 def to_file(self, destination_path: Path) -> None: 29 """ 30 Write the ByteStream to a file. Note: the metadata will be lost. 31 32 :param destination_path: The path to write the ByteStream to. 33 """ 34 with open(destination_path, "wb") as fd: 35 fd.write(self.data) 36 37 @classmethod 38 def from_file_path( 39 cls, 40 filepath: Path, 41 mime_type: str | None = None, 42 meta: dict[str, Any] | None = None, 43 guess_mime_type: bool = False, 44 ) -> "ByteStream": 45 """ 46 Create a ByteStream from the contents read from a file. 47 48 :param filepath: A valid path to a file. 49 :param mime_type: The mime type of the file. 50 :param meta: Additional metadata to be stored with the ByteStream. 51 :param guess_mime_type: Whether to guess the mime type from the file. 52 """ 53 if not mime_type and guess_mime_type: 54 mime_type = _guess_mime_type(filepath) 55 with open(filepath, "rb") as fd: 56 return cls(data=fd.read(), mime_type=mime_type, meta=meta or {}) 57 58 @classmethod 59 def from_string( 60 cls, text: str, encoding: str = "utf-8", mime_type: str | None = None, meta: dict[str, Any] | None = None 61 ) -> "ByteStream": 62 """ 63 Create a ByteStream encoding a string. 64 65 :param text: The string to encode 66 :param encoding: The encoding used to convert the string into bytes 67 :param mime_type: The mime type of the file. 68 :param meta: Additional metadata to be stored with the ByteStream. 69 """ 70 return cls(data=text.encode(encoding), mime_type=mime_type, meta=meta or {}) 71 72 def to_string(self, encoding: str = "utf-8") -> str: 73 """ 74 Convert the ByteStream to a string, metadata will not be included. 75 76 :param encoding: The encoding used to convert the bytes to a string. Defaults to "utf-8". 77 :returns: The string representation of the ByteStream. 78 :raises UnicodeDecodeError: If the ByteStream data cannot be decoded with the specified encoding. 79 """ 80 return self.data.decode(encoding) 81 82 def __repr__(self) -> str: 83 """ 84 Return a string representation of the ByteStream, truncating the data to 100 bytes. 85 """ 86 fields = [] 87 truncated_data = self.data[:100] + b"..." if len(self.data) > 100 else self.data 88 fields.append(f"data={truncated_data!r}") 89 fields.append(f"meta={self.meta!r}") 90 fields.append(f"mime_type={self.mime_type!r}") 91 fields_str = ", ".join(fields) 92 return f"{self.__class__.__name__}({fields_str})" 93 94 def to_dict(self) -> dict[str, Any]: 95 """ 96 Convert the ByteStream to a dictionary representation. 97 98 :returns: A dictionary with keys 'data', 'meta', and 'mime_type'. 99 """ 100 # Note: The data is converted to a list of integers for serialization since JSON does not support bytes 101 # directly. 102 return {"data": list(self.data), "meta": self.meta, "mime_type": self.mime_type} 103 104 def _to_trace_dict(self) -> dict[str, Any]: 105 """ 106 Convert the ByteStream to a dictionary representation for tracing. 107 108 Binary data is replaced with a placeholder string to avoid sending large payloads to the tracing backend. 109 110 :returns: 111 Serialized version of the object only for tracing purposes. 112 """ 113 return {"data": f"Binary data ({len(self.data)} bytes)", "meta": self.meta, "mime_type": self.mime_type} 114 115 @classmethod 116 def from_dict(cls, data: dict[str, Any]) -> "ByteStream": 117 """ 118 Create a ByteStream from a dictionary representation. 119 120 :param data: A dictionary with keys 'data', 'meta', and 'mime_type'. 121 122 :returns: A ByteStream instance. 123 """ 124 return ByteStream(data=bytes(data["data"]), meta=data.get("meta", {}), mime_type=data.get("mime_type"))