/ haystack / dataclasses / document.py
document.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  import hashlib
  6  from dataclasses import asdict, dataclass, field, fields
  7  from typing import Any
  8  
  9  from numpy import ndarray
 10  
 11  from haystack.dataclasses.byte_stream import ByteStream
 12  from haystack.dataclasses.sparse_embedding import SparseEmbedding
 13  from haystack.utils.dataclasses import _warn_on_inplace_mutation
 14  
 15  LEGACY_FIELDS = ["content_type", "id_hash_keys", "dataframe"]
 16  
 17  
 18  class _BackwardCompatible(type):
 19      """
 20      Metaclass that handles Document backward compatibility.
 21      """
 22  
 23      def __call__(cls, *args: Any, **kwargs: Any) -> Any:
 24          """
 25          Called before Document.__init__, handles legacy fields.
 26  
 27          Embedding was stored as NumPy arrays in 1.x, so we convert it to a list of floats.
 28          Other legacy fields are removed.
 29          """
 30          ### Conversion from 1.x Document ###
 31          content = kwargs.get("content")
 32          if content and not isinstance(content, str):
 33              raise ValueError("The `content` field must be a string or None.")
 34  
 35          # Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
 36          if isinstance(embedding := kwargs.get("embedding"), ndarray):
 37              kwargs["embedding"] = embedding.tolist()
 38  
 39          # Remove legacy fields
 40          for field_name in LEGACY_FIELDS:
 41              kwargs.pop(field_name, None)
 42  
 43          return super().__call__(*args, **kwargs)
 44  
 45  
 46  @_warn_on_inplace_mutation
 47  @dataclass
 48  class Document(metaclass=_BackwardCompatible):  # noqa: PLW1641
 49      """
 50      Base data class containing some data to be queried.
 51  
 52      Can contain text snippets and file paths to images or audios. Documents can be sorted by score and saved
 53      to/from dictionary and JSON.
 54  
 55      :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
 56      :param content: Text of the document, if the document contains text.
 57      :param blob: Binary data associated with the document, if the document has any binary data associated with it.
 58      :param meta: Additional custom metadata for the document. Must be JSON-serializable.
 59      :param score: Score of the document. Used for ranking, usually assigned by retrievers.
 60      :param embedding: dense vector representation of the document.
 61      :param sparse_embedding: sparse vector representation of the document.
 62      """
 63  
 64      id: str = field(default="")
 65      content: str | None = field(default=None)
 66      blob: ByteStream | None = field(default=None)
 67      meta: dict[str, Any] = field(default_factory=dict)
 68      score: float | None = field(default=None)
 69      embedding: list[float] | None = field(default=None)
 70      sparse_embedding: SparseEmbedding | None = field(default=None)
 71  
 72      def __repr__(self) -> str:
 73          fields = []
 74          if self.content is not None:
 75              fields.append(
 76                  f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'"
 77              )
 78          if self.blob is not None:
 79              fields.append(f"blob: {len(self.blob.data)} bytes")
 80          if len(self.meta) > 0:
 81              fields.append(f"meta: {self.meta}")
 82          if self.score is not None:
 83              fields.append(f"score: {self.score}")
 84          if self.embedding is not None:
 85              fields.append(f"embedding: vector of size {len(self.embedding)}")
 86          if self.sparse_embedding is not None:
 87              fields.append(f"sparse_embedding: vector with {len(self.sparse_embedding.indices)} non-zero elements")
 88          fields_str = ", ".join(fields)
 89          return f"{self.__class__.__name__}(id={self.id}, {fields_str})"
 90  
 91      def __eq__(self, other: object) -> bool:
 92          """
 93          Compares Documents for equality.
 94  
 95          Two Documents are considered equals if their dictionary representation is identical.
 96          """
 97          if type(self) != type(other):
 98              return False
 99          return self.to_dict() == other.to_dict()
100  
101      def __post_init__(self) -> None:
102          """
103          Generate the ID based on the init parameters.
104          """
105          # Generate an id only if not explicitly set
106          self.id = self.id or self._create_id()
107  
108      def _create_id(self) -> str:
109          """
110          Creates a hash of the given content that acts as the document's ID.
111          """
112          text = self.content or None
113          dataframe = None  # this allows the ID creation to remain unchanged even if the dataframe field has been removed
114          blob = self.blob.data if self.blob is not None else None
115          mime_type = self.blob.mime_type if self.blob is not None else None
116          meta = self.meta or {}
117          embedding = self.embedding if self.embedding is not None else None
118          sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
119          data = f"{text}{dataframe}{blob!r}{mime_type}{meta}{embedding}{sparse_embedding}"
120          return hashlib.sha256(data.encode("utf-8")).hexdigest()
121  
122      def to_dict(self, flatten: bool = True) -> dict[str, Any]:
123          """
124          Converts Document into a dictionary.
125  
126          `blob` field is converted to a JSON-serializable type.
127  
128          :param flatten:
129              Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x.
130          """
131          data = asdict(self)
132  
133          # Use `ByteStream` and `SparseEmbedding`'s to_dict methods to convert them to JSON-serializable types.
134          if self.blob is not None:
135              data["blob"] = self.blob.to_dict()
136          if self.sparse_embedding is not None:
137              data["sparse_embedding"] = self.sparse_embedding.to_dict()
138  
139          if flatten:
140              meta = data.pop("meta")
141              return {**meta, **data}
142  
143          return data
144  
145      @classmethod
146      def from_dict(cls, data: dict[str, Any]) -> "Document":
147          """
148          Creates a new Document object from a dictionary.
149  
150          The `blob` field is converted to its original type.
151          """
152          if blob := data.get("blob"):
153              data["blob"] = ByteStream.from_dict(blob)
154          if sparse_embedding := data.get("sparse_embedding"):
155              data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding)
156  
157          # Store metadata for a moment while we try un-flattening allegedly flatten metadata.
158          # We don't expect both a `meta=` keyword and flatten metadata keys so we'll raise a
159          # ValueError later if this is the case.
160          meta = data.pop("meta", {})
161          # Unflatten metadata if it was flattened. We assume any keyword argument that's not
162          # a document field is a metadata key. We treat legacy fields as document fields
163          # for backward compatibility.
164          flatten_meta = {}
165          document_fields = LEGACY_FIELDS + [f.name for f in fields(cls)]
166          for key in list(data.keys()):
167              if key not in document_fields:
168                  flatten_meta[key] = data.pop(key)
169  
170          # We don't support passing both flatten keys and the `meta` keyword parameter
171          if meta and flatten_meta:
172              raise ValueError(
173                  "You can pass either the 'meta' parameter or flattened metadata keys as keyword arguments, "
174                  "but currently you're passing both. Pass either the 'meta' parameter or flattened metadata keys."
175              )
176  
177          # Finally put back all the metadata
178          return cls(**data, meta={**meta, **flatten_meta})
179  
180      @property
181      def content_type(self) -> str:
182          """
183          Returns the type of the content for the document.
184  
185          This is necessary to keep backward compatibility with 1.x.
186          """
187          if self.content is not None:
188              return "text"
189          raise ValueError("Content is not set.")