document.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 import hashlib 6 from dataclasses import asdict, dataclass, field, fields 7 from typing import Any 8 9 from numpy import ndarray 10 11 from haystack.dataclasses.byte_stream import ByteStream 12 from haystack.dataclasses.sparse_embedding import SparseEmbedding 13 from haystack.utils.dataclasses import _warn_on_inplace_mutation 14 15 LEGACY_FIELDS = ["content_type", "id_hash_keys", "dataframe"] 16 17 18 class _BackwardCompatible(type): 19 """ 20 Metaclass that handles Document backward compatibility. 21 """ 22 23 def __call__(cls, *args: Any, **kwargs: Any) -> Any: 24 """ 25 Called before Document.__init__, handles legacy fields. 26 27 Embedding was stored as NumPy arrays in 1.x, so we convert it to a list of floats. 28 Other legacy fields are removed. 29 """ 30 ### Conversion from 1.x Document ### 31 content = kwargs.get("content") 32 if content and not isinstance(content, str): 33 raise ValueError("The `content` field must be a string or None.") 34 35 # Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type 36 if isinstance(embedding := kwargs.get("embedding"), ndarray): 37 kwargs["embedding"] = embedding.tolist() 38 39 # Remove legacy fields 40 for field_name in LEGACY_FIELDS: 41 kwargs.pop(field_name, None) 42 43 return super().__call__(*args, **kwargs) 44 45 46 @_warn_on_inplace_mutation 47 @dataclass 48 class Document(metaclass=_BackwardCompatible): # noqa: PLW1641 49 """ 50 Base data class containing some data to be queried. 51 52 Can contain text snippets and file paths to images or audios. Documents can be sorted by score and saved 53 to/from dictionary and JSON. 54 55 :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values. 56 :param content: Text of the document, if the document contains text. 57 :param blob: Binary data associated with the document, if the document has any binary data associated with it. 58 :param meta: Additional custom metadata for the document. Must be JSON-serializable. 59 :param score: Score of the document. Used for ranking, usually assigned by retrievers. 60 :param embedding: dense vector representation of the document. 61 :param sparse_embedding: sparse vector representation of the document. 62 """ 63 64 id: str = field(default="") 65 content: str | None = field(default=None) 66 blob: ByteStream | None = field(default=None) 67 meta: dict[str, Any] = field(default_factory=dict) 68 score: float | None = field(default=None) 69 embedding: list[float] | None = field(default=None) 70 sparse_embedding: SparseEmbedding | None = field(default=None) 71 72 def __repr__(self) -> str: 73 fields = [] 74 if self.content is not None: 75 fields.append( 76 f"content: '{self.content}'" if len(self.content) < 100 else f"content: '{self.content[:100]}...'" 77 ) 78 if self.blob is not None: 79 fields.append(f"blob: {len(self.blob.data)} bytes") 80 if len(self.meta) > 0: 81 fields.append(f"meta: {self.meta}") 82 if self.score is not None: 83 fields.append(f"score: {self.score}") 84 if self.embedding is not None: 85 fields.append(f"embedding: vector of size {len(self.embedding)}") 86 if self.sparse_embedding is not None: 87 fields.append(f"sparse_embedding: vector with {len(self.sparse_embedding.indices)} non-zero elements") 88 fields_str = ", ".join(fields) 89 return f"{self.__class__.__name__}(id={self.id}, {fields_str})" 90 91 def __eq__(self, other: object) -> bool: 92 """ 93 Compares Documents for equality. 94 95 Two Documents are considered equals if their dictionary representation is identical. 96 """ 97 if type(self) != type(other): 98 return False 99 return self.to_dict() == other.to_dict() 100 101 def __post_init__(self) -> None: 102 """ 103 Generate the ID based on the init parameters. 104 """ 105 # Generate an id only if not explicitly set 106 self.id = self.id or self._create_id() 107 108 def _create_id(self) -> str: 109 """ 110 Creates a hash of the given content that acts as the document's ID. 111 """ 112 text = self.content or None 113 dataframe = None # this allows the ID creation to remain unchanged even if the dataframe field has been removed 114 blob = self.blob.data if self.blob is not None else None 115 mime_type = self.blob.mime_type if self.blob is not None else None 116 meta = self.meta or {} 117 embedding = self.embedding if self.embedding is not None else None 118 sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else "" 119 data = f"{text}{dataframe}{blob!r}{mime_type}{meta}{embedding}{sparse_embedding}" 120 return hashlib.sha256(data.encode("utf-8")).hexdigest() 121 122 def to_dict(self, flatten: bool = True) -> dict[str, Any]: 123 """ 124 Converts Document into a dictionary. 125 126 `blob` field is converted to a JSON-serializable type. 127 128 :param flatten: 129 Whether to flatten `meta` field or not. Defaults to `True` to be backward-compatible with Haystack 1.x. 130 """ 131 data = asdict(self) 132 133 # Use `ByteStream` and `SparseEmbedding`'s to_dict methods to convert them to JSON-serializable types. 134 if self.blob is not None: 135 data["blob"] = self.blob.to_dict() 136 if self.sparse_embedding is not None: 137 data["sparse_embedding"] = self.sparse_embedding.to_dict() 138 139 if flatten: 140 meta = data.pop("meta") 141 return {**meta, **data} 142 143 return data 144 145 @classmethod 146 def from_dict(cls, data: dict[str, Any]) -> "Document": 147 """ 148 Creates a new Document object from a dictionary. 149 150 The `blob` field is converted to its original type. 151 """ 152 if blob := data.get("blob"): 153 data["blob"] = ByteStream.from_dict(blob) 154 if sparse_embedding := data.get("sparse_embedding"): 155 data["sparse_embedding"] = SparseEmbedding.from_dict(sparse_embedding) 156 157 # Store metadata for a moment while we try un-flattening allegedly flatten metadata. 158 # We don't expect both a `meta=` keyword and flatten metadata keys so we'll raise a 159 # ValueError later if this is the case. 160 meta = data.pop("meta", {}) 161 # Unflatten metadata if it was flattened. We assume any keyword argument that's not 162 # a document field is a metadata key. We treat legacy fields as document fields 163 # for backward compatibility. 164 flatten_meta = {} 165 document_fields = LEGACY_FIELDS + [f.name for f in fields(cls)] 166 for key in list(data.keys()): 167 if key not in document_fields: 168 flatten_meta[key] = data.pop(key) 169 170 # We don't support passing both flatten keys and the `meta` keyword parameter 171 if meta and flatten_meta: 172 raise ValueError( 173 "You can pass either the 'meta' parameter or flattened metadata keys as keyword arguments, " 174 "but currently you're passing both. Pass either the 'meta' parameter or flattened metadata keys." 175 ) 176 177 # Finally put back all the metadata 178 return cls(**data, meta={**meta, **flatten_meta}) 179 180 @property 181 def content_type(self) -> str: 182 """ 183 Returns the type of the content for the document. 184 185 This is necessary to keep backward compatibility with 1.x. 186 """ 187 if self.content is not None: 188 return "text" 189 raise ValueError("Content is not set.")