test_document.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 import warnings 6 from dataclasses import replace 7 8 import pytest 9 10 from haystack import Document 11 from haystack.dataclasses.byte_stream import ByteStream 12 from haystack.dataclasses.sparse_embedding import SparseEmbedding 13 14 15 @pytest.mark.parametrize( 16 "doc,doc_str", 17 [ 18 (Document(content="test text"), "content: 'test text'"), 19 (Document(blob=ByteStream(b"hello, test string")), "blob: 18 bytes"), 20 (Document(content="test text", blob=ByteStream(b"hello, test string")), "content: 'test text', blob: 18 bytes"), 21 ], 22 ) 23 def test_document_str(doc, doc_str): 24 assert f"Document(id={doc.id}, {doc_str})" == str(doc) 25 26 27 def test_init(): 28 doc = Document() 29 assert doc.id == "d4675c57fcfe114db0b95f1da46eea3c5d6f5729c17d01fb5251ae19830a3455" 30 assert doc.content == None 31 assert doc.blob == None 32 assert doc.meta == {} 33 assert doc.score == None 34 assert doc.embedding == None 35 assert doc.sparse_embedding == None 36 37 38 def test_init_with_wrong_parameters(): 39 with pytest.raises(TypeError): 40 Document(text="") 41 42 43 def test_init_with_parameters(): 44 blob_data = b"some bytes" 45 sparse_embedding = SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3]) 46 doc = Document( 47 content="test text", 48 blob=ByteStream(data=blob_data, mime_type="text/markdown"), 49 meta={"text": "test text"}, 50 score=0.812, 51 embedding=[0.1, 0.2, 0.3], 52 sparse_embedding=sparse_embedding, 53 ) 54 assert doc.id == "1aa43af57c1dbc317241bf55d3067049f334d3b458d95dc72f71a7111f6c1a56" 55 assert doc.content == "test text" 56 assert doc.blob.data == blob_data 57 assert doc.blob.mime_type == "text/markdown" 58 assert doc.meta == {"text": "test text"} 59 assert doc.score == 0.812 60 assert doc.embedding == [0.1, 0.2, 0.3] 61 assert doc.sparse_embedding == sparse_embedding 62 63 64 def test_init_with_legacy_fields(): 65 doc = Document( 66 content="test text", 67 content_type="text", 68 id_hash_keys=["content"], 69 dataframe="placeholder", 70 score=0.812, 71 embedding=[0.1, 0.2, 0.3], # type: ignore 72 ) 73 assert doc.id == "18fc2c114825872321cf5009827ca162f54d3be50ab9e9ffa027824b6ec223af" 74 assert doc.content == "test text" 75 assert doc.blob == None 76 assert doc.meta == {} 77 assert doc.score == 0.812 78 assert doc.embedding == [0.1, 0.2, 0.3] 79 assert doc.sparse_embedding == None 80 81 assert doc.content_type == "text" # this is a property now 82 83 assert not hasattr(doc, "id_hash_keys") 84 assert not hasattr(doc, "dataframe") 85 86 87 def test_init_with_legacy_field(): 88 doc = Document( 89 content="test text", 90 content_type="text", # type: ignore 91 id_hash_keys=["content"], # type: ignore 92 score=0.812, 93 embedding=[0.1, 0.2, 0.3], 94 meta={"date": "10-10-2023", "type": "article"}, 95 ) 96 assert doc.id == "a2c0321b34430cc675294611e55529fceb56140ca3202f1c59a43a8cecac1f43" 97 assert doc.content == "test text" 98 assert doc.meta == {"date": "10-10-2023", "type": "article"} 99 assert doc.score == 0.812 100 assert doc.embedding == [0.1, 0.2, 0.3] 101 assert doc.sparse_embedding == None 102 103 assert doc.content_type == "text" # this is a property now 104 assert not hasattr(doc, "id_hash_keys") 105 106 107 def test_basic_equality_type_mismatch(): 108 doc = Document(content="test text") 109 assert doc != "test text" 110 111 112 def test_basic_equality_id(): 113 doc1 = Document(content="test text") 114 doc2 = Document(content="test text") 115 116 assert doc1 == doc2 117 118 doc1 = replace(doc1, id="1234") 119 doc2 = replace(doc2, id="5678") 120 121 assert doc1 != doc2 122 123 124 def test_to_dict(): 125 doc = Document() 126 assert doc.to_dict() == { 127 "id": doc._create_id(), 128 "content": None, 129 "blob": None, 130 "score": None, 131 "embedding": None, 132 "sparse_embedding": None, 133 } 134 135 136 def test_to_dict_without_flattening(): 137 doc = Document() 138 assert doc.to_dict(flatten=False) == { 139 "id": doc._create_id(), 140 "content": None, 141 "blob": None, 142 "meta": {}, 143 "score": None, 144 "embedding": None, 145 "sparse_embedding": None, 146 } 147 148 149 def test_to_dict_with_custom_parameters(): 150 doc = Document( 151 content="test text", 152 blob=ByteStream(b"some bytes", mime_type="application/pdf", meta={"foo": "bar"}), 153 meta={"some": "values", "test": 10}, 154 score=0.99, 155 embedding=[10.0, 10.0], 156 sparse_embedding=SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3]), 157 ) 158 159 assert doc.to_dict() == { 160 "id": doc.id, 161 "content": "test text", 162 "blob": {"data": list(b"some bytes"), "mime_type": "application/pdf", "meta": {"foo": "bar"}}, 163 "some": "values", 164 "test": 10, 165 "score": 0.99, 166 "embedding": [10.0, 10.0], 167 "sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]}, 168 } 169 170 171 def test_to_dict_with_custom_parameters_without_flattening(): 172 doc = Document( 173 content="test text", 174 blob=ByteStream(b"some bytes", mime_type="application/pdf"), 175 meta={"some": "values", "test": 10}, 176 score=0.99, 177 embedding=[10.0, 10.0], 178 sparse_embedding=SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3]), 179 ) 180 181 assert doc.to_dict(flatten=False) == { 182 "id": doc.id, 183 "content": "test text", 184 "blob": {"data": list(b"some bytes"), "mime_type": "application/pdf", "meta": {}}, 185 "meta": {"some": "values", "test": 10}, 186 "score": 0.99, 187 "embedding": [10.0, 10.0], 188 "sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]}, 189 } 190 191 192 def test_to_dict_field_precedence(): 193 """ 194 Test for Document.to_dict() with flatten=True. 195 196 Test that Document's first-level fields take precedence over meta fields when flattening the dictionary 197 representation. 198 """ 199 200 doc = Document(content="from-content", score=0.9, meta={"content": "from-meta", "score": 0.5, "source": "web"}) 201 202 flat_dict = doc.to_dict(flatten=True) 203 204 # First-level fields should take precedence 205 assert flat_dict["content"] == "from-content" 206 assert flat_dict["score"] == 0.9 207 # Meta-only fields should be preserved 208 assert flat_dict["source"] == "web" 209 210 211 def test_from_dict(): 212 assert Document.from_dict({}) == Document() 213 214 215 def from_from_dict_with_parameters(): 216 blob_data = b"some bytes" 217 assert Document.from_dict( 218 { 219 "content": "test text", 220 "blob": {"data": list(blob_data), "mime_type": "text/markdown", "meta": {"text": "test text"}}, 221 "meta": {"text": "test text"}, 222 "score": 0.812, 223 "embedding": [0.1, 0.2, 0.3], 224 "sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]}, 225 } 226 ) == Document( 227 content="test text", 228 blob=ByteStream(blob_data, mime_type="text/markdown", meta={"text": "test text"}), 229 meta={"text": "test text"}, 230 score=0.812, 231 embedding=[0.1, 0.2, 0.3], 232 sparse_embedding=SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3]), 233 ) 234 235 236 def test_from_dict_with_legacy_fields(): 237 assert Document.from_dict( 238 { 239 "content": "test text", 240 "content_type": "text", 241 "id_hash_keys": ["content"], 242 "score": 0.812, 243 "embedding": [0.1, 0.2, 0.3], 244 } 245 ) == Document( 246 content="test text", 247 content_type="text", 248 id_hash_keys=["content"], 249 score=0.812, 250 embedding=[0.1, 0.2, 0.3], # type: ignore 251 ) 252 253 254 def test_from_dict_with_legacy_field_and_flat_meta(): 255 assert Document.from_dict( 256 { 257 "content": "test text", 258 "content_type": "text", 259 "id_hash_keys": ["content"], 260 "score": 0.812, 261 "embedding": [0.1, 0.2, 0.3], 262 "date": "10-10-2023", 263 "type": "article", 264 } 265 ) == Document( 266 content="test text", 267 content_type="text", # type: ignore 268 id_hash_keys=["content"], # type: ignore 269 score=0.812, 270 embedding=[0.1, 0.2, 0.3], 271 meta={"date": "10-10-2023", "type": "article"}, 272 ) 273 274 275 def test_from_dict_with_flat_meta(): 276 blob_data = b"some bytes" 277 assert Document.from_dict( 278 { 279 "content": "test text", 280 "blob": {"data": list(blob_data), "mime_type": "text/markdown"}, 281 "score": 0.812, 282 "embedding": [0.1, 0.2, 0.3], 283 "sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]}, 284 "date": "10-10-2023", 285 "type": "article", 286 } 287 ) == Document( 288 content="test text", 289 blob=ByteStream(blob_data, mime_type="text/markdown"), 290 score=0.812, 291 embedding=[0.1, 0.2, 0.3], 292 sparse_embedding=SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3]), 293 meta={"date": "10-10-2023", "type": "article"}, 294 ) 295 296 297 def test_from_dict_with_flat_and_non_flat_meta(): 298 with pytest.raises(ValueError, match="Pass either the 'meta' parameter or flattened metadata keys"): 299 Document.from_dict( 300 { 301 "content": "test text", 302 "blob": {"data": list(b"some bytes"), "mime_type": "text/markdown"}, 303 "score": 0.812, 304 "meta": {"test": 10}, 305 "embedding": [0.1, 0.2, 0.3], 306 "date": "10-10-2023", 307 "type": "article", 308 } 309 ) 310 311 312 def test_from_dict_with_dataframe(): 313 """ 314 Test for legacy support of Document.from_dict() with dataframe field. 315 316 Test that Document.from_dict() can properly deserialize a Document dictionary obtained with 317 document.to_dict(flatten=False) in haystack-ai<=2.10.0. 318 We make sure that Document.from_dict() does not raise an error and that dataframe is skipped (legacy field). 319 """ 320 321 # Document dictionary obtained with document.to_dict(flatten=False) in haystack-ai<=2.10.0 322 doc_dict = { 323 "id": "my_id", 324 "content": "my_content", 325 "dataframe": None, 326 "blob": None, 327 "meta": {"key": "value"}, 328 "score": None, 329 "embedding": None, 330 "sparse_embedding": None, 331 } 332 333 doc = Document.from_dict(doc_dict) 334 335 assert doc.id == "my_id" 336 assert doc.content == "my_content" 337 assert doc.meta == {"key": "value"} 338 assert doc.score is None 339 assert doc.embedding is None 340 assert doc.sparse_embedding is None 341 342 assert not hasattr(doc, "dataframe") 343 344 345 def test_content_type(): 346 assert Document(content="text").content_type == "text" 347 348 with pytest.raises(ValueError): 349 _ = Document().content_type 350 351 352 def test_no_warning_on_init(): 353 with warnings.catch_warnings(): 354 warnings.simplefilter("error", Warning) 355 Document(content="test") 356 357 358 def test_warn_on_inplace_mutation(): 359 doc = Document(content="test") 360 with pytest.warns(Warning, match="dataclasses.replace"): 361 doc.content = "other"