/ test / dataclasses / test_document.py
test_document.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  import warnings
  6  from dataclasses import replace
  7  
  8  import pytest
  9  
 10  from haystack import Document
 11  from haystack.dataclasses.byte_stream import ByteStream
 12  from haystack.dataclasses.sparse_embedding import SparseEmbedding
 13  
 14  
 15  @pytest.mark.parametrize(
 16      "doc,doc_str",
 17      [
 18          (Document(content="test text"), "content: 'test text'"),
 19          (Document(blob=ByteStream(b"hello, test string")), "blob: 18 bytes"),
 20          (Document(content="test text", blob=ByteStream(b"hello, test string")), "content: 'test text', blob: 18 bytes"),
 21      ],
 22  )
 23  def test_document_str(doc, doc_str):
 24      assert f"Document(id={doc.id}, {doc_str})" == str(doc)
 25  
 26  
 27  def test_init():
 28      doc = Document()
 29      assert doc.id == "d4675c57fcfe114db0b95f1da46eea3c5d6f5729c17d01fb5251ae19830a3455"
 30      assert doc.content == None
 31      assert doc.blob == None
 32      assert doc.meta == {}
 33      assert doc.score == None
 34      assert doc.embedding == None
 35      assert doc.sparse_embedding == None
 36  
 37  
 38  def test_init_with_wrong_parameters():
 39      with pytest.raises(TypeError):
 40          Document(text="")
 41  
 42  
 43  def test_init_with_parameters():
 44      blob_data = b"some bytes"
 45      sparse_embedding = SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3])
 46      doc = Document(
 47          content="test text",
 48          blob=ByteStream(data=blob_data, mime_type="text/markdown"),
 49          meta={"text": "test text"},
 50          score=0.812,
 51          embedding=[0.1, 0.2, 0.3],
 52          sparse_embedding=sparse_embedding,
 53      )
 54      assert doc.id == "1aa43af57c1dbc317241bf55d3067049f334d3b458d95dc72f71a7111f6c1a56"
 55      assert doc.content == "test text"
 56      assert doc.blob.data == blob_data
 57      assert doc.blob.mime_type == "text/markdown"
 58      assert doc.meta == {"text": "test text"}
 59      assert doc.score == 0.812
 60      assert doc.embedding == [0.1, 0.2, 0.3]
 61      assert doc.sparse_embedding == sparse_embedding
 62  
 63  
 64  def test_init_with_legacy_fields():
 65      doc = Document(
 66          content="test text",
 67          content_type="text",
 68          id_hash_keys=["content"],
 69          dataframe="placeholder",
 70          score=0.812,
 71          embedding=[0.1, 0.2, 0.3],  # type: ignore
 72      )
 73      assert doc.id == "18fc2c114825872321cf5009827ca162f54d3be50ab9e9ffa027824b6ec223af"
 74      assert doc.content == "test text"
 75      assert doc.blob == None
 76      assert doc.meta == {}
 77      assert doc.score == 0.812
 78      assert doc.embedding == [0.1, 0.2, 0.3]
 79      assert doc.sparse_embedding == None
 80  
 81      assert doc.content_type == "text"  # this is a property now
 82  
 83      assert not hasattr(doc, "id_hash_keys")
 84      assert not hasattr(doc, "dataframe")
 85  
 86  
 87  def test_init_with_legacy_field():
 88      doc = Document(
 89          content="test text",
 90          content_type="text",  # type: ignore
 91          id_hash_keys=["content"],  # type: ignore
 92          score=0.812,
 93          embedding=[0.1, 0.2, 0.3],
 94          meta={"date": "10-10-2023", "type": "article"},
 95      )
 96      assert doc.id == "a2c0321b34430cc675294611e55529fceb56140ca3202f1c59a43a8cecac1f43"
 97      assert doc.content == "test text"
 98      assert doc.meta == {"date": "10-10-2023", "type": "article"}
 99      assert doc.score == 0.812
100      assert doc.embedding == [0.1, 0.2, 0.3]
101      assert doc.sparse_embedding == None
102  
103      assert doc.content_type == "text"  # this is a property now
104      assert not hasattr(doc, "id_hash_keys")
105  
106  
107  def test_basic_equality_type_mismatch():
108      doc = Document(content="test text")
109      assert doc != "test text"
110  
111  
112  def test_basic_equality_id():
113      doc1 = Document(content="test text")
114      doc2 = Document(content="test text")
115  
116      assert doc1 == doc2
117  
118      doc1 = replace(doc1, id="1234")
119      doc2 = replace(doc2, id="5678")
120  
121      assert doc1 != doc2
122  
123  
124  def test_to_dict():
125      doc = Document()
126      assert doc.to_dict() == {
127          "id": doc._create_id(),
128          "content": None,
129          "blob": None,
130          "score": None,
131          "embedding": None,
132          "sparse_embedding": None,
133      }
134  
135  
136  def test_to_dict_without_flattening():
137      doc = Document()
138      assert doc.to_dict(flatten=False) == {
139          "id": doc._create_id(),
140          "content": None,
141          "blob": None,
142          "meta": {},
143          "score": None,
144          "embedding": None,
145          "sparse_embedding": None,
146      }
147  
148  
149  def test_to_dict_with_custom_parameters():
150      doc = Document(
151          content="test text",
152          blob=ByteStream(b"some bytes", mime_type="application/pdf", meta={"foo": "bar"}),
153          meta={"some": "values", "test": 10},
154          score=0.99,
155          embedding=[10.0, 10.0],
156          sparse_embedding=SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3]),
157      )
158  
159      assert doc.to_dict() == {
160          "id": doc.id,
161          "content": "test text",
162          "blob": {"data": list(b"some bytes"), "mime_type": "application/pdf", "meta": {"foo": "bar"}},
163          "some": "values",
164          "test": 10,
165          "score": 0.99,
166          "embedding": [10.0, 10.0],
167          "sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
168      }
169  
170  
171  def test_to_dict_with_custom_parameters_without_flattening():
172      doc = Document(
173          content="test text",
174          blob=ByteStream(b"some bytes", mime_type="application/pdf"),
175          meta={"some": "values", "test": 10},
176          score=0.99,
177          embedding=[10.0, 10.0],
178          sparse_embedding=SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3]),
179      )
180  
181      assert doc.to_dict(flatten=False) == {
182          "id": doc.id,
183          "content": "test text",
184          "blob": {"data": list(b"some bytes"), "mime_type": "application/pdf", "meta": {}},
185          "meta": {"some": "values", "test": 10},
186          "score": 0.99,
187          "embedding": [10.0, 10.0],
188          "sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
189      }
190  
191  
192  def test_to_dict_field_precedence():
193      """
194      Test for Document.to_dict() with flatten=True.
195  
196      Test that Document's first-level fields take precedence over meta fields when flattening the dictionary
197      representation.
198      """
199  
200      doc = Document(content="from-content", score=0.9, meta={"content": "from-meta", "score": 0.5, "source": "web"})
201  
202      flat_dict = doc.to_dict(flatten=True)
203  
204      # First-level fields should take precedence
205      assert flat_dict["content"] == "from-content"
206      assert flat_dict["score"] == 0.9
207      # Meta-only fields should be preserved
208      assert flat_dict["source"] == "web"
209  
210  
211  def test_from_dict():
212      assert Document.from_dict({}) == Document()
213  
214  
215  def from_from_dict_with_parameters():
216      blob_data = b"some bytes"
217      assert Document.from_dict(
218          {
219              "content": "test text",
220              "blob": {"data": list(blob_data), "mime_type": "text/markdown", "meta": {"text": "test text"}},
221              "meta": {"text": "test text"},
222              "score": 0.812,
223              "embedding": [0.1, 0.2, 0.3],
224              "sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
225          }
226      ) == Document(
227          content="test text",
228          blob=ByteStream(blob_data, mime_type="text/markdown", meta={"text": "test text"}),
229          meta={"text": "test text"},
230          score=0.812,
231          embedding=[0.1, 0.2, 0.3],
232          sparse_embedding=SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3]),
233      )
234  
235  
236  def test_from_dict_with_legacy_fields():
237      assert Document.from_dict(
238          {
239              "content": "test text",
240              "content_type": "text",
241              "id_hash_keys": ["content"],
242              "score": 0.812,
243              "embedding": [0.1, 0.2, 0.3],
244          }
245      ) == Document(
246          content="test text",
247          content_type="text",
248          id_hash_keys=["content"],
249          score=0.812,
250          embedding=[0.1, 0.2, 0.3],  # type: ignore
251      )
252  
253  
254  def test_from_dict_with_legacy_field_and_flat_meta():
255      assert Document.from_dict(
256          {
257              "content": "test text",
258              "content_type": "text",
259              "id_hash_keys": ["content"],
260              "score": 0.812,
261              "embedding": [0.1, 0.2, 0.3],
262              "date": "10-10-2023",
263              "type": "article",
264          }
265      ) == Document(
266          content="test text",
267          content_type="text",  # type: ignore
268          id_hash_keys=["content"],  # type: ignore
269          score=0.812,
270          embedding=[0.1, 0.2, 0.3],
271          meta={"date": "10-10-2023", "type": "article"},
272      )
273  
274  
275  def test_from_dict_with_flat_meta():
276      blob_data = b"some bytes"
277      assert Document.from_dict(
278          {
279              "content": "test text",
280              "blob": {"data": list(blob_data), "mime_type": "text/markdown"},
281              "score": 0.812,
282              "embedding": [0.1, 0.2, 0.3],
283              "sparse_embedding": {"indices": [0, 2, 4], "values": [0.1, 0.2, 0.3]},
284              "date": "10-10-2023",
285              "type": "article",
286          }
287      ) == Document(
288          content="test text",
289          blob=ByteStream(blob_data, mime_type="text/markdown"),
290          score=0.812,
291          embedding=[0.1, 0.2, 0.3],
292          sparse_embedding=SparseEmbedding(indices=[0, 2, 4], values=[0.1, 0.2, 0.3]),
293          meta={"date": "10-10-2023", "type": "article"},
294      )
295  
296  
297  def test_from_dict_with_flat_and_non_flat_meta():
298      with pytest.raises(ValueError, match="Pass either the 'meta' parameter or flattened metadata keys"):
299          Document.from_dict(
300              {
301                  "content": "test text",
302                  "blob": {"data": list(b"some bytes"), "mime_type": "text/markdown"},
303                  "score": 0.812,
304                  "meta": {"test": 10},
305                  "embedding": [0.1, 0.2, 0.3],
306                  "date": "10-10-2023",
307                  "type": "article",
308              }
309          )
310  
311  
312  def test_from_dict_with_dataframe():
313      """
314      Test for legacy support of Document.from_dict() with dataframe field.
315  
316      Test that Document.from_dict() can properly deserialize a Document dictionary obtained with
317      document.to_dict(flatten=False) in haystack-ai<=2.10.0.
318      We make sure that Document.from_dict() does not raise an error and that dataframe is skipped (legacy field).
319      """
320  
321      # Document dictionary obtained with document.to_dict(flatten=False) in haystack-ai<=2.10.0
322      doc_dict = {
323          "id": "my_id",
324          "content": "my_content",
325          "dataframe": None,
326          "blob": None,
327          "meta": {"key": "value"},
328          "score": None,
329          "embedding": None,
330          "sparse_embedding": None,
331      }
332  
333      doc = Document.from_dict(doc_dict)
334  
335      assert doc.id == "my_id"
336      assert doc.content == "my_content"
337      assert doc.meta == {"key": "value"}
338      assert doc.score is None
339      assert doc.embedding is None
340      assert doc.sparse_embedding is None
341  
342      assert not hasattr(doc, "dataframe")
343  
344  
345  def test_content_type():
346      assert Document(content="text").content_type == "text"
347  
348      with pytest.raises(ValueError):
349          _ = Document().content_type
350  
351  
352  def test_no_warning_on_init():
353      with warnings.catch_warnings():
354          warnings.simplefilter("error", Warning)
355          Document(content="test")
356  
357  
358  def test_warn_on_inplace_mutation():
359      doc = Document(content="test")
360      with pytest.warns(Warning, match="dataclasses.replace"):
361          doc.content = "other"