/ test / components / rankers / test_transformers_similarity.py
test_transformers_similarity.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  import logging
  6  from unittest.mock import MagicMock, patch
  7  
  8  import pytest
  9  import torch
 10  from transformers.modeling_outputs import SequenceClassifierOutput
 11  
 12  from haystack import Document
 13  from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker
 14  from haystack.utils.auth import Secret
 15  from haystack.utils.device import ComponentDevice, DeviceMap
 16  
 17  
 18  class TestSimilarityRanker:
 19      def test_to_dict(self):
 20          component = TransformersSimilarityRanker()
 21          data = component.to_dict()
 22          assert data == {
 23              "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker",
 24              "init_parameters": {
 25                  "device": None,
 26                  "top_k": 10,
 27                  "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"},
 28                  "query_prefix": "",
 29                  "document_prefix": "",
 30                  "model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
 31                  "meta_fields_to_embed": [],
 32                  "embedding_separator": "\n",
 33                  "scale_score": True,
 34                  "calibration_factor": 1.0,
 35                  "score_threshold": None,
 36                  "model_kwargs": {"device_map": ComponentDevice.resolve_device(None).to_hf()},
 37                  "tokenizer_kwargs": {},
 38                  "batch_size": 16,
 39              },
 40          }
 41  
 42      def test_to_dict_with_custom_init_parameters(self):
 43          component = TransformersSimilarityRanker(
 44              model="my_model",
 45              device=ComponentDevice.from_str("cuda:0"),
 46              token=Secret.from_env_var("ENV_VAR", strict=False),
 47              top_k=5,
 48              query_prefix="query_instruction: ",
 49              document_prefix="document_instruction: ",
 50              scale_score=False,
 51              calibration_factor=None,
 52              score_threshold=0.01,
 53              model_kwargs={"torch_dtype": torch.float16},
 54              tokenizer_kwargs={"model_max_length": 512},
 55              batch_size=32,
 56          )
 57          data = component.to_dict()
 58          assert data == {
 59              "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker",
 60              "init_parameters": {
 61                  "device": None,
 62                  "model": "my_model",
 63                  "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"},
 64                  "top_k": 5,
 65                  "query_prefix": "query_instruction: ",
 66                  "document_prefix": "document_instruction: ",
 67                  "meta_fields_to_embed": [],
 68                  "embedding_separator": "\n",
 69                  "scale_score": False,
 70                  "calibration_factor": None,
 71                  "score_threshold": 0.01,
 72                  "model_kwargs": {
 73                      "torch_dtype": "torch.float16",
 74                      "device_map": ComponentDevice.from_str("cuda:0").to_hf(),
 75                  },  # torch_dtype is correctly serialized
 76                  "tokenizer_kwargs": {"model_max_length": 512},
 77                  "batch_size": 32,
 78              },
 79          }
 80  
 81      def test_to_dict_with_quantization_options(self):
 82          component = TransformersSimilarityRanker(
 83              model_kwargs={
 84                  "load_in_4bit": True,
 85                  "bnb_4bit_use_double_quant": True,
 86                  "bnb_4bit_quant_type": "nf4",
 87                  "bnb_4bit_compute_dtype": torch.bfloat16,
 88              }
 89          )
 90          data = component.to_dict()
 91          assert data == {
 92              "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker",
 93              "init_parameters": {
 94                  "device": None,
 95                  "top_k": 10,
 96                  "query_prefix": "",
 97                  "document_prefix": "",
 98                  "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"},
 99                  "model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
100                  "meta_fields_to_embed": [],
101                  "embedding_separator": "\n",
102                  "scale_score": True,
103                  "calibration_factor": 1.0,
104                  "score_threshold": None,
105                  "model_kwargs": {
106                      "load_in_4bit": True,
107                      "bnb_4bit_use_double_quant": True,
108                      "bnb_4bit_quant_type": "nf4",
109                      "bnb_4bit_compute_dtype": "torch.bfloat16",
110                      "device_map": ComponentDevice.resolve_device(None).to_hf(),
111                  },
112                  "tokenizer_kwargs": {},
113                  "batch_size": 16,
114              },
115          }
116  
117      @pytest.mark.parametrize(
118          "device_map,expected",
119          [
120              ("auto", "auto"),
121              ("cpu:0", ComponentDevice.from_str("cpu:0").to_hf()),
122              ({"": "cpu:0"}, ComponentDevice.from_multiple(DeviceMap.from_hf({"": "cpu:0"})).to_hf()),
123          ],
124      )
125      def test_to_dict_device_map(self, device_map, expected):
126          component = TransformersSimilarityRanker(model_kwargs={"device_map": device_map}, token=None)
127          data = component.to_dict()
128  
129          assert data == {
130              "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker",
131              "init_parameters": {
132                  "device": None,
133                  "top_k": 10,
134                  "token": None,
135                  "query_prefix": "",
136                  "document_prefix": "",
137                  "model": "cross-encoder/ms-marco-MiniLM-L-6-v2",
138                  "meta_fields_to_embed": [],
139                  "embedding_separator": "\n",
140                  "scale_score": True,
141                  "calibration_factor": 1.0,
142                  "score_threshold": None,
143                  "model_kwargs": {"device_map": expected},
144                  "tokenizer_kwargs": {},
145                  "batch_size": 16,
146              },
147          }
148  
149      def test_from_dict(self):
150          data = {
151              "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker",
152              "init_parameters": {
153                  "device": None,
154                  "model": "my_model",
155                  "token": None,
156                  "top_k": 5,
157                  "query_prefix": "",
158                  "document_prefix": "",
159                  "meta_fields_to_embed": [],
160                  "embedding_separator": "\n",
161                  "scale_score": False,
162                  "calibration_factor": None,
163                  "score_threshold": 0.01,
164                  "model_kwargs": {"torch_dtype": "torch.float16"},
165                  "tokenizer_kwargs": {"model_max_length": 512},
166                  "batch_size": 32,
167              },
168          }
169  
170          component = TransformersSimilarityRanker.from_dict(data)
171          assert component.device is None
172          assert component.model_name_or_path == "my_model"
173          assert component.token is None
174          assert component.top_k == 5
175          assert component.query_prefix == ""
176          assert component.document_prefix == ""
177          assert component.meta_fields_to_embed == []
178          assert component.embedding_separator == "\n"
179          assert not component.scale_score
180          assert component.calibration_factor is None
181          assert component.score_threshold == 0.01
182          # torch_dtype is correctly deserialized
183          assert component.model_kwargs == {
184              "torch_dtype": torch.float16,
185              "device_map": ComponentDevice.resolve_device(None).to_hf(),
186          }
187          assert component.tokenizer_kwargs == {"model_max_length": 512}
188          assert component.batch_size == 32
189  
190      def test_from_dict_no_default_parameters(self):
191          data = {
192              "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker",
193              "init_parameters": {},
194          }
195  
196          component = TransformersSimilarityRanker.from_dict(data)
197          assert component.device is None
198          assert component.model_name_or_path == "cross-encoder/ms-marco-MiniLM-L-6-v2"
199          assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False)
200          assert component.top_k == 10
201          assert component.query_prefix == ""
202          assert component.document_prefix == ""
203          assert component.meta_fields_to_embed == []
204          assert component.embedding_separator == "\n"
205          assert component.scale_score
206          assert component.calibration_factor == 1.0
207          assert component.score_threshold is None
208          # torch_dtype is correctly deserialized
209          assert component.model_kwargs == {"device_map": ComponentDevice.resolve_device(None).to_hf()}
210          assert component.tokenizer_kwargs == {}
211          assert component.batch_size == 16
212  
213      @patch("torch.sigmoid")
214      @patch("torch.sort")
215      @patch("torch.stack")
216      def test_embed_meta(self, mocked_stack, mocked_sort, mocked_sigmoid):
217          mocked_stack.return_value = torch.tensor([0])
218          mocked_sort.return_value = (None, torch.tensor([0]))
219          mocked_sigmoid.return_value = torch.tensor([0])
220          embedder = TransformersSimilarityRanker(
221              model="model", meta_fields_to_embed=["meta_field"], embedding_separator="\n"
222          )
223          embedder.model = MagicMock()
224          embedder.tokenizer = MagicMock()
225          embedder.device = MagicMock()
226  
227          documents = [Document(content=f"document number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)]
228  
229          embedder.run(query="test", documents=documents)
230  
231          embedder.tokenizer.assert_called_once_with(
232              [
233                  ["test", "meta_value 0\ndocument number 0"],
234                  ["test", "meta_value 1\ndocument number 1"],
235                  ["test", "meta_value 2\ndocument number 2"],
236                  ["test", "meta_value 3\ndocument number 3"],
237                  ["test", "meta_value 4\ndocument number 4"],
238              ],
239              padding=True,
240              truncation=True,
241              return_tensors="pt",
242          )
243  
244      @patch("torch.sigmoid")
245      @patch("torch.sort")
246      @patch("torch.stack")
247      def test_prefix(self, mocked_stack, mocked_sort, mocked_sigmoid):
248          mocked_stack.return_value = torch.tensor([0])
249          mocked_sort.return_value = (None, torch.tensor([0]))
250          mocked_sigmoid.return_value = torch.tensor([0])
251          embedder = TransformersSimilarityRanker(
252              model="model", query_prefix="query_instruction: ", document_prefix="document_instruction: "
253          )
254          embedder.model = MagicMock()
255          embedder.tokenizer = MagicMock()
256          embedder.device = MagicMock()
257  
258          documents = [Document(content=f"document number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)]
259  
260          embedder.run(query="test", documents=documents)
261  
262          embedder.tokenizer.assert_called_once_with(
263              [
264                  ["query_instruction: test", "document_instruction: document number 0"],
265                  ["query_instruction: test", "document_instruction: document number 1"],
266                  ["query_instruction: test", "document_instruction: document number 2"],
267                  ["query_instruction: test", "document_instruction: document number 3"],
268                  ["query_instruction: test", "document_instruction: document number 4"],
269              ],
270              padding=True,
271              truncation=True,
272              return_tensors="pt",
273          )
274  
275      @patch("torch.sort")
276      @patch("torch.stack")
277      def test_scale_score_false(self, mocked_stack, mocked_sort):
278          mocked_stack.return_value = torch.FloatTensor([-10.6859, -8.9874])
279          mocked_sort.return_value = (None, torch.tensor([0, 1]))
280          embedder = TransformersSimilarityRanker(model="model", scale_score=False)
281          embedder.model = MagicMock()
282          embedder.model.return_value = SequenceClassifierOutput(
283              loss=None, logits=torch.FloatTensor([[-10.6859], [-8.9874]]), hidden_states=None, attentions=None
284          )
285          embedder.tokenizer = MagicMock()
286          embedder.device = MagicMock()
287  
288          documents = [Document(content="document number 0"), Document(content="document number 1")]
289          out = embedder.run(query="test", documents=documents)
290          assert out["documents"][0].score == pytest.approx(-10.6859, abs=1e-4)
291          assert out["documents"][1].score == pytest.approx(-8.9874, abs=1e-4)
292  
293      @patch("torch.sort")
294      @patch("torch.stack")
295      def test_score_threshold(self, mocked_stack, mocked_sort):
296          mocked_stack.return_value = torch.FloatTensor([0.955, 0.001])
297          mocked_sort.return_value = (None, torch.tensor([0, 1]))
298          embedder = TransformersSimilarityRanker(model="model", scale_score=False, score_threshold=0.1)
299          embedder.model = MagicMock()
300          embedder.model.return_value = SequenceClassifierOutput(
301              loss=None, logits=torch.FloatTensor([[0.955], [0.001]]), hidden_states=None, attentions=None
302          )
303          embedder.tokenizer = MagicMock()
304          embedder.device = MagicMock()
305  
306          documents = [Document(content="document number 0"), Document(content="document number 1")]
307          out = embedder.run(query="test", documents=documents)
308          assert len(out["documents"]) == 1
309  
310      def test_device_map_and_device_raises(self, caplog):
311          with caplog.at_level(logging.WARNING):
312              _ = TransformersSimilarityRanker(
313                  "model", model_kwargs={"device_map": "cpu"}, device=ComponentDevice.from_str("cuda")
314              )
315              assert (
316                  "The parameters `device` and `device_map` from `model_kwargs` are both provided. Ignoring `device` "
317                  "and using `device_map`." in caplog.text
318              )
319  
320      @patch("haystack.components.rankers.transformers_similarity.AutoTokenizer.from_pretrained")
321      @patch("haystack.components.rankers.transformers_similarity.AutoModelForSequenceClassification.from_pretrained")
322      def test_device_map_dict(self, mocked_automodel, _mocked_autotokenizer, del_hf_env_vars):
323          ranker = TransformersSimilarityRanker("model", model_kwargs={"device_map": {"layer_1": 1, "classifier": "cpu"}})
324  
325          class MockedModel:
326              def __init__(self):
327                  self.hf_device_map = {"layer_1": 1, "classifier": "cpu"}
328  
329          mocked_automodel.return_value = MockedModel()
330          ranker.warm_up()
331  
332          mocked_automodel.assert_called_once_with("model", token=None, device_map={"layer_1": 1, "classifier": "cpu"})
333          assert ranker.device == ComponentDevice.from_multiple(DeviceMap.from_hf({"layer_1": 1, "classifier": "cpu"}))
334  
335      def test_returns_empty_list_if_no_documents_are_provided(self):
336          sampler = TransformersSimilarityRanker()
337          # Mock all attributes that are set during warm_up
338          sampler.model = MagicMock()
339          sampler.tokenizer = MagicMock()
340          sampler.device = MagicMock()
341  
342          output = sampler.run(query="City in Germany", documents=[])
343          assert not output["documents"]
344  
345      @patch("torch.stack")
346      def test_run_deduplicates_documents(self, mocked_stack):
347          mocked_stack.return_value = torch.tensor([0.42, 0.12])
348          ranker = TransformersSimilarityRanker()
349          ranker.model = MagicMock()
350          ranker.tokenizer = MagicMock()
351          ranker.device = MagicMock()
352  
353          documents = [
354              Document(id="duplicate", content="keep me", score=0.9),
355              Document(id="duplicate", content="drop me", score=0.1),
356              Document(id="unique", content="unique"),
357          ]
358          result = ranker.run(query="test", documents=documents)
359          assert len(result["documents"]) == 2
360          assert result["documents"][0].content == "keep me"
361          assert result["documents"][1].content == "unique"
362  
363      @pytest.mark.integration
364      @pytest.mark.slow
365      def test_run(self, del_hf_env_vars):
366          """
367          Test if the component ranks documents correctly.
368          """
369  
370          ranker = TransformersSimilarityRanker(model="cross-encoder-testing/reranker-bert-tiny-gooaq-bce")
371  
372          query = "City in Bosnia and Herzegovina"
373          docs_before_texts = ["Berlin", "Belgrade", "Sarajevo"]
374          expected_first_text = "Sarajevo"
375          expected_scores = [0.14568544924259186, 0.18189962208271027, 0.5728498697280884]
376  
377          docs_before = [Document(content=text) for text in docs_before_texts]
378          output = ranker.run(query=query, documents=docs_before)
379          docs_after = output["documents"]
380  
381          assert len(docs_after) == 3
382          assert docs_after[0].content == expected_first_text
383  
384          sorted_scores = sorted(expected_scores, reverse=True)
385          assert docs_after[0].score == pytest.approx(sorted_scores[0], abs=1e-6)
386          assert docs_after[1].score == pytest.approx(sorted_scores[1], abs=1e-6)
387          assert docs_after[2].score == pytest.approx(sorted_scores[2], abs=1e-6)
388  
389      @pytest.mark.integration
390      @pytest.mark.slow
391      def test_run_top_k(self, del_hf_env_vars):
392          """
393          Test if the component ranks documents correctly with a custom top_k.
394          """
395          ranker = TransformersSimilarityRanker(model="cross-encoder-testing/reranker-bert-tiny-gooaq-bce", top_k=2)
396  
397          query = "City in Bosnia and Herzegovina"
398          docs_before_texts = ["Berlin", "Belgrade", "Sarajevo"]
399          expected_first_text = "Sarajevo"
400  
401          docs_before = [Document(content=text) for text in docs_before_texts]
402          output = ranker.run(query=query, documents=docs_before)
403          docs_after = output["documents"]
404  
405          assert len(docs_after) == 2
406          assert docs_after[0].content == expected_first_text
407  
408          sorted_scores = sorted([doc.score for doc in docs_after], reverse=True)
409          assert [doc.score for doc in docs_after] == sorted_scores
410  
411      @pytest.mark.integration
412      @pytest.mark.slow
413      def test_run_single_document(self, del_hf_env_vars):
414          """
415          Test if the component runs with a single document.
416          """
417          ranker = TransformersSimilarityRanker(model="cross-encoder-testing/reranker-bert-tiny-gooaq-bce", device=None)
418          docs_before = [Document(content="Berlin")]
419          output = ranker.run(query="City in Germany", documents=docs_before)
420          docs_after = output["documents"]
421  
422          assert len(docs_after) == 1