test_transformers_similarity.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 import logging 6 from unittest.mock import MagicMock, patch 7 8 import pytest 9 import torch 10 from transformers.modeling_outputs import SequenceClassifierOutput 11 12 from haystack import Document 13 from haystack.components.rankers.transformers_similarity import TransformersSimilarityRanker 14 from haystack.utils.auth import Secret 15 from haystack.utils.device import ComponentDevice, DeviceMap 16 17 18 class TestSimilarityRanker: 19 def test_to_dict(self): 20 component = TransformersSimilarityRanker() 21 data = component.to_dict() 22 assert data == { 23 "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker", 24 "init_parameters": { 25 "device": None, 26 "top_k": 10, 27 "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, 28 "query_prefix": "", 29 "document_prefix": "", 30 "model": "cross-encoder/ms-marco-MiniLM-L-6-v2", 31 "meta_fields_to_embed": [], 32 "embedding_separator": "\n", 33 "scale_score": True, 34 "calibration_factor": 1.0, 35 "score_threshold": None, 36 "model_kwargs": {"device_map": ComponentDevice.resolve_device(None).to_hf()}, 37 "tokenizer_kwargs": {}, 38 "batch_size": 16, 39 }, 40 } 41 42 def test_to_dict_with_custom_init_parameters(self): 43 component = TransformersSimilarityRanker( 44 model="my_model", 45 device=ComponentDevice.from_str("cuda:0"), 46 token=Secret.from_env_var("ENV_VAR", strict=False), 47 top_k=5, 48 query_prefix="query_instruction: ", 49 document_prefix="document_instruction: ", 50 scale_score=False, 51 calibration_factor=None, 52 score_threshold=0.01, 53 model_kwargs={"torch_dtype": torch.float16}, 54 tokenizer_kwargs={"model_max_length": 512}, 55 batch_size=32, 56 ) 57 data = component.to_dict() 58 assert data == { 59 "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker", 60 "init_parameters": { 61 "device": None, 62 "model": "my_model", 63 "token": {"env_vars": ["ENV_VAR"], "strict": False, "type": "env_var"}, 64 "top_k": 5, 65 "query_prefix": "query_instruction: ", 66 "document_prefix": "document_instruction: ", 67 "meta_fields_to_embed": [], 68 "embedding_separator": "\n", 69 "scale_score": False, 70 "calibration_factor": None, 71 "score_threshold": 0.01, 72 "model_kwargs": { 73 "torch_dtype": "torch.float16", 74 "device_map": ComponentDevice.from_str("cuda:0").to_hf(), 75 }, # torch_dtype is correctly serialized 76 "tokenizer_kwargs": {"model_max_length": 512}, 77 "batch_size": 32, 78 }, 79 } 80 81 def test_to_dict_with_quantization_options(self): 82 component = TransformersSimilarityRanker( 83 model_kwargs={ 84 "load_in_4bit": True, 85 "bnb_4bit_use_double_quant": True, 86 "bnb_4bit_quant_type": "nf4", 87 "bnb_4bit_compute_dtype": torch.bfloat16, 88 } 89 ) 90 data = component.to_dict() 91 assert data == { 92 "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker", 93 "init_parameters": { 94 "device": None, 95 "top_k": 10, 96 "query_prefix": "", 97 "document_prefix": "", 98 "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, 99 "model": "cross-encoder/ms-marco-MiniLM-L-6-v2", 100 "meta_fields_to_embed": [], 101 "embedding_separator": "\n", 102 "scale_score": True, 103 "calibration_factor": 1.0, 104 "score_threshold": None, 105 "model_kwargs": { 106 "load_in_4bit": True, 107 "bnb_4bit_use_double_quant": True, 108 "bnb_4bit_quant_type": "nf4", 109 "bnb_4bit_compute_dtype": "torch.bfloat16", 110 "device_map": ComponentDevice.resolve_device(None).to_hf(), 111 }, 112 "tokenizer_kwargs": {}, 113 "batch_size": 16, 114 }, 115 } 116 117 @pytest.mark.parametrize( 118 "device_map,expected", 119 [ 120 ("auto", "auto"), 121 ("cpu:0", ComponentDevice.from_str("cpu:0").to_hf()), 122 ({"": "cpu:0"}, ComponentDevice.from_multiple(DeviceMap.from_hf({"": "cpu:0"})).to_hf()), 123 ], 124 ) 125 def test_to_dict_device_map(self, device_map, expected): 126 component = TransformersSimilarityRanker(model_kwargs={"device_map": device_map}, token=None) 127 data = component.to_dict() 128 129 assert data == { 130 "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker", 131 "init_parameters": { 132 "device": None, 133 "top_k": 10, 134 "token": None, 135 "query_prefix": "", 136 "document_prefix": "", 137 "model": "cross-encoder/ms-marco-MiniLM-L-6-v2", 138 "meta_fields_to_embed": [], 139 "embedding_separator": "\n", 140 "scale_score": True, 141 "calibration_factor": 1.0, 142 "score_threshold": None, 143 "model_kwargs": {"device_map": expected}, 144 "tokenizer_kwargs": {}, 145 "batch_size": 16, 146 }, 147 } 148 149 def test_from_dict(self): 150 data = { 151 "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker", 152 "init_parameters": { 153 "device": None, 154 "model": "my_model", 155 "token": None, 156 "top_k": 5, 157 "query_prefix": "", 158 "document_prefix": "", 159 "meta_fields_to_embed": [], 160 "embedding_separator": "\n", 161 "scale_score": False, 162 "calibration_factor": None, 163 "score_threshold": 0.01, 164 "model_kwargs": {"torch_dtype": "torch.float16"}, 165 "tokenizer_kwargs": {"model_max_length": 512}, 166 "batch_size": 32, 167 }, 168 } 169 170 component = TransformersSimilarityRanker.from_dict(data) 171 assert component.device is None 172 assert component.model_name_or_path == "my_model" 173 assert component.token is None 174 assert component.top_k == 5 175 assert component.query_prefix == "" 176 assert component.document_prefix == "" 177 assert component.meta_fields_to_embed == [] 178 assert component.embedding_separator == "\n" 179 assert not component.scale_score 180 assert component.calibration_factor is None 181 assert component.score_threshold == 0.01 182 # torch_dtype is correctly deserialized 183 assert component.model_kwargs == { 184 "torch_dtype": torch.float16, 185 "device_map": ComponentDevice.resolve_device(None).to_hf(), 186 } 187 assert component.tokenizer_kwargs == {"model_max_length": 512} 188 assert component.batch_size == 32 189 190 def test_from_dict_no_default_parameters(self): 191 data = { 192 "type": "haystack.components.rankers.transformers_similarity.TransformersSimilarityRanker", 193 "init_parameters": {}, 194 } 195 196 component = TransformersSimilarityRanker.from_dict(data) 197 assert component.device is None 198 assert component.model_name_or_path == "cross-encoder/ms-marco-MiniLM-L-6-v2" 199 assert component.token == Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False) 200 assert component.top_k == 10 201 assert component.query_prefix == "" 202 assert component.document_prefix == "" 203 assert component.meta_fields_to_embed == [] 204 assert component.embedding_separator == "\n" 205 assert component.scale_score 206 assert component.calibration_factor == 1.0 207 assert component.score_threshold is None 208 # torch_dtype is correctly deserialized 209 assert component.model_kwargs == {"device_map": ComponentDevice.resolve_device(None).to_hf()} 210 assert component.tokenizer_kwargs == {} 211 assert component.batch_size == 16 212 213 @patch("torch.sigmoid") 214 @patch("torch.sort") 215 @patch("torch.stack") 216 def test_embed_meta(self, mocked_stack, mocked_sort, mocked_sigmoid): 217 mocked_stack.return_value = torch.tensor([0]) 218 mocked_sort.return_value = (None, torch.tensor([0])) 219 mocked_sigmoid.return_value = torch.tensor([0]) 220 embedder = TransformersSimilarityRanker( 221 model="model", meta_fields_to_embed=["meta_field"], embedding_separator="\n" 222 ) 223 embedder.model = MagicMock() 224 embedder.tokenizer = MagicMock() 225 embedder.device = MagicMock() 226 227 documents = [Document(content=f"document number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] 228 229 embedder.run(query="test", documents=documents) 230 231 embedder.tokenizer.assert_called_once_with( 232 [ 233 ["test", "meta_value 0\ndocument number 0"], 234 ["test", "meta_value 1\ndocument number 1"], 235 ["test", "meta_value 2\ndocument number 2"], 236 ["test", "meta_value 3\ndocument number 3"], 237 ["test", "meta_value 4\ndocument number 4"], 238 ], 239 padding=True, 240 truncation=True, 241 return_tensors="pt", 242 ) 243 244 @patch("torch.sigmoid") 245 @patch("torch.sort") 246 @patch("torch.stack") 247 def test_prefix(self, mocked_stack, mocked_sort, mocked_sigmoid): 248 mocked_stack.return_value = torch.tensor([0]) 249 mocked_sort.return_value = (None, torch.tensor([0])) 250 mocked_sigmoid.return_value = torch.tensor([0]) 251 embedder = TransformersSimilarityRanker( 252 model="model", query_prefix="query_instruction: ", document_prefix="document_instruction: " 253 ) 254 embedder.model = MagicMock() 255 embedder.tokenizer = MagicMock() 256 embedder.device = MagicMock() 257 258 documents = [Document(content=f"document number {i}", meta={"meta_field": f"meta_value {i}"}) for i in range(5)] 259 260 embedder.run(query="test", documents=documents) 261 262 embedder.tokenizer.assert_called_once_with( 263 [ 264 ["query_instruction: test", "document_instruction: document number 0"], 265 ["query_instruction: test", "document_instruction: document number 1"], 266 ["query_instruction: test", "document_instruction: document number 2"], 267 ["query_instruction: test", "document_instruction: document number 3"], 268 ["query_instruction: test", "document_instruction: document number 4"], 269 ], 270 padding=True, 271 truncation=True, 272 return_tensors="pt", 273 ) 274 275 @patch("torch.sort") 276 @patch("torch.stack") 277 def test_scale_score_false(self, mocked_stack, mocked_sort): 278 mocked_stack.return_value = torch.FloatTensor([-10.6859, -8.9874]) 279 mocked_sort.return_value = (None, torch.tensor([0, 1])) 280 embedder = TransformersSimilarityRanker(model="model", scale_score=False) 281 embedder.model = MagicMock() 282 embedder.model.return_value = SequenceClassifierOutput( 283 loss=None, logits=torch.FloatTensor([[-10.6859], [-8.9874]]), hidden_states=None, attentions=None 284 ) 285 embedder.tokenizer = MagicMock() 286 embedder.device = MagicMock() 287 288 documents = [Document(content="document number 0"), Document(content="document number 1")] 289 out = embedder.run(query="test", documents=documents) 290 assert out["documents"][0].score == pytest.approx(-10.6859, abs=1e-4) 291 assert out["documents"][1].score == pytest.approx(-8.9874, abs=1e-4) 292 293 @patch("torch.sort") 294 @patch("torch.stack") 295 def test_score_threshold(self, mocked_stack, mocked_sort): 296 mocked_stack.return_value = torch.FloatTensor([0.955, 0.001]) 297 mocked_sort.return_value = (None, torch.tensor([0, 1])) 298 embedder = TransformersSimilarityRanker(model="model", scale_score=False, score_threshold=0.1) 299 embedder.model = MagicMock() 300 embedder.model.return_value = SequenceClassifierOutput( 301 loss=None, logits=torch.FloatTensor([[0.955], [0.001]]), hidden_states=None, attentions=None 302 ) 303 embedder.tokenizer = MagicMock() 304 embedder.device = MagicMock() 305 306 documents = [Document(content="document number 0"), Document(content="document number 1")] 307 out = embedder.run(query="test", documents=documents) 308 assert len(out["documents"]) == 1 309 310 def test_device_map_and_device_raises(self, caplog): 311 with caplog.at_level(logging.WARNING): 312 _ = TransformersSimilarityRanker( 313 "model", model_kwargs={"device_map": "cpu"}, device=ComponentDevice.from_str("cuda") 314 ) 315 assert ( 316 "The parameters `device` and `device_map` from `model_kwargs` are both provided. Ignoring `device` " 317 "and using `device_map`." in caplog.text 318 ) 319 320 @patch("haystack.components.rankers.transformers_similarity.AutoTokenizer.from_pretrained") 321 @patch("haystack.components.rankers.transformers_similarity.AutoModelForSequenceClassification.from_pretrained") 322 def test_device_map_dict(self, mocked_automodel, _mocked_autotokenizer, del_hf_env_vars): 323 ranker = TransformersSimilarityRanker("model", model_kwargs={"device_map": {"layer_1": 1, "classifier": "cpu"}}) 324 325 class MockedModel: 326 def __init__(self): 327 self.hf_device_map = {"layer_1": 1, "classifier": "cpu"} 328 329 mocked_automodel.return_value = MockedModel() 330 ranker.warm_up() 331 332 mocked_automodel.assert_called_once_with("model", token=None, device_map={"layer_1": 1, "classifier": "cpu"}) 333 assert ranker.device == ComponentDevice.from_multiple(DeviceMap.from_hf({"layer_1": 1, "classifier": "cpu"})) 334 335 def test_returns_empty_list_if_no_documents_are_provided(self): 336 sampler = TransformersSimilarityRanker() 337 # Mock all attributes that are set during warm_up 338 sampler.model = MagicMock() 339 sampler.tokenizer = MagicMock() 340 sampler.device = MagicMock() 341 342 output = sampler.run(query="City in Germany", documents=[]) 343 assert not output["documents"] 344 345 @patch("torch.stack") 346 def test_run_deduplicates_documents(self, mocked_stack): 347 mocked_stack.return_value = torch.tensor([0.42, 0.12]) 348 ranker = TransformersSimilarityRanker() 349 ranker.model = MagicMock() 350 ranker.tokenizer = MagicMock() 351 ranker.device = MagicMock() 352 353 documents = [ 354 Document(id="duplicate", content="keep me", score=0.9), 355 Document(id="duplicate", content="drop me", score=0.1), 356 Document(id="unique", content="unique"), 357 ] 358 result = ranker.run(query="test", documents=documents) 359 assert len(result["documents"]) == 2 360 assert result["documents"][0].content == "keep me" 361 assert result["documents"][1].content == "unique" 362 363 @pytest.mark.integration 364 @pytest.mark.slow 365 def test_run(self, del_hf_env_vars): 366 """ 367 Test if the component ranks documents correctly. 368 """ 369 370 ranker = TransformersSimilarityRanker(model="cross-encoder-testing/reranker-bert-tiny-gooaq-bce") 371 372 query = "City in Bosnia and Herzegovina" 373 docs_before_texts = ["Berlin", "Belgrade", "Sarajevo"] 374 expected_first_text = "Sarajevo" 375 expected_scores = [0.14568544924259186, 0.18189962208271027, 0.5728498697280884] 376 377 docs_before = [Document(content=text) for text in docs_before_texts] 378 output = ranker.run(query=query, documents=docs_before) 379 docs_after = output["documents"] 380 381 assert len(docs_after) == 3 382 assert docs_after[0].content == expected_first_text 383 384 sorted_scores = sorted(expected_scores, reverse=True) 385 assert docs_after[0].score == pytest.approx(sorted_scores[0], abs=1e-6) 386 assert docs_after[1].score == pytest.approx(sorted_scores[1], abs=1e-6) 387 assert docs_after[2].score == pytest.approx(sorted_scores[2], abs=1e-6) 388 389 @pytest.mark.integration 390 @pytest.mark.slow 391 def test_run_top_k(self, del_hf_env_vars): 392 """ 393 Test if the component ranks documents correctly with a custom top_k. 394 """ 395 ranker = TransformersSimilarityRanker(model="cross-encoder-testing/reranker-bert-tiny-gooaq-bce", top_k=2) 396 397 query = "City in Bosnia and Herzegovina" 398 docs_before_texts = ["Berlin", "Belgrade", "Sarajevo"] 399 expected_first_text = "Sarajevo" 400 401 docs_before = [Document(content=text) for text in docs_before_texts] 402 output = ranker.run(query=query, documents=docs_before) 403 docs_after = output["documents"] 404 405 assert len(docs_after) == 2 406 assert docs_after[0].content == expected_first_text 407 408 sorted_scores = sorted([doc.score for doc in docs_after], reverse=True) 409 assert [doc.score for doc in docs_after] == sorted_scores 410 411 @pytest.mark.integration 412 @pytest.mark.slow 413 def test_run_single_document(self, del_hf_env_vars): 414 """ 415 Test if the component runs with a single document. 416 """ 417 ranker = TransformersSimilarityRanker(model="cross-encoder-testing/reranker-bert-tiny-gooaq-bce", device=None) 418 docs_before = [Document(content="Berlin")] 419 output = ranker.run(query="City in Germany", documents=docs_before) 420 docs_after = output["documents"] 421 422 assert len(docs_after) == 1