test_embedding_based_document_splitter.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 import os 6 from dataclasses import replace 7 from unittest.mock import AsyncMock, Mock, patch 8 9 import pytest 10 11 from haystack import Document 12 from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder, SentenceTransformersDocumentEmbedder 13 from haystack.components.preprocessors import EmbeddingBasedDocumentSplitter 14 from haystack.utils import ComponentDevice 15 16 os.environ["TOKENIZERS_PARALLELISM"] = "false" 17 18 # disable tqdm entirely for tests 19 from tqdm import tqdm 20 21 tqdm.disable = True 22 23 24 class TestEmbeddingBasedDocumentSplitter: 25 def test_init(self): 26 mock_embedder = Mock() 27 splitter = EmbeddingBasedDocumentSplitter( 28 document_embedder=mock_embedder, sentences_per_group=2, percentile=0.9, min_length=50, max_length=1000 29 ) 30 31 assert splitter.document_embedder == mock_embedder 32 assert splitter.sentences_per_group == 2 33 assert splitter.percentile == 0.9 34 assert splitter.min_length == 50 35 assert splitter.max_length == 1000 36 37 def test_init_invalid_sentences_per_group(self): 38 mock_embedder = Mock() 39 with pytest.raises(ValueError, match="sentences_per_group must be greater than 0"): 40 EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, sentences_per_group=0) 41 42 def test_init_invalid_percentile(self): 43 mock_embedder = Mock() 44 with pytest.raises(ValueError, match="percentile must be between 0.0 and 1.0"): 45 EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, percentile=1.5) 46 47 def test_init_invalid_min_length(self): 48 mock_embedder = Mock() 49 with pytest.raises(ValueError, match="min_length must be greater than or equal to 0"): 50 EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, min_length=-1) 51 52 def test_init_invalid_max_length(self): 53 mock_embedder = Mock() 54 with pytest.raises(ValueError, match="max_length must be greater than min_length"): 55 EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, min_length=100, max_length=50) 56 57 def test_warm_up(self): 58 mock_embedder = Mock() 59 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 60 61 with patch( 62 "haystack.components.preprocessors.embedding_based_document_splitter.SentenceSplitter" 63 ) as mock_splitter_class: 64 mock_splitter = Mock() 65 mock_splitter_class.return_value = mock_splitter 66 67 splitter.warm_up() 68 69 assert splitter.sentence_splitter == mock_splitter 70 mock_splitter_class.assert_called_once() 71 72 def test_run_not_warmed_up(self): 73 mock_embedder = Mock() 74 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 75 76 with patch.object(splitter, "warm_up", wraps=splitter.warm_up) as mock_warm_up: 77 splitter.run(documents=[]) 78 assert splitter._is_warmed_up 79 mock_warm_up.assert_called_once() 80 81 @pytest.mark.asyncio 82 async def test_run_not_warmed_up_async(self) -> None: 83 mock_embedder = AsyncMock() 84 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 85 86 with patch.object(splitter, "warm_up", wraps=splitter.warm_up) as mock_warm_up: 87 await splitter.run_async(documents=[]) 88 assert splitter._is_warmed_up 89 mock_warm_up.assert_called_once() 90 91 def test_run_invalid_input(self): 92 mock_embedder = Mock() 93 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 94 splitter.sentence_splitter = Mock() 95 splitter._is_warmed_up = True 96 97 with pytest.raises(TypeError, match="expects a List of Documents"): 98 splitter.run(documents="not a list") 99 100 @pytest.mark.asyncio 101 async def test_run_invalid_input_async(self) -> None: 102 mock_embedder = AsyncMock() 103 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 104 splitter.sentence_splitter = AsyncMock() 105 splitter._is_warmed_up = True 106 107 with pytest.raises(TypeError, match="expects a List of Documents"): 108 await splitter.run_async(documents="not a list") 109 110 def test_run_document_with_none_content(self): 111 mock_embedder = Mock() 112 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 113 splitter.sentence_splitter = Mock() 114 splitter._is_warmed_up = True 115 116 with pytest.raises(ValueError, match="content for document ID"): 117 splitter.run(documents=[Document(content=None)]) 118 119 @pytest.mark.asyncio 120 async def test_run_document_with_none_content_async(self) -> None: 121 mock_embedder = AsyncMock() 122 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 123 splitter.sentence_splitter = AsyncMock() 124 splitter._is_warmed_up = True 125 126 with pytest.raises(ValueError, match="content for document ID"): 127 await splitter.run_async(documents=[Document(content=None)]) 128 129 def test_run_empty_document(self): 130 mock_embedder = Mock() 131 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 132 splitter.sentence_splitter = Mock() 133 splitter._is_warmed_up = True 134 135 result = splitter.run(documents=[Document(content="")]) 136 assert result["documents"] == [] 137 138 @pytest.mark.asyncio 139 async def test_run_empty_document_async(self) -> None: 140 mock_embedder = AsyncMock() 141 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 142 splitter.sentence_splitter = AsyncMock() 143 splitter._is_warmed_up = True 144 145 result = await splitter.run_async(documents=[Document(content="")]) 146 assert result["documents"] == [] 147 148 def test_group_sentences_single(self): 149 mock_embedder = Mock() 150 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, sentences_per_group=1) 151 152 sentences = ["Sentence 1.", "Sentence 2.", "Sentence 3."] 153 groups = splitter._group_sentences(sentences) 154 155 assert groups == sentences 156 157 def test_group_sentences_multiple(self): 158 mock_embedder = Mock() 159 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, sentences_per_group=2) 160 161 sentences = ["Sentence 1. ", "Sentence 2. ", "Sentence 3. ", "Sentence 4."] 162 groups = splitter._group_sentences(sentences) 163 164 assert groups == ["Sentence 1. Sentence 2. ", "Sentence 3. Sentence 4."] 165 166 def test_cosine_distance(self): 167 mock_embedder = Mock() 168 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 169 170 # Test with identical vectors 171 embedding1 = [1.0, 0.0, 0.0] 172 embedding2 = [1.0, 0.0, 0.0] 173 distance = splitter._cosine_distance(embedding1, embedding2) 174 assert distance == 0.0 175 176 # Test with orthogonal vectors 177 embedding1 = [1.0, 0.0, 0.0] 178 embedding2 = [0.0, 1.0, 0.0] 179 distance = splitter._cosine_distance(embedding1, embedding2) 180 assert distance == 1.0 181 182 # Test with zero vectors 183 embedding1 = [0.0, 0.0, 0.0] 184 embedding2 = [1.0, 0.0, 0.0] 185 distance = splitter._cosine_distance(embedding1, embedding2) 186 assert distance == 1.0 187 188 def test_find_split_points_empty(self): 189 mock_embedder = Mock() 190 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 191 192 split_points = splitter._find_split_points([]) 193 assert split_points == [] 194 195 split_points = splitter._find_split_points([[1.0, 0.0]]) 196 assert split_points == [] 197 198 def test_find_split_points(self): 199 mock_embedder = Mock() 200 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, percentile=0.5) 201 202 # Create embeddings where the second pair has high distance 203 embeddings = [ 204 [1.0, 0.0, 0.0], # Similar to next 205 [0.9, 0.1, 0.0], # Similar to previous 206 [0.0, 1.0, 0.0], # Very different from next 207 [0.1, 0.9, 0.0], # Similar to previous 208 ] 209 210 split_points = splitter._find_split_points(embeddings) 211 # Should find a split point after the second embedding (index 2) 212 assert 2 in split_points 213 214 def test_create_splits_from_points(self): 215 mock_embedder = Mock() 216 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 217 218 sentence_groups = ["Group 1 ", "Group 2 ", "Group 3 ", "Group 4"] 219 split_points = [2] # Split after index 1 220 221 splits = splitter._create_splits_from_points(sentence_groups, split_points) 222 assert splits == ["Group 1 Group 2 ", "Group 3 Group 4"] 223 224 def test_create_splits_from_points_no_points(self): 225 mock_embedder = Mock() 226 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 227 228 sentence_groups = ["Group 1 ", "Group 2 ", "Group 3"] 229 split_points = [] 230 231 splits = splitter._create_splits_from_points(sentence_groups, split_points) 232 assert splits == ["Group 1 Group 2 Group 3"] 233 234 def test_merge_small_splits(self): 235 mock_embedder = Mock() 236 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, min_length=10) 237 238 splits = ["Short ", "Also short ", "Long enough text ", "Another short"] 239 merged = splitter._merge_small_splits(splits) 240 241 assert len(merged) == 3 242 assert merged[0] == "Short Also short " 243 assert merged[1] == "Long enough text " 244 assert merged[2] == "Another short" 245 246 def test_merge_small_splits_respect_max_length(self): 247 mock_embedder = Mock() 248 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, min_length=10, max_length=15) 249 250 splits = ["123456", "123456789", "1234"] 251 merged = splitter._merge_small_splits(splits=splits) 252 253 assert len(merged) == 2 254 # First split remains beneath min_length b/c next split is too long 255 assert merged[0] == "123456" 256 # Second split is merged with third split to get above min_length and still beneath max_length 257 assert merged[1] == "1234567891234" 258 259 def test_create_documents_from_splits(self): 260 mock_embedder = Mock() 261 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 262 263 original_doc = Document(content="test", meta={"key": "value"}) 264 splits = ["Split 1", "Split 2"] 265 266 documents = splitter._create_documents_from_splits(splits, original_doc) 267 268 assert len(documents) == 2 269 assert documents[0].content == "Split 1" 270 assert documents[0].meta["source_id"] == original_doc.id 271 assert documents[0].meta["split_id"] == 0 272 assert documents[0].meta["key"] == "value" 273 assert documents[1].content == "Split 2" 274 assert documents[1].meta["split_id"] == 1 275 276 def test_create_documents_from_splits_with_page_numbers(self): 277 mock_embedder = Mock() 278 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 279 280 original_doc = Document(content="Page 1 content.\fPage 2 content.\f\fPage 4 content.", meta={"key": "value"}) 281 splits = ["Page 1 content.\f", "Page 2 content.\f\f", "Page 4 content."] 282 283 documents = splitter._create_documents_from_splits(splits, original_doc) 284 285 assert len(documents) == 3 286 assert documents[0].content == "Page 1 content.\f" 287 assert documents[0].meta["page_number"] == 1 288 assert documents[1].content == "Page 2 content.\f\f" 289 assert documents[1].meta["page_number"] == 2 290 assert documents[2].content == "Page 4 content." 291 assert documents[2].meta["page_number"] == 4 292 293 def test_create_documents_from_splits_with_consecutive_page_breaks(self): 294 mock_embedder = Mock() 295 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 296 297 # Test with consecutive page breaks at the end 298 original_doc = Document(content="Page 1 content.\fPage 2 content.\f\f\f", meta={"key": "value"}) 299 splits = ["Page 1 content.\f", "Page 2 content.\f\f\f"] 300 301 documents = splitter._create_documents_from_splits(splits, original_doc) 302 303 assert len(documents) == 2 304 assert documents[0].content == "Page 1 content.\f" 305 assert documents[0].meta["page_number"] == 1 306 assert documents[1].content == "Page 2 content.\f\f\f" 307 # Should be page 2, not 4, because consecutive page breaks at the end are adjusted 308 assert documents[1].meta["page_number"] == 2 309 310 def test_calculate_embeddings(self): 311 mock_embedder = Mock() 312 313 # Mock the document embedder to return documents with embeddings 314 def mock_run(documents): 315 return {"documents": [replace(doc, embedding=[1.0, 2.0, 3.0]) for doc in documents]} 316 317 mock_embedder.run = Mock(side_effect=mock_run) 318 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 319 320 sentence_groups = ["Group 1", "Group 2", "Group 3"] 321 embeddings = splitter._calculate_embeddings(sentence_groups) 322 323 assert len(embeddings) == 3 324 assert all(embedding == [1.0, 2.0, 3.0] for embedding in embeddings) 325 mock_embedder.run.assert_called_once() 326 327 @pytest.mark.asyncio 328 async def test_calculate_embeddings_async(self) -> None: 329 mock_embedder = AsyncMock() 330 331 # Mock the document embedder to return documents with embeddings 332 async def mock_run_async(documents): 333 return {"documents": [replace(doc, embedding=[1.0, 2.0, 3.0]) for doc in documents]} 334 335 mock_embedder.run_async = AsyncMock(side_effect=mock_run_async) 336 splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder) 337 338 sentence_groups = ["Group 1", "Group 2", "Group 3"] 339 embeddings = await splitter._calculate_embeddings_async(sentence_groups) 340 341 assert len(embeddings) == 3 342 assert all(embedding == [1.0, 2.0, 3.0] for embedding in embeddings) 343 mock_embedder.run_async.assert_called_once() 344 345 def test_to_dict(self): 346 mock_embedder = Mock() 347 mock_embedder.to_dict.return_value = {"type": "MockEmbedder"} 348 349 splitter = EmbeddingBasedDocumentSplitter( 350 document_embedder=mock_embedder, sentences_per_group=2, percentile=0.9, min_length=50, max_length=1000 351 ) 352 353 result = splitter.to_dict() 354 355 assert "EmbeddingBasedDocumentSplitter" in result["type"] 356 assert result["init_parameters"]["sentences_per_group"] == 2 357 assert result["init_parameters"]["percentile"] == 0.9 358 assert result["init_parameters"]["min_length"] == 50 359 assert result["init_parameters"]["max_length"] == 1000 360 assert "document_embedder" in result["init_parameters"] 361 362 @pytest.mark.integration 363 @pytest.mark.slow 364 def test_split_document_with_multiple_topics(self, del_hf_env_vars, monkeypatch): 365 import torch 366 367 # Force CPU usage to avoid MPS memory issues 368 monkeypatch.setenv("PYTORCH_ENABLE_MPS_FALLBACK", "1") 369 torch.backends.mps.is_available = lambda: False 370 371 embedder = SentenceTransformersDocumentEmbedder( 372 model="sentence-transformers/all-MiniLM-L6-v2", device=ComponentDevice.from_str("cpu") 373 ) 374 375 splitter = EmbeddingBasedDocumentSplitter( 376 document_embedder=embedder, sentences_per_group=2, percentile=0.9, min_length=30, max_length=300 377 ) 378 379 # A document with multiple topics 380 text = ( 381 "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. " 382 "Machine learning has revolutionized many industries. Neural networks can process vast amounts of data. " 383 "Deep learning models achieve remarkable accuracy on complex tasks. " 384 "Cooking is both an art and a science. Fresh ingredients make all the difference. " 385 "Proper seasoning enhances the natural flavors of food. " 386 "The history of ancient civilizations fascinates researchers. Archaeological discoveries reveal new insights. " # noqa: E501 387 "Ancient texts provide valuable information about past societies." 388 ) 389 doc = Document(content=text) 390 391 result = splitter.run(documents=[doc]) 392 split_docs = result["documents"] 393 394 # There should be more than one split 395 assert len(split_docs) > 1 396 # Each split should be non-empty and respect min_length 397 for split_doc in split_docs: 398 assert split_doc.content.strip() != "" 399 assert len(split_doc.content) >= 30 400 # The splits should cover the original text 401 combined = "".join([d.content for d in split_docs]) 402 original = text 403 assert combined in original or original in combined 404 405 @pytest.mark.asyncio 406 @pytest.mark.skipif( 407 not os.environ.get("TEI_URL", None), 408 reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.", 409 ) 410 @pytest.mark.slow 411 @pytest.mark.integration 412 async def test_split_document_with_multiple_topics_async(self) -> None: 413 embedder = HuggingFaceAPIDocumentEmbedder( 414 api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")} 415 ) 416 417 splitter = EmbeddingBasedDocumentSplitter( 418 document_embedder=embedder, sentences_per_group=2, percentile=0.9, min_length=30, max_length=300 419 ) 420 421 # A document with multiple topics 422 text = ( 423 "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. " 424 "Machine learning has revolutionized many industries. Neural networks can process vast amounts of data. " 425 "Deep learning models achieve remarkable accuracy on complex tasks. " 426 "Cooking is both an art and a science. Fresh ingredients make all the difference. " 427 "Proper seasoning enhances the natural flavors of food. " 428 "The history of ancient civilizations fascinates researchers. Archaeological discoveries reveal new insights. " # noqa: E501 429 "Ancient texts provide valuable information about past societies." 430 ) 431 doc = Document(content=text) 432 433 result = await splitter.run_async(documents=[doc]) 434 split_docs = result["documents"] 435 436 # There should be more than one split 437 assert len(split_docs) > 1 438 # Each split should be non-empty and respect min_length 439 for split_doc in split_docs: 440 assert split_doc.content.strip() != "" 441 assert len(split_doc.content) >= 30 442 # The splits should cover the original text 443 combined = "".join([d.content for d in split_docs]) 444 original = text 445 assert combined in original or original in combined 446 447 @pytest.mark.slow 448 @pytest.mark.integration 449 def test_trailing_whitespace_is_preserved(self, del_hf_env_vars): 450 embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2") 451 452 splitter = EmbeddingBasedDocumentSplitter(document_embedder=embedder, sentences_per_group=1) 453 454 # Normal trailing whitespace 455 text = "The weather today is beautiful. " 456 result = splitter.run(documents=[Document(content=text)]) 457 assert result["documents"][0].content == text 458 459 # Newline at the end 460 text = "The weather today is beautiful.\n" 461 result = splitter.run(documents=[Document(content=text)]) 462 assert result["documents"][0].content == text 463 464 # Page break at the end 465 text = "The weather today is beautiful.\f" 466 result = splitter.run(documents=[Document(content=text)]) 467 assert result["documents"][0].content == text 468 469 @pytest.mark.asyncio 470 @pytest.mark.skipif( 471 not os.environ.get("TEI_URL", None), 472 reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.", 473 ) 474 @pytest.mark.slow 475 @pytest.mark.integration 476 async def test_trailing_whitespace_is_preserved_async(self) -> None: 477 embedder = HuggingFaceAPIDocumentEmbedder( 478 api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")} 479 ) 480 splitter = EmbeddingBasedDocumentSplitter(document_embedder=embedder, sentences_per_group=1) 481 482 # Normal trailing whitespace 483 text = "The weather today is beautiful. " 484 result = await splitter.run_async(documents=[Document(content=text)]) 485 assert result["documents"][0].content == text 486 487 # Newline at the end 488 text = "The weather today is beautiful.\n" 489 result = await splitter.run_async(documents=[Document(content=text)]) 490 assert result["documents"][0].content == text 491 492 # Page break at the end 493 text = "The weather today is beautiful.\f" 494 result = await splitter.run_async(documents=[Document(content=text)]) 495 assert result["documents"][0].content == text 496 497 @pytest.mark.integration 498 @pytest.mark.slow 499 def test_no_extra_whitespaces_between_sentences(self, del_hf_env_vars): 500 embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2") 501 502 splitter = EmbeddingBasedDocumentSplitter( 503 document_embedder=embedder, sentences_per_group=1, percentile=0.9, min_length=10, max_length=500 504 ) 505 506 text = ( 507 "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. " 508 "There are no clouds and no rain. Machine learning has revolutionized many industries. " 509 "Neural networks can process vast amounts of data. Deep learning models achieve remarkable accuracy on complex tasks." # noqa: E501 510 ) 511 doc = Document(content=text) 512 513 result = splitter.run(documents=[doc]) 514 split_docs = result["documents"] 515 assert len(split_docs) == 2 516 # Expect the original whitespace structure with trailing spaces where they exist 517 assert ( 518 split_docs[0].content 519 == "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. There are no clouds and no rain. " # noqa: E501 520 ) # noqa: E501 521 assert ( 522 split_docs[1].content 523 == "Machine learning has revolutionized many industries. Neural networks can process vast amounts of data. Deep learning models achieve remarkable accuracy on complex tasks." # noqa: E501 524 ) # noqa: E501 525 526 @pytest.mark.asyncio 527 @pytest.mark.skipif( 528 not os.environ.get("TEI_URL", None), 529 reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.", 530 ) 531 @pytest.mark.integration 532 @pytest.mark.slow 533 async def test_no_extra_whitespaces_between_sentences_async(self) -> None: 534 embedder = HuggingFaceAPIDocumentEmbedder( 535 api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")} 536 ) 537 538 splitter = EmbeddingBasedDocumentSplitter( 539 document_embedder=embedder, sentences_per_group=1, percentile=0.9, min_length=10, max_length=500 540 ) 541 542 text = ( 543 "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. " 544 "There are no clouds and no rain. Machine learning has revolutionized many industries. " 545 "Neural networks can process vast amounts of data. Deep learning models achieve remarkable accuracy on complex tasks." # noqa: E501 546 ) 547 doc = Document(content=text) 548 549 result = await splitter.run_async(documents=[doc]) 550 split_docs = result["documents"] 551 assert len(split_docs) == 2 552 # Expect the original whitespace structure with trailing spaces where they exist 553 assert ( 554 split_docs[0].content 555 == "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. There are no clouds and no rain. " # noqa: E501 556 ) # noqa: E501 557 assert ( 558 split_docs[1].content 559 == "Machine learning has revolutionized many industries. Neural networks can process vast amounts of data. Deep learning models achieve remarkable accuracy on complex tasks." # noqa: E501 560 ) # noqa: E501 561 562 @pytest.mark.integration 563 @pytest.mark.slow 564 def test_split_large_splits_recursion(self, del_hf_env_vars): 565 """ 566 Test that _split_large_splits() works correctly without infinite loops. 567 This test uses a longer text that will trigger the recursive splitting logic. 568 If the chunk cannot be split further, it is allowed to be larger than max_length. 569 """ 570 embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2", batch_size=32) 571 semantic_chunker = EmbeddingBasedDocumentSplitter( 572 document_embedder=embedder, sentences_per_group=5, percentile=0.95, min_length=50, max_length=1000 573 ) 574 575 text = """# Artificial intelligence and its Impact on Society 576 ## Article from Wikipedia, the free encyclopedia 577 ### Introduction to Artificial Intelligence 578 Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals. 579 580 ### The History of Software 581 The history of software is closely tied to the development of digital computers in the mid-20th century. Early programs were written in the machine language specific to the hardware. The introduction of high-level programming languages in 1958 allowed for more human-readable instructions, making software development easier and more portable across different computer architectures. Software in a programming language is run through a compiler or interpreter to execute on the architecture's hardware. Over time, software has become complex, owing to developments in networking, operating systems, and databases.""" # noqa: E501 582 583 doc = Document(content=text) 584 result = semantic_chunker.run(documents=[doc]) 585 split_docs = result["documents"] 586 587 assert len(split_docs) == 1 588 589 # If the chunk cannot be split further, it is allowed to be larger than max_length 590 # At least one split should be larger than max_length in this test case 591 assert any(len(split_doc.content) > 1000 for split_doc in split_docs) 592 593 # Verify that the splits cover the original content 594 combined_content = "".join([d.content for d in split_docs]) 595 assert combined_content == text 596 597 for i, split_doc in enumerate(split_docs): 598 assert split_doc.meta["source_id"] == doc.id 599 assert split_doc.meta["split_id"] == i 600 assert "page_number" in split_doc.meta 601 602 @pytest.mark.asyncio 603 @pytest.mark.skipif( 604 not os.environ.get("TEI_URL", None), 605 reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.", 606 ) 607 @pytest.mark.integration 608 @pytest.mark.slow 609 async def test_split_large_splits_recursion_async(self) -> None: 610 """ 611 Test that _split_large_splits() works correctly without infinite loops. 612 This test uses a longer text that will trigger the recursive splitting logic. 613 If the chunk cannot be split further, it is allowed to be larger than max_length. 614 """ 615 embedder = HuggingFaceAPIDocumentEmbedder( 616 api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")} 617 ) 618 semantic_chunker = EmbeddingBasedDocumentSplitter( 619 document_embedder=embedder, sentences_per_group=5, percentile=0.95, min_length=50, max_length=1000 620 ) 621 622 text = """# Artificial intelligence and its Impact on Society 623 ## Article from Wikipedia, the free encyclopedia 624 ### Introduction to Artificial Intelligence 625 Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals. 626 627 ### The History of Software 628 The history of software is closely tied to the development of digital computers in the mid-20th century. Early programs were written in the machine language specific to the hardware. The introduction of high-level programming languages in 1958 allowed for more human-readable instructions, making software development easier and more portable across different computer architectures. Software in a programming language is run through a compiler or interpreter to execute on the architecture's hardware. Over time, software has become complex, owing to developments in networking, operating systems, and databases.""" # noqa: E501 629 630 doc = Document(content=text) 631 result = await semantic_chunker.run_async(documents=[doc]) 632 split_docs = result["documents"] 633 634 assert len(split_docs) == 1 635 636 # If the chunk cannot be split further, it is allowed to be larger than max_length 637 # At least one split should be larger than max_length in this test case 638 assert any(len(split_doc.content) > 1000 for split_doc in split_docs) 639 640 # Verify that the splits cover the original content 641 combined_content = "".join([d.content for d in split_docs]) 642 assert combined_content == text 643 644 for i, split_doc in enumerate(split_docs): 645 assert split_doc.meta["source_id"] == doc.id 646 assert split_doc.meta["split_id"] == i 647 assert "page_number" in split_doc.meta 648 649 @pytest.mark.integration 650 @pytest.mark.slow 651 def test_split_large_splits_actually_splits(self, del_hf_env_vars): 652 """ 653 Test that _split_large_splits() actually works and can split long texts into multiple chunks. 654 This test uses a very long text that should be split into multiple chunks. 655 """ 656 embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2", batch_size=32) 657 semantic_chunker = EmbeddingBasedDocumentSplitter( 658 document_embedder=embedder, 659 sentences_per_group=3, 660 percentile=0.85, # Lower percentile to create more splits 661 min_length=100, 662 max_length=500, # Smaller max_length to force more splits 663 ) 664 665 # Create a very long text with multiple paragraphs and topics 666 text = """# Comprehensive Guide to Machine Learning and Artificial Intelligence 667 668 ## Introduction to Machine Learning 669 Machine learning is a subset of artificial intelligence that focuses on the development of computer programs that can access data and use it to learn for themselves. The process of learning begins with observations or data, such as examples, direct experience, or instruction, in order to look for patterns in data and make better decisions in the future based on the examples that we provide. The primary aim is to allow the computers learn automatically without human intervention or assistance and adjust actions accordingly. 670 671 ## Types of Machine Learning 672 There are several types of machine learning algorithms, each with their own strengths and weaknesses. Supervised learning involves training a model on a labeled dataset, where the correct answers are provided. The model learns to map inputs to outputs based on these examples. Unsupervised learning, on the other hand, deals with unlabeled data and seeks to find hidden patterns or structures within the data. Reinforcement learning is a type of learning where an agent learns to behave in an environment by performing certain actions and receiving rewards or penalties. 673 674 ## Deep Learning and Neural Networks 675 Deep learning is a subset of machine learning that uses neural networks with multiple layers to model and understand complex patterns. Neural networks are inspired by the human brain and consist of interconnected nodes or neurons. Each connection between neurons has a weight that is adjusted during training. The network learns by adjusting these weights based on the error between predicted and actual outputs. Deep learning has been particularly successful in areas such as computer vision, natural language processing, and speech recognition. 676 677 \f 678 679 ## Natural Language Processing 680 Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and human language. It involves developing algorithms and models that can understand, interpret, and generate human language. NLP applications include machine translation, sentiment analysis, text summarization, and question answering systems. Recent advances in deep learning have significantly improved the performance of NLP systems, leading to more accurate and sophisticated language models. 681 682 ## Computer Vision and Image Recognition 683 Computer vision is another important area of artificial intelligence that deals with how computers can gain high-level understanding from digital images or videos. It involves developing algorithms that can identify and understand visual information from the world. Applications include facial recognition, object detection, medical image analysis, and autonomous vehicle navigation. Deep learning models, particularly convolutional neural networks (CNNs), have revolutionized computer vision by achieving human-level performance on many tasks. 684 685 ## The Future of Artificial Intelligence 686 The future of artificial intelligence holds immense potential for transforming various industries and aspects of human life. We can expect to see more sophisticated AI systems that can handle complex reasoning tasks, understand context better, and interact more naturally with humans. However, this rapid advancement also brings challenges related to ethics, privacy, and the impact on employment. It's crucial to develop AI systems that are not only powerful but also safe, fair, and beneficial to society as a whole. 687 688 \f 689 690 ## Ethical Considerations in AI 691 As artificial intelligence becomes more prevalent, ethical considerations become increasingly important. Issues such as bias in AI systems, privacy concerns, and the potential for misuse need to be carefully addressed. AI systems can inherit biases from their training data, leading to unfair outcomes for certain groups. Privacy concerns arise from the vast amounts of data required to train AI systems. Additionally, there are concerns about the potential for AI to be used maliciously or to replace human workers in certain industries. 692 693 ## Applications in Healthcare 694 Artificial intelligence has the potential to revolutionize healthcare by improving diagnosis, treatment planning, and patient care. Machine learning algorithms can analyze medical images to detect diseases earlier and more accurately than human doctors. AI systems can also help in drug discovery by predicting the effectiveness of potential treatments. In addition, AI-powered chatbots and virtual assistants can provide basic healthcare information and support to patients, reducing the burden on healthcare professionals. 695 696 ## AI in Finance and Banking 697 The financial industry has been quick to adopt artificial intelligence for various applications. AI systems can analyze market data to make investment decisions, detect fraudulent transactions, and provide personalized financial advice. Machine learning algorithms can assess credit risk more accurately than traditional methods, leading to better lending decisions. Additionally, AI-powered chatbots can handle customer service inquiries, reducing costs and improving customer satisfaction. 698 699 \f 700 701 ## Transportation and Autonomous Vehicles 702 Autonomous vehicles represent one of the most visible applications of artificial intelligence in transportation. Self-driving cars use a combination of sensors, cameras, and AI algorithms to navigate roads safely. These systems can detect obstacles, read traffic signs, and make decisions about speed and direction. Beyond autonomous cars, AI is also being used in logistics and supply chain management to optimize routes and reduce delivery times. 703 704 ## Education and Personalized Learning 705 Artificial intelligence is transforming education by enabling personalized learning experiences. AI systems can adapt to individual student needs, providing customized content and pacing. Intelligent tutoring systems can provide immediate feedback and support to students, helping them learn more effectively. Additionally, AI can help educators by automating administrative tasks and providing insights into student performance and learning patterns.""" # noqa: E501 706 707 doc = Document(content=text) 708 result = semantic_chunker.run(documents=[doc]) 709 split_docs = result["documents"] 710 711 assert len(split_docs) == 11 712 713 # Verify that the splits cover the original content 714 combined_content = "".join([d.content for d in split_docs]) 715 assert combined_content == text 716 717 for i, split_doc in enumerate(split_docs): 718 assert split_doc.meta["source_id"] == doc.id 719 assert split_doc.meta["split_id"] == i 720 assert "page_number" in split_doc.meta 721 722 if i in [0, 1, 2, 3]: 723 assert split_doc.meta["page_number"] == 1 724 if i in [4, 5, 6]: 725 assert split_doc.meta["page_number"] == 2 726 if i in [7, 8]: 727 assert split_doc.meta["page_number"] == 3 728 if i in [9, 10]: 729 assert split_doc.meta["page_number"] == 4 730 731 @pytest.mark.asyncio 732 @pytest.mark.skipif( 733 not os.environ.get("TEI_URL", None), 734 reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.", 735 ) 736 @pytest.mark.integration 737 @pytest.mark.slow 738 async def test_split_large_splits_actually_splits_async(self) -> None: 739 """ 740 Test that _split_large_splits() actually works and can split long texts into multiple chunks. 741 This test uses a very long text that should be split into multiple chunks. 742 """ 743 embedder = HuggingFaceAPIDocumentEmbedder( 744 api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")} 745 ) 746 semantic_chunker = EmbeddingBasedDocumentSplitter( 747 document_embedder=embedder, 748 sentences_per_group=3, 749 percentile=0.85, # Lower percentile to create more splits 750 min_length=100, 751 max_length=500, # Smaller max_length to force more splits 752 ) 753 754 # Create a very long text with multiple paragraphs and topics 755 text = """# Comprehensive Guide to Machine Learning and Artificial Intelligence 756 757 ## Introduction to Machine Learning 758 Machine learning is a subset of artificial intelligence that focuses on the development of computer programs that can access data and use it to learn for themselves. The process of learning begins with observations or data, such as examples, direct experience, or instruction, in order to look for patterns in data and make better decisions in the future based on the examples that we provide. The primary aim is to allow the computers learn automatically without human intervention or assistance and adjust actions accordingly. 759 760 ## Types of Machine Learning 761 There are several types of machine learning algorithms, each with their own strengths and weaknesses. Supervised learning involves training a model on a labeled dataset, where the correct answers are provided. The model learns to map inputs to outputs based on these examples. Unsupervised learning, on the other hand, deals with unlabeled data and seeks to find hidden patterns or structures within the data. Reinforcement learning is a type of learning where an agent learns to behave in an environment by performing certain actions and receiving rewards or penalties. 762 763 ## Deep Learning and Neural Networks 764 Deep learning is a subset of machine learning that uses neural networks with multiple layers to model and understand complex patterns. Neural networks are inspired by the human brain and consist of interconnected nodes or neurons. Each connection between neurons has a weight that is adjusted during training. The network learns by adjusting these weights based on the error between predicted and actual outputs. Deep learning has been particularly successful in areas such as computer vision, natural language processing, and speech recognition. 765 766 \f 767 768 ## Natural Language Processing 769 Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and human language. It involves developing algorithms and models that can understand, interpret, and generate human language. NLP applications include machine translation, sentiment analysis, text summarization, and question answering systems. Recent advances in deep learning have significantly improved the performance of NLP systems, leading to more accurate and sophisticated language models. 770 771 ## Computer Vision and Image Recognition 772 Computer vision is another important area of artificial intelligence that deals with how computers can gain high-level understanding from digital images or videos. It involves developing algorithms that can identify and understand visual information from the world. Applications include facial recognition, object detection, medical image analysis, and autonomous vehicle navigation. Deep learning models, particularly convolutional neural networks (CNNs), have revolutionized computer vision by achieving human-level performance on many tasks. 773 774 ## The Future of Artificial Intelligence 775 The future of artificial intelligence holds immense potential for transforming various industries and aspects of human life. We can expect to see more sophisticated AI systems that can handle complex reasoning tasks, understand context better, and interact more naturally with humans. However, this rapid advancement also brings challenges related to ethics, privacy, and the impact on employment. It's crucial to develop AI systems that are not only powerful but also safe, fair, and beneficial to society as a whole. 776 777 \f 778 779 ## Ethical Considerations in AI 780 As artificial intelligence becomes more prevalent, ethical considerations become increasingly important. Issues such as bias in AI systems, privacy concerns, and the potential for misuse need to be carefully addressed. AI systems can inherit biases from their training data, leading to unfair outcomes for certain groups. Privacy concerns arise from the vast amounts of data required to train AI systems. Additionally, there are concerns about the potential for AI to be used maliciously or to replace human workers in certain industries. 781 782 ## Applications in Healthcare 783 Artificial intelligence has the potential to revolutionize healthcare by improving diagnosis, treatment planning, and patient care. Machine learning algorithms can analyze medical images to detect diseases earlier and more accurately than human doctors. AI systems can also help in drug discovery by predicting the effectiveness of potential treatments. In addition, AI-powered chatbots and virtual assistants can provide basic healthcare information and support to patients, reducing the burden on healthcare professionals. 784 785 ## AI in Finance and Banking 786 The financial industry has been quick to adopt artificial intelligence for various applications. AI systems can analyze market data to make investment decisions, detect fraudulent transactions, and provide personalized financial advice. Machine learning algorithms can assess credit risk more accurately than traditional methods, leading to better lending decisions. Additionally, AI-powered chatbots can handle customer service inquiries, reducing costs and improving customer satisfaction. 787 788 \f 789 790 ## Transportation and Autonomous Vehicles 791 Autonomous vehicles represent one of the most visible applications of artificial intelligence in transportation. Self-driving cars use a combination of sensors, cameras, and AI algorithms to navigate roads safely. These systems can detect obstacles, read traffic signs, and make decisions about speed and direction. Beyond autonomous cars, AI is also being used in logistics and supply chain management to optimize routes and reduce delivery times. 792 793 ## Education and Personalized Learning 794 Artificial intelligence is transforming education by enabling personalized learning experiences. AI systems can adapt to individual student needs, providing customized content and pacing. Intelligent tutoring systems can provide immediate feedback and support to students, helping them learn more effectively. Additionally, AI can help educators by automating administrative tasks and providing insights into student performance and learning patterns.""" # noqa: E501 795 796 doc = Document(content=text) 797 result = await semantic_chunker.run_async(documents=[doc]) 798 split_docs = result["documents"] 799 800 assert len(split_docs) == 11 801 802 # Verify that the splits cover the original content 803 combined_content = "".join([d.content for d in split_docs]) 804 assert combined_content == text 805 806 for i, split_doc in enumerate(split_docs): 807 assert split_doc.meta["source_id"] == doc.id 808 assert split_doc.meta["split_id"] == i 809 assert "page_number" in split_doc.meta 810 811 if i in [0, 1, 2, 3]: 812 assert split_doc.meta["page_number"] == 1 813 if i in [4, 5, 6]: 814 assert split_doc.meta["page_number"] == 2 815 if i in [7, 8]: 816 assert split_doc.meta["page_number"] == 3 817 if i in [9, 10]: 818 assert split_doc.meta["page_number"] == 4