/ test / components / preprocessors / test_embedding_based_document_splitter.py
test_embedding_based_document_splitter.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  import os
  6  from dataclasses import replace
  7  from unittest.mock import AsyncMock, Mock, patch
  8  
  9  import pytest
 10  
 11  from haystack import Document
 12  from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder, SentenceTransformersDocumentEmbedder
 13  from haystack.components.preprocessors import EmbeddingBasedDocumentSplitter
 14  from haystack.utils import ComponentDevice
 15  
 16  os.environ["TOKENIZERS_PARALLELISM"] = "false"
 17  
 18  # disable tqdm entirely for tests
 19  from tqdm import tqdm
 20  
 21  tqdm.disable = True
 22  
 23  
 24  class TestEmbeddingBasedDocumentSplitter:
 25      def test_init(self):
 26          mock_embedder = Mock()
 27          splitter = EmbeddingBasedDocumentSplitter(
 28              document_embedder=mock_embedder, sentences_per_group=2, percentile=0.9, min_length=50, max_length=1000
 29          )
 30  
 31          assert splitter.document_embedder == mock_embedder
 32          assert splitter.sentences_per_group == 2
 33          assert splitter.percentile == 0.9
 34          assert splitter.min_length == 50
 35          assert splitter.max_length == 1000
 36  
 37      def test_init_invalid_sentences_per_group(self):
 38          mock_embedder = Mock()
 39          with pytest.raises(ValueError, match="sentences_per_group must be greater than 0"):
 40              EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, sentences_per_group=0)
 41  
 42      def test_init_invalid_percentile(self):
 43          mock_embedder = Mock()
 44          with pytest.raises(ValueError, match="percentile must be between 0.0 and 1.0"):
 45              EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, percentile=1.5)
 46  
 47      def test_init_invalid_min_length(self):
 48          mock_embedder = Mock()
 49          with pytest.raises(ValueError, match="min_length must be greater than or equal to 0"):
 50              EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, min_length=-1)
 51  
 52      def test_init_invalid_max_length(self):
 53          mock_embedder = Mock()
 54          with pytest.raises(ValueError, match="max_length must be greater than min_length"):
 55              EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, min_length=100, max_length=50)
 56  
 57      def test_warm_up(self):
 58          mock_embedder = Mock()
 59          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
 60  
 61          with patch(
 62              "haystack.components.preprocessors.embedding_based_document_splitter.SentenceSplitter"
 63          ) as mock_splitter_class:
 64              mock_splitter = Mock()
 65              mock_splitter_class.return_value = mock_splitter
 66  
 67              splitter.warm_up()
 68  
 69              assert splitter.sentence_splitter == mock_splitter
 70              mock_splitter_class.assert_called_once()
 71  
 72      def test_run_not_warmed_up(self):
 73          mock_embedder = Mock()
 74          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
 75  
 76          with patch.object(splitter, "warm_up", wraps=splitter.warm_up) as mock_warm_up:
 77              splitter.run(documents=[])
 78              assert splitter._is_warmed_up
 79              mock_warm_up.assert_called_once()
 80  
 81      @pytest.mark.asyncio
 82      async def test_run_not_warmed_up_async(self) -> None:
 83          mock_embedder = AsyncMock()
 84          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
 85  
 86          with patch.object(splitter, "warm_up", wraps=splitter.warm_up) as mock_warm_up:
 87              await splitter.run_async(documents=[])
 88              assert splitter._is_warmed_up
 89              mock_warm_up.assert_called_once()
 90  
 91      def test_run_invalid_input(self):
 92          mock_embedder = Mock()
 93          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
 94          splitter.sentence_splitter = Mock()
 95          splitter._is_warmed_up = True
 96  
 97          with pytest.raises(TypeError, match="expects a List of Documents"):
 98              splitter.run(documents="not a list")
 99  
100      @pytest.mark.asyncio
101      async def test_run_invalid_input_async(self) -> None:
102          mock_embedder = AsyncMock()
103          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
104          splitter.sentence_splitter = AsyncMock()
105          splitter._is_warmed_up = True
106  
107          with pytest.raises(TypeError, match="expects a List of Documents"):
108              await splitter.run_async(documents="not a list")
109  
110      def test_run_document_with_none_content(self):
111          mock_embedder = Mock()
112          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
113          splitter.sentence_splitter = Mock()
114          splitter._is_warmed_up = True
115  
116          with pytest.raises(ValueError, match="content for document ID"):
117              splitter.run(documents=[Document(content=None)])
118  
119      @pytest.mark.asyncio
120      async def test_run_document_with_none_content_async(self) -> None:
121          mock_embedder = AsyncMock()
122          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
123          splitter.sentence_splitter = AsyncMock()
124          splitter._is_warmed_up = True
125  
126          with pytest.raises(ValueError, match="content for document ID"):
127              await splitter.run_async(documents=[Document(content=None)])
128  
129      def test_run_empty_document(self):
130          mock_embedder = Mock()
131          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
132          splitter.sentence_splitter = Mock()
133          splitter._is_warmed_up = True
134  
135          result = splitter.run(documents=[Document(content="")])
136          assert result["documents"] == []
137  
138      @pytest.mark.asyncio
139      async def test_run_empty_document_async(self) -> None:
140          mock_embedder = AsyncMock()
141          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
142          splitter.sentence_splitter = AsyncMock()
143          splitter._is_warmed_up = True
144  
145          result = await splitter.run_async(documents=[Document(content="")])
146          assert result["documents"] == []
147  
148      def test_group_sentences_single(self):
149          mock_embedder = Mock()
150          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, sentences_per_group=1)
151  
152          sentences = ["Sentence 1.", "Sentence 2.", "Sentence 3."]
153          groups = splitter._group_sentences(sentences)
154  
155          assert groups == sentences
156  
157      def test_group_sentences_multiple(self):
158          mock_embedder = Mock()
159          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, sentences_per_group=2)
160  
161          sentences = ["Sentence 1. ", "Sentence 2. ", "Sentence 3. ", "Sentence 4."]
162          groups = splitter._group_sentences(sentences)
163  
164          assert groups == ["Sentence 1. Sentence 2. ", "Sentence 3. Sentence 4."]
165  
166      def test_cosine_distance(self):
167          mock_embedder = Mock()
168          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
169  
170          # Test with identical vectors
171          embedding1 = [1.0, 0.0, 0.0]
172          embedding2 = [1.0, 0.0, 0.0]
173          distance = splitter._cosine_distance(embedding1, embedding2)
174          assert distance == 0.0
175  
176          # Test with orthogonal vectors
177          embedding1 = [1.0, 0.0, 0.0]
178          embedding2 = [0.0, 1.0, 0.0]
179          distance = splitter._cosine_distance(embedding1, embedding2)
180          assert distance == 1.0
181  
182          # Test with zero vectors
183          embedding1 = [0.0, 0.0, 0.0]
184          embedding2 = [1.0, 0.0, 0.0]
185          distance = splitter._cosine_distance(embedding1, embedding2)
186          assert distance == 1.0
187  
188      def test_find_split_points_empty(self):
189          mock_embedder = Mock()
190          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
191  
192          split_points = splitter._find_split_points([])
193          assert split_points == []
194  
195          split_points = splitter._find_split_points([[1.0, 0.0]])
196          assert split_points == []
197  
198      def test_find_split_points(self):
199          mock_embedder = Mock()
200          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, percentile=0.5)
201  
202          # Create embeddings where the second pair has high distance
203          embeddings = [
204              [1.0, 0.0, 0.0],  # Similar to next
205              [0.9, 0.1, 0.0],  # Similar to previous
206              [0.0, 1.0, 0.0],  # Very different from next
207              [0.1, 0.9, 0.0],  # Similar to previous
208          ]
209  
210          split_points = splitter._find_split_points(embeddings)
211          # Should find a split point after the second embedding (index 2)
212          assert 2 in split_points
213  
214      def test_create_splits_from_points(self):
215          mock_embedder = Mock()
216          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
217  
218          sentence_groups = ["Group 1 ", "Group 2 ", "Group 3 ", "Group 4"]
219          split_points = [2]  # Split after index 1
220  
221          splits = splitter._create_splits_from_points(sentence_groups, split_points)
222          assert splits == ["Group 1 Group 2 ", "Group 3 Group 4"]
223  
224      def test_create_splits_from_points_no_points(self):
225          mock_embedder = Mock()
226          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
227  
228          sentence_groups = ["Group 1 ", "Group 2 ", "Group 3"]
229          split_points = []
230  
231          splits = splitter._create_splits_from_points(sentence_groups, split_points)
232          assert splits == ["Group 1 Group 2 Group 3"]
233  
234      def test_merge_small_splits(self):
235          mock_embedder = Mock()
236          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, min_length=10)
237  
238          splits = ["Short ", "Also short ", "Long enough text ", "Another short"]
239          merged = splitter._merge_small_splits(splits)
240  
241          assert len(merged) == 3
242          assert merged[0] == "Short Also short "
243          assert merged[1] == "Long enough text "
244          assert merged[2] == "Another short"
245  
246      def test_merge_small_splits_respect_max_length(self):
247          mock_embedder = Mock()
248          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder, min_length=10, max_length=15)
249  
250          splits = ["123456", "123456789", "1234"]
251          merged = splitter._merge_small_splits(splits=splits)
252  
253          assert len(merged) == 2
254          # First split remains beneath min_length b/c next split is too long
255          assert merged[0] == "123456"
256          # Second split is merged with third split to get above min_length and still beneath max_length
257          assert merged[1] == "1234567891234"
258  
259      def test_create_documents_from_splits(self):
260          mock_embedder = Mock()
261          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
262  
263          original_doc = Document(content="test", meta={"key": "value"})
264          splits = ["Split 1", "Split 2"]
265  
266          documents = splitter._create_documents_from_splits(splits, original_doc)
267  
268          assert len(documents) == 2
269          assert documents[0].content == "Split 1"
270          assert documents[0].meta["source_id"] == original_doc.id
271          assert documents[0].meta["split_id"] == 0
272          assert documents[0].meta["key"] == "value"
273          assert documents[1].content == "Split 2"
274          assert documents[1].meta["split_id"] == 1
275  
276      def test_create_documents_from_splits_with_page_numbers(self):
277          mock_embedder = Mock()
278          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
279  
280          original_doc = Document(content="Page 1 content.\fPage 2 content.\f\fPage 4 content.", meta={"key": "value"})
281          splits = ["Page 1 content.\f", "Page 2 content.\f\f", "Page 4 content."]
282  
283          documents = splitter._create_documents_from_splits(splits, original_doc)
284  
285          assert len(documents) == 3
286          assert documents[0].content == "Page 1 content.\f"
287          assert documents[0].meta["page_number"] == 1
288          assert documents[1].content == "Page 2 content.\f\f"
289          assert documents[1].meta["page_number"] == 2
290          assert documents[2].content == "Page 4 content."
291          assert documents[2].meta["page_number"] == 4
292  
293      def test_create_documents_from_splits_with_consecutive_page_breaks(self):
294          mock_embedder = Mock()
295          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
296  
297          # Test with consecutive page breaks at the end
298          original_doc = Document(content="Page 1 content.\fPage 2 content.\f\f\f", meta={"key": "value"})
299          splits = ["Page 1 content.\f", "Page 2 content.\f\f\f"]
300  
301          documents = splitter._create_documents_from_splits(splits, original_doc)
302  
303          assert len(documents) == 2
304          assert documents[0].content == "Page 1 content.\f"
305          assert documents[0].meta["page_number"] == 1
306          assert documents[1].content == "Page 2 content.\f\f\f"
307          # Should be page 2, not 4, because consecutive page breaks at the end are adjusted
308          assert documents[1].meta["page_number"] == 2
309  
310      def test_calculate_embeddings(self):
311          mock_embedder = Mock()
312  
313          # Mock the document embedder to return documents with embeddings
314          def mock_run(documents):
315              return {"documents": [replace(doc, embedding=[1.0, 2.0, 3.0]) for doc in documents]}
316  
317          mock_embedder.run = Mock(side_effect=mock_run)
318          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
319  
320          sentence_groups = ["Group 1", "Group 2", "Group 3"]
321          embeddings = splitter._calculate_embeddings(sentence_groups)
322  
323          assert len(embeddings) == 3
324          assert all(embedding == [1.0, 2.0, 3.0] for embedding in embeddings)
325          mock_embedder.run.assert_called_once()
326  
327      @pytest.mark.asyncio
328      async def test_calculate_embeddings_async(self) -> None:
329          mock_embedder = AsyncMock()
330  
331          # Mock the document embedder to return documents with embeddings
332          async def mock_run_async(documents):
333              return {"documents": [replace(doc, embedding=[1.0, 2.0, 3.0]) for doc in documents]}
334  
335          mock_embedder.run_async = AsyncMock(side_effect=mock_run_async)
336          splitter = EmbeddingBasedDocumentSplitter(document_embedder=mock_embedder)
337  
338          sentence_groups = ["Group 1", "Group 2", "Group 3"]
339          embeddings = await splitter._calculate_embeddings_async(sentence_groups)
340  
341          assert len(embeddings) == 3
342          assert all(embedding == [1.0, 2.0, 3.0] for embedding in embeddings)
343          mock_embedder.run_async.assert_called_once()
344  
345      def test_to_dict(self):
346          mock_embedder = Mock()
347          mock_embedder.to_dict.return_value = {"type": "MockEmbedder"}
348  
349          splitter = EmbeddingBasedDocumentSplitter(
350              document_embedder=mock_embedder, sentences_per_group=2, percentile=0.9, min_length=50, max_length=1000
351          )
352  
353          result = splitter.to_dict()
354  
355          assert "EmbeddingBasedDocumentSplitter" in result["type"]
356          assert result["init_parameters"]["sentences_per_group"] == 2
357          assert result["init_parameters"]["percentile"] == 0.9
358          assert result["init_parameters"]["min_length"] == 50
359          assert result["init_parameters"]["max_length"] == 1000
360          assert "document_embedder" in result["init_parameters"]
361  
362      @pytest.mark.integration
363      @pytest.mark.slow
364      def test_split_document_with_multiple_topics(self, del_hf_env_vars, monkeypatch):
365          import torch
366  
367          # Force CPU usage to avoid MPS memory issues
368          monkeypatch.setenv("PYTORCH_ENABLE_MPS_FALLBACK", "1")
369          torch.backends.mps.is_available = lambda: False
370  
371          embedder = SentenceTransformersDocumentEmbedder(
372              model="sentence-transformers/all-MiniLM-L6-v2", device=ComponentDevice.from_str("cpu")
373          )
374  
375          splitter = EmbeddingBasedDocumentSplitter(
376              document_embedder=embedder, sentences_per_group=2, percentile=0.9, min_length=30, max_length=300
377          )
378  
379          # A document with multiple topics
380          text = (
381              "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. "
382              "Machine learning has revolutionized many industries. Neural networks can process vast amounts of data. "
383              "Deep learning models achieve remarkable accuracy on complex tasks. "
384              "Cooking is both an art and a science. Fresh ingredients make all the difference. "
385              "Proper seasoning enhances the natural flavors of food. "
386              "The history of ancient civilizations fascinates researchers. Archaeological discoveries reveal new insights. "  # noqa: E501
387              "Ancient texts provide valuable information about past societies."
388          )
389          doc = Document(content=text)
390  
391          result = splitter.run(documents=[doc])
392          split_docs = result["documents"]
393  
394          # There should be more than one split
395          assert len(split_docs) > 1
396          # Each split should be non-empty and respect min_length
397          for split_doc in split_docs:
398              assert split_doc.content.strip() != ""
399              assert len(split_doc.content) >= 30
400          # The splits should cover the original text
401          combined = "".join([d.content for d in split_docs])
402          original = text
403          assert combined in original or original in combined
404  
405      @pytest.mark.asyncio
406      @pytest.mark.skipif(
407          not os.environ.get("TEI_URL", None),
408          reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.",
409      )
410      @pytest.mark.slow
411      @pytest.mark.integration
412      async def test_split_document_with_multiple_topics_async(self) -> None:
413          embedder = HuggingFaceAPIDocumentEmbedder(
414              api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")}
415          )
416  
417          splitter = EmbeddingBasedDocumentSplitter(
418              document_embedder=embedder, sentences_per_group=2, percentile=0.9, min_length=30, max_length=300
419          )
420  
421          # A document with multiple topics
422          text = (
423              "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. "
424              "Machine learning has revolutionized many industries. Neural networks can process vast amounts of data. "
425              "Deep learning models achieve remarkable accuracy on complex tasks. "
426              "Cooking is both an art and a science. Fresh ingredients make all the difference. "
427              "Proper seasoning enhances the natural flavors of food. "
428              "The history of ancient civilizations fascinates researchers. Archaeological discoveries reveal new insights. "  # noqa: E501
429              "Ancient texts provide valuable information about past societies."
430          )
431          doc = Document(content=text)
432  
433          result = await splitter.run_async(documents=[doc])
434          split_docs = result["documents"]
435  
436          # There should be more than one split
437          assert len(split_docs) > 1
438          # Each split should be non-empty and respect min_length
439          for split_doc in split_docs:
440              assert split_doc.content.strip() != ""
441              assert len(split_doc.content) >= 30
442          # The splits should cover the original text
443          combined = "".join([d.content for d in split_docs])
444          original = text
445          assert combined in original or original in combined
446  
447      @pytest.mark.slow
448      @pytest.mark.integration
449      def test_trailing_whitespace_is_preserved(self, del_hf_env_vars):
450          embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
451  
452          splitter = EmbeddingBasedDocumentSplitter(document_embedder=embedder, sentences_per_group=1)
453  
454          # Normal trailing whitespace
455          text = "The weather today is beautiful.  "
456          result = splitter.run(documents=[Document(content=text)])
457          assert result["documents"][0].content == text
458  
459          # Newline at the end
460          text = "The weather today is beautiful.\n"
461          result = splitter.run(documents=[Document(content=text)])
462          assert result["documents"][0].content == text
463  
464          # Page break at the end
465          text = "The weather today is beautiful.\f"
466          result = splitter.run(documents=[Document(content=text)])
467          assert result["documents"][0].content == text
468  
469      @pytest.mark.asyncio
470      @pytest.mark.skipif(
471          not os.environ.get("TEI_URL", None),
472          reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.",
473      )
474      @pytest.mark.slow
475      @pytest.mark.integration
476      async def test_trailing_whitespace_is_preserved_async(self) -> None:
477          embedder = HuggingFaceAPIDocumentEmbedder(
478              api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")}
479          )
480          splitter = EmbeddingBasedDocumentSplitter(document_embedder=embedder, sentences_per_group=1)
481  
482          # Normal trailing whitespace
483          text = "The weather today is beautiful.  "
484          result = await splitter.run_async(documents=[Document(content=text)])
485          assert result["documents"][0].content == text
486  
487          # Newline at the end
488          text = "The weather today is beautiful.\n"
489          result = await splitter.run_async(documents=[Document(content=text)])
490          assert result["documents"][0].content == text
491  
492          # Page break at the end
493          text = "The weather today is beautiful.\f"
494          result = await splitter.run_async(documents=[Document(content=text)])
495          assert result["documents"][0].content == text
496  
497      @pytest.mark.integration
498      @pytest.mark.slow
499      def test_no_extra_whitespaces_between_sentences(self, del_hf_env_vars):
500          embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
501  
502          splitter = EmbeddingBasedDocumentSplitter(
503              document_embedder=embedder, sentences_per_group=1, percentile=0.9, min_length=10, max_length=500
504          )
505  
506          text = (
507              "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. "
508              "There are no clouds and no rain. Machine learning has revolutionized many industries. "
509              "Neural networks can process vast amounts of data. Deep learning models achieve remarkable accuracy on complex tasks."  # noqa: E501
510          )
511          doc = Document(content=text)
512  
513          result = splitter.run(documents=[doc])
514          split_docs = result["documents"]
515          assert len(split_docs) == 2
516          # Expect the original whitespace structure with trailing spaces where they exist
517          assert (
518              split_docs[0].content
519              == "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. There are no clouds and no rain. "  # noqa: E501
520          )  # noqa: E501
521          assert (
522              split_docs[1].content
523              == "Machine learning has revolutionized many industries. Neural networks can process vast amounts of data. Deep learning models achieve remarkable accuracy on complex tasks."  # noqa: E501
524          )  # noqa: E501
525  
526      @pytest.mark.asyncio
527      @pytest.mark.skipif(
528          not os.environ.get("TEI_URL", None),
529          reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.",
530      )
531      @pytest.mark.integration
532      @pytest.mark.slow
533      async def test_no_extra_whitespaces_between_sentences_async(self) -> None:
534          embedder = HuggingFaceAPIDocumentEmbedder(
535              api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")}
536          )
537  
538          splitter = EmbeddingBasedDocumentSplitter(
539              document_embedder=embedder, sentences_per_group=1, percentile=0.9, min_length=10, max_length=500
540          )
541  
542          text = (
543              "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. "
544              "There are no clouds and no rain. Machine learning has revolutionized many industries. "
545              "Neural networks can process vast amounts of data. Deep learning models achieve remarkable accuracy on complex tasks."  # noqa: E501
546          )
547          doc = Document(content=text)
548  
549          result = await splitter.run_async(documents=[doc])
550          split_docs = result["documents"]
551          assert len(split_docs) == 2
552          # Expect the original whitespace structure with trailing spaces where they exist
553          assert (
554              split_docs[0].content
555              == "The weather today is beautiful. The sun is shining brightly. The temperature is perfect for a walk. There are no clouds and no rain. "  # noqa: E501
556          )  # noqa: E501
557          assert (
558              split_docs[1].content
559              == "Machine learning has revolutionized many industries. Neural networks can process vast amounts of data. Deep learning models achieve remarkable accuracy on complex tasks."  # noqa: E501
560          )  # noqa: E501
561  
562      @pytest.mark.integration
563      @pytest.mark.slow
564      def test_split_large_splits_recursion(self, del_hf_env_vars):
565          """
566          Test that _split_large_splits() works correctly without infinite loops.
567          This test uses a longer text that will trigger the recursive splitting logic.
568          If the chunk cannot be split further, it is allowed to be larger than max_length.
569          """
570          embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2", batch_size=32)
571          semantic_chunker = EmbeddingBasedDocumentSplitter(
572              document_embedder=embedder, sentences_per_group=5, percentile=0.95, min_length=50, max_length=1000
573          )
574  
575          text = """# Artificial intelligence and its Impact on Society
576  ## Article from Wikipedia, the free encyclopedia
577  ### Introduction to Artificial Intelligence
578  Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.
579  
580  ### The History of Software
581  The history of software is closely tied to the development of digital computers in the mid-20th century. Early programs were written in the machine language specific to the hardware. The introduction of high-level programming languages in 1958 allowed for more human-readable instructions, making software development easier and more portable across different computer architectures. Software in a programming language is run through a compiler or interpreter to execute on the architecture's hardware. Over time, software has become complex, owing to developments in networking, operating systems, and databases."""  # noqa: E501
582  
583          doc = Document(content=text)
584          result = semantic_chunker.run(documents=[doc])
585          split_docs = result["documents"]
586  
587          assert len(split_docs) == 1
588  
589          # If the chunk cannot be split further, it is allowed to be larger than max_length
590          # At least one split should be larger than max_length in this test case
591          assert any(len(split_doc.content) > 1000 for split_doc in split_docs)
592  
593          # Verify that the splits cover the original content
594          combined_content = "".join([d.content for d in split_docs])
595          assert combined_content == text
596  
597          for i, split_doc in enumerate(split_docs):
598              assert split_doc.meta["source_id"] == doc.id
599              assert split_doc.meta["split_id"] == i
600              assert "page_number" in split_doc.meta
601  
602      @pytest.mark.asyncio
603      @pytest.mark.skipif(
604          not os.environ.get("TEI_URL", None),
605          reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.",
606      )
607      @pytest.mark.integration
608      @pytest.mark.slow
609      async def test_split_large_splits_recursion_async(self) -> None:
610          """
611          Test that _split_large_splits() works correctly without infinite loops.
612          This test uses a longer text that will trigger the recursive splitting logic.
613          If the chunk cannot be split further, it is allowed to be larger than max_length.
614          """
615          embedder = HuggingFaceAPIDocumentEmbedder(
616              api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")}
617          )
618          semantic_chunker = EmbeddingBasedDocumentSplitter(
619              document_embedder=embedder, sentences_per_group=5, percentile=0.95, min_length=50, max_length=1000
620          )
621  
622          text = """# Artificial intelligence and its Impact on Society
623  ## Article from Wikipedia, the free encyclopedia
624  ### Introduction to Artificial Intelligence
625  Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals.
626  
627  ### The History of Software
628  The history of software is closely tied to the development of digital computers in the mid-20th century. Early programs were written in the machine language specific to the hardware. The introduction of high-level programming languages in 1958 allowed for more human-readable instructions, making software development easier and more portable across different computer architectures. Software in a programming language is run through a compiler or interpreter to execute on the architecture's hardware. Over time, software has become complex, owing to developments in networking, operating systems, and databases."""  # noqa: E501
629  
630          doc = Document(content=text)
631          result = await semantic_chunker.run_async(documents=[doc])
632          split_docs = result["documents"]
633  
634          assert len(split_docs) == 1
635  
636          # If the chunk cannot be split further, it is allowed to be larger than max_length
637          # At least one split should be larger than max_length in this test case
638          assert any(len(split_doc.content) > 1000 for split_doc in split_docs)
639  
640          # Verify that the splits cover the original content
641          combined_content = "".join([d.content for d in split_docs])
642          assert combined_content == text
643  
644          for i, split_doc in enumerate(split_docs):
645              assert split_doc.meta["source_id"] == doc.id
646              assert split_doc.meta["split_id"] == i
647              assert "page_number" in split_doc.meta
648  
649      @pytest.mark.integration
650      @pytest.mark.slow
651      def test_split_large_splits_actually_splits(self, del_hf_env_vars):
652          """
653          Test that _split_large_splits() actually works and can split long texts into multiple chunks.
654          This test uses a very long text that should be split into multiple chunks.
655          """
656          embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2", batch_size=32)
657          semantic_chunker = EmbeddingBasedDocumentSplitter(
658              document_embedder=embedder,
659              sentences_per_group=3,
660              percentile=0.85,  # Lower percentile to create more splits
661              min_length=100,
662              max_length=500,  # Smaller max_length to force more splits
663          )
664  
665          # Create a very long text with multiple paragraphs and topics
666          text = """# Comprehensive Guide to Machine Learning and Artificial Intelligence
667  
668  ## Introduction to Machine Learning
669  Machine learning is a subset of artificial intelligence that focuses on the development of computer programs that can access data and use it to learn for themselves. The process of learning begins with observations or data, such as examples, direct experience, or instruction, in order to look for patterns in data and make better decisions in the future based on the examples that we provide. The primary aim is to allow the computers learn automatically without human intervention or assistance and adjust actions accordingly.
670  
671  ## Types of Machine Learning
672  There are several types of machine learning algorithms, each with their own strengths and weaknesses. Supervised learning involves training a model on a labeled dataset, where the correct answers are provided. The model learns to map inputs to outputs based on these examples. Unsupervised learning, on the other hand, deals with unlabeled data and seeks to find hidden patterns or structures within the data. Reinforcement learning is a type of learning where an agent learns to behave in an environment by performing certain actions and receiving rewards or penalties.
673  
674  ## Deep Learning and Neural Networks
675  Deep learning is a subset of machine learning that uses neural networks with multiple layers to model and understand complex patterns. Neural networks are inspired by the human brain and consist of interconnected nodes or neurons. Each connection between neurons has a weight that is adjusted during training. The network learns by adjusting these weights based on the error between predicted and actual outputs. Deep learning has been particularly successful in areas such as computer vision, natural language processing, and speech recognition.
676  
677  \f
678  
679  ## Natural Language Processing
680  Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and human language. It involves developing algorithms and models that can understand, interpret, and generate human language. NLP applications include machine translation, sentiment analysis, text summarization, and question answering systems. Recent advances in deep learning have significantly improved the performance of NLP systems, leading to more accurate and sophisticated language models.
681  
682  ## Computer Vision and Image Recognition
683  Computer vision is another important area of artificial intelligence that deals with how computers can gain high-level understanding from digital images or videos. It involves developing algorithms that can identify and understand visual information from the world. Applications include facial recognition, object detection, medical image analysis, and autonomous vehicle navigation. Deep learning models, particularly convolutional neural networks (CNNs), have revolutionized computer vision by achieving human-level performance on many tasks.
684  
685  ## The Future of Artificial Intelligence
686  The future of artificial intelligence holds immense potential for transforming various industries and aspects of human life. We can expect to see more sophisticated AI systems that can handle complex reasoning tasks, understand context better, and interact more naturally with humans. However, this rapid advancement also brings challenges related to ethics, privacy, and the impact on employment. It's crucial to develop AI systems that are not only powerful but also safe, fair, and beneficial to society as a whole.
687  
688  \f
689  
690  ## Ethical Considerations in AI
691  As artificial intelligence becomes more prevalent, ethical considerations become increasingly important. Issues such as bias in AI systems, privacy concerns, and the potential for misuse need to be carefully addressed. AI systems can inherit biases from their training data, leading to unfair outcomes for certain groups. Privacy concerns arise from the vast amounts of data required to train AI systems. Additionally, there are concerns about the potential for AI to be used maliciously or to replace human workers in certain industries.
692  
693  ## Applications in Healthcare
694  Artificial intelligence has the potential to revolutionize healthcare by improving diagnosis, treatment planning, and patient care. Machine learning algorithms can analyze medical images to detect diseases earlier and more accurately than human doctors. AI systems can also help in drug discovery by predicting the effectiveness of potential treatments. In addition, AI-powered chatbots and virtual assistants can provide basic healthcare information and support to patients, reducing the burden on healthcare professionals.
695  
696  ## AI in Finance and Banking
697  The financial industry has been quick to adopt artificial intelligence for various applications. AI systems can analyze market data to make investment decisions, detect fraudulent transactions, and provide personalized financial advice. Machine learning algorithms can assess credit risk more accurately than traditional methods, leading to better lending decisions. Additionally, AI-powered chatbots can handle customer service inquiries, reducing costs and improving customer satisfaction.
698  
699  \f
700  
701  ## Transportation and Autonomous Vehicles
702  Autonomous vehicles represent one of the most visible applications of artificial intelligence in transportation. Self-driving cars use a combination of sensors, cameras, and AI algorithms to navigate roads safely. These systems can detect obstacles, read traffic signs, and make decisions about speed and direction. Beyond autonomous cars, AI is also being used in logistics and supply chain management to optimize routes and reduce delivery times.
703  
704  ## Education and Personalized Learning
705  Artificial intelligence is transforming education by enabling personalized learning experiences. AI systems can adapt to individual student needs, providing customized content and pacing. Intelligent tutoring systems can provide immediate feedback and support to students, helping them learn more effectively. Additionally, AI can help educators by automating administrative tasks and providing insights into student performance and learning patterns."""  # noqa: E501
706  
707          doc = Document(content=text)
708          result = semantic_chunker.run(documents=[doc])
709          split_docs = result["documents"]
710  
711          assert len(split_docs) == 11
712  
713          # Verify that the splits cover the original content
714          combined_content = "".join([d.content for d in split_docs])
715          assert combined_content == text
716  
717          for i, split_doc in enumerate(split_docs):
718              assert split_doc.meta["source_id"] == doc.id
719              assert split_doc.meta["split_id"] == i
720              assert "page_number" in split_doc.meta
721  
722              if i in [0, 1, 2, 3]:
723                  assert split_doc.meta["page_number"] == 1
724              if i in [4, 5, 6]:
725                  assert split_doc.meta["page_number"] == 2
726              if i in [7, 8]:
727                  assert split_doc.meta["page_number"] == 3
728              if i in [9, 10]:
729                  assert split_doc.meta["page_number"] == 4
730  
731      @pytest.mark.asyncio
732      @pytest.mark.skipif(
733          not os.environ.get("TEI_URL", None),
734          reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.",
735      )
736      @pytest.mark.integration
737      @pytest.mark.slow
738      async def test_split_large_splits_actually_splits_async(self) -> None:
739          """
740          Test that _split_large_splits() actually works and can split long texts into multiple chunks.
741          This test uses a very long text that should be split into multiple chunks.
742          """
743          embedder = HuggingFaceAPIDocumentEmbedder(
744              api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")}
745          )
746          semantic_chunker = EmbeddingBasedDocumentSplitter(
747              document_embedder=embedder,
748              sentences_per_group=3,
749              percentile=0.85,  # Lower percentile to create more splits
750              min_length=100,
751              max_length=500,  # Smaller max_length to force more splits
752          )
753  
754          # Create a very long text with multiple paragraphs and topics
755          text = """# Comprehensive Guide to Machine Learning and Artificial Intelligence
756  
757  ## Introduction to Machine Learning
758  Machine learning is a subset of artificial intelligence that focuses on the development of computer programs that can access data and use it to learn for themselves. The process of learning begins with observations or data, such as examples, direct experience, or instruction, in order to look for patterns in data and make better decisions in the future based on the examples that we provide. The primary aim is to allow the computers learn automatically without human intervention or assistance and adjust actions accordingly.
759  
760  ## Types of Machine Learning
761  There are several types of machine learning algorithms, each with their own strengths and weaknesses. Supervised learning involves training a model on a labeled dataset, where the correct answers are provided. The model learns to map inputs to outputs based on these examples. Unsupervised learning, on the other hand, deals with unlabeled data and seeks to find hidden patterns or structures within the data. Reinforcement learning is a type of learning where an agent learns to behave in an environment by performing certain actions and receiving rewards or penalties.
762  
763  ## Deep Learning and Neural Networks
764  Deep learning is a subset of machine learning that uses neural networks with multiple layers to model and understand complex patterns. Neural networks are inspired by the human brain and consist of interconnected nodes or neurons. Each connection between neurons has a weight that is adjusted during training. The network learns by adjusting these weights based on the error between predicted and actual outputs. Deep learning has been particularly successful in areas such as computer vision, natural language processing, and speech recognition.
765  
766  \f
767  
768  ## Natural Language Processing
769  Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and human language. It involves developing algorithms and models that can understand, interpret, and generate human language. NLP applications include machine translation, sentiment analysis, text summarization, and question answering systems. Recent advances in deep learning have significantly improved the performance of NLP systems, leading to more accurate and sophisticated language models.
770  
771  ## Computer Vision and Image Recognition
772  Computer vision is another important area of artificial intelligence that deals with how computers can gain high-level understanding from digital images or videos. It involves developing algorithms that can identify and understand visual information from the world. Applications include facial recognition, object detection, medical image analysis, and autonomous vehicle navigation. Deep learning models, particularly convolutional neural networks (CNNs), have revolutionized computer vision by achieving human-level performance on many tasks.
773  
774  ## The Future of Artificial Intelligence
775  The future of artificial intelligence holds immense potential for transforming various industries and aspects of human life. We can expect to see more sophisticated AI systems that can handle complex reasoning tasks, understand context better, and interact more naturally with humans. However, this rapid advancement also brings challenges related to ethics, privacy, and the impact on employment. It's crucial to develop AI systems that are not only powerful but also safe, fair, and beneficial to society as a whole.
776  
777  \f
778  
779  ## Ethical Considerations in AI
780  As artificial intelligence becomes more prevalent, ethical considerations become increasingly important. Issues such as bias in AI systems, privacy concerns, and the potential for misuse need to be carefully addressed. AI systems can inherit biases from their training data, leading to unfair outcomes for certain groups. Privacy concerns arise from the vast amounts of data required to train AI systems. Additionally, there are concerns about the potential for AI to be used maliciously or to replace human workers in certain industries.
781  
782  ## Applications in Healthcare
783  Artificial intelligence has the potential to revolutionize healthcare by improving diagnosis, treatment planning, and patient care. Machine learning algorithms can analyze medical images to detect diseases earlier and more accurately than human doctors. AI systems can also help in drug discovery by predicting the effectiveness of potential treatments. In addition, AI-powered chatbots and virtual assistants can provide basic healthcare information and support to patients, reducing the burden on healthcare professionals.
784  
785  ## AI in Finance and Banking
786  The financial industry has been quick to adopt artificial intelligence for various applications. AI systems can analyze market data to make investment decisions, detect fraudulent transactions, and provide personalized financial advice. Machine learning algorithms can assess credit risk more accurately than traditional methods, leading to better lending decisions. Additionally, AI-powered chatbots can handle customer service inquiries, reducing costs and improving customer satisfaction.
787  
788  \f
789  
790  ## Transportation and Autonomous Vehicles
791  Autonomous vehicles represent one of the most visible applications of artificial intelligence in transportation. Self-driving cars use a combination of sensors, cameras, and AI algorithms to navigate roads safely. These systems can detect obstacles, read traffic signs, and make decisions about speed and direction. Beyond autonomous cars, AI is also being used in logistics and supply chain management to optimize routes and reduce delivery times.
792  
793  ## Education and Personalized Learning
794  Artificial intelligence is transforming education by enabling personalized learning experiences. AI systems can adapt to individual student needs, providing customized content and pacing. Intelligent tutoring systems can provide immediate feedback and support to students, helping them learn more effectively. Additionally, AI can help educators by automating administrative tasks and providing insights into student performance and learning patterns."""  # noqa: E501
795  
796          doc = Document(content=text)
797          result = await semantic_chunker.run_async(documents=[doc])
798          split_docs = result["documents"]
799  
800          assert len(split_docs) == 11
801  
802          # Verify that the splits cover the original content
803          combined_content = "".join([d.content for d in split_docs])
804          assert combined_content == text
805  
806          for i, split_doc in enumerate(split_docs):
807              assert split_doc.meta["source_id"] == doc.id
808              assert split_doc.meta["split_id"] == i
809              assert "page_number" in split_doc.meta
810  
811              if i in [0, 1, 2, 3]:
812                  assert split_doc.meta["page_number"] == 1
813              if i in [4, 5, 6]:
814                  assert split_doc.meta["page_number"] == 2
815              if i in [7, 8]:
816                  assert split_doc.meta["page_number"] == 3
817              if i in [9, 10]:
818                  assert split_doc.meta["page_number"] == 4