/ test / components / preprocessors / test_sentence_tokenizer.py
test_sentence_tokenizer.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  import time
  6  from pathlib import Path
  7  from unittest.mock import patch
  8  
  9  import pytest
 10  from pytest import LogCaptureFixture
 11  
 12  from haystack.components.preprocessors.sentence_tokenizer import QUOTE_SPANS_RE, SentenceSplitter
 13  
 14  
 15  def test_apply_split_rules_no_join() -> None:
 16      text = "This is a test. This is another test. And a third test."
 17      spans = [(0, 15), (16, 36), (37, 54)]
 18      result = SentenceSplitter._apply_split_rules(text, spans)
 19      assert len(result) == 3
 20      assert result == [(0, 15), (16, 36), (37, 54)]
 21  
 22  
 23  def test_apply_split_rules_join_case_1():
 24      text = 'He said "This is sentence one. This is sentence two." Then he left.'
 25      result = SentenceSplitter._apply_split_rules(text, [(0, 30), (31, 53), (54, 67)])
 26      assert len(result) == 2
 27      assert result == [(0, 53), (54, 67)]
 28  
 29  
 30  def test_apply_split_rules_join_case_3():
 31      splitter = SentenceSplitter(language="en", use_split_rules=True)
 32      text = """
 33      1. First item
 34      2. Second item
 35      3. Third item."""
 36      spans = [(0, 7), (8, 25), (26, 44), (45, 56)]
 37      result = splitter._apply_split_rules(text, spans)
 38      assert len(result) == 1
 39      assert result == [(0, 56)]
 40  
 41  
 42  def test_apply_split_rules_join_case_4() -> None:
 43      text = "This is a test. (With a parenthetical statement.) And another sentence."
 44      spans = [(0, 15), (16, 50), (51, 74)]
 45      result = SentenceSplitter._apply_split_rules(text, spans)
 46      assert len(result) == 2
 47      assert result == [(0, 50), (51, 74)]
 48  
 49  
 50  @pytest.fixture
 51  def mock_file_content():
 52      return "Mr.\nDr.\nProf."
 53  
 54  
 55  def test_read_abbreviations_existing_file(tmp_path, mock_file_content):
 56      abbrev_dir = tmp_path / "data" / "abbreviations"
 57      abbrev_dir.mkdir(parents=True)
 58      abbrev_file = abbrev_dir / "en.txt"
 59      abbrev_file.write_text(mock_file_content)
 60  
 61      with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path:
 62          mock_path.return_value.parent.parent.parent = tmp_path
 63          result = SentenceSplitter._read_abbreviations("en")
 64          assert result == ["Mr.", "Dr.", "Prof."]
 65  
 66  
 67  def test_read_abbreviations_missing_file(caplog: LogCaptureFixture):
 68      with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path:
 69          mock_path.return_value.parent.parent = Path("/nonexistent")
 70          result = SentenceSplitter._read_abbreviations("pt")
 71          assert result == []
 72          assert "No abbreviations file found for pt. Using default abbreviations." in caplog.text
 73  
 74  
 75  def test_quote_spans_regex():
 76      # double quotes
 77      text1 = 'He said "Hello world" and left.'
 78      matches1 = list(QUOTE_SPANS_RE.finditer(text1))
 79      assert len(matches1) == 1
 80      assert matches1[0].group() == '"Hello world"'
 81  
 82      # single quotes
 83      text2 = "She replied 'Goodbye world' and smiled."
 84      matches2 = list(QUOTE_SPANS_RE.finditer(text2))
 85      assert len(matches2) == 1
 86      assert matches2[0].group() == "'Goodbye world'"
 87  
 88      # multiple quotes
 89      text3 = 'First "quote" and second "quote" in same text.'
 90      matches3 = list(QUOTE_SPANS_RE.finditer(text3))
 91      assert len(matches3) == 2
 92      assert matches3[0].group() == '"quote"'
 93      assert matches3[1].group() == '"quote"'
 94  
 95      # quotes containing newlines
 96      text4 = 'Text with "quote\nspanning\nmultiple\nlines"'
 97      matches4 = list(QUOTE_SPANS_RE.finditer(text4))
 98      assert len(matches4) == 1
 99      assert matches4[0].group() == '"quote\nspanning\nmultiple\nlines"'
100  
101      # no quotes
102      text5 = "This text has no quotes."
103      matches5 = list(QUOTE_SPANS_RE.finditer(text5))
104      assert len(matches5) == 0
105  
106  
107  def test_split_sentences_performance() -> None:
108      # make sure our regex is not vulnerable to Regex Denial of Service (ReDoS)
109      # https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS
110      # this is a very long string, roughly 50 MB, but it should not take more than 2 seconds to process
111      splitter = SentenceSplitter()
112      text = " " + '"' * 20 + "A" * 50000000 + "B"
113      start = time.time()
114      _ = splitter.split_sentences(text)
115      end = time.time()
116  
117      assert end - start < 2, f"Execution time exceeded 2 seconds: {end - start:.2f} seconds"