test_sentence_tokenizer.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 import time 6 from pathlib import Path 7 from unittest.mock import patch 8 9 import pytest 10 from pytest import LogCaptureFixture 11 12 from haystack.components.preprocessors.sentence_tokenizer import QUOTE_SPANS_RE, SentenceSplitter 13 14 15 def test_apply_split_rules_no_join() -> None: 16 text = "This is a test. This is another test. And a third test." 17 spans = [(0, 15), (16, 36), (37, 54)] 18 result = SentenceSplitter._apply_split_rules(text, spans) 19 assert len(result) == 3 20 assert result == [(0, 15), (16, 36), (37, 54)] 21 22 23 def test_apply_split_rules_join_case_1(): 24 text = 'He said "This is sentence one. This is sentence two." Then he left.' 25 result = SentenceSplitter._apply_split_rules(text, [(0, 30), (31, 53), (54, 67)]) 26 assert len(result) == 2 27 assert result == [(0, 53), (54, 67)] 28 29 30 def test_apply_split_rules_join_case_3(): 31 splitter = SentenceSplitter(language="en", use_split_rules=True) 32 text = """ 33 1. First item 34 2. Second item 35 3. Third item.""" 36 spans = [(0, 7), (8, 25), (26, 44), (45, 56)] 37 result = splitter._apply_split_rules(text, spans) 38 assert len(result) == 1 39 assert result == [(0, 56)] 40 41 42 def test_apply_split_rules_join_case_4() -> None: 43 text = "This is a test. (With a parenthetical statement.) And another sentence." 44 spans = [(0, 15), (16, 50), (51, 74)] 45 result = SentenceSplitter._apply_split_rules(text, spans) 46 assert len(result) == 2 47 assert result == [(0, 50), (51, 74)] 48 49 50 @pytest.fixture 51 def mock_file_content(): 52 return "Mr.\nDr.\nProf." 53 54 55 def test_read_abbreviations_existing_file(tmp_path, mock_file_content): 56 abbrev_dir = tmp_path / "data" / "abbreviations" 57 abbrev_dir.mkdir(parents=True) 58 abbrev_file = abbrev_dir / "en.txt" 59 abbrev_file.write_text(mock_file_content) 60 61 with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path: 62 mock_path.return_value.parent.parent.parent = tmp_path 63 result = SentenceSplitter._read_abbreviations("en") 64 assert result == ["Mr.", "Dr.", "Prof."] 65 66 67 def test_read_abbreviations_missing_file(caplog: LogCaptureFixture): 68 with patch("haystack.components.preprocessors.sentence_tokenizer.Path") as mock_path: 69 mock_path.return_value.parent.parent = Path("/nonexistent") 70 result = SentenceSplitter._read_abbreviations("pt") 71 assert result == [] 72 assert "No abbreviations file found for pt. Using default abbreviations." in caplog.text 73 74 75 def test_quote_spans_regex(): 76 # double quotes 77 text1 = 'He said "Hello world" and left.' 78 matches1 = list(QUOTE_SPANS_RE.finditer(text1)) 79 assert len(matches1) == 1 80 assert matches1[0].group() == '"Hello world"' 81 82 # single quotes 83 text2 = "She replied 'Goodbye world' and smiled." 84 matches2 = list(QUOTE_SPANS_RE.finditer(text2)) 85 assert len(matches2) == 1 86 assert matches2[0].group() == "'Goodbye world'" 87 88 # multiple quotes 89 text3 = 'First "quote" and second "quote" in same text.' 90 matches3 = list(QUOTE_SPANS_RE.finditer(text3)) 91 assert len(matches3) == 2 92 assert matches3[0].group() == '"quote"' 93 assert matches3[1].group() == '"quote"' 94 95 # quotes containing newlines 96 text4 = 'Text with "quote\nspanning\nmultiple\nlines"' 97 matches4 = list(QUOTE_SPANS_RE.finditer(text4)) 98 assert len(matches4) == 1 99 assert matches4[0].group() == '"quote\nspanning\nmultiple\nlines"' 100 101 # no quotes 102 text5 = "This text has no quotes." 103 matches5 = list(QUOTE_SPANS_RE.finditer(text5)) 104 assert len(matches5) == 0 105 106 107 def test_split_sentences_performance() -> None: 108 # make sure our regex is not vulnerable to Regex Denial of Service (ReDoS) 109 # https://owasp.org/www-community/attacks/Regular_expression_Denial_of_Service_-_ReDoS 110 # this is a very long string, roughly 50 MB, but it should not take more than 2 seconds to process 111 splitter = SentenceSplitter() 112 text = " " + '"' * 20 + "A" * 50000000 + "B" 113 start = time.time() 114 _ = splitter.split_sentences(text) 115 end = time.time() 116 117 assert end - start < 2, f"Execution time exceeded 2 seconds: {end - start:.2f} seconds"