test_text_cleaner.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 from haystack.components.preprocessors import TextCleaner 6 7 8 def test_init_default(): 9 cleaner = TextCleaner() 10 assert cleaner._remove_regexps is None 11 assert not cleaner._convert_to_lowercase 12 assert not cleaner._remove_punctuation 13 assert not cleaner._remove_numbers 14 assert cleaner._regex is None 15 assert cleaner._translator is None 16 17 18 def test_run(): 19 cleaner = TextCleaner() 20 texts = ["Some text", "Some other text", "Yet another text"] 21 result = cleaner.run(texts=texts) 22 assert len(result) == 1 23 assert result["texts"] == texts 24 25 26 def test_run_with_empty_inputs(): 27 cleaner = TextCleaner() 28 result = cleaner.run(texts=[]) 29 assert len(result) == 1 30 assert result["texts"] == [] 31 32 33 def test_run_with_regex(): 34 cleaner = TextCleaner(remove_regexps=[r"\d+"]) 35 result = cleaner.run(texts=["Open123 Source", "HaystackAI"]) 36 assert len(result) == 1 37 assert result["texts"] == ["Open Source", "HaystackAI"] 38 39 40 def test_run_with_multiple_regexps(): 41 cleaner = TextCleaner(remove_regexps=[r"\d+", r"[^\w\s]"]) 42 result = cleaner.run(texts=["Open123! Source", "Haystack.AI"]) 43 assert len(result) == 1 44 assert result["texts"] == ["Open Source", "HaystackAI"] 45 46 47 def test_run_with_convert_to_lowercase(): 48 cleaner = TextCleaner(convert_to_lowercase=True) 49 result = cleaner.run(texts=["Open123! Source", "Haystack.AI"]) 50 assert len(result) == 1 51 assert result["texts"] == ["open123! source", "haystack.ai"] 52 53 54 def test_run_with_remove_punctuation(): 55 cleaner = TextCleaner(remove_punctuation=True) 56 result = cleaner.run(texts=["Open123! Source", "Haystack.AI"]) 57 assert len(result) == 1 58 assert result["texts"] == ["Open123 Source", "HaystackAI"] 59 60 61 def test_run_with_remove_numbers(): 62 cleaner = TextCleaner(remove_numbers=True) 63 result = cleaner.run(texts=["Open123! Source", "Haystack.AI"]) 64 assert len(result) == 1 65 assert result["texts"] == ["Open! Source", "Haystack.AI"] 66 67 68 def test_run_with_multiple_parameters(): 69 cleaner = TextCleaner( 70 remove_regexps=[r"\d+", r"[^\w\s]"], convert_to_lowercase=True, remove_punctuation=True, remove_numbers=True 71 ) 72 result = cleaner.run(texts=["Open%123. !$Source", "Haystack.AI##"]) 73 assert len(result) == 1 74 assert result["texts"] == ["open source", "haystackai"]