/ test / components / preprocessors / test_text_cleaner.py
test_text_cleaner.py
 1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
 2  #
 3  # SPDX-License-Identifier: Apache-2.0
 4  
 5  from haystack.components.preprocessors import TextCleaner
 6  
 7  
 8  def test_init_default():
 9      cleaner = TextCleaner()
10      assert cleaner._remove_regexps is None
11      assert not cleaner._convert_to_lowercase
12      assert not cleaner._remove_punctuation
13      assert not cleaner._remove_numbers
14      assert cleaner._regex is None
15      assert cleaner._translator is None
16  
17  
18  def test_run():
19      cleaner = TextCleaner()
20      texts = ["Some text", "Some other text", "Yet another text"]
21      result = cleaner.run(texts=texts)
22      assert len(result) == 1
23      assert result["texts"] == texts
24  
25  
26  def test_run_with_empty_inputs():
27      cleaner = TextCleaner()
28      result = cleaner.run(texts=[])
29      assert len(result) == 1
30      assert result["texts"] == []
31  
32  
33  def test_run_with_regex():
34      cleaner = TextCleaner(remove_regexps=[r"\d+"])
35      result = cleaner.run(texts=["Open123 Source", "HaystackAI"])
36      assert len(result) == 1
37      assert result["texts"] == ["Open Source", "HaystackAI"]
38  
39  
40  def test_run_with_multiple_regexps():
41      cleaner = TextCleaner(remove_regexps=[r"\d+", r"[^\w\s]"])
42      result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
43      assert len(result) == 1
44      assert result["texts"] == ["Open Source", "HaystackAI"]
45  
46  
47  def test_run_with_convert_to_lowercase():
48      cleaner = TextCleaner(convert_to_lowercase=True)
49      result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
50      assert len(result) == 1
51      assert result["texts"] == ["open123! source", "haystack.ai"]
52  
53  
54  def test_run_with_remove_punctuation():
55      cleaner = TextCleaner(remove_punctuation=True)
56      result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
57      assert len(result) == 1
58      assert result["texts"] == ["Open123 Source", "HaystackAI"]
59  
60  
61  def test_run_with_remove_numbers():
62      cleaner = TextCleaner(remove_numbers=True)
63      result = cleaner.run(texts=["Open123! Source", "Haystack.AI"])
64      assert len(result) == 1
65      assert result["texts"] == ["Open! Source", "Haystack.AI"]
66  
67  
68  def test_run_with_multiple_parameters():
69      cleaner = TextCleaner(
70          remove_regexps=[r"\d+", r"[^\w\s]"], convert_to_lowercase=True, remove_punctuation=True, remove_numbers=True
71      )
72      result = cleaner.run(texts=["Open%123. !$Source", "Haystack.AI##"])
73      assert len(result) == 1
74      assert result["texts"] == ["open source", "haystackai"]