/ test / components / preprocessors / test_csv_document_cleaner.py
test_csv_document_cleaner.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  from haystack import Document
  6  from haystack.components.preprocessors.csv_document_cleaner import CSVDocumentCleaner
  7  
  8  
  9  def test_empty_column() -> None:
 10      csv_content = """,A,B,C
 11  ,1,2,3
 12  ,4,5,6
 13  """
 14      csv_document = Document(content=csv_content)
 15      csv_document_cleaner = CSVDocumentCleaner()
 16      result = csv_document_cleaner.run([csv_document])
 17      cleaned_document = result["documents"][0]
 18      assert cleaned_document.content == "A,B,C\n1,2,3\n4,5,6\n"
 19  
 20  
 21  def test_empty_row() -> None:
 22      csv_content = """A,B,C
 23  1,2,3
 24  ,,
 25  4,5,6
 26  """
 27      csv_document = Document(content=csv_content)
 28      csv_document_cleaner = CSVDocumentCleaner()
 29      result = csv_document_cleaner.run([csv_document])
 30      cleaned_document = result["documents"][0]
 31      assert cleaned_document.content == "A,B,C\n1,2,3\n4,5,6\n"
 32  
 33  
 34  def test_empty_column_and_row() -> None:
 35      csv_content = """,A,B,C
 36  ,1,2,3
 37  ,,,
 38  ,4,5,6
 39  """
 40      csv_document = Document(content=csv_content)
 41      csv_document_cleaner = CSVDocumentCleaner()
 42      result = csv_document_cleaner.run([csv_document])
 43      cleaned_document = result["documents"][0]
 44      assert cleaned_document.content == "A,B,C\n1,2,3\n4,5,6\n"
 45  
 46  
 47  def test_ignore_rows() -> None:
 48      csv_content = """,,
 49  A,B,C
 50  4,5,6
 51  7,8,9
 52  """
 53      csv_document = Document(content=csv_content, meta={"name": "test.csv"})
 54      csv_document_cleaner = CSVDocumentCleaner(ignore_rows=1)
 55      result = csv_document_cleaner.run([csv_document])
 56      cleaned_document = result["documents"][0]
 57      assert cleaned_document.content == ",,\nA,B,C\n4,5,6\n7,8,9\n"
 58      assert cleaned_document.meta == {"name": "test.csv"}
 59  
 60  
 61  def test_ignore_rows_2() -> None:
 62      csv_content = """A,B,C
 63  ,,
 64  4,5,6
 65  7,8,9
 66  """
 67      csv_document = Document(content=csv_content, meta={"name": "test.csv"})
 68      csv_document_cleaner = CSVDocumentCleaner(ignore_rows=1)
 69      result = csv_document_cleaner.run([csv_document])
 70      cleaned_document = result["documents"][0]
 71      assert cleaned_document.content == "A,B,C\n4,5,6\n7,8,9\n"
 72      assert cleaned_document.meta == {"name": "test.csv"}
 73  
 74  
 75  def test_ignore_rows_3() -> None:
 76      csv_content = """A,B,C
 77  4,,6
 78  7,,9
 79  """
 80      csv_document = Document(content=csv_content, meta={"name": "test.csv"})
 81      csv_document_cleaner = CSVDocumentCleaner(ignore_rows=1)
 82      result = csv_document_cleaner.run([csv_document])
 83      cleaned_document = result["documents"][0]
 84      assert cleaned_document.content == "A,C\n4,6\n7,9\n"
 85      assert cleaned_document.meta == {"name": "test.csv"}
 86  
 87  
 88  def test_ignore_columns() -> None:
 89      csv_content = """,,A,B
 90  ,2,3,4
 91  ,7,8,9
 92  """
 93      csv_document = Document(content=csv_content)
 94      csv_document_cleaner = CSVDocumentCleaner(ignore_columns=1)
 95      result = csv_document_cleaner.run([csv_document])
 96      cleaned_document = result["documents"][0]
 97      assert cleaned_document.content == ",,A,B\n,2,3,4\n,7,8,9\n"
 98  
 99  
100  def test_too_many_ignore_rows() -> None:
101      csv_content = """,,
102  A,B,C
103  4,5,6
104  """
105      csv_document = Document(content=csv_content)
106      csv_document_cleaner = CSVDocumentCleaner(ignore_rows=4)
107      result = csv_document_cleaner.run([csv_document])
108      cleaned_document = result["documents"][0]
109      assert cleaned_document.content == ",,\nA,B,C\n4,5,6\n"
110  
111  
112  def test_too_many_ignore_columns() -> None:
113      csv_content = """,,
114  A,B,C
115  4,5,6
116  """
117      csv_document = Document(content=csv_content)
118      csv_document_cleaner = CSVDocumentCleaner(ignore_columns=4)
119      result = csv_document_cleaner.run([csv_document])
120      cleaned_document = result["documents"][0]
121      assert cleaned_document.content == ",,\nA,B,C\n4,5,6\n"
122  
123  
124  def test_ignore_rows_and_columns() -> None:
125      csv_content = """,A,B,C
126  1,item,s,
127  2,item2,fd,
128  """
129      csv_document = Document(content=csv_content)
130      csv_document_cleaner = CSVDocumentCleaner(ignore_columns=1, ignore_rows=1)
131      result = csv_document_cleaner.run([csv_document])
132      cleaned_document = result["documents"][0]
133      assert cleaned_document.content == ",A,B\n1,item,s\n2,item2,fd\n"
134  
135  
136  def test_zero_ignore_rows_and_columns() -> None:
137      csv_content = """,A,B,C
138  1,item,s,
139  2,item2,fd,
140  """
141      csv_document = Document(content=csv_content)
142      csv_document_cleaner = CSVDocumentCleaner(ignore_columns=0, ignore_rows=0)
143      result = csv_document_cleaner.run([csv_document])
144      cleaned_document = result["documents"][0]
145      assert cleaned_document.content == ",A,B,C\n1,item,s,\n2,item2,fd,\n"
146  
147  
148  def test_empty_document() -> None:
149      csv_document = Document(content="")
150      csv_document_cleaner = CSVDocumentCleaner()
151      result = csv_document_cleaner.run([csv_document])
152      cleaned_document = result["documents"][0]
153      assert cleaned_document.content == ""
154      assert cleaned_document.meta == {}
155  
156  
157  def test_empty_documents() -> None:
158      csv_document_cleaner = CSVDocumentCleaner()
159      result = csv_document_cleaner.run([])
160      assert result["documents"] == []
161  
162  
163  def test_keep_id() -> None:
164      csv_content = """,A,B,C
165  1,item,s,
166  """
167      csv_document = Document(id="123", content=csv_content)
168      csv_document_cleaner = CSVDocumentCleaner(keep_id=True)
169      result = csv_document_cleaner.run([csv_document])
170      cleaned_document = result["documents"][0]
171      assert cleaned_document.id == "123"
172      assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
173  
174  
175  def test_id_not_none() -> None:
176      csv_content = """,A,B,C
177  1,item,s,
178  """
179      csv_document = Document(content=csv_content)
180      csv_document_cleaner = CSVDocumentCleaner()
181      result = csv_document_cleaner.run([csv_document])
182      cleaned_document = result["documents"][0]
183      assert cleaned_document.id != ""
184      assert cleaned_document.content == ",A,B,C\n1,item,s,\n"
185  
186  
187  def test_remove_empty_rows_false() -> None:
188      csv_content = """,B,C
189  ,,
190  ,5,6
191  """
192      csv_document = Document(content=csv_content)
193      csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False)
194      result = csv_document_cleaner.run([csv_document])
195      cleaned_document = result["documents"][0]
196      assert cleaned_document.content == "B,C\n,\n5,6\n"
197  
198  
199  def test_remove_empty_columns_false() -> None:
200      csv_content = """,B,C
201  ,,
202  ,,4
203  """
204      csv_document = Document(content=csv_content)
205      csv_document_cleaner = CSVDocumentCleaner(remove_empty_columns=False)
206      result = csv_document_cleaner.run([csv_document])
207      cleaned_document = result["documents"][0]
208      assert cleaned_document.content == ",B,C\n,,4\n"
209  
210  
211  def test_remove_empty_rows_and_columns_false() -> None:
212      csv_content = """,B,C
213  ,,4
214  ,,
215  """
216      csv_document = Document(content=csv_content)
217      csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False, remove_empty_columns=False)
218      result = csv_document_cleaner.run([csv_document])
219      cleaned_document = result["documents"][0]
220      assert cleaned_document.content == ",B,C\n,,4\n,,\n"