test_csv_document_cleaner.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 from haystack import Document 6 from haystack.components.preprocessors.csv_document_cleaner import CSVDocumentCleaner 7 8 9 def test_empty_column() -> None: 10 csv_content = """,A,B,C 11 ,1,2,3 12 ,4,5,6 13 """ 14 csv_document = Document(content=csv_content) 15 csv_document_cleaner = CSVDocumentCleaner() 16 result = csv_document_cleaner.run([csv_document]) 17 cleaned_document = result["documents"][0] 18 assert cleaned_document.content == "A,B,C\n1,2,3\n4,5,6\n" 19 20 21 def test_empty_row() -> None: 22 csv_content = """A,B,C 23 1,2,3 24 ,, 25 4,5,6 26 """ 27 csv_document = Document(content=csv_content) 28 csv_document_cleaner = CSVDocumentCleaner() 29 result = csv_document_cleaner.run([csv_document]) 30 cleaned_document = result["documents"][0] 31 assert cleaned_document.content == "A,B,C\n1,2,3\n4,5,6\n" 32 33 34 def test_empty_column_and_row() -> None: 35 csv_content = """,A,B,C 36 ,1,2,3 37 ,,, 38 ,4,5,6 39 """ 40 csv_document = Document(content=csv_content) 41 csv_document_cleaner = CSVDocumentCleaner() 42 result = csv_document_cleaner.run([csv_document]) 43 cleaned_document = result["documents"][0] 44 assert cleaned_document.content == "A,B,C\n1,2,3\n4,5,6\n" 45 46 47 def test_ignore_rows() -> None: 48 csv_content = """,, 49 A,B,C 50 4,5,6 51 7,8,9 52 """ 53 csv_document = Document(content=csv_content, meta={"name": "test.csv"}) 54 csv_document_cleaner = CSVDocumentCleaner(ignore_rows=1) 55 result = csv_document_cleaner.run([csv_document]) 56 cleaned_document = result["documents"][0] 57 assert cleaned_document.content == ",,\nA,B,C\n4,5,6\n7,8,9\n" 58 assert cleaned_document.meta == {"name": "test.csv"} 59 60 61 def test_ignore_rows_2() -> None: 62 csv_content = """A,B,C 63 ,, 64 4,5,6 65 7,8,9 66 """ 67 csv_document = Document(content=csv_content, meta={"name": "test.csv"}) 68 csv_document_cleaner = CSVDocumentCleaner(ignore_rows=1) 69 result = csv_document_cleaner.run([csv_document]) 70 cleaned_document = result["documents"][0] 71 assert cleaned_document.content == "A,B,C\n4,5,6\n7,8,9\n" 72 assert cleaned_document.meta == {"name": "test.csv"} 73 74 75 def test_ignore_rows_3() -> None: 76 csv_content = """A,B,C 77 4,,6 78 7,,9 79 """ 80 csv_document = Document(content=csv_content, meta={"name": "test.csv"}) 81 csv_document_cleaner = CSVDocumentCleaner(ignore_rows=1) 82 result = csv_document_cleaner.run([csv_document]) 83 cleaned_document = result["documents"][0] 84 assert cleaned_document.content == "A,C\n4,6\n7,9\n" 85 assert cleaned_document.meta == {"name": "test.csv"} 86 87 88 def test_ignore_columns() -> None: 89 csv_content = """,,A,B 90 ,2,3,4 91 ,7,8,9 92 """ 93 csv_document = Document(content=csv_content) 94 csv_document_cleaner = CSVDocumentCleaner(ignore_columns=1) 95 result = csv_document_cleaner.run([csv_document]) 96 cleaned_document = result["documents"][0] 97 assert cleaned_document.content == ",,A,B\n,2,3,4\n,7,8,9\n" 98 99 100 def test_too_many_ignore_rows() -> None: 101 csv_content = """,, 102 A,B,C 103 4,5,6 104 """ 105 csv_document = Document(content=csv_content) 106 csv_document_cleaner = CSVDocumentCleaner(ignore_rows=4) 107 result = csv_document_cleaner.run([csv_document]) 108 cleaned_document = result["documents"][0] 109 assert cleaned_document.content == ",,\nA,B,C\n4,5,6\n" 110 111 112 def test_too_many_ignore_columns() -> None: 113 csv_content = """,, 114 A,B,C 115 4,5,6 116 """ 117 csv_document = Document(content=csv_content) 118 csv_document_cleaner = CSVDocumentCleaner(ignore_columns=4) 119 result = csv_document_cleaner.run([csv_document]) 120 cleaned_document = result["documents"][0] 121 assert cleaned_document.content == ",,\nA,B,C\n4,5,6\n" 122 123 124 def test_ignore_rows_and_columns() -> None: 125 csv_content = """,A,B,C 126 1,item,s, 127 2,item2,fd, 128 """ 129 csv_document = Document(content=csv_content) 130 csv_document_cleaner = CSVDocumentCleaner(ignore_columns=1, ignore_rows=1) 131 result = csv_document_cleaner.run([csv_document]) 132 cleaned_document = result["documents"][0] 133 assert cleaned_document.content == ",A,B\n1,item,s\n2,item2,fd\n" 134 135 136 def test_zero_ignore_rows_and_columns() -> None: 137 csv_content = """,A,B,C 138 1,item,s, 139 2,item2,fd, 140 """ 141 csv_document = Document(content=csv_content) 142 csv_document_cleaner = CSVDocumentCleaner(ignore_columns=0, ignore_rows=0) 143 result = csv_document_cleaner.run([csv_document]) 144 cleaned_document = result["documents"][0] 145 assert cleaned_document.content == ",A,B,C\n1,item,s,\n2,item2,fd,\n" 146 147 148 def test_empty_document() -> None: 149 csv_document = Document(content="") 150 csv_document_cleaner = CSVDocumentCleaner() 151 result = csv_document_cleaner.run([csv_document]) 152 cleaned_document = result["documents"][0] 153 assert cleaned_document.content == "" 154 assert cleaned_document.meta == {} 155 156 157 def test_empty_documents() -> None: 158 csv_document_cleaner = CSVDocumentCleaner() 159 result = csv_document_cleaner.run([]) 160 assert result["documents"] == [] 161 162 163 def test_keep_id() -> None: 164 csv_content = """,A,B,C 165 1,item,s, 166 """ 167 csv_document = Document(id="123", content=csv_content) 168 csv_document_cleaner = CSVDocumentCleaner(keep_id=True) 169 result = csv_document_cleaner.run([csv_document]) 170 cleaned_document = result["documents"][0] 171 assert cleaned_document.id == "123" 172 assert cleaned_document.content == ",A,B,C\n1,item,s,\n" 173 174 175 def test_id_not_none() -> None: 176 csv_content = """,A,B,C 177 1,item,s, 178 """ 179 csv_document = Document(content=csv_content) 180 csv_document_cleaner = CSVDocumentCleaner() 181 result = csv_document_cleaner.run([csv_document]) 182 cleaned_document = result["documents"][0] 183 assert cleaned_document.id != "" 184 assert cleaned_document.content == ",A,B,C\n1,item,s,\n" 185 186 187 def test_remove_empty_rows_false() -> None: 188 csv_content = """,B,C 189 ,, 190 ,5,6 191 """ 192 csv_document = Document(content=csv_content) 193 csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False) 194 result = csv_document_cleaner.run([csv_document]) 195 cleaned_document = result["documents"][0] 196 assert cleaned_document.content == "B,C\n,\n5,6\n" 197 198 199 def test_remove_empty_columns_false() -> None: 200 csv_content = """,B,C 201 ,, 202 ,,4 203 """ 204 csv_document = Document(content=csv_content) 205 csv_document_cleaner = CSVDocumentCleaner(remove_empty_columns=False) 206 result = csv_document_cleaner.run([csv_document]) 207 cleaned_document = result["documents"][0] 208 assert cleaned_document.content == ",B,C\n,,4\n" 209 210 211 def test_remove_empty_rows_and_columns_false() -> None: 212 csv_content = """,B,C 213 ,,4 214 ,, 215 """ 216 csv_document = Document(content=csv_content) 217 csv_document_cleaner = CSVDocumentCleaner(remove_empty_rows=False, remove_empty_columns=False) 218 result = csv_document_cleaner.run([csv_document]) 219 cleaned_document = result["documents"][0] 220 assert cleaned_document.content == ",B,C\n,,4\n,,\n"