testtextractor.py
1 """ 2 Textractor module tests 3 """ 4 5 import platform 6 import unittest 7 8 from txtai.pipeline import Textractor 9 10 # pylint: disable=C0411 11 from utils import Utils 12 13 14 class TestTextractor(unittest.TestCase): 15 """ 16 Textractor tests. 17 """ 18 19 def testClean(self): 20 """ 21 Test text cleaning method 22 """ 23 24 # Default text cleaning 25 textractor = Textractor() 26 self.assertEqual(textractor(" a b c "), "a b c") 27 28 # Require text to be minlength 29 textractor = Textractor(minlength=10) 30 self.assertEqual(textractor(" a b c "), None) 31 32 # Disable text cleaning 33 textractor = Textractor(cleantext=False, minlength=10) 34 self.assertEqual(textractor(" a b c "), " a b c ") 35 36 def testChonkie(self): 37 """ 38 Test a chonkie chunker 39 """ 40 41 # Test chonkie chunking 42 textractor = Textractor(chunker="sentence", chunk_size=5, chunk_overlap=0) 43 self.assertEqual(textractor("This is a test. And another test."), ["This is a test.", "And another test."]) 44 45 # Test bad chunker throws an exception 46 with self.assertRaises(AttributeError): 47 textractor = Textractor(chunker="badchunker") 48 49 def testDefault(self): 50 """ 51 Test default text extraction 52 """ 53 54 # Text input 55 textractor = Textractor(backend=None) 56 text = textractor(Utils.PATH + "/tabular.csv") 57 self.assertEqual(len(text), 125) 58 59 # Markdown input 60 textractor = Textractor(sections=True) 61 sections = textractor("# Heading 1\nText1\n\n# Heading 2\nText2\n") 62 63 # Check number of sections is as expected 64 self.assertEqual(len(sections), 2) 65 66 @unittest.skipIf(platform.system() == "Darwin", "Docling skipped on macOS to avoid MPS issues") 67 def testDocling(self): 68 """ 69 Test docling backend 70 """ 71 72 textractor = Textractor(backend="docling") 73 74 # Extract text and check for Markdown formatting 75 text = textractor(Utils.PATH + "/article.pdf") 76 self.assertTrue("## Introducing txtai" in text) 77 78 def testLines(self): 79 """ 80 Test extraction to lines 81 """ 82 83 textractor = Textractor(lines=True) 84 85 # Extract text as lines 86 lines = textractor(Utils.PATH + "/article.pdf") 87 88 # Check number of lines is as expected 89 self.assertEqual(len(lines), 35) 90 91 def testHTML(self): 92 """ 93 Test HTML to Markdown 94 """ 95 96 # Headings 97 self.assertMarkdown("<h1>This is a test</h1>", "# This is a test") 98 self.assertMarkdown("<h6>This is a test</h6>", "###### This is a test") 99 100 # Blockquotes 101 self.assertMarkdown("<blockquote>This is a test</blockquote>", "> This is a test") 102 103 # Lists 104 self.assertMarkdown("<ul><li>Test1</li><li>Test2</li></ul>", "- Test1\n- Test2") 105 self.assertMarkdown("<ol><li>Test1</li><li>Test2</li></ol>", "1. Test1\n2. Test2") 106 107 # Code 108 self.assertMarkdown("<code>This is a test</code>", "```\nThis is a test\n```") 109 self.assertMarkdown("<pre>This is a test</pre>", "```\nThis is a test\n```") 110 111 # Tables 112 self.assertMarkdown( 113 "<table><tr><th>Header1</th><th>Header2</th></tr><tr><td>Test1</td><td>Test2</td></tr></table>", 114 "|Header1|Header2|\n|---|---|\n|Test1|Test2|", 115 ) 116 117 # Ignore list 118 self.assertMarkdown("<aside>This is a test</aside>", "") 119 120 # Text formatting 121 self.assertMarkdown("<p>This is a test</p>", "This is a test") 122 self.assertMarkdown("<p>This is a <b>test</b</p>", "This is a **test**") 123 self.assertMarkdown("<p>This is a <strong>test</strong></p>", "This is a **test**") 124 self.assertMarkdown("<p>This is a <i>test</i></p>", "This is a *test*") 125 self.assertMarkdown("<p>This is a <em>test</em></p>", "This is a *test*") 126 self.assertMarkdown("<p>This is a <a href='link'>test</a>", "This is a [test](link)") 127 128 # Collapse to outer tag 129 self.assertMarkdown("<p>This is a <strong><em>test</em></strong></p>", "This is a **test**") 130 self.assertMarkdown("<p>This is a <em><strong>test</strong></em></p>", "This is a *test*") 131 132 def testParagraphs(self): 133 """ 134 Test extraction to paragraphs 135 """ 136 137 textractor = Textractor(paragraphs=True) 138 139 # Extract text as paragraphs 140 paragraphs = textractor(Utils.PATH + "/article.pdf") 141 142 # Check number of paragraphs is as expected 143 self.assertEqual(len(paragraphs), 11) 144 145 def testSections(self): 146 """ 147 Test extraction to sections 148 """ 149 150 textractor = Textractor(sections=True) 151 152 # Extract as sections 153 sections = textractor(Utils.PATH + "/document.pdf") 154 155 # Check number of sections is as expected 156 self.assertEqual(len(sections), 3) 157 158 def testSentences(self): 159 """ 160 Test extraction to sentences 161 """ 162 163 textractor = Textractor(sentences=True) 164 165 # Extract text as sentences 166 sentences = textractor(Utils.PATH + "/article.pdf") 167 168 # Check number of sentences is as expected 169 self.assertEqual(len(sentences), 17) 170 171 def testSingle(self): 172 """ 173 Test a single extraction with no tokenization of the results 174 """ 175 176 textractor = Textractor() 177 178 # Extract text as a single block 179 text = textractor(Utils.PATH + "/article.pdf") 180 181 # Check length of text is as expected 182 self.assertEqual(len(text), 2471) 183 184 def testTable(self): 185 """ 186 Test table extraction 187 """ 188 189 textractor = Textractor() 190 191 # Extract text as a single block 192 for name in ["document.docx", "spreadsheet.xlsx"]: 193 text = textractor(f"{Utils.PATH}/{name}") 194 195 # Check for table header 196 self.assertTrue("|---|" in text) 197 198 def testTikaFlag(self): 199 """ 200 Test legacy tika flag 201 """ 202 203 textractor = Textractor(tika=True) 204 self.assertIsNotNone(textractor.html) 205 206 textractor = Textractor(tika=False) 207 self.assertIsNone(textractor.html) 208 209 def testTuples(self): 210 """ 211 Test output tuples 212 """ 213 214 # Default text cleaning 215 textractor = Textractor(tuples=True) 216 217 path, text = textractor(Utils.PATH + "/article.pdf") 218 self.assertEqual(path, Utils.PATH + "/article.pdf") 219 self.assertEqual(len(text), 2471) 220 221 def testURL(self): 222 """ 223 Test parsing a remote URL 224 """ 225 226 # Test parsing URLs for each backend 227 for backend in ["docling", "tika"]: 228 textractor = Textractor(backend=backend) 229 text = textractor("https://github.com/neuml/txtai") 230 self.assertTrue("txtai is an all-in-one AI framework" in text) 231 232 def assertMarkdown(self, html, expected): 233 """ 234 Helper method to assert generated markdown is as expected. 235 236 Args: 237 html: input html snippet 238 expected: expected markdown text 239 """ 240 241 textractor = Textractor() 242 self.assertEqual(textractor(f"<html><body>{html}</body></html>"), expected)