/ test / python / testpipeline / testdata / testtextractor.py
testtextractor.py
  1  """
  2  Textractor module tests
  3  """
  4  
  5  import platform
  6  import unittest
  7  
  8  from txtai.pipeline import Textractor
  9  
 10  # pylint: disable=C0411
 11  from utils import Utils
 12  
 13  
 14  class TestTextractor(unittest.TestCase):
 15      """
 16      Textractor tests.
 17      """
 18  
 19      def testClean(self):
 20          """
 21          Test text cleaning method
 22          """
 23  
 24          # Default text cleaning
 25          textractor = Textractor()
 26          self.assertEqual(textractor(" a  b  c "), "a b c")
 27  
 28          # Require text to be minlength
 29          textractor = Textractor(minlength=10)
 30          self.assertEqual(textractor(" a  b  c "), None)
 31  
 32          # Disable text cleaning
 33          textractor = Textractor(cleantext=False, minlength=10)
 34          self.assertEqual(textractor(" a  b  c "), " a  b  c ")
 35  
 36      def testChonkie(self):
 37          """
 38          Test a chonkie chunker
 39          """
 40  
 41          # Test chonkie chunking
 42          textractor = Textractor(chunker="sentence", chunk_size=5, chunk_overlap=0)
 43          self.assertEqual(textractor("This is a test. And another test."), ["This is a test.", "And another test."])
 44  
 45          # Test bad chunker throws an exception
 46          with self.assertRaises(AttributeError):
 47              textractor = Textractor(chunker="badchunker")
 48  
 49      def testDefault(self):
 50          """
 51          Test default text extraction
 52          """
 53  
 54          # Text input
 55          textractor = Textractor(backend=None)
 56          text = textractor(Utils.PATH + "/tabular.csv")
 57          self.assertEqual(len(text), 125)
 58  
 59          # Markdown input
 60          textractor = Textractor(sections=True)
 61          sections = textractor("# Heading 1\nText1\n\n# Heading 2\nText2\n")
 62  
 63          # Check number of sections is as expected
 64          self.assertEqual(len(sections), 2)
 65  
 66      @unittest.skipIf(platform.system() == "Darwin", "Docling skipped on macOS to avoid MPS issues")
 67      def testDocling(self):
 68          """
 69          Test docling backend
 70          """
 71  
 72          textractor = Textractor(backend="docling")
 73  
 74          # Extract text and check for Markdown formatting
 75          text = textractor(Utils.PATH + "/article.pdf")
 76          self.assertTrue("## Introducing txtai" in text)
 77  
 78      def testLines(self):
 79          """
 80          Test extraction to lines
 81          """
 82  
 83          textractor = Textractor(lines=True)
 84  
 85          # Extract text as lines
 86          lines = textractor(Utils.PATH + "/article.pdf")
 87  
 88          # Check number of lines is as expected
 89          self.assertEqual(len(lines), 35)
 90  
 91      def testHTML(self):
 92          """
 93          Test HTML to Markdown
 94          """
 95  
 96          # Headings
 97          self.assertMarkdown("<h1>This is a test</h1>", "# This is a test")
 98          self.assertMarkdown("<h6>This is a test</h6>", "###### This is a test")
 99  
100          # Blockquotes
101          self.assertMarkdown("<blockquote>This is a test</blockquote>", "> This is a test")
102  
103          # Lists
104          self.assertMarkdown("<ul><li>Test1</li><li>Test2</li></ul>", "- Test1\n- Test2")
105          self.assertMarkdown("<ol><li>Test1</li><li>Test2</li></ol>", "1. Test1\n2. Test2")
106  
107          # Code
108          self.assertMarkdown("<code>This is a test</code>", "```\nThis is a test\n```")
109          self.assertMarkdown("<pre>This is a test</pre>", "```\nThis is a test\n```")
110  
111          # Tables
112          self.assertMarkdown(
113              "<table><tr><th>Header1</th><th>Header2</th></tr><tr><td>Test1</td><td>Test2</td></tr></table>",
114              "|Header1|Header2|\n|---|---|\n|Test1|Test2|",
115          )
116  
117          # Ignore list
118          self.assertMarkdown("<aside>This is a test</aside>", "")
119  
120          # Text formatting
121          self.assertMarkdown("<p>This is a test</p>", "This is a test")
122          self.assertMarkdown("<p>This is a <b>test</b</p>", "This is a **test**")
123          self.assertMarkdown("<p>This is a <strong>test</strong></p>", "This is a **test**")
124          self.assertMarkdown("<p>This is a <i>test</i></p>", "This is a *test*")
125          self.assertMarkdown("<p>This is a <em>test</em></p>", "This is a *test*")
126          self.assertMarkdown("<p>This is a <a href='link'>test</a>", "This is a [test](link)")
127  
128          # Collapse to outer tag
129          self.assertMarkdown("<p>This is a <strong><em>test</em></strong></p>", "This is a **test**")
130          self.assertMarkdown("<p>This is a <em><strong>test</strong></em></p>", "This is a *test*")
131  
132      def testParagraphs(self):
133          """
134          Test extraction to paragraphs
135          """
136  
137          textractor = Textractor(paragraphs=True)
138  
139          # Extract text as paragraphs
140          paragraphs = textractor(Utils.PATH + "/article.pdf")
141  
142          # Check number of paragraphs is as expected
143          self.assertEqual(len(paragraphs), 11)
144  
145      def testSections(self):
146          """
147          Test extraction to sections
148          """
149  
150          textractor = Textractor(sections=True)
151  
152          # Extract as sections
153          sections = textractor(Utils.PATH + "/document.pdf")
154  
155          # Check number of sections is as expected
156          self.assertEqual(len(sections), 3)
157  
158      def testSentences(self):
159          """
160          Test extraction to sentences
161          """
162  
163          textractor = Textractor(sentences=True)
164  
165          # Extract text as sentences
166          sentences = textractor(Utils.PATH + "/article.pdf")
167  
168          # Check number of sentences is as expected
169          self.assertEqual(len(sentences), 17)
170  
171      def testSingle(self):
172          """
173          Test a single extraction with no tokenization of the results
174          """
175  
176          textractor = Textractor()
177  
178          # Extract text as a single block
179          text = textractor(Utils.PATH + "/article.pdf")
180  
181          # Check length of text is as expected
182          self.assertEqual(len(text), 2471)
183  
184      def testTable(self):
185          """
186          Test table extraction
187          """
188  
189          textractor = Textractor()
190  
191          # Extract text as a single block
192          for name in ["document.docx", "spreadsheet.xlsx"]:
193              text = textractor(f"{Utils.PATH}/{name}")
194  
195              # Check for table header
196              self.assertTrue("|---|" in text)
197  
198      def testTikaFlag(self):
199          """
200          Test legacy tika flag
201          """
202  
203          textractor = Textractor(tika=True)
204          self.assertIsNotNone(textractor.html)
205  
206          textractor = Textractor(tika=False)
207          self.assertIsNone(textractor.html)
208  
209      def testTuples(self):
210          """
211          Test output tuples
212          """
213  
214          # Default text cleaning
215          textractor = Textractor(tuples=True)
216  
217          path, text = textractor(Utils.PATH + "/article.pdf")
218          self.assertEqual(path, Utils.PATH + "/article.pdf")
219          self.assertEqual(len(text), 2471)
220  
221      def testURL(self):
222          """
223          Test parsing a remote URL
224          """
225  
226          # Test parsing URLs for each backend
227          for backend in ["docling", "tika"]:
228              textractor = Textractor(backend=backend)
229              text = textractor("https://github.com/neuml/txtai")
230              self.assertTrue("txtai is an all-in-one AI framework" in text)
231  
232      def assertMarkdown(self, html, expected):
233          """
234          Helper method to assert generated markdown is as expected.
235  
236          Args:
237              html: input html snippet
238              expected: expected markdown text
239          """
240  
241          textractor = Textractor()
242          self.assertEqual(textractor(f"<html><body>{html}</body></html>"), expected)