Cradicle Explorer

/ test / python / testpipeline / testdata / testtokenizer.py
testtokenizer.py
 1  """
 2  Tokenizer module tests
 3  """
 4  
 5  import unittest
 6  
 7  from txtai.pipeline import Tokenizer
 8  
 9  
10  class TestTokenizer(unittest.TestCase):
11      """
12      Tokenizer tests.
13      """
14  
15      def testAlphanumTokenize(self):
16          """
17          Test alphanumeric tokenization
18          """
19  
20          # Alphanumeric tokenization through backwards compatible static method
21          self.assertEqual(Tokenizer.tokenize("Y this is a test!"), ["test"])
22          self.assertEqual(Tokenizer.tokenize("abc123 ABC 123"), ["abc123", "abc"])
23  
24      def testEmptyTokenize(self):
25          """
26          Test handling empty and None inputs
27          """
28  
29          # Test that parser can handle empty or None strings
30          self.assertEqual(Tokenizer.tokenize(""), [])
31          self.assertEqual(Tokenizer.tokenize(None), None)
32  
33      def testStandardTokenize(self):
34          """
35          Test standard tokenization
36          """
37  
38          # Default standard tokenizer parameters
39          tokenizer = Tokenizer()
40  
41          # Define token tests
42          tests = [
43              ("Y this is a test!", ["y", "this", "is", "a", "test"]),
44              ("abc123 ABC 123", ["abc123", "abc", "123"]),
45              ("Testing hy-phenated words", ["testing", "hy", "phenated", "words"]),
46              ("111-111-1111", ["111", "111", "1111"]),
47              ("Test.1234", ["test", "1234"]),
48          ]
49  
50          # Run through tests
51          for test, result in tests:
52              # Unicode Text Segmentation per Unicode Annex #29
53              self.assertEqual(tokenizer(test), result)