testtokenizer.py
1 """ 2 Tokenizer module tests 3 """ 4 5 import unittest 6 7 from txtai.pipeline import Tokenizer 8 9 10 class TestTokenizer(unittest.TestCase): 11 """ 12 Tokenizer tests. 13 """ 14 15 def testAlphanumTokenize(self): 16 """ 17 Test alphanumeric tokenization 18 """ 19 20 # Alphanumeric tokenization through backwards compatible static method 21 self.assertEqual(Tokenizer.tokenize("Y this is a test!"), ["test"]) 22 self.assertEqual(Tokenizer.tokenize("abc123 ABC 123"), ["abc123", "abc"]) 23 24 def testEmptyTokenize(self): 25 """ 26 Test handling empty and None inputs 27 """ 28 29 # Test that parser can handle empty or None strings 30 self.assertEqual(Tokenizer.tokenize(""), []) 31 self.assertEqual(Tokenizer.tokenize(None), None) 32 33 def testStandardTokenize(self): 34 """ 35 Test standard tokenization 36 """ 37 38 # Default standard tokenizer parameters 39 tokenizer = Tokenizer() 40 41 # Define token tests 42 tests = [ 43 ("Y this is a test!", ["y", "this", "is", "a", "test"]), 44 ("abc123 ABC 123", ["abc123", "abc", "123"]), 45 ("Testing hy-phenated words", ["testing", "hy", "phenated", "words"]), 46 ("111-111-1111", ["111", "111", "1111"]), 47 ("Test.1234", ["test", "1234"]), 48 ] 49 50 # Run through tests 51 for test, result in tests: 52 # Unicode Text Segmentation per Unicode Annex #29 53 self.assertEqual(tokenizer(test), result)