tokenizer.test.ts
1 /** 2 * Tests for search tokenizer 3 */ 4 import { describe, it, expect } from 'vitest'; 5 import { 6 tokenize, 7 tokenizeUrl, 8 normalizeToken, 9 isValidToken, 10 simpleStem, 11 extractDomain, 12 stringSimilarity, 13 } from '../../../src/lib/search/tokenizer'; 14 15 describe('normalizeToken', () => { 16 it('converts to lowercase', () => { 17 expect(normalizeToken('HELLO')).toBe('hello'); 18 expect(normalizeToken('HeLLo')).toBe('hello'); 19 }); 20 21 it('removes diacritics', () => { 22 expect(normalizeToken('café')).toBe('cafe'); 23 expect(normalizeToken('naïve')).toBe('naive'); 24 }); 25 26 it('trims whitespace', () => { 27 expect(normalizeToken(' hello ')).toBe('hello'); 28 }); 29 }); 30 31 describe('isValidToken', () => { 32 it('rejects tokens shorter than 2 characters', () => { 33 expect(isValidToken('a')).toBe(false); 34 expect(isValidToken('ab')).toBe(true); 35 }); 36 37 it('rejects stop words', () => { 38 expect(isValidToken('the')).toBe(false); 39 expect(isValidToken('and')).toBe(false); 40 expect(isValidToken('www')).toBe(false); 41 }); 42 43 it('rejects tokens without letters', () => { 44 expect(isValidToken('123')).toBe(false); 45 expect(isValidToken('12a')).toBe(true); 46 }); 47 48 it('accepts valid tokens', () => { 49 expect(isValidToken('react')).toBe(true); 50 expect(isValidToken('javascript')).toBe(true); 51 }); 52 }); 53 54 describe('simpleStem', () => { 55 it('removes common suffixes', () => { 56 expect(simpleStem('running')).toBe('runn'); 57 expect(simpleStem('jumped')).toBe('jump'); 58 expect(simpleStem('tests')).toBe('test'); 59 }); 60 61 it('preserves short words', () => { 62 expect(simpleStem('go')).toBe('go'); 63 expect(simpleStem('run')).toBe('run'); 64 }); 65 }); 66 67 describe('extractDomain', () => { 68 it('extracts domain from URL', () => { 69 expect(extractDomain('https://www.example.com/path')).toBe('example.com'); 70 expect(extractDomain('https://github.com/user/repo')).toBe('github.com'); 71 }); 72 73 it('removes www prefix', () => { 74 expect(extractDomain('https://www.google.com')).toBe('google.com'); 75 }); 76 77 it('returns null for invalid URLs', () => { 78 expect(extractDomain('not-a-url')).toBe(null); 79 }); 80 }); 81 82 describe('tokenize', () => { 83 it('splits text into tokens', () => { 84 const tokens = tokenize('Hello World'); 85 expect(tokens).toContain('hello'); 86 expect(tokens).toContain('world'); 87 }); 88 89 it('removes punctuation', () => { 90 const tokens = tokenize('Hello, World!'); 91 expect(tokens).toContain('hello'); 92 expect(tokens).toContain('world'); 93 expect(tokens).not.toContain(','); 94 }); 95 96 it('filters stop words', () => { 97 const tokens = tokenize('the quick brown fox'); 98 expect(tokens).not.toContain('the'); 99 expect(tokens).toContain('quick'); 100 expect(tokens).toContain('brown'); 101 expect(tokens).toContain('fox'); 102 }); 103 104 it('deduplicates tokens within same text', () => { 105 const tokens = tokenize('hello hello world'); 106 expect(tokens.filter((t) => t === 'hello')).toHaveLength(1); 107 }); 108 109 it('returns empty array for empty input', () => { 110 expect(tokenize('')).toEqual([]); 111 expect(tokenize(' ')).toEqual([]); 112 }); 113 }); 114 115 describe('tokenizeUrl', () => { 116 it('extracts domain parts', () => { 117 const tokens = tokenizeUrl('https://github.com/user/repo'); 118 expect(tokens).toContain('github'); 119 }); 120 121 it('extracts path segments', () => { 122 const tokens = tokenizeUrl('https://example.com/docs/getting-started'); 123 expect(tokens).toContain('docs'); 124 expect(tokens).toContain('getting'); 125 expect(tokens).toContain('started'); 126 }); 127 128 it('handles URLs with query parameters', () => { 129 const tokens = tokenizeUrl('https://example.com/search?query=test'); 130 expect(tokens).toContain('search'); 131 expect(tokens).toContain('query'); 132 }); 133 }); 134 135 describe('stringSimilarity', () => { 136 it('returns 1 for identical strings', () => { 137 expect(stringSimilarity('hello', 'hello')).toBe(1); 138 }); 139 140 it('returns 0 for empty strings', () => { 141 expect(stringSimilarity('', 'hello')).toBe(0); 142 expect(stringSimilarity('hello', '')).toBe(0); 143 }); 144 145 it('returns partial score for similar strings', () => { 146 const similarity = stringSimilarity('hello', 'helo'); 147 expect(similarity).toBeGreaterThan(0.5); 148 expect(similarity).toBeLessThan(1); 149 }); 150 151 it('returns higher score when one string contains the other', () => { 152 const containsSimilarity = stringSimilarity('react', 'reactjs'); 153 expect(containsSimilarity).toBeGreaterThan(0.5); 154 }); 155 });