Cradicle Explorer

/ tests / unit / search / tokenizer.test.ts
tokenizer.test.ts
  1  /**
  2   * Tests for search tokenizer
  3   */
  4  import { describe, it, expect } from 'vitest';
  5  import {
  6    tokenize,
  7    tokenizeUrl,
  8    normalizeToken,
  9    isValidToken,
 10    simpleStem,
 11    extractDomain,
 12    stringSimilarity,
 13  } from '../../../src/lib/search/tokenizer';
 14  
 15  describe('normalizeToken', () => {
 16    it('converts to lowercase', () => {
 17      expect(normalizeToken('HELLO')).toBe('hello');
 18      expect(normalizeToken('HeLLo')).toBe('hello');
 19    });
 20  
 21    it('removes diacritics', () => {
 22      expect(normalizeToken('café')).toBe('cafe');
 23      expect(normalizeToken('naïve')).toBe('naive');
 24    });
 25  
 26    it('trims whitespace', () => {
 27      expect(normalizeToken('  hello  ')).toBe('hello');
 28    });
 29  });
 30  
 31  describe('isValidToken', () => {
 32    it('rejects tokens shorter than 2 characters', () => {
 33      expect(isValidToken('a')).toBe(false);
 34      expect(isValidToken('ab')).toBe(true);
 35    });
 36  
 37    it('rejects stop words', () => {
 38      expect(isValidToken('the')).toBe(false);
 39      expect(isValidToken('and')).toBe(false);
 40      expect(isValidToken('www')).toBe(false);
 41    });
 42  
 43    it('rejects tokens without letters', () => {
 44      expect(isValidToken('123')).toBe(false);
 45      expect(isValidToken('12a')).toBe(true);
 46    });
 47  
 48    it('accepts valid tokens', () => {
 49      expect(isValidToken('react')).toBe(true);
 50      expect(isValidToken('javascript')).toBe(true);
 51    });
 52  });
 53  
 54  describe('simpleStem', () => {
 55    it('removes common suffixes', () => {
 56      expect(simpleStem('running')).toBe('runn');
 57      expect(simpleStem('jumped')).toBe('jump');
 58      expect(simpleStem('tests')).toBe('test');
 59    });
 60  
 61    it('preserves short words', () => {
 62      expect(simpleStem('go')).toBe('go');
 63      expect(simpleStem('run')).toBe('run');
 64    });
 65  });
 66  
 67  describe('extractDomain', () => {
 68    it('extracts domain from URL', () => {
 69      expect(extractDomain('https://www.example.com/path')).toBe('example.com');
 70      expect(extractDomain('https://github.com/user/repo')).toBe('github.com');
 71    });
 72  
 73    it('removes www prefix', () => {
 74      expect(extractDomain('https://www.google.com')).toBe('google.com');
 75    });
 76  
 77    it('returns null for invalid URLs', () => {
 78      expect(extractDomain('not-a-url')).toBe(null);
 79    });
 80  });
 81  
 82  describe('tokenize', () => {
 83    it('splits text into tokens', () => {
 84      const tokens = tokenize('Hello World');
 85      expect(tokens).toContain('hello');
 86      expect(tokens).toContain('world');
 87    });
 88  
 89    it('removes punctuation', () => {
 90      const tokens = tokenize('Hello, World!');
 91      expect(tokens).toContain('hello');
 92      expect(tokens).toContain('world');
 93      expect(tokens).not.toContain(',');
 94    });
 95  
 96    it('filters stop words', () => {
 97      const tokens = tokenize('the quick brown fox');
 98      expect(tokens).not.toContain('the');
 99      expect(tokens).toContain('quick');
100      expect(tokens).toContain('brown');
101      expect(tokens).toContain('fox');
102    });
103  
104    it('deduplicates tokens within same text', () => {
105      const tokens = tokenize('hello hello world');
106      expect(tokens.filter((t) => t === 'hello')).toHaveLength(1);
107    });
108  
109    it('returns empty array for empty input', () => {
110      expect(tokenize('')).toEqual([]);
111      expect(tokenize('   ')).toEqual([]);
112    });
113  });
114  
115  describe('tokenizeUrl', () => {
116    it('extracts domain parts', () => {
117      const tokens = tokenizeUrl('https://github.com/user/repo');
118      expect(tokens).toContain('github');
119    });
120  
121    it('extracts path segments', () => {
122      const tokens = tokenizeUrl('https://example.com/docs/getting-started');
123      expect(tokens).toContain('docs');
124      expect(tokens).toContain('getting');
125      expect(tokens).toContain('started');
126    });
127  
128    it('handles URLs with query parameters', () => {
129      const tokens = tokenizeUrl('https://example.com/search?query=test');
130      expect(tokens).toContain('search');
131      expect(tokens).toContain('query');
132    });
133  });
134  
135  describe('stringSimilarity', () => {
136    it('returns 1 for identical strings', () => {
137      expect(stringSimilarity('hello', 'hello')).toBe(1);
138    });
139  
140    it('returns 0 for empty strings', () => {
141      expect(stringSimilarity('', 'hello')).toBe(0);
142      expect(stringSimilarity('hello', '')).toBe(0);
143    });
144  
145    it('returns partial score for similar strings', () => {
146      const similarity = stringSimilarity('hello', 'helo');
147      expect(similarity).toBeGreaterThan(0.5);
148      expect(similarity).toBeLessThan(1);
149    });
150  
151    it('returns higher score when one string contains the other', () => {
152      const containsSimilarity = stringSimilarity('react', 'reactjs');
153      expect(containsSimilarity).toBeGreaterThan(0.5);
154    });
155  });