/ src / chunkers / tokenizers / factory.py
factory.py
 1  from __future__ import annotations
 2  
 3  """Factory for creating tokenizer implementations."""
 4  
 5  from collections.abc import Callable
 6  from typing import Any, ClassVar
 7  
 8  from .base_tokenizer import Tokenizer
 9  from .gemini import create_gemini_tokenizer
10  from .simple import create_simple_tokenizer
11  from .types import TokenizerType
12  
13  
14  class TokenizerFactory:
15      """Factory for creating tokenizer implementations. Easily extensible."""
16  
17      _registry: ClassVar[dict[TokenizerType, Callable[[dict[str, Any]], Tokenizer]]] = {}
18  
19      @classmethod
20      def register(cls, tokenizer_type: TokenizerType):
21          """Register a new tokenizer factory.
22  
23          Parameters
24          ----------
25          tokenizer_type
26              Type to register the tokenizer under.
27          """
28          def decorator(factory_func: Callable[[dict[str, Any]], Tokenizer]):
29              cls._registry[tokenizer_type] = factory_func
30              return factory_func
31          return decorator
32  
33      @classmethod
34      def create(cls, tokenizer_type: TokenizerType, **kwargs) -> Tokenizer:
35          """Create a tokenizer by type.
36  
37          Parameters
38          ----------
39          tokenizer_type
40              Type of the tokenizer to create.
41          **kwargs
42              Additional arguments passed to the tokenizer factory.
43  
44          Returns
45          -------
46          Tokenizer instance.
47          """
48          if tokenizer_type not in cls._registry:
49              available = ", ".join(t.value for t in cls._registry)
50              raise ValueError(
51                  f"Unknown tokenizer: {tokenizer_type}. "
52                  f"Available tokenizers: {available}"
53              )
54          return cls._registry[tokenizer_type](kwargs)
55  
56  
57  TokenizerFactory.register(TokenizerType.SIMPLE)(create_simple_tokenizer)
58  TokenizerFactory.register(TokenizerType.GEMINI)(create_gemini_tokenizer)
59