factory.py
1 from __future__ import annotations 2 3 """Factory for creating tokenizer implementations.""" 4 5 from collections.abc import Callable 6 from typing import Any, ClassVar 7 8 from .base_tokenizer import Tokenizer 9 from .gemini import create_gemini_tokenizer 10 from .simple import create_simple_tokenizer 11 from .types import TokenizerType 12 13 14 class TokenizerFactory: 15 """Factory for creating tokenizer implementations. Easily extensible.""" 16 17 _registry: ClassVar[dict[TokenizerType, Callable[[dict[str, Any]], Tokenizer]]] = {} 18 19 @classmethod 20 def register(cls, tokenizer_type: TokenizerType): 21 """Register a new tokenizer factory. 22 23 Parameters 24 ---------- 25 tokenizer_type 26 Type to register the tokenizer under. 27 """ 28 def decorator(factory_func: Callable[[dict[str, Any]], Tokenizer]): 29 cls._registry[tokenizer_type] = factory_func 30 return factory_func 31 return decorator 32 33 @classmethod 34 def create(cls, tokenizer_type: TokenizerType, **kwargs) -> Tokenizer: 35 """Create a tokenizer by type. 36 37 Parameters 38 ---------- 39 tokenizer_type 40 Type of the tokenizer to create. 41 **kwargs 42 Additional arguments passed to the tokenizer factory. 43 44 Returns 45 ------- 46 Tokenizer instance. 47 """ 48 if tokenizer_type not in cls._registry: 49 available = ", ".join(t.value for t in cls._registry) 50 raise ValueError( 51 f"Unknown tokenizer: {tokenizer_type}. " 52 f"Available tokenizers: {available}" 53 ) 54 return cls._registry[tokenizer_type](kwargs) 55 56 57 TokenizerFactory.register(TokenizerType.SIMPLE)(create_simple_tokenizer) 58 TokenizerFactory.register(TokenizerType.GEMINI)(create_gemini_tokenizer) 59