simple.py
1 from __future__ import annotations 2 3 """Simple approximation tokenizer implementation.""" 4 5 from typing import Any 6 7 from pydantic import ConfigDict 8 9 from .base_tokenizer import Tokenizer 10 11 12 class SimpleTokenizer(Tokenizer): 13 """Simple tokenizer that uses character-based approximation for token counting. 14 15 This tokenizer doesn't require any external API and works independently 16 of the embedding model. It uses a fast approximation (4 chars per token). 17 """ 18 19 model_config = ConfigDict(extra='allow') 20 21 def __init__( 22 self, 23 max_tokens: int = 2048, 24 chars_per_token_ratio: float | None = None, 25 split_buffer_size: int | None = None, 26 **kwargs 27 ): 28 """Initialize the approximation tokenizer. 29 30 Parameters 31 ---------- 32 max_tokens 33 Maximum number of tokens per chunk (default: 2048). 34 chars_per_token_ratio 35 Ratio of characters to tokens for threshold estimation. 36 split_buffer_size 37 Number of words to buffer before checking limits. 38 """ 39 super().__init__( 40 chars_per_token_ratio=chars_per_token_ratio, 41 split_buffer_size=split_buffer_size, 42 **kwargs 43 ) 44 self.max_tokens = max_tokens 45 46 def count_tokens(self, text: str) -> int: 47 """Count tokens using character-based approximation. 48 49 Parameters 50 ---------- 51 text 52 The text to count tokens for. 53 54 Returns 55 ------- 56 Number of tokens (approximated as 4 characters per token). 57 """ 58 return len(text) // 4 59 60 def get_max_tokens(self) -> int: 61 """Returns the maximum tokens allowed per chunk.""" 62 return self.max_tokens 63 64 def _hash_attributes(self) -> tuple: 65 """Return hashable attributes that uniquely identify this tokenizer. 66 67 Returns 68 ------- 69 Tuple containing class type and configuration attributes. 70 """ 71 return ( 72 type(self), 73 self.max_tokens, 74 self.chars_per_token_ratio, 75 self.split_buffer_size, 76 ) 77 78 def create_simple_tokenizer(config: dict[str, Any]) -> Tokenizer: 79 """Create a simple approximation tokenizer from configuration. 80 81 Parameters 82 ---------- 83 config 84 Configuration dictionary with keys: 85 - max_tokens: int (optional) - Maximum tokens per chunk (default: 2048) 86 - chars_per_token_ratio: float (optional) - Char-to-token ratio for threshold (default: 1.5) 87 - split_buffer_size: int (optional) - Words to buffer before checking limits (default: 5) 88 89 Returns 90 ------- 91 Tokenizer instance. 92 """ 93 max_tokens = config.get("max_tokens", 2048) 94 chars_per_token_ratio = config.get("chars_per_token_ratio") 95 split_buffer_size = config.get("split_buffer_size") 96 return SimpleTokenizer( 97 max_tokens=max_tokens, 98 chars_per_token_ratio=chars_per_token_ratio, 99 split_buffer_size=split_buffer_size, 100 ) 101