/ src / chunkers / tokenizers / simple.py
simple.py
  1  from __future__ import annotations
  2  
  3  """Simple approximation tokenizer implementation."""
  4  
  5  from typing import Any
  6  
  7  from pydantic import ConfigDict
  8  
  9  from .base_tokenizer import Tokenizer
 10  
 11  
 12  class SimpleTokenizer(Tokenizer):
 13      """Simple tokenizer that uses character-based approximation for token counting.
 14  
 15      This tokenizer doesn't require any external API and works independently
 16      of the embedding model. It uses a fast approximation (4 chars per token).
 17      """
 18  
 19      model_config = ConfigDict(extra='allow')
 20  
 21      def __init__(
 22          self,
 23          max_tokens: int = 2048,
 24          chars_per_token_ratio: float | None = None,
 25          split_buffer_size: int | None = None,
 26          **kwargs
 27      ):
 28          """Initialize the approximation tokenizer.
 29  
 30          Parameters
 31          ----------
 32          max_tokens
 33              Maximum number of tokens per chunk (default: 2048).
 34          chars_per_token_ratio
 35              Ratio of characters to tokens for threshold estimation.
 36          split_buffer_size
 37              Number of words to buffer before checking limits.
 38          """
 39          super().__init__(
 40              chars_per_token_ratio=chars_per_token_ratio,
 41              split_buffer_size=split_buffer_size,
 42              **kwargs
 43          )
 44          self.max_tokens = max_tokens
 45  
 46      def count_tokens(self, text: str) -> int:
 47          """Count tokens using character-based approximation.
 48  
 49          Parameters
 50          ----------
 51          text
 52              The text to count tokens for.
 53  
 54          Returns
 55          -------
 56          Number of tokens (approximated as 4 characters per token).
 57          """
 58          return len(text) // 4
 59  
 60      def get_max_tokens(self) -> int:
 61          """Returns the maximum tokens allowed per chunk."""
 62          return self.max_tokens
 63  
 64      def _hash_attributes(self) -> tuple:
 65          """Return hashable attributes that uniquely identify this tokenizer.
 66  
 67          Returns
 68          -------
 69          Tuple containing class type and configuration attributes.
 70          """
 71          return (
 72              type(self),
 73              self.max_tokens,
 74              self.chars_per_token_ratio,
 75              self.split_buffer_size,
 76          )
 77  
 78  def create_simple_tokenizer(config: dict[str, Any]) -> Tokenizer:
 79      """Create a simple approximation tokenizer from configuration.
 80  
 81      Parameters
 82      ----------
 83      config
 84          Configuration dictionary with keys:
 85          - max_tokens: int (optional) - Maximum tokens per chunk (default: 2048)
 86          - chars_per_token_ratio: float (optional) - Char-to-token ratio for threshold (default: 1.5)
 87          - split_buffer_size: int (optional) - Words to buffer before checking limits (default: 5)
 88  
 89      Returns
 90      -------
 91      Tokenizer instance.
 92      """
 93      max_tokens = config.get("max_tokens", 2048)
 94      chars_per_token_ratio = config.get("chars_per_token_ratio")
 95      split_buffer_size = config.get("split_buffer_size")
 96      return SimpleTokenizer(
 97          max_tokens=max_tokens,
 98          chars_per_token_ratio=chars_per_token_ratio,
 99          split_buffer_size=split_buffer_size,
100      )
101