Source code for lexilux.tokenizer

"""
Tokenizer API client (optional dependency on transformers).

Provides local tokenization with support for offline/online modes and automatic model downloading.
"""

from __future__ import annotations

from collections.abc import Sequence
from pathlib import Path
from typing import TYPE_CHECKING, Any

from lexilux.usage import Json, ResultBase, Usage

if TYPE_CHECKING:
    pass


[docs] class TokenizeResult(ResultBase): """ Tokenize result. Attributes: input_ids: List of token IDs (List[List[int]] for batch input). attention_mask: Attention mask (List[List[int]] for batch input, optional). usage: Usage statistics (at least input_tokens is provided). raw: Raw tokenizer output. Examples: >>> result = tokenizer("Hello, world!") >>> print(result.input_ids) # [[15496, 11, 1917, 0]] >>> print(result.usage.input_tokens) # 4 """
[docs] def __init__( self, *, input_ids: list[list[int]], attention_mask: list[list[int]] | None, usage: Usage, raw: Json | None = None, ): """ Initialize TokenizeResult. Args: input_ids: List of token ID sequences. attention_mask: Attention mask sequences (optional). usage: Usage statistics. raw: Raw tokenizer output. """ super().__init__(usage=usage, raw=raw) self.input_ids = input_ids self.attention_mask = attention_mask
[docs] def __repr__(self) -> str: """Return string representation.""" return f"TokenizeResult(input_ids=[{len(self.input_ids)} sequences], usage={self.usage!r})"
[docs] class Tokenizer: """ Tokenizer client (uses transformers library). Provides local tokenization with support for: - Offline mode (offline=True): Only uses local cache, fails if model not found - Online mode (offline=False): Prioritizes local cache, downloads if not found Examples: >>> # Offline mode (for air-gapped environments) >>> tokenizer = Tokenizer("Qwen/Qwen2.5-7B-Instruct", offline=True, cache_dir="/models/hf") >>> result = tokenizer("Hello, world!") >>> print(result.usage.input_tokens) >>> # Online mode (default, downloads if needed) >>> tokenizer = Tokenizer("Qwen/Qwen2.5-7B-Instruct", offline=False) >>> result = tokenizer("Hello, world!") """
[docs] def __init__( self, model: str, *, cache_dir: str | None = None, offline: bool = False, revision: str | None = None, trust_remote_code: bool = False, require_transformers: bool = True, ): """ Initialize Tokenizer client. Args: model: HuggingFace model identifier (e.g., "Qwen/Qwen2.5-7B-Instruct"). cache_dir: Directory to cache models (defaults to HuggingFace cache). Supports "~" for home directory expansion. offline: If True, only use local cache (fail if not found). If False, prioritize local cache, download if not found. revision: Model revision/branch/tag (optional). trust_remote_code: Whether to allow remote code execution. require_transformers: If True, raise error immediately if transformers not installed. If False, delay error until first use. Raises: ImportError: If transformers is not installed and require_transformers=True. """ self.model = model # Expand "~" to home directory if cache_dir is provided self.cache_dir = str(Path(cache_dir).expanduser()) if cache_dir else None self.offline = offline self.revision = revision self.trust_remote_code = trust_remote_code self.require_transformers = require_transformers # Lazy import transformers self._tokenizer = None self._transformers_available = False # Check transformers availability try: import transformers # noqa: F401 self._transformers_available = True except ImportError: if require_transformers: raise ImportError( "transformers library is required for Tokenizer. " "Install it with: pip install lexilux[tokenizer] (or lexilux[token]) or pip install transformers" )
# If require_transformers=False, we'll check again on first use
[docs] @staticmethod def list_tokenizer_files( model: str, *, revision: str | None = None, ) -> list[str]: """ List tokenizer-related files for a given model. This method queries the HuggingFace Hub to identify which files are needed for tokenization, without downloading them. Args: model: HuggingFace model identifier (e.g., "Qwen/Qwen2.5-7B-Instruct"). revision: Model revision/branch/tag (optional). Returns: List of file paths that are tokenizer-related. Raises: ImportError: If huggingface_hub is not installed. Exception: If unable to list files from HuggingFace Hub. Example: >>> files = Tokenizer.list_tokenizer_files("Qwen/Qwen2.5-7B-Instruct") >>> print(files) ['tokenizer.json', 'tokenizer_config.json', 'vocab.json', 'merges.txt', ...] """ try: from huggingface_hub import list_repo_files except ImportError: raise ImportError( "huggingface_hub library is required to list tokenizer files. " "Install it with: pip install huggingface-hub" ) # List all files in the repository all_files = list_repo_files( repo_id=model, revision=revision, ) # Common tokenizer file patterns tokenizer_patterns = [ "tokenizer.json", "tokenizer_config.json", "vocab.json", "merges.txt", "special_tokens_map.json", "added_tokens.json", "preprocessor_config.json", "config.json", # Model config (may contain tokenizer info) ] # Filter files that match tokenizer patterns tokenizer_files = [] for file in all_files: filename = Path(file).name # Check if file matches any tokenizer pattern if any( pattern in filename or filename == pattern for pattern in tokenizer_patterns ): tokenizer_files.append(file) # Also include files in tokenizer subdirectory if it exists elif file.startswith("tokenizer/"): tokenizer_files.append(file) return sorted(tokenizer_files)
def _check_tokenizer_files_exist(self) -> tuple[bool, str | None]: """ Check if tokenizer files exist in cache using direct filesystem check. This avoids potential network requests or hanging from huggingface_hub functions. Returns: Tuple of (files_exist: bool, cache_path: str | None) """ try: from pathlib import Path # Determine cache directory if self.cache_dir: base_cache_dir = Path(self.cache_dir) else: # Use default HuggingFace cache location try: from huggingface_hub import HF_HUB_CACHE base_cache_dir = Path(HF_HUB_CACHE) except ImportError: # Fallback to common default location base_cache_dir = Path.home() / ".cache" / "huggingface" / "hub" # HuggingFace cache structure: models--{repo_id}--{revision}/snapshots/{hash}/ model_cache_name = self.model.replace("/", "--") model_cache_path = base_cache_dir / f"models--{model_cache_name}" if not model_cache_path.exists(): return False, None # Check for snapshots directory snapshots_dir = model_cache_path / "snapshots" if not snapshots_dir.exists(): return False, None # Look for any snapshot directory that contains tokenizer files required_files = ["tokenizer_config.json", "tokenizer.json"] for snapshot_dir in snapshots_dir.iterdir(): if snapshot_dir.is_dir(): # Check if this snapshot has tokenizer files has_tokenizer_files = any( (snapshot_dir / filename).exists() for filename in required_files ) if has_tokenizer_files: return True, str(snapshot_dir) return False, None except (OSError, ImportError, AttributeError) as e: # If filesystem check fails for any reason, fail fast import warnings warnings.warn( f"Failed to check local cache for model '{self.model}': {e}. " "Assuming files don't exist.", UserWarning, ) return False, None def _ensure_model_downloaded(self) -> str: """ Ensure tokenizer files are available. In offline mode: Check for necessary files and throw exception if not found. In online mode: Check files first, only download if missing. """ # First, check if tokenizer files already exist in cache files_exist, cached_path = self._check_tokenizer_files_exist() if self.offline: # Offline mode: files must exist, throw exception if not found if not files_exist: raise OSError( f"Model '{self.model}' tokenizer files not found in cache. " f"Offline mode requires tokenizer files to be pre-downloaded. " f"Cache dir: {self.cache_dir or 'default HuggingFace cache'}. " f"Please download the model first in online mode." ) # Files exist, return cached path or model ID return cached_path if cached_path else self.model # Online mode: if files exist, use them directly if files_exist and cached_path: return cached_path # Files don't exist or couldn't be found, need to download try: from huggingface_hub import snapshot_download except ImportError: # If huggingface_hub is not available, let AutoTokenizer handle downloads. return self.model try: # Download only files relevant for tokenization, ignoring model weights. # The returned path points to a local directory with the downloaded files. return snapshot_download( repo_id=self.model, cache_dir=self.cache_dir, revision=self.revision, local_files_only=False, # Ensure we download if not present allow_patterns=["tokenizer*", "*.json", "*.txt", "vocab.*", "merges.*"], ignore_patterns=["*.safetensors", "*.bin", "*.pt", "*.onnx"], ) except (OSError, ValueError) as e: # If download fails, provide a clear error message. raise OSError( f"Failed to download tokenizer files for model '{self.model}'. " f"Cache dir: {self.cache_dir or 'default'}. Error: {e}" ) from e def _ensure_tokenizer(self): """ Ensure tokenizer is loaded (lazy loading). Uses local_files_only parameter instead of environment variables for better control. This is the recommended approach as it doesn't affect global state. Raises: ImportError: If transformers is not available. OSError: If model cannot be loaded (e.g., offline mode and model not found). """ if self._tokenizer is not None: return # Check transformers availability if not self._transformers_available: try: import transformers # noqa: F401 self._transformers_available = True except ImportError: raise ImportError( "transformers library is required for Tokenizer. " "Install it with: pip install lexilux[tokenizer] or pip install transformers" ) # Import transformers components from transformers import AutoTokenizer # Ensure model is downloaded (if needed) model_path = self._ensure_model_downloaded() # Load tokenizer # Since we handle all downloads ourselves in _ensure_model_downloaded(), # we ALWAYS use local_files_only=True for AutoTokenizer. # This ensures: # 1. No duplicate downloads # 2. Complete control over download process # 3. Consistent behavior regardless of online/offline mode # 4. Better error messages from our own logic try: self._tokenizer = AutoTokenizer.from_pretrained( model_path, cache_dir=self.cache_dir, revision=self.revision, trust_remote_code=self.trust_remote_code, local_files_only=True, # Always True - we handle downloads ourselves! ) except (OSError, ValueError) as e: # This should rarely happen since _ensure_model_downloaded() # already verified files exist, but provide helpful error anyway mode_text = "offline" if self.offline else "online" raise OSError( f"Failed to load tokenizer for model '{self.model}' in {mode_text} mode. " f"Files should have been prepared by _ensure_model_downloaded(). " f"Cache dir: {self.cache_dir or 'default HuggingFace cache'}. " f"Original error: {e}" ) from e
[docs] def __call__( self, text: str | Sequence[str], *, add_special_tokens: bool = True, truncation: bool | str = False, max_length: int | None = None, padding: bool | str = False, return_attention_mask: bool = True, extra: Json | None = None, return_raw: bool = False, ) -> TokenizeResult: """ Tokenize text. Args: text: Single text string or sequence of text strings. add_special_tokens: Whether to add special tokens (e.g., BOS, EOS). truncation: Truncation strategy (True, False, or "longest_first", etc.). max_length: Maximum sequence length. padding: Padding strategy (True, False, or "max_length", etc.). return_attention_mask: Whether to return attention mask. extra: Additional tokenizer parameters. return_raw: Whether to include raw tokenizer output. Returns: TokenizeResult with input_ids, attention_mask, and usage. Raises: ImportError: If transformers is not available. OSError: If model cannot be loaded (offline mode). """ # Ensure tokenizer is loaded self._ensure_tokenizer() # Normalize input to list is_single = isinstance(text, str) text_list = [text] if is_single else list(text) if not text_list: raise ValueError("Text cannot be empty") # Prepare tokenizer arguments tokenizer_kwargs: dict[str, Any] = { "add_special_tokens": add_special_tokens, "truncation": truncation, "padding": padding, "return_attention_mask": return_attention_mask, } if max_length is not None: tokenizer_kwargs["max_length"] = max_length if extra: tokenizer_kwargs.update(extra) # Tokenize encoded = self._tokenizer(text_list, **tokenizer_kwargs) # Extract results input_ids = encoded["input_ids"] attention_mask = ( encoded.get("attention_mask") if return_attention_mask else None ) # Calculate usage (total tokens across all sequences) total_tokens = sum(len(ids) for ids in input_ids) # Create usage usage = Usage( input_tokens=total_tokens, output_tokens=None, # Not applicable for tokenization total_tokens=total_tokens, ) # Return result return TokenizeResult( input_ids=input_ids, attention_mask=attention_mask, usage=usage, raw=encoded if return_raw else {}, )