"""
Tokenizer API client (optional dependency on transformers).
Provides local tokenization with support for offline/online modes and automatic model downloading.
"""
from __future__ import annotations
from collections.abc import Sequence
from pathlib import Path
from typing import TYPE_CHECKING, Any
from lexilux.usage import Json, ResultBase, Usage
if TYPE_CHECKING:
pass
[docs]
class TokenizeResult(ResultBase):
"""
Tokenize result.
Attributes:
input_ids: List of token IDs (List[List[int]] for batch input).
attention_mask: Attention mask (List[List[int]] for batch input, optional).
usage: Usage statistics (at least input_tokens is provided).
raw: Raw tokenizer output.
Examples:
>>> result = tokenizer("Hello, world!")
>>> print(result.input_ids) # [[15496, 11, 1917, 0]]
>>> print(result.usage.input_tokens) # 4
"""
[docs]
def __init__(
self,
*,
input_ids: list[list[int]],
attention_mask: list[list[int]] | None,
usage: Usage,
raw: Json | None = None,
):
"""
Initialize TokenizeResult.
Args:
input_ids: List of token ID sequences.
attention_mask: Attention mask sequences (optional).
usage: Usage statistics.
raw: Raw tokenizer output.
"""
super().__init__(usage=usage, raw=raw)
self.input_ids = input_ids
self.attention_mask = attention_mask
[docs]
def __repr__(self) -> str:
"""Return string representation."""
return f"TokenizeResult(input_ids=[{len(self.input_ids)} sequences], usage={self.usage!r})"
[docs]
class Tokenizer:
"""
Tokenizer client (uses transformers library).
Provides local tokenization with support for:
- Offline mode (offline=True): Only uses local cache, fails if model not found
- Online mode (offline=False): Prioritizes local cache, downloads if not found
Examples:
>>> # Offline mode (for air-gapped environments)
>>> tokenizer = Tokenizer("Qwen/Qwen2.5-7B-Instruct", offline=True, cache_dir="/models/hf")
>>> result = tokenizer("Hello, world!")
>>> print(result.usage.input_tokens)
>>> # Online mode (default, downloads if needed)
>>> tokenizer = Tokenizer("Qwen/Qwen2.5-7B-Instruct", offline=False)
>>> result = tokenizer("Hello, world!")
"""
[docs]
def __init__(
self,
model: str,
*,
cache_dir: str | None = None,
offline: bool = False,
revision: str | None = None,
trust_remote_code: bool = False,
require_transformers: bool = True,
):
"""
Initialize Tokenizer client.
Args:
model: HuggingFace model identifier (e.g., "Qwen/Qwen2.5-7B-Instruct").
cache_dir: Directory to cache models (defaults to HuggingFace cache).
Supports "~" for home directory expansion.
offline: If True, only use local cache (fail if not found).
If False, prioritize local cache, download if not found.
revision: Model revision/branch/tag (optional).
trust_remote_code: Whether to allow remote code execution.
require_transformers: If True, raise error immediately if transformers not installed.
If False, delay error until first use.
Raises:
ImportError: If transformers is not installed and require_transformers=True.
"""
self.model = model
# Expand "~" to home directory if cache_dir is provided
self.cache_dir = str(Path(cache_dir).expanduser()) if cache_dir else None
self.offline = offline
self.revision = revision
self.trust_remote_code = trust_remote_code
self.require_transformers = require_transformers
# Lazy import transformers
self._tokenizer = None
self._transformers_available = False
# Check transformers availability
try:
import transformers # noqa: F401
self._transformers_available = True
except ImportError:
if require_transformers:
raise ImportError(
"transformers library is required for Tokenizer. "
"Install it with: pip install lexilux[tokenizer] (or lexilux[token]) or pip install transformers"
)
# If require_transformers=False, we'll check again on first use
[docs]
@staticmethod
def list_tokenizer_files(
model: str,
*,
revision: str | None = None,
) -> list[str]:
"""
List tokenizer-related files for a given model.
This method queries the HuggingFace Hub to identify which files
are needed for tokenization, without downloading them.
Args:
model: HuggingFace model identifier (e.g., "Qwen/Qwen2.5-7B-Instruct").
revision: Model revision/branch/tag (optional).
Returns:
List of file paths that are tokenizer-related.
Raises:
ImportError: If huggingface_hub is not installed.
Exception: If unable to list files from HuggingFace Hub.
Example:
>>> files = Tokenizer.list_tokenizer_files("Qwen/Qwen2.5-7B-Instruct")
>>> print(files)
['tokenizer.json', 'tokenizer_config.json', 'vocab.json', 'merges.txt', ...]
"""
try:
from huggingface_hub import list_repo_files
except ImportError:
raise ImportError(
"huggingface_hub library is required to list tokenizer files. "
"Install it with: pip install huggingface-hub"
)
# List all files in the repository
all_files = list_repo_files(
repo_id=model,
revision=revision,
)
# Common tokenizer file patterns
tokenizer_patterns = [
"tokenizer.json",
"tokenizer_config.json",
"vocab.json",
"merges.txt",
"special_tokens_map.json",
"added_tokens.json",
"preprocessor_config.json",
"config.json", # Model config (may contain tokenizer info)
]
# Filter files that match tokenizer patterns
tokenizer_files = []
for file in all_files:
filename = Path(file).name
# Check if file matches any tokenizer pattern
if any(
pattern in filename or filename == pattern
for pattern in tokenizer_patterns
):
tokenizer_files.append(file)
# Also include files in tokenizer subdirectory if it exists
elif file.startswith("tokenizer/"):
tokenizer_files.append(file)
return sorted(tokenizer_files)
def _check_tokenizer_files_exist(self) -> tuple[bool, str | None]:
"""
Check if tokenizer files exist in cache using direct filesystem check.
This avoids potential network requests or hanging from huggingface_hub functions.
Returns:
Tuple of (files_exist: bool, cache_path: str | None)
"""
try:
from pathlib import Path
# Determine cache directory
if self.cache_dir:
base_cache_dir = Path(self.cache_dir)
else:
# Use default HuggingFace cache location
try:
from huggingface_hub import HF_HUB_CACHE
base_cache_dir = Path(HF_HUB_CACHE)
except ImportError:
# Fallback to common default location
base_cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
# HuggingFace cache structure: models--{repo_id}--{revision}/snapshots/{hash}/
model_cache_name = self.model.replace("/", "--")
model_cache_path = base_cache_dir / f"models--{model_cache_name}"
if not model_cache_path.exists():
return False, None
# Check for snapshots directory
snapshots_dir = model_cache_path / "snapshots"
if not snapshots_dir.exists():
return False, None
# Look for any snapshot directory that contains tokenizer files
required_files = ["tokenizer_config.json", "tokenizer.json"]
for snapshot_dir in snapshots_dir.iterdir():
if snapshot_dir.is_dir():
# Check if this snapshot has tokenizer files
has_tokenizer_files = any(
(snapshot_dir / filename).exists()
for filename in required_files
)
if has_tokenizer_files:
return True, str(snapshot_dir)
return False, None
except (OSError, ImportError, AttributeError) as e:
# If filesystem check fails for any reason, fail fast
import warnings
warnings.warn(
f"Failed to check local cache for model '{self.model}': {e}. "
"Assuming files don't exist.",
UserWarning,
)
return False, None
def _ensure_model_downloaded(self) -> str:
"""
Ensure tokenizer files are available.
In offline mode: Check for necessary files and throw exception if not found.
In online mode: Check files first, only download if missing.
"""
# First, check if tokenizer files already exist in cache
files_exist, cached_path = self._check_tokenizer_files_exist()
if self.offline:
# Offline mode: files must exist, throw exception if not found
if not files_exist:
raise OSError(
f"Model '{self.model}' tokenizer files not found in cache. "
f"Offline mode requires tokenizer files to be pre-downloaded. "
f"Cache dir: {self.cache_dir or 'default HuggingFace cache'}. "
f"Please download the model first in online mode."
)
# Files exist, return cached path or model ID
return cached_path if cached_path else self.model
# Online mode: if files exist, use them directly
if files_exist and cached_path:
return cached_path
# Files don't exist or couldn't be found, need to download
try:
from huggingface_hub import snapshot_download
except ImportError:
# If huggingface_hub is not available, let AutoTokenizer handle downloads.
return self.model
try:
# Download only files relevant for tokenization, ignoring model weights.
# The returned path points to a local directory with the downloaded files.
return snapshot_download(
repo_id=self.model,
cache_dir=self.cache_dir,
revision=self.revision,
local_files_only=False, # Ensure we download if not present
allow_patterns=["tokenizer*", "*.json", "*.txt", "vocab.*", "merges.*"],
ignore_patterns=["*.safetensors", "*.bin", "*.pt", "*.onnx"],
)
except (OSError, ValueError) as e:
# If download fails, provide a clear error message.
raise OSError(
f"Failed to download tokenizer files for model '{self.model}'. "
f"Cache dir: {self.cache_dir or 'default'}. Error: {e}"
) from e
def _ensure_tokenizer(self):
"""
Ensure tokenizer is loaded (lazy loading).
Uses local_files_only parameter instead of environment variables for better control.
This is the recommended approach as it doesn't affect global state.
Raises:
ImportError: If transformers is not available.
OSError: If model cannot be loaded (e.g., offline mode and model not found).
"""
if self._tokenizer is not None:
return
# Check transformers availability
if not self._transformers_available:
try:
import transformers # noqa: F401
self._transformers_available = True
except ImportError:
raise ImportError(
"transformers library is required for Tokenizer. "
"Install it with: pip install lexilux[tokenizer] or pip install transformers"
)
# Import transformers components
from transformers import AutoTokenizer
# Ensure model is downloaded (if needed)
model_path = self._ensure_model_downloaded()
# Load tokenizer
# Since we handle all downloads ourselves in _ensure_model_downloaded(),
# we ALWAYS use local_files_only=True for AutoTokenizer.
# This ensures:
# 1. No duplicate downloads
# 2. Complete control over download process
# 3. Consistent behavior regardless of online/offline mode
# 4. Better error messages from our own logic
try:
self._tokenizer = AutoTokenizer.from_pretrained(
model_path,
cache_dir=self.cache_dir,
revision=self.revision,
trust_remote_code=self.trust_remote_code,
local_files_only=True, # Always True - we handle downloads ourselves!
)
except (OSError, ValueError) as e:
# This should rarely happen since _ensure_model_downloaded()
# already verified files exist, but provide helpful error anyway
mode_text = "offline" if self.offline else "online"
raise OSError(
f"Failed to load tokenizer for model '{self.model}' in {mode_text} mode. "
f"Files should have been prepared by _ensure_model_downloaded(). "
f"Cache dir: {self.cache_dir or 'default HuggingFace cache'}. "
f"Original error: {e}"
) from e
[docs]
def __call__(
self,
text: str | Sequence[str],
*,
add_special_tokens: bool = True,
truncation: bool | str = False,
max_length: int | None = None,
padding: bool | str = False,
return_attention_mask: bool = True,
extra: Json | None = None,
return_raw: bool = False,
) -> TokenizeResult:
"""
Tokenize text.
Args:
text: Single text string or sequence of text strings.
add_special_tokens: Whether to add special tokens (e.g., BOS, EOS).
truncation: Truncation strategy (True, False, or "longest_first", etc.).
max_length: Maximum sequence length.
padding: Padding strategy (True, False, or "max_length", etc.).
return_attention_mask: Whether to return attention mask.
extra: Additional tokenizer parameters.
return_raw: Whether to include raw tokenizer output.
Returns:
TokenizeResult with input_ids, attention_mask, and usage.
Raises:
ImportError: If transformers is not available.
OSError: If model cannot be loaded (offline mode).
"""
# Ensure tokenizer is loaded
self._ensure_tokenizer()
# Normalize input to list
is_single = isinstance(text, str)
text_list = [text] if is_single else list(text)
if not text_list:
raise ValueError("Text cannot be empty")
# Prepare tokenizer arguments
tokenizer_kwargs: dict[str, Any] = {
"add_special_tokens": add_special_tokens,
"truncation": truncation,
"padding": padding,
"return_attention_mask": return_attention_mask,
}
if max_length is not None:
tokenizer_kwargs["max_length"] = max_length
if extra:
tokenizer_kwargs.update(extra)
# Tokenize
encoded = self._tokenizer(text_list, **tokenizer_kwargs)
# Extract results
input_ids = encoded["input_ids"]
attention_mask = (
encoded.get("attention_mask") if return_attention_mask else None
)
# Calculate usage (total tokens across all sequences)
total_tokens = sum(len(ids) for ids in input_ids)
# Create usage
usage = Usage(
input_tokens=total_tokens,
output_tokens=None, # Not applicable for tokenization
total_tokens=total_tokens,
)
# Return result
return TokenizeResult(
input_ids=input_ids,
attention_mask=attention_mask,
usage=usage,
raw=encoded if return_raw else {},
)