Tokenizer Example

Tokenizer example:

  1#!/usr/bin/env python
  2"""
  322 Tokenizer - Count and Analyze Tokens
  4
  5Learn how to count tokens without making API calls.
  6This is useful for cost estimation and input length management.
  7
  8Note: Requires lexilux[tokenizer] extra installation:
  9    pip install lexilux[tokenizer]
 10
 11Level: Other APIs
 12"""
 13
 14from config_loader import parse_args
 15
 16from lexilux import Tokenizer
 17
 18
 19def main():
 20    """Demonstrate tokenization."""
 21    parse_args()  # Parse args for consistency (supports --config if needed)
 22
 23    # Example 1: Basic token counting
 24    print("=" * 50)
 25    print("Example 1: Basic Token Counting")
 26    print("=" * 50)
 27
 28    # For this example, we'll use offline mode to avoid downloads
 29    # In real usage, you'd use an actual model name like "Qwen/Qwen2.5-7B-Instruct"
 30    # For demo purposes, we'll show what would happen
 31
 32    print("Tokenizing text without API calls...\n")
 33
 34    # Note: This requires a valid model. For the demo, we'll show the concept.
 35    try:
 36        # Try with a common model (will download if not cached)
 37        tokenizer = Tokenizer("gpt2", offline=False)
 38
 39        text = "Hello, world!"
 40        result = tokenizer(text)
 41
 42        print(f"Text: {text}")
 43        print(f"Input tokens: {result.usage.input_tokens}")
 44        print(f"Token IDs: {result.input_ids[0]}\n")
 45
 46        # Example 2: Comparing texts
 47        print("=" * 50)
 48        print("Example 2: Comparing Token Counts")
 49        print("=" * 50)
 50
 51        texts = [
 52            "Hello",
 53            "Hello, world!",
 54            "The quick brown fox jumps over the lazy dog.",
 55            "Python is a high-level programming language.",
 56        ]
 57
 58        print("Token counts:")
 59        for text in texts:
 60            result = tokenizer(text)
 61            print(f"  {result.usage.input_tokens:3d} tokens: {text}")
 62
 63        print("\nNotice: Longer text ≠ proportionally more tokens!")
 64        print("Words and characters are tokenized differently.\n")
 65
 66        # Example 3: Batch tokenization
 67        print("=" * 50)
 68        print("Example 3: Batch Tokenization")
 69        print("=" * 50)
 70
 71        batch_texts = ["Hello", "World", "How are you?"]
 72        result = tokenizer(batch_texts)
 73
 74        print(f"Batch tokenized {len(batch_texts)} texts:")
 75        for i, text in enumerate(batch_texts):
 76            print(f"  '{text}': {result.usage.input_tokens} tokens")
 77        print()
 78
 79        # Example 4: Truncation and padding
 80        print("=" * 50)
 81        print("Example 4: Truncation (limiting length)")
 82        print("=" * 50)
 83
 84        long_text = "This is a very long text that would be truncated. " * 10
 85        result = tokenizer(
 86            long_text,
 87            max_length=20,
 88            truncation=True,
 89        )
 90
 91        print(f"Original text length: ~{len(long_text.split())} words")
 92        print(f"Truncated to: {result.usage.input_tokens} tokens")
 93        print(f"Truncated IDs length: {len(result.input_ids[0])}\n")
 94
 95        # Example 5: Cost estimation
 96        print("=" * 50)
 97        print("Example 5: Cost Estimation")
 98        print("=" * 50)
 99
100        # Estimate tokens for common scenarios
101        scenarios = {
102            "Short question": "What is the capital of France?",
103            "Medium paragraph": (
104                "Python is a versatile programming language "
105                "that is widely used in web development, "
106                "data science, and automation."
107            ),
108            "Long document": "Chapter 1: Introduction. " * 50,
109        }
110
111        # Example pricing (adjust based on your API)
112        price_per_1k_tokens = 0.001
113
114        print("Estimated costs (assuming $0.001 per 1K tokens):\n")
115        for name, text in scenarios.items():
116            result = tokenizer(text)
117            tokens = result.usage.input_tokens
118            cost = (tokens / 1000) * price_per_1k_tokens
119            print(f"{name}:")
120            print(f"  Tokens: {tokens}")
121            print(f"  Cost: ${cost:.6f}\n")
122
123    except Exception as e:
124        print("Note: Tokenization requires transformers library.")
125        print(f"Error: {e}")
126        print("\nTo install tokenizer support:")
127        print("  pip install lexilux[tokenizer]")
128        print("\nOr install transformers directly:")
129        print("  pip install transformers tokenizers")
130
131
132if __name__ == "__main__":
133    main()