Tokenizer Example¶
Tokenizer example:
1#!/usr/bin/env python
2"""
322 Tokenizer - Count and Analyze Tokens
4
5Learn how to count tokens without making API calls.
6This is useful for cost estimation and input length management.
7
8Note: Requires lexilux[tokenizer] extra installation:
9 pip install lexilux[tokenizer]
10
11Level: Other APIs
12"""
13
14from config_loader import parse_args
15
16from lexilux import Tokenizer
17
18
19def main():
20 """Demonstrate tokenization."""
21 parse_args() # Parse args for consistency (supports --config if needed)
22
23 # Example 1: Basic token counting
24 print("=" * 50)
25 print("Example 1: Basic Token Counting")
26 print("=" * 50)
27
28 # For this example, we'll use offline mode to avoid downloads
29 # In real usage, you'd use an actual model name like "Qwen/Qwen2.5-7B-Instruct"
30 # For demo purposes, we'll show what would happen
31
32 print("Tokenizing text without API calls...\n")
33
34 # Note: This requires a valid model. For the demo, we'll show the concept.
35 try:
36 # Try with a common model (will download if not cached)
37 tokenizer = Tokenizer("gpt2", offline=False)
38
39 text = "Hello, world!"
40 result = tokenizer(text)
41
42 print(f"Text: {text}")
43 print(f"Input tokens: {result.usage.input_tokens}")
44 print(f"Token IDs: {result.input_ids[0]}\n")
45
46 # Example 2: Comparing texts
47 print("=" * 50)
48 print("Example 2: Comparing Token Counts")
49 print("=" * 50)
50
51 texts = [
52 "Hello",
53 "Hello, world!",
54 "The quick brown fox jumps over the lazy dog.",
55 "Python is a high-level programming language.",
56 ]
57
58 print("Token counts:")
59 for text in texts:
60 result = tokenizer(text)
61 print(f" {result.usage.input_tokens:3d} tokens: {text}")
62
63 print("\nNotice: Longer text ≠ proportionally more tokens!")
64 print("Words and characters are tokenized differently.\n")
65
66 # Example 3: Batch tokenization
67 print("=" * 50)
68 print("Example 3: Batch Tokenization")
69 print("=" * 50)
70
71 batch_texts = ["Hello", "World", "How are you?"]
72 result = tokenizer(batch_texts)
73
74 print(f"Batch tokenized {len(batch_texts)} texts:")
75 for i, text in enumerate(batch_texts):
76 print(f" '{text}': {result.usage.input_tokens} tokens")
77 print()
78
79 # Example 4: Truncation and padding
80 print("=" * 50)
81 print("Example 4: Truncation (limiting length)")
82 print("=" * 50)
83
84 long_text = "This is a very long text that would be truncated. " * 10
85 result = tokenizer(
86 long_text,
87 max_length=20,
88 truncation=True,
89 )
90
91 print(f"Original text length: ~{len(long_text.split())} words")
92 print(f"Truncated to: {result.usage.input_tokens} tokens")
93 print(f"Truncated IDs length: {len(result.input_ids[0])}\n")
94
95 # Example 5: Cost estimation
96 print("=" * 50)
97 print("Example 5: Cost Estimation")
98 print("=" * 50)
99
100 # Estimate tokens for common scenarios
101 scenarios = {
102 "Short question": "What is the capital of France?",
103 "Medium paragraph": (
104 "Python is a versatile programming language "
105 "that is widely used in web development, "
106 "data science, and automation."
107 ),
108 "Long document": "Chapter 1: Introduction. " * 50,
109 }
110
111 # Example pricing (adjust based on your API)
112 price_per_1k_tokens = 0.001
113
114 print("Estimated costs (assuming $0.001 per 1K tokens):\n")
115 for name, text in scenarios.items():
116 result = tokenizer(text)
117 tokens = result.usage.input_tokens
118 cost = (tokens / 1000) * price_per_1k_tokens
119 print(f"{name}:")
120 print(f" Tokens: {tokens}")
121 print(f" Cost: ${cost:.6f}\n")
122
123 except Exception as e:
124 print("Note: Tokenization requires transformers library.")
125 print(f"Error: {e}")
126 print("\nTo install tokenizer support:")
127 print(" pip install lexilux[tokenizer]")
128 print("\nOr install transformers directly:")
129 print(" pip install transformers tokenizers")
130
131
132if __name__ == "__main__":
133 main()