Multimodal ExampleΒΆ

Vision capabilities with image inputs:

  1#!/usr/bin/env python
  2"""
  331 Multimodal - Text and Images Together
  4
  5Learn how to process images along with text.
  6This enables vision capabilities like analyzing images, reading charts, etc.
  7
  8Level: Advanced Feature
  9"""
 10
 11import base64
 12from pathlib import Path
 13
 14from config_loader import get_chat_config, parse_args
 15
 16from lexilux import Chat, ImageContentBlock, TextContentBlock
 17
 18
 19def encode_image(image_path: str) -> str:
 20    """Encode an image file to base64."""
 21    with open(image_path, "rb") as f:
 22        return base64.b64encode(f.read()).decode("utf-8")
 23
 24
 25def main():
 26    """Demonstrate multimodal capabilities."""
 27    args = parse_args()
 28    try:
 29        config = get_chat_config(config_path=args.config)
 30    except (FileNotFoundError, KeyError) as e:
 31        print(f"Configuration error: {e}")
 32        print("\nUsing placeholder values. Please configure test_endpoints.json")
 33        config = {
 34            "base_url": "https://api.example.com/v1",
 35            "api_key": "your-api-key",
 36            "model": "gpt-4-vision-preview",
 37        }
 38
 39    chat = Chat(**config)
 40
 41    # Example 1: Image from URL
 42    print("=" * 50)
 43    print("Example 1: Analyze Image from URL")
 44    print("=" * 50)
 45
 46    messages = [
 47        {
 48            "role": "user",
 49            "content": [
 50                TextContentBlock(text="What's in this image? Describe it briefly."),
 51                ImageContentBlock(
 52                    image_url={
 53                        "url": "https://raw.githubusercontent.com/python/python-logo/main/Python-logo-notext.svg"
 54                    }
 55                ),
 56            ],
 57        }
 58    ]
 59
 60    try:
 61        result = chat(messages)
 62        print(f"Response: {result.text}\n")
 63    except Exception as e:
 64        print("Note: This example requires a vision-enabled model.")
 65        print(f"Error: {e}\n")
 66
 67    # Example 2: Local image (base64 encoded)
 68    print("=" * 50)
 69    print("Example 2: Analyze Local Image")
 70    print("=" * 50)
 71
 72    # Check if we have a sample image
 73    sample_images = [
 74        "examples/sample_image.png",
 75        "examples/sample_image.jpg",
 76        "/tmp/sample_image.png",
 77    ]
 78
 79    image_path = None
 80    for path in sample_images:
 81        if Path(path).exists():
 82            image_path = path
 83            break
 84
 85    if image_path:
 86        base64_image = encode_image(image_path)
 87
 88        messages = [
 89            {
 90                "role": "user",
 91                "content": [
 92                    TextContentBlock(text="Describe this image in detail."),
 93                    ImageContentBlock(
 94                        image_url={"url": f"data:image/png;base64,{base64_image}"}
 95                    ),
 96                ],
 97            }
 98        ]
 99
100        try:
101            result = chat(messages)
102            print(f"Response: {result.text}\n")
103        except Exception as e:
104            print(f"Error: {e}\n")
105    else:
106        print("No sample image found. To test local images:")
107        print("  1. Place an image at examples/sample_image.png")
108        print("  2. Or update the image_path variable\n")
109
110    # Example 3: Multiple images
111    print("=" * 50)
112    print("Example 3: Compare Multiple Images")
113    print("=" * 50)
114
115    messages = [
116        {
117            "role": "user",
118            "content": [
119                TextContentBlock(
120                    text="What are the differences between these two images?"
121                ),
122                ImageContentBlock(
123                    image_url={"url": "https://placehold.co/100x100/red/red"}
124                ),
125                ImageContentBlock(
126                    image_url={"url": "https://placehold.co/100x100/blue/blue"}
127                ),
128            ],
129        }
130    ]
131
132    try:
133        result = chat(messages)
134        print(f"Response: {result.text}\n")
135    except Exception as e:
136        print("Note: This example requires network access and vision model.")
137        print(f"Error: {e}\n")
138
139    # Example 4: Image detail levels
140    print("=" * 50)
141    print("Example 4: Image Detail Levels")
142    print("=" * 50)
143
144    print("Different detail levels for different use cases:\n")
145
146    print("Low detail - faster, less detailed analysis:")
147    print("  ImageUrlDetail.LOW\n")
148
149    print("High detail - slower, more detailed analysis:")
150    print("  ImageUrlDetail.HIGH\n")
151
152    print("Auto detail - lets the model decide:")
153    print("  ImageUrlDetail.AUTO\n")
154
155    # Example usage
156    from lexilux import ImageUrlDetail
157
158    messages = [
159        {
160            "role": "user",
161            "content": [
162                TextContentBlock(text="Quick: what color is this?"),
163                ImageContentBlock(
164                    image_url={
165                        "url": "https://placehold.co/100x100/green/green",
166                        "detail": ImageUrlDetail.LOW,
167                    }
168                ),
169            ],
170        }
171    ]
172
173    try:
174        result = chat(messages)
175        print(f"\nWith LOW detail: {result.text}\n")
176    except Exception as e:
177        print(f"Error: {e}\n")
178
179    # Example 5: Reading text from images (OCR)
180    print("=" * 50)
181    print("Example 5: OCR - Reading Text from Images")
182    print("=" * 50)
183
184    messages = [
185        {
186            "role": "user",
187            "content": [
188                TextContentBlock(text="Read and transcribe any text in this image."),
189                ImageContentBlock(
190                    image_url={
191                        "url": "https://placehold.co/300x100/000000/FFF?text=Hello+World"
192                    }
193                ),
194            ],
195        }
196    ]
197
198    try:
199        result = chat(messages)
200        print(f"Transcribed text: {result.text}\n")
201    except Exception as e:
202        print("Note: This requires a vision-enabled model.")
203        print(f"Error: {e}\n")
204
205
206if __name__ == "__main__":
207    main()