Why Multimodal Adds Complexity
Adding image or audio inputs to an LLM-powered system introduces challenges that don't exist in text-only systems:
- Input size variance: A 1080p image is 6MB. A 1-hour audio file is 50MB. This breaks most text-oriented architectures.
- Cost structure changes: Vision tokens are 3–10× more expensive than text tokens per equivalent "information unit"
- Preprocessing pipelines: Images need resizing, audio needs chunking and transcription, video needs frame extraction
- Latency: Image encoding adds 100–500ms depending on model and resolution
This guide covers the engineering decisions that matter.
Image Input Pipelines
How vision LLMs tokenize images
Modern vision LLMs (GPT-4o, Claude 3.5, Gemini 1.5) encode images by dividing them into tiles and running a vision encoder (typically a ViT variant) on each tile.
GPT-4o tile pricing:
- Base cost: 85 tokens per image (minimum)
- Each 512×512 tile: +170 tokens
- A 1024×1024 image = 85 + 4×170 = 765 tokens ≈ $0.0023 at current prices
Key insight: Resolution directly drives cost. Downsizing images before sending them is the single highest-leverage cost optimization.
Image preprocessing service
from PIL import Image
import io
import base64
from typing import Literal
def prepare_image_for_llm(
image_path: str,
target_detail: Literal["low", "high", "auto"] = "auto",
max_dimension: int = 2048,
) -> tuple[str, dict]:
"""
Resize and encode an image for LLM vision APIs.
Returns (base64_string, cost_estimate).
"""
img = Image.open(image_path).convert("RGB")
orig_w, orig_h = img.size
# Determine if we need to resize
if max(orig_w, orig_h) > max_dimension:
ratio = max_dimension / max(orig_w, orig_h)
new_size = (int(orig_w * ratio), int(orig_h * ratio))
img = img.resize(new_size, Image.LANCZOS)
# Estimate token cost (OpenAI formula)
w, h = img.size
tiles = (w // 512 + (1 if w % 512 else 0)) * (h // 512 + (1 if h % 512 else 0))
estimated_tokens = 85 + tiles * 170
# Encode
buffer = io.BytesIO()
img.save(buffer, format="JPEG", quality=85)
encoded = base64.standard_b64encode(buffer.getvalue()).decode()
return encoded, {"tokens": estimated_tokens, "size_kb": len(buffer.getvalue()) // 1024}
# Usage
b64, cost_info = prepare_image_for_llm("invoice.jpg", max_dimension=1024)
print(f"Estimated tokens: {cost_info['tokens']}, size: {cost_info['size_kb']}KB")
Calling vision APIs
from anthropic import Anthropic
client = Anthropic()
def analyze_image(image_path: str, prompt: str) -> str:
b64_image, _ = prepare_image_for_llm(image_path)
message = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": b64_image,
},
},
{"type": "text", "text": prompt},
],
}],
)
return message.content[0].text
Caching image analysis results
Image analysis is expensive and often idempotent (same image + same prompt = same result). Always cache:
import hashlib
import redis
cache = redis.Redis(host="localhost", decode_responses=True)
def cached_analyze_image(image_path: str, prompt: str, ttl: int = 86400) -> str:
# Hash by file content + prompt (not path, which can change)
with open(image_path, "rb") as f:
content_hash = hashlib.sha256(f.read() + prompt.encode()).hexdigest()[:16]
cache_key = f"vision:{content_hash}"
cached = cache.get(cache_key)
if cached:
return cached
result = analyze_image(image_path, prompt)
cache.setex(cache_key, ttl, result)
return result
Audio Input Pipelines
Transcription-first vs. native audio
Two approaches:
- Transcription-first: Audio → Whisper/Deepgram → text → LLM
- Native audio: Audio → multimodal LLM directly (GPT-4o audio, Gemini)
When to use transcription-first:
- You need word-level timestamps
- Audio is > 5 minutes (native audio has context window limits)
- Cost is a priority (Whisper at $0.006/minute vs. GPT-4o audio at ~$0.10/minute)
- You need to store/index the transcript
When to use native audio:
- Tone, emotion, or speaker identity matters
- Conversational AI where prosody changes meaning
- Short clips (< 2 minutes)
Chunked transcription pipeline
from openai import OpenAI
from pydub import AudioSegment
import io
client = OpenAI()
def transcribe_long_audio(
audio_path: str,
chunk_minutes: int = 10,
language: str = "en",
) -> str:
audio = AudioSegment.from_file(audio_path)
chunk_ms = chunk_minutes * 60 * 1000
transcripts = []
for i, start in enumerate(range(0, len(audio), chunk_ms)):
chunk = audio[start : start + chunk_ms]
buffer = io.BytesIO()
chunk.export(buffer, format="mp3", bitrate="64k") # downsample for cost
buffer.seek(0)
buffer.name = f"chunk_{i}.mp3"
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=buffer,
language=language,
response_format="verbose_json", # includes timestamps
)
transcripts.append(transcript.text)
return " ".join(transcripts)
Document Understanding (PDF + Images)
PDFs are the most common multimodal input in enterprise applications. Two approaches:
Approach A: PDF → Images → Vision LLM
Best for PDFs with complex layouts, charts, or non-standard formatting.
from pdf2image import convert_from_path
def pdf_to_analyzed_text(pdf_path: str, dpi: int = 150) -> str:
# Convert each page to image at 150 DPI (good balance of quality/cost)
pages = convert_from_path(pdf_path, dpi=dpi)
results = []
for i, page_img in enumerate(pages):
# Save to temp buffer
buffer = io.BytesIO()
page_img.save(buffer, format="JPEG", quality=85)
b64 = base64.standard_b64encode(buffer.getvalue()).decode()
result = client.messages.create(
model="claude-opus-4-6",
max_tokens=2048,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}},
{"type": "text", "text": "Extract all text content from this document page. Preserve structure including headers, tables, and lists. Output in Markdown."},
],
}],
)
results.append(f"## Page {i+1}\n\n{result.content[0].text}")
return "\n\n---\n\n".join(results)
Approach B: PDF text extraction → LLM
Best when PDFs are digitally created (not scanned), cost matters, and layout is simple.
import pymupdf # formerly fitz
def extract_pdf_text(pdf_path: str) -> str:
doc = pymupdf.open(pdf_path)
pages = []
for page in doc:
pages.append(page.get_text("markdown")) # preserves basic formatting
return "\n\n---\n\n".join(pages)
Cost comparison for a 10-page PDF:
| Method | Tokens | Cost (Claude claude-opus-4-6) |
|---|---|---|
| Vision per page (150 DPI) | ~8,500 | ~$0.085 |
| Text extraction | ~3,000 | ~$0.030 |
| Vision at 72 DPI | ~4,000 | ~$0.040 |
Cost Optimization Strategies
1. Route by input type and complexity
Not every image needs a frontier model:
def route_vision_request(image_path: str, task: str) -> str:
img = Image.open(image_path)
w, h = img.size
# Simple tasks (OCR, classification) → small fast model
simple_tasks = ["extract_text", "classify_document_type", "detect_language"]
if task in simple_tasks and max(w, h) < 2000:
return call_vision_model("claude-haiku-4-5-20251001", image_path, PROMPTS[task])
# Complex tasks → frontier model
return call_vision_model("claude-opus-4-6", image_path, PROMPTS[task])
2. Thumbnail for classification, full resolution for extraction
def two_stage_document_processing(image_path: str) -> dict:
# Stage 1: classify with thumbnail (cheap)
thumbnail = prepare_image_for_llm(image_path, max_dimension=512)
doc_type = classify_document_type(thumbnail)
# Stage 2: extract only if worth it
if doc_type in VALUABLE_DOC_TYPES:
full_res = prepare_image_for_llm(image_path, max_dimension=2048)
return extract_structured_data(full_res, doc_type)
return {"doc_type": doc_type, "extracted": None}
3. Async batch processing
Vision tasks are often not latency-critical. Batch them:
import asyncio
from anthropic import AsyncAnthropic
async_client = AsyncAnthropic()
async def process_images_batch(image_paths: list[str], concurrency: int = 5) -> list[str]:
semaphore = asyncio.Semaphore(concurrency)
async def process_one(path):
async with semaphore:
b64, _ = prepare_image_for_llm(path)
response = await async_client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=512,
messages=[{"role": "user", "content": [
{"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}},
{"type": "text", "text": "Describe this image in one sentence."},
]}],
)
return response.content[0].text
return await asyncio.gather(*[process_one(p) for p in image_paths])
Integrating multimodal capabilities into a RAG system? See our guide on RAG Systems at Scale.