616 lines
14 KiB
Markdown
616 lines
14 KiB
Markdown
# Backend Configuration Guide
|
|
|
|
Complete guide to configuring Outlines with different model backends.
|
|
|
|
## Table of Contents
|
|
- Local Models (Transformers, llama.cpp, vLLM)
|
|
- API Models (OpenAI)
|
|
- Performance Comparison
|
|
- Configuration Examples
|
|
- Production Deployment
|
|
|
|
## Transformers (Hugging Face)
|
|
|
|
### Basic Setup
|
|
|
|
```python
|
|
import outlines
|
|
|
|
# Load model from Hugging Face
|
|
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
|
|
|
|
# Use with generator
|
|
generator = outlines.generate.json(model, YourModel)
|
|
result = generator("Your prompt")
|
|
```
|
|
|
|
### GPU Configuration
|
|
|
|
```python
|
|
# Use CUDA GPU
|
|
model = outlines.models.transformers(
|
|
"microsoft/Phi-3-mini-4k-instruct",
|
|
device="cuda"
|
|
)
|
|
|
|
# Use specific GPU
|
|
model = outlines.models.transformers(
|
|
"microsoft/Phi-3-mini-4k-instruct",
|
|
device="cuda:0" # GPU 0
|
|
)
|
|
|
|
# Use CPU
|
|
model = outlines.models.transformers(
|
|
"microsoft/Phi-3-mini-4k-instruct",
|
|
device="cpu"
|
|
)
|
|
|
|
# Use Apple Silicon MPS
|
|
model = outlines.models.transformers(
|
|
"microsoft/Phi-3-mini-4k-instruct",
|
|
device="mps"
|
|
)
|
|
```
|
|
|
|
### Advanced Configuration
|
|
|
|
```python
|
|
# FP16 for faster inference
|
|
model = outlines.models.transformers(
|
|
"microsoft/Phi-3-mini-4k-instruct",
|
|
device="cuda",
|
|
model_kwargs={
|
|
"torch_dtype": "float16"
|
|
}
|
|
)
|
|
|
|
# 8-bit quantization (less memory)
|
|
model = outlines.models.transformers(
|
|
"microsoft/Phi-3-mini-4k-instruct",
|
|
device="cuda",
|
|
model_kwargs={
|
|
"load_in_8bit": True,
|
|
"device_map": "auto"
|
|
}
|
|
)
|
|
|
|
# 4-bit quantization (even less memory)
|
|
model = outlines.models.transformers(
|
|
"meta-llama/Llama-3.1-70B-Instruct",
|
|
device="cuda",
|
|
model_kwargs={
|
|
"load_in_4bit": True,
|
|
"device_map": "auto",
|
|
"bnb_4bit_compute_dtype": "float16"
|
|
}
|
|
)
|
|
|
|
# Multi-GPU
|
|
model = outlines.models.transformers(
|
|
"meta-llama/Llama-3.1-70B-Instruct",
|
|
device="cuda",
|
|
model_kwargs={
|
|
"device_map": "auto", # Automatic GPU distribution
|
|
"max_memory": {0: "40GB", 1: "40GB"} # Per-GPU limits
|
|
}
|
|
)
|
|
```
|
|
|
|
### Popular Models
|
|
|
|
```python
|
|
# Phi-4 (Microsoft)
|
|
model = outlines.models.transformers("microsoft/Phi-4-mini-instruct")
|
|
model = outlines.models.transformers("microsoft/Phi-3-medium-4k-instruct")
|
|
|
|
# Llama 3.1 (Meta)
|
|
model = outlines.models.transformers("meta-llama/Llama-3.1-8B-Instruct")
|
|
model = outlines.models.transformers("meta-llama/Llama-3.1-70B-Instruct")
|
|
model = outlines.models.transformers("meta-llama/Llama-3.1-405B-Instruct")
|
|
|
|
# Mistral (Mistral AI)
|
|
model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.3")
|
|
model = outlines.models.transformers("mistralai/Mixtral-8x7B-Instruct-v0.1")
|
|
model = outlines.models.transformers("mistralai/Mixtral-8x22B-Instruct-v0.1")
|
|
|
|
# Qwen (Alibaba)
|
|
model = outlines.models.transformers("Qwen/Qwen2.5-7B-Instruct")
|
|
model = outlines.models.transformers("Qwen/Qwen2.5-14B-Instruct")
|
|
model = outlines.models.transformers("Qwen/Qwen2.5-72B-Instruct")
|
|
|
|
# Gemma (Google)
|
|
model = outlines.models.transformers("google/gemma-2-9b-it")
|
|
model = outlines.models.transformers("google/gemma-2-27b-it")
|
|
|
|
# Llava (Vision)
|
|
model = outlines.models.transformers("llava-hf/llava-v1.6-mistral-7b-hf")
|
|
```
|
|
|
|
### Custom Model Loading
|
|
|
|
```python
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
import outlines
|
|
|
|
# Load model manually
|
|
tokenizer = AutoTokenizer.from_pretrained("your-model")
|
|
model_hf = AutoModelForCausalLM.from_pretrained(
|
|
"your-model",
|
|
device_map="auto",
|
|
torch_dtype="float16"
|
|
)
|
|
|
|
# Use with Outlines
|
|
model = outlines.models.transformers(
|
|
model=model_hf,
|
|
tokenizer=tokenizer
|
|
)
|
|
```
|
|
|
|
## llama.cpp
|
|
|
|
### Basic Setup
|
|
|
|
```python
|
|
import outlines
|
|
|
|
# Load GGUF model
|
|
model = outlines.models.llamacpp(
|
|
"./models/llama-3.1-8b-instruct.Q4_K_M.gguf",
|
|
n_ctx=4096 # Context window
|
|
)
|
|
|
|
# Use with generator
|
|
generator = outlines.generate.json(model, YourModel)
|
|
```
|
|
|
|
### GPU Configuration
|
|
|
|
```python
|
|
# CPU only
|
|
model = outlines.models.llamacpp(
|
|
"./models/model.gguf",
|
|
n_ctx=4096,
|
|
n_threads=8 # Use 8 CPU threads
|
|
)
|
|
|
|
# GPU offload (partial)
|
|
model = outlines.models.llamacpp(
|
|
"./models/model.gguf",
|
|
n_ctx=4096,
|
|
n_gpu_layers=35, # Offload 35 layers to GPU
|
|
n_threads=4 # CPU threads for remaining layers
|
|
)
|
|
|
|
# Full GPU offload
|
|
model = outlines.models.llamacpp(
|
|
"./models/model.gguf",
|
|
n_ctx=8192,
|
|
n_gpu_layers=-1 # All layers on GPU
|
|
)
|
|
```
|
|
|
|
### Advanced Configuration
|
|
|
|
```python
|
|
model = outlines.models.llamacpp(
|
|
"./models/llama-3.1-8b.Q4_K_M.gguf",
|
|
n_ctx=8192, # Context window (tokens)
|
|
n_gpu_layers=35, # GPU layers
|
|
n_threads=8, # CPU threads
|
|
n_batch=512, # Batch size for prompt processing
|
|
use_mmap=True, # Memory-map model file (faster loading)
|
|
use_mlock=False, # Lock model in RAM (prevents swapping)
|
|
seed=42, # Random seed for reproducibility
|
|
verbose=False # Suppress verbose output
|
|
)
|
|
```
|
|
|
|
### Quantization Formats
|
|
|
|
```python
|
|
# Q4_K_M (4-bit, recommended for most cases)
|
|
# - Size: ~4.5GB for 7B model
|
|
# - Quality: Good
|
|
# - Speed: Fast
|
|
model = outlines.models.llamacpp("./models/model.Q4_K_M.gguf")
|
|
|
|
# Q5_K_M (5-bit, better quality)
|
|
# - Size: ~5.5GB for 7B model
|
|
# - Quality: Very good
|
|
# - Speed: Slightly slower than Q4
|
|
model = outlines.models.llamacpp("./models/model.Q5_K_M.gguf")
|
|
|
|
# Q6_K (6-bit, high quality)
|
|
# - Size: ~6.5GB for 7B model
|
|
# - Quality: Excellent
|
|
# - Speed: Slower than Q5
|
|
model = outlines.models.llamacpp("./models/model.Q6_K.gguf")
|
|
|
|
# Q8_0 (8-bit, near-original quality)
|
|
# - Size: ~8GB for 7B model
|
|
# - Quality: Near FP16
|
|
# - Speed: Slower than Q6
|
|
model = outlines.models.llamacpp("./models/model.Q8_0.gguf")
|
|
|
|
# F16 (16-bit float, original quality)
|
|
# - Size: ~14GB for 7B model
|
|
# - Quality: Original
|
|
# - Speed: Slowest
|
|
model = outlines.models.llamacpp("./models/model.F16.gguf")
|
|
```
|
|
|
|
### Popular GGUF Models
|
|
|
|
```python
|
|
# Llama 3.1
|
|
model = outlines.models.llamacpp("llama-3.1-8b-instruct.Q4_K_M.gguf")
|
|
model = outlines.models.llamacpp("llama-3.1-70b-instruct.Q4_K_M.gguf")
|
|
|
|
# Mistral
|
|
model = outlines.models.llamacpp("mistral-7b-instruct-v0.3.Q4_K_M.gguf")
|
|
|
|
# Phi-4
|
|
model = outlines.models.llamacpp("phi-4-mini-instruct.Q4_K_M.gguf")
|
|
|
|
# Qwen
|
|
model = outlines.models.llamacpp("qwen2.5-7b-instruct.Q4_K_M.gguf")
|
|
```
|
|
|
|
### Apple Silicon Optimization
|
|
|
|
```python
|
|
# Optimized for M1/M2/M3 Macs
|
|
model = outlines.models.llamacpp(
|
|
"./models/llama-3.1-8b.Q4_K_M.gguf",
|
|
n_ctx=4096,
|
|
n_gpu_layers=-1, # Use Metal GPU acceleration
|
|
use_mmap=True, # Efficient memory mapping
|
|
n_threads=8 # Use performance cores
|
|
)
|
|
```
|
|
|
|
## vLLM (Production)
|
|
|
|
### Basic Setup
|
|
|
|
```python
|
|
import outlines
|
|
|
|
# Load model with vLLM
|
|
model = outlines.models.vllm("meta-llama/Llama-3.1-8B-Instruct")
|
|
|
|
# Use with generator
|
|
generator = outlines.generate.json(model, YourModel)
|
|
```
|
|
|
|
### Single GPU
|
|
|
|
```python
|
|
model = outlines.models.vllm(
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
gpu_memory_utilization=0.9, # Use 90% of GPU memory
|
|
max_model_len=4096 # Max sequence length
|
|
)
|
|
```
|
|
|
|
### Multi-GPU
|
|
|
|
```python
|
|
# Tensor parallelism (split model across GPUs)
|
|
model = outlines.models.vllm(
|
|
"meta-llama/Llama-3.1-70B-Instruct",
|
|
tensor_parallel_size=4, # Use 4 GPUs
|
|
gpu_memory_utilization=0.9
|
|
)
|
|
|
|
# Pipeline parallelism (rare, for very large models)
|
|
model = outlines.models.vllm(
|
|
"meta-llama/Llama-3.1-405B-Instruct",
|
|
pipeline_parallel_size=8, # 8-GPU pipeline
|
|
tensor_parallel_size=4 # 4-GPU tensor split
|
|
# Total: 32 GPUs
|
|
)
|
|
```
|
|
|
|
### Quantization
|
|
|
|
```python
|
|
# AWQ quantization (4-bit)
|
|
model = outlines.models.vllm(
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
quantization="awq",
|
|
dtype="float16"
|
|
)
|
|
|
|
# GPTQ quantization (4-bit)
|
|
model = outlines.models.vllm(
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
quantization="gptq"
|
|
)
|
|
|
|
# SqueezeLLM quantization
|
|
model = outlines.models.vllm(
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
quantization="squeezellm"
|
|
)
|
|
```
|
|
|
|
### Advanced Configuration
|
|
|
|
```python
|
|
model = outlines.models.vllm(
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
tensor_parallel_size=1,
|
|
gpu_memory_utilization=0.9,
|
|
max_model_len=8192,
|
|
max_num_seqs=256, # Max concurrent sequences
|
|
max_num_batched_tokens=8192, # Max tokens per batch
|
|
dtype="float16",
|
|
trust_remote_code=True,
|
|
enforce_eager=False, # Use CUDA graphs (faster)
|
|
swap_space=4 # CPU swap space (GB)
|
|
)
|
|
```
|
|
|
|
### Batch Processing
|
|
|
|
```python
|
|
# vLLM optimized for high-throughput batch processing
|
|
model = outlines.models.vllm(
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
max_num_seqs=128 # Process 128 sequences in parallel
|
|
)
|
|
|
|
generator = outlines.generate.json(model, YourModel)
|
|
|
|
# Process many prompts efficiently
|
|
prompts = ["prompt1", "prompt2", ..., "prompt100"]
|
|
results = [generator(p) for p in prompts]
|
|
# vLLM automatically batches and optimizes
|
|
```
|
|
|
|
## OpenAI (Limited Support)
|
|
|
|
### Basic Setup
|
|
|
|
```python
|
|
import outlines
|
|
|
|
# Basic OpenAI support
|
|
model = outlines.models.openai("gpt-4o-mini", api_key="your-api-key")
|
|
|
|
# Use with generator
|
|
generator = outlines.generate.json(model, YourModel)
|
|
result = generator("Your prompt")
|
|
```
|
|
|
|
### Configuration
|
|
|
|
```python
|
|
model = outlines.models.openai(
|
|
"gpt-4o-mini",
|
|
api_key="your-api-key", # Or set OPENAI_API_KEY env var
|
|
max_tokens=2048,
|
|
temperature=0.7
|
|
)
|
|
```
|
|
|
|
### Available Models
|
|
|
|
```python
|
|
# GPT-4o (latest)
|
|
model = outlines.models.openai("gpt-4o")
|
|
|
|
# GPT-4o Mini (cost-effective)
|
|
model = outlines.models.openai("gpt-4o-mini")
|
|
|
|
# GPT-4 Turbo
|
|
model = outlines.models.openai("gpt-4-turbo")
|
|
|
|
# GPT-3.5 Turbo
|
|
model = outlines.models.openai("gpt-3.5-turbo")
|
|
```
|
|
|
|
**Note**: OpenAI support is limited compared to local models. Some advanced features may not work.
|
|
|
|
## Backend Comparison
|
|
|
|
### Feature Matrix
|
|
|
|
| Feature | Transformers | llama.cpp | vLLM | OpenAI |
|
|
|---------|-------------|-----------|------|--------|
|
|
| Structured Generation | ✅ Full | ✅ Full | ✅ Full | ⚠️ Limited |
|
|
| FSM Optimization | ✅ Yes | ✅ Yes | ✅ Yes | ❌ No |
|
|
| GPU Support | ✅ Yes | ✅ Yes | ✅ Yes | N/A |
|
|
| Multi-GPU | ✅ Yes | ✅ Yes | ✅ Yes | N/A |
|
|
| Quantization | ✅ Yes | ✅ Yes | ✅ Yes | N/A |
|
|
| High Throughput | ⚠️ Medium | ⚠️ Medium | ✅ Excellent | ⚠️ API-limited |
|
|
| Setup Difficulty | Easy | Medium | Medium | Easy |
|
|
| Cost | Hardware | Hardware | Hardware | API usage |
|
|
|
|
### Performance Characteristics
|
|
|
|
**Transformers:**
|
|
- **Latency**: 50-200ms (single request, GPU)
|
|
- **Throughput**: 10-50 tokens/sec (depends on hardware)
|
|
- **Memory**: 2-4GB per 1B parameters (FP16)
|
|
- **Best for**: Development, small-scale deployment, flexibility
|
|
|
|
**llama.cpp:**
|
|
- **Latency**: 30-150ms (single request)
|
|
- **Throughput**: 20-150 tokens/sec (depends on quantization)
|
|
- **Memory**: 0.5-2GB per 1B parameters (Q4-Q8)
|
|
- **Best for**: CPU inference, Apple Silicon, edge deployment, low memory
|
|
|
|
**vLLM:**
|
|
- **Latency**: 30-100ms (single request)
|
|
- **Throughput**: 100-1000+ tokens/sec (batch processing)
|
|
- **Memory**: 2-4GB per 1B parameters (FP16)
|
|
- **Best for**: Production, high-throughput, batch processing, serving
|
|
|
|
**OpenAI:**
|
|
- **Latency**: 200-500ms (API call)
|
|
- **Throughput**: API rate limits
|
|
- **Memory**: N/A (cloud-based)
|
|
- **Best for**: Quick prototyping, no infrastructure
|
|
|
|
### Memory Requirements
|
|
|
|
**7B Model:**
|
|
- FP16: ~14GB
|
|
- 8-bit: ~7GB
|
|
- 4-bit: ~4GB
|
|
- Q4_K_M (GGUF): ~4.5GB
|
|
|
|
**13B Model:**
|
|
- FP16: ~26GB
|
|
- 8-bit: ~13GB
|
|
- 4-bit: ~7GB
|
|
- Q4_K_M (GGUF): ~8GB
|
|
|
|
**70B Model:**
|
|
- FP16: ~140GB (multi-GPU)
|
|
- 8-bit: ~70GB (multi-GPU)
|
|
- 4-bit: ~35GB (single A100/H100)
|
|
- Q4_K_M (GGUF): ~40GB
|
|
|
|
## Performance Tuning
|
|
|
|
### Transformers Optimization
|
|
|
|
```python
|
|
# Use FP16
|
|
model = outlines.models.transformers(
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
device="cuda",
|
|
model_kwargs={"torch_dtype": "float16"}
|
|
)
|
|
|
|
# Use flash attention (2-4x faster)
|
|
model = outlines.models.transformers(
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
device="cuda",
|
|
model_kwargs={
|
|
"torch_dtype": "float16",
|
|
"use_flash_attention_2": True
|
|
}
|
|
)
|
|
|
|
# Use 8-bit quantization (2x less memory)
|
|
model = outlines.models.transformers(
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
device="cuda",
|
|
model_kwargs={
|
|
"load_in_8bit": True,
|
|
"device_map": "auto"
|
|
}
|
|
)
|
|
```
|
|
|
|
### llama.cpp Optimization
|
|
|
|
```python
|
|
# Maximize GPU usage
|
|
model = outlines.models.llamacpp(
|
|
"./models/model.Q4_K_M.gguf",
|
|
n_gpu_layers=-1, # All layers on GPU
|
|
n_ctx=8192,
|
|
n_batch=512 # Larger batch = faster
|
|
)
|
|
|
|
# Optimize for CPU (Apple Silicon)
|
|
model = outlines.models.llamacpp(
|
|
"./models/model.Q4_K_M.gguf",
|
|
n_ctx=4096,
|
|
n_threads=8, # Use all performance cores
|
|
use_mmap=True
|
|
)
|
|
```
|
|
|
|
### vLLM Optimization
|
|
|
|
```python
|
|
# High throughput
|
|
model = outlines.models.vllm(
|
|
"meta-llama/Llama-3.1-8B-Instruct",
|
|
gpu_memory_utilization=0.95, # Use 95% of GPU
|
|
max_num_seqs=256, # High concurrency
|
|
enforce_eager=False # Use CUDA graphs
|
|
)
|
|
|
|
# Multi-GPU
|
|
model = outlines.models.vllm(
|
|
"meta-llama/Llama-3.1-70B-Instruct",
|
|
tensor_parallel_size=4, # 4 GPUs
|
|
gpu_memory_utilization=0.9
|
|
)
|
|
```
|
|
|
|
## Production Deployment
|
|
|
|
### Docker with vLLM
|
|
|
|
```dockerfile
|
|
FROM vllm/vllm-openai:latest
|
|
|
|
# Install outlines
|
|
RUN pip install outlines
|
|
|
|
# Copy your code
|
|
COPY app.py /app/
|
|
|
|
# Run
|
|
CMD ["python", "/app/app.py"]
|
|
```
|
|
|
|
### Environment Variables
|
|
|
|
```bash
|
|
# Transformers cache
|
|
export HF_HOME="/path/to/cache"
|
|
export TRANSFORMERS_CACHE="/path/to/cache"
|
|
|
|
# GPU selection
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
|
|
|
# OpenAI API key
|
|
export OPENAI_API_KEY="sk-..."
|
|
|
|
# Disable tokenizers parallelism warning
|
|
export TOKENIZERS_PARALLELISM=false
|
|
```
|
|
|
|
### Model Serving
|
|
|
|
```python
|
|
# Simple HTTP server with vLLM
|
|
import outlines
|
|
from fastapi import FastAPI
|
|
from pydantic import BaseModel
|
|
|
|
app = FastAPI()
|
|
|
|
# Load model once at startup
|
|
model = outlines.models.vllm("meta-llama/Llama-3.1-8B-Instruct")
|
|
|
|
class User(BaseModel):
|
|
name: str
|
|
age: int
|
|
email: str
|
|
|
|
generator = outlines.generate.json(model, User)
|
|
|
|
@app.post("/extract")
|
|
def extract(text: str):
|
|
result = generator(f"Extract user from: {text}")
|
|
return result.model_dump()
|
|
```
|
|
|
|
## Resources
|
|
|
|
- **Transformers**: https://huggingface.co/docs/transformers
|
|
- **llama.cpp**: https://github.com/ggerganov/llama.cpp
|
|
- **vLLM**: https://docs.vllm.ai
|
|
- **Outlines**: https://github.com/outlines-dev/outlines
|