Files
hermes-sync/skills/mlops/inference/outlines/references/backends.md

14 KiB

Backend Configuration Guide

Complete guide to configuring Outlines with different model backends.

Table of Contents

  • Local Models (Transformers, llama.cpp, vLLM)
  • API Models (OpenAI)
  • Performance Comparison
  • Configuration Examples
  • Production Deployment

Transformers (Hugging Face)

Basic Setup

import outlines

# Load model from Hugging Face
model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")

# Use with generator
generator = outlines.generate.json(model, YourModel)
result = generator("Your prompt")

GPU Configuration

# Use CUDA GPU
model = outlines.models.transformers(
    "microsoft/Phi-3-mini-4k-instruct",
    device="cuda"
)

# Use specific GPU
model = outlines.models.transformers(
    "microsoft/Phi-3-mini-4k-instruct",
    device="cuda:0"  # GPU 0
)

# Use CPU
model = outlines.models.transformers(
    "microsoft/Phi-3-mini-4k-instruct",
    device="cpu"
)

# Use Apple Silicon MPS
model = outlines.models.transformers(
    "microsoft/Phi-3-mini-4k-instruct",
    device="mps"
)

Advanced Configuration

# FP16 for faster inference
model = outlines.models.transformers(
    "microsoft/Phi-3-mini-4k-instruct",
    device="cuda",
    model_kwargs={
        "torch_dtype": "float16"
    }
)

# 8-bit quantization (less memory)
model = outlines.models.transformers(
    "microsoft/Phi-3-mini-4k-instruct",
    device="cuda",
    model_kwargs={
        "load_in_8bit": True,
        "device_map": "auto"
    }
)

# 4-bit quantization (even less memory)
model = outlines.models.transformers(
    "meta-llama/Llama-3.1-70B-Instruct",
    device="cuda",
    model_kwargs={
        "load_in_4bit": True,
        "device_map": "auto",
        "bnb_4bit_compute_dtype": "float16"
    }
)

# Multi-GPU
model = outlines.models.transformers(
    "meta-llama/Llama-3.1-70B-Instruct",
    device="cuda",
    model_kwargs={
        "device_map": "auto",  # Automatic GPU distribution
        "max_memory": {0: "40GB", 1: "40GB"}  # Per-GPU limits
    }
)
# Phi-4 (Microsoft)
model = outlines.models.transformers("microsoft/Phi-4-mini-instruct")
model = outlines.models.transformers("microsoft/Phi-3-medium-4k-instruct")

# Llama 3.1 (Meta)
model = outlines.models.transformers("meta-llama/Llama-3.1-8B-Instruct")
model = outlines.models.transformers("meta-llama/Llama-3.1-70B-Instruct")
model = outlines.models.transformers("meta-llama/Llama-3.1-405B-Instruct")

# Mistral (Mistral AI)
model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.3")
model = outlines.models.transformers("mistralai/Mixtral-8x7B-Instruct-v0.1")
model = outlines.models.transformers("mistralai/Mixtral-8x22B-Instruct-v0.1")

# Qwen (Alibaba)
model = outlines.models.transformers("Qwen/Qwen2.5-7B-Instruct")
model = outlines.models.transformers("Qwen/Qwen2.5-14B-Instruct")
model = outlines.models.transformers("Qwen/Qwen2.5-72B-Instruct")

# Gemma (Google)
model = outlines.models.transformers("google/gemma-2-9b-it")
model = outlines.models.transformers("google/gemma-2-27b-it")

# Llava (Vision)
model = outlines.models.transformers("llava-hf/llava-v1.6-mistral-7b-hf")

Custom Model Loading

from transformers import AutoTokenizer, AutoModelForCausalLM
import outlines

# Load model manually
tokenizer = AutoTokenizer.from_pretrained("your-model")
model_hf = AutoModelForCausalLM.from_pretrained(
    "your-model",
    device_map="auto",
    torch_dtype="float16"
)

# Use with Outlines
model = outlines.models.transformers(
    model=model_hf,
    tokenizer=tokenizer
)

llama.cpp

Basic Setup

import outlines

# Load GGUF model
model = outlines.models.llamacpp(
    "./models/llama-3.1-8b-instruct.Q4_K_M.gguf",
    n_ctx=4096  # Context window
)

# Use with generator
generator = outlines.generate.json(model, YourModel)

GPU Configuration

# CPU only
model = outlines.models.llamacpp(
    "./models/model.gguf",
    n_ctx=4096,
    n_threads=8  # Use 8 CPU threads
)

# GPU offload (partial)
model = outlines.models.llamacpp(
    "./models/model.gguf",
    n_ctx=4096,
    n_gpu_layers=35,  # Offload 35 layers to GPU
    n_threads=4       # CPU threads for remaining layers
)

# Full GPU offload
model = outlines.models.llamacpp(
    "./models/model.gguf",
    n_ctx=8192,
    n_gpu_layers=-1  # All layers on GPU
)

Advanced Configuration

model = outlines.models.llamacpp(
    "./models/llama-3.1-8b.Q4_K_M.gguf",
    n_ctx=8192,          # Context window (tokens)
    n_gpu_layers=35,     # GPU layers
    n_threads=8,         # CPU threads
    n_batch=512,         # Batch size for prompt processing
    use_mmap=True,       # Memory-map model file (faster loading)
    use_mlock=False,     # Lock model in RAM (prevents swapping)
    seed=42,             # Random seed for reproducibility
    verbose=False        # Suppress verbose output
)

Quantization Formats

# Q4_K_M (4-bit, recommended for most cases)
# - Size: ~4.5GB for 7B model
# - Quality: Good
# - Speed: Fast
model = outlines.models.llamacpp("./models/model.Q4_K_M.gguf")

# Q5_K_M (5-bit, better quality)
# - Size: ~5.5GB for 7B model
# - Quality: Very good
# - Speed: Slightly slower than Q4
model = outlines.models.llamacpp("./models/model.Q5_K_M.gguf")

# Q6_K (6-bit, high quality)
# - Size: ~6.5GB for 7B model
# - Quality: Excellent
# - Speed: Slower than Q5
model = outlines.models.llamacpp("./models/model.Q6_K.gguf")

# Q8_0 (8-bit, near-original quality)
# - Size: ~8GB for 7B model
# - Quality: Near FP16
# - Speed: Slower than Q6
model = outlines.models.llamacpp("./models/model.Q8_0.gguf")

# F16 (16-bit float, original quality)
# - Size: ~14GB for 7B model
# - Quality: Original
# - Speed: Slowest
model = outlines.models.llamacpp("./models/model.F16.gguf")
# Llama 3.1
model = outlines.models.llamacpp("llama-3.1-8b-instruct.Q4_K_M.gguf")
model = outlines.models.llamacpp("llama-3.1-70b-instruct.Q4_K_M.gguf")

# Mistral
model = outlines.models.llamacpp("mistral-7b-instruct-v0.3.Q4_K_M.gguf")

# Phi-4
model = outlines.models.llamacpp("phi-4-mini-instruct.Q4_K_M.gguf")

# Qwen
model = outlines.models.llamacpp("qwen2.5-7b-instruct.Q4_K_M.gguf")

Apple Silicon Optimization

# Optimized for M1/M2/M3 Macs
model = outlines.models.llamacpp(
    "./models/llama-3.1-8b.Q4_K_M.gguf",
    n_ctx=4096,
    n_gpu_layers=-1,  # Use Metal GPU acceleration
    use_mmap=True,    # Efficient memory mapping
    n_threads=8       # Use performance cores
)

vLLM (Production)

Basic Setup

import outlines

# Load model with vLLM
model = outlines.models.vllm("meta-llama/Llama-3.1-8B-Instruct")

# Use with generator
generator = outlines.generate.json(model, YourModel)

Single GPU

model = outlines.models.vllm(
    "meta-llama/Llama-3.1-8B-Instruct",
    gpu_memory_utilization=0.9,  # Use 90% of GPU memory
    max_model_len=4096          # Max sequence length
)

Multi-GPU

# Tensor parallelism (split model across GPUs)
model = outlines.models.vllm(
    "meta-llama/Llama-3.1-70B-Instruct",
    tensor_parallel_size=4,  # Use 4 GPUs
    gpu_memory_utilization=0.9
)

# Pipeline parallelism (rare, for very large models)
model = outlines.models.vllm(
    "meta-llama/Llama-3.1-405B-Instruct",
    pipeline_parallel_size=8,  # 8-GPU pipeline
    tensor_parallel_size=4     # 4-GPU tensor split
    # Total: 32 GPUs
)

Quantization

# AWQ quantization (4-bit)
model = outlines.models.vllm(
    "meta-llama/Llama-3.1-8B-Instruct",
    quantization="awq",
    dtype="float16"
)

# GPTQ quantization (4-bit)
model = outlines.models.vllm(
    "meta-llama/Llama-3.1-8B-Instruct",
    quantization="gptq"
)

# SqueezeLLM quantization
model = outlines.models.vllm(
    "meta-llama/Llama-3.1-8B-Instruct",
    quantization="squeezellm"
)

Advanced Configuration

model = outlines.models.vllm(
    "meta-llama/Llama-3.1-8B-Instruct",
    tensor_parallel_size=1,
    gpu_memory_utilization=0.9,
    max_model_len=8192,
    max_num_seqs=256,           # Max concurrent sequences
    max_num_batched_tokens=8192, # Max tokens per batch
    dtype="float16",
    trust_remote_code=True,
    enforce_eager=False,        # Use CUDA graphs (faster)
    swap_space=4                # CPU swap space (GB)
)

Batch Processing

# vLLM optimized for high-throughput batch processing
model = outlines.models.vllm(
    "meta-llama/Llama-3.1-8B-Instruct",
    max_num_seqs=128  # Process 128 sequences in parallel
)

generator = outlines.generate.json(model, YourModel)

# Process many prompts efficiently
prompts = ["prompt1", "prompt2", ..., "prompt100"]
results = [generator(p) for p in prompts]
# vLLM automatically batches and optimizes

OpenAI (Limited Support)

Basic Setup

import outlines

# Basic OpenAI support
model = outlines.models.openai("gpt-4o-mini", api_key="your-api-key")

# Use with generator
generator = outlines.generate.json(model, YourModel)
result = generator("Your prompt")

Configuration

model = outlines.models.openai(
    "gpt-4o-mini",
    api_key="your-api-key",  # Or set OPENAI_API_KEY env var
    max_tokens=2048,
    temperature=0.7
)

Available Models

# GPT-4o (latest)
model = outlines.models.openai("gpt-4o")

# GPT-4o Mini (cost-effective)
model = outlines.models.openai("gpt-4o-mini")

# GPT-4 Turbo
model = outlines.models.openai("gpt-4-turbo")

# GPT-3.5 Turbo
model = outlines.models.openai("gpt-3.5-turbo")

Note: OpenAI support is limited compared to local models. Some advanced features may not work.

Backend Comparison

Feature Matrix

Feature Transformers llama.cpp vLLM OpenAI
Structured Generation Full Full Full ⚠️ Limited
FSM Optimization Yes Yes Yes No
GPU Support Yes Yes Yes N/A
Multi-GPU Yes Yes Yes N/A
Quantization Yes Yes Yes N/A
High Throughput ⚠️ Medium ⚠️ Medium Excellent ⚠️ API-limited
Setup Difficulty Easy Medium Medium Easy
Cost Hardware Hardware Hardware API usage

Performance Characteristics

Transformers:

  • Latency: 50-200ms (single request, GPU)
  • Throughput: 10-50 tokens/sec (depends on hardware)
  • Memory: 2-4GB per 1B parameters (FP16)
  • Best for: Development, small-scale deployment, flexibility

llama.cpp:

  • Latency: 30-150ms (single request)
  • Throughput: 20-150 tokens/sec (depends on quantization)
  • Memory: 0.5-2GB per 1B parameters (Q4-Q8)
  • Best for: CPU inference, Apple Silicon, edge deployment, low memory

vLLM:

  • Latency: 30-100ms (single request)
  • Throughput: 100-1000+ tokens/sec (batch processing)
  • Memory: 2-4GB per 1B parameters (FP16)
  • Best for: Production, high-throughput, batch processing, serving

OpenAI:

  • Latency: 200-500ms (API call)
  • Throughput: API rate limits
  • Memory: N/A (cloud-based)
  • Best for: Quick prototyping, no infrastructure

Memory Requirements

7B Model:

  • FP16: ~14GB
  • 8-bit: ~7GB
  • 4-bit: ~4GB
  • Q4_K_M (GGUF): ~4.5GB

13B Model:

  • FP16: ~26GB
  • 8-bit: ~13GB
  • 4-bit: ~7GB
  • Q4_K_M (GGUF): ~8GB

70B Model:

  • FP16: ~140GB (multi-GPU)
  • 8-bit: ~70GB (multi-GPU)
  • 4-bit: ~35GB (single A100/H100)
  • Q4_K_M (GGUF): ~40GB

Performance Tuning

Transformers Optimization

# Use FP16
model = outlines.models.transformers(
    "meta-llama/Llama-3.1-8B-Instruct",
    device="cuda",
    model_kwargs={"torch_dtype": "float16"}
)

# Use flash attention (2-4x faster)
model = outlines.models.transformers(
    "meta-llama/Llama-3.1-8B-Instruct",
    device="cuda",
    model_kwargs={
        "torch_dtype": "float16",
        "use_flash_attention_2": True
    }
)

# Use 8-bit quantization (2x less memory)
model = outlines.models.transformers(
    "meta-llama/Llama-3.1-8B-Instruct",
    device="cuda",
    model_kwargs={
        "load_in_8bit": True,
        "device_map": "auto"
    }
)

llama.cpp Optimization

# Maximize GPU usage
model = outlines.models.llamacpp(
    "./models/model.Q4_K_M.gguf",
    n_gpu_layers=-1,  # All layers on GPU
    n_ctx=8192,
    n_batch=512       # Larger batch = faster
)

# Optimize for CPU (Apple Silicon)
model = outlines.models.llamacpp(
    "./models/model.Q4_K_M.gguf",
    n_ctx=4096,
    n_threads=8,      # Use all performance cores
    use_mmap=True
)

vLLM Optimization

# High throughput
model = outlines.models.vllm(
    "meta-llama/Llama-3.1-8B-Instruct",
    gpu_memory_utilization=0.95,  # Use 95% of GPU
    max_num_seqs=256,             # High concurrency
    enforce_eager=False           # Use CUDA graphs
)

# Multi-GPU
model = outlines.models.vllm(
    "meta-llama/Llama-3.1-70B-Instruct",
    tensor_parallel_size=4,  # 4 GPUs
    gpu_memory_utilization=0.9
)

Production Deployment

Docker with vLLM

FROM vllm/vllm-openai:latest

# Install outlines
RUN pip install outlines

# Copy your code
COPY app.py /app/

# Run
CMD ["python", "/app/app.py"]

Environment Variables

# Transformers cache
export HF_HOME="/path/to/cache"
export TRANSFORMERS_CACHE="/path/to/cache"

# GPU selection
export CUDA_VISIBLE_DEVICES=0,1,2,3

# OpenAI API key
export OPENAI_API_KEY="sk-..."

# Disable tokenizers parallelism warning
export TOKENIZERS_PARALLELISM=false

Model Serving

# Simple HTTP server with vLLM
import outlines
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

# Load model once at startup
model = outlines.models.vllm("meta-llama/Llama-3.1-8B-Instruct")

class User(BaseModel):
    name: str
    age: int
    email: str

generator = outlines.generate.json(model, User)

@app.post("/extract")
def extract(text: str):
    result = generator(f"Extract user from: {text}")
    return result.model_dump()

Resources