Sync all skills and memories 2026-04-14 07:27
This commit is contained in:
256
skills/mlops/models/clip/SKILL.md
Normal file
256
skills/mlops/models/clip/SKILL.md
Normal file
@@ -0,0 +1,256 @@
|
||||
---
|
||||
name: clip
|
||||
description: OpenAI's model connecting vision and language. Enables zero-shot image classification, image-text matching, and cross-modal retrieval. Trained on 400M image-text pairs. Use for image search, content moderation, or vision-language tasks without fine-tuning. Best for general-purpose image understanding.
|
||||
version: 1.0.0
|
||||
author: Orchestra Research
|
||||
license: MIT
|
||||
dependencies: [transformers, torch, pillow]
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [Multimodal, CLIP, Vision-Language, Zero-Shot, Image Classification, OpenAI, Image Search, Cross-Modal Retrieval, Content Moderation]
|
||||
|
||||
---
|
||||
|
||||
# CLIP - Contrastive Language-Image Pre-Training
|
||||
|
||||
OpenAI's model that understands images from natural language.
|
||||
|
||||
## When to use CLIP
|
||||
|
||||
**Use when:**
|
||||
- Zero-shot image classification (no training data needed)
|
||||
- Image-text similarity/matching
|
||||
- Semantic image search
|
||||
- Content moderation (detect NSFW, violence)
|
||||
- Visual question answering
|
||||
- Cross-modal retrieval (image→text, text→image)
|
||||
|
||||
**Metrics**:
|
||||
- **25,300+ GitHub stars**
|
||||
- Trained on 400M image-text pairs
|
||||
- Matches ResNet-50 on ImageNet (zero-shot)
|
||||
- MIT License
|
||||
|
||||
**Use alternatives instead**:
|
||||
- **BLIP-2**: Better captioning
|
||||
- **LLaVA**: Vision-language chat
|
||||
- **Segment Anything**: Image segmentation
|
||||
|
||||
## Quick start
|
||||
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/openai/CLIP.git
|
||||
pip install torch torchvision ftfy regex tqdm
|
||||
```
|
||||
|
||||
### Zero-shot classification
|
||||
|
||||
```python
|
||||
import torch
|
||||
import clip
|
||||
from PIL import Image
|
||||
|
||||
# Load model
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model, preprocess = clip.load("ViT-B/32", device=device)
|
||||
|
||||
# Load image
|
||||
image = preprocess(Image.open("photo.jpg")).unsqueeze(0).to(device)
|
||||
|
||||
# Define possible labels
|
||||
text = clip.tokenize(["a dog", "a cat", "a bird", "a car"]).to(device)
|
||||
|
||||
# Compute similarity
|
||||
with torch.no_grad():
|
||||
image_features = model.encode_image(image)
|
||||
text_features = model.encode_text(text)
|
||||
|
||||
# Cosine similarity
|
||||
logits_per_image, logits_per_text = model(image, text)
|
||||
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
|
||||
|
||||
# Print results
|
||||
labels = ["a dog", "a cat", "a bird", "a car"]
|
||||
for label, prob in zip(labels, probs[0]):
|
||||
print(f"{label}: {prob:.2%}")
|
||||
```
|
||||
|
||||
## Available models
|
||||
|
||||
```python
|
||||
# Models (sorted by size)
|
||||
models = [
|
||||
"RN50", # ResNet-50
|
||||
"RN101", # ResNet-101
|
||||
"ViT-B/32", # Vision Transformer (recommended)
|
||||
"ViT-B/16", # Better quality, slower
|
||||
"ViT-L/14", # Best quality, slowest
|
||||
]
|
||||
|
||||
model, preprocess = clip.load("ViT-B/32")
|
||||
```
|
||||
|
||||
| Model | Parameters | Speed | Quality |
|
||||
|-------|------------|-------|---------|
|
||||
| RN50 | 102M | Fast | Good |
|
||||
| ViT-B/32 | 151M | Medium | Better |
|
||||
| ViT-L/14 | 428M | Slow | Best |
|
||||
|
||||
## Image-text similarity
|
||||
|
||||
```python
|
||||
# Compute embeddings
|
||||
image_features = model.encode_image(image)
|
||||
text_features = model.encode_text(text)
|
||||
|
||||
# Normalize
|
||||
image_features /= image_features.norm(dim=-1, keepdim=True)
|
||||
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||
|
||||
# Cosine similarity
|
||||
similarity = (image_features @ text_features.T).item()
|
||||
print(f"Similarity: {similarity:.4f}")
|
||||
```
|
||||
|
||||
## Semantic image search
|
||||
|
||||
```python
|
||||
# Index images
|
||||
image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
|
||||
image_embeddings = []
|
||||
|
||||
for img_path in image_paths:
|
||||
image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
|
||||
with torch.no_grad():
|
||||
embedding = model.encode_image(image)
|
||||
embedding /= embedding.norm(dim=-1, keepdim=True)
|
||||
image_embeddings.append(embedding)
|
||||
|
||||
image_embeddings = torch.cat(image_embeddings)
|
||||
|
||||
# Search with text query
|
||||
query = "a sunset over the ocean"
|
||||
text_input = clip.tokenize([query]).to(device)
|
||||
with torch.no_grad():
|
||||
text_embedding = model.encode_text(text_input)
|
||||
text_embedding /= text_embedding.norm(dim=-1, keepdim=True)
|
||||
|
||||
# Find most similar images
|
||||
similarities = (text_embedding @ image_embeddings.T).squeeze(0)
|
||||
top_k = similarities.topk(3)
|
||||
|
||||
for idx, score in zip(top_k.indices, top_k.values):
|
||||
print(f"{image_paths[idx]}: {score:.3f}")
|
||||
```
|
||||
|
||||
## Content moderation
|
||||
|
||||
```python
|
||||
# Define categories
|
||||
categories = [
|
||||
"safe for work",
|
||||
"not safe for work",
|
||||
"violent content",
|
||||
"graphic content"
|
||||
]
|
||||
|
||||
text = clip.tokenize(categories).to(device)
|
||||
|
||||
# Check image
|
||||
with torch.no_grad():
|
||||
logits_per_image, _ = model(image, text)
|
||||
probs = logits_per_image.softmax(dim=-1)
|
||||
|
||||
# Get classification
|
||||
max_idx = probs.argmax().item()
|
||||
max_prob = probs[0, max_idx].item()
|
||||
|
||||
print(f"Category: {categories[max_idx]} ({max_prob:.2%})")
|
||||
```
|
||||
|
||||
## Batch processing
|
||||
|
||||
```python
|
||||
# Process multiple images
|
||||
images = [preprocess(Image.open(f"img{i}.jpg")) for i in range(10)]
|
||||
images = torch.stack(images).to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
image_features = model.encode_image(images)
|
||||
image_features /= image_features.norm(dim=-1, keepdim=True)
|
||||
|
||||
# Batch text
|
||||
texts = ["a dog", "a cat", "a bird"]
|
||||
text_tokens = clip.tokenize(texts).to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
text_features = model.encode_text(text_tokens)
|
||||
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||
|
||||
# Similarity matrix (10 images × 3 texts)
|
||||
similarities = image_features @ text_features.T
|
||||
print(similarities.shape) # (10, 3)
|
||||
```
|
||||
|
||||
## Integration with vector databases
|
||||
|
||||
```python
|
||||
# Store CLIP embeddings in Chroma/FAISS
|
||||
import chromadb
|
||||
|
||||
client = chromadb.Client()
|
||||
collection = client.create_collection("image_embeddings")
|
||||
|
||||
# Add image embeddings
|
||||
for img_path, embedding in zip(image_paths, image_embeddings):
|
||||
collection.add(
|
||||
embeddings=[embedding.cpu().numpy().tolist()],
|
||||
metadatas=[{"path": img_path}],
|
||||
ids=[img_path]
|
||||
)
|
||||
|
||||
# Query with text
|
||||
query = "a sunset"
|
||||
text_embedding = model.encode_text(clip.tokenize([query]))
|
||||
results = collection.query(
|
||||
query_embeddings=[text_embedding.cpu().numpy().tolist()],
|
||||
n_results=5
|
||||
)
|
||||
```
|
||||
|
||||
## Best practices
|
||||
|
||||
1. **Use ViT-B/32 for most cases** - Good balance
|
||||
2. **Normalize embeddings** - Required for cosine similarity
|
||||
3. **Batch processing** - More efficient
|
||||
4. **Cache embeddings** - Expensive to recompute
|
||||
5. **Use descriptive labels** - Better zero-shot performance
|
||||
6. **GPU recommended** - 10-50× faster
|
||||
7. **Preprocess images** - Use provided preprocess function
|
||||
|
||||
## Performance
|
||||
|
||||
| Operation | CPU | GPU (V100) |
|
||||
|-----------|-----|------------|
|
||||
| Image encoding | ~200ms | ~20ms |
|
||||
| Text encoding | ~50ms | ~5ms |
|
||||
| Similarity compute | <1ms | <1ms |
|
||||
|
||||
## Limitations
|
||||
|
||||
1. **Not for fine-grained tasks** - Best for broad categories
|
||||
2. **Requires descriptive text** - Vague labels perform poorly
|
||||
3. **Biased on web data** - May have dataset biases
|
||||
4. **No bounding boxes** - Whole image only
|
||||
5. **Limited spatial understanding** - Position/counting weak
|
||||
|
||||
## Resources
|
||||
|
||||
- **GitHub**: https://github.com/openai/CLIP ⭐ 25,300+
|
||||
- **Paper**: https://arxiv.org/abs/2103.00020
|
||||
- **Colab**: https://colab.research.google.com/github/openai/clip/
|
||||
- **License**: MIT
|
||||
|
||||
|
||||
207
skills/mlops/models/clip/references/applications.md
Normal file
207
skills/mlops/models/clip/references/applications.md
Normal file
@@ -0,0 +1,207 @@
|
||||
# CLIP Applications Guide
|
||||
|
||||
Practical applications and use cases for CLIP.
|
||||
|
||||
## Zero-shot image classification
|
||||
|
||||
```python
|
||||
import torch
|
||||
import clip
|
||||
from PIL import Image
|
||||
|
||||
model, preprocess = clip.load("ViT-B/32")
|
||||
|
||||
# Define categories
|
||||
categories = [
|
||||
"a photo of a dog",
|
||||
"a photo of a cat",
|
||||
"a photo of a bird",
|
||||
"a photo of a car",
|
||||
"a photo of a person"
|
||||
]
|
||||
|
||||
# Prepare image
|
||||
image = preprocess(Image.open("photo.jpg")).unsqueeze(0)
|
||||
text = clip.tokenize(categories)
|
||||
|
||||
# Classify
|
||||
with torch.no_grad():
|
||||
image_features = model.encode_image(image)
|
||||
text_features = model.encode_text(text)
|
||||
|
||||
logits_per_image, _ = model(image, text)
|
||||
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
|
||||
|
||||
# Print results
|
||||
for category, prob in zip(categories, probs[0]):
|
||||
print(f"{category}: {prob:.2%}")
|
||||
```
|
||||
|
||||
## Semantic image search
|
||||
|
||||
```python
|
||||
# Index images
|
||||
image_database = []
|
||||
image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
|
||||
|
||||
for img_path in image_paths:
|
||||
image = preprocess(Image.open(img_path)).unsqueeze(0)
|
||||
with torch.no_grad():
|
||||
features = model.encode_image(image)
|
||||
features /= features.norm(dim=-1, keepdim=True)
|
||||
image_database.append((img_path, features))
|
||||
|
||||
# Search with text
|
||||
query = "a sunset over mountains"
|
||||
text_input = clip.tokenize([query])
|
||||
|
||||
with torch.no_grad():
|
||||
text_features = model.encode_text(text_input)
|
||||
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||
|
||||
# Find matches
|
||||
similarities = []
|
||||
for img_path, img_features in image_database:
|
||||
similarity = (text_features @ img_features.T).item()
|
||||
similarities.append((img_path, similarity))
|
||||
|
||||
# Sort by similarity
|
||||
similarities.sort(key=lambda x: x[1], reverse=True)
|
||||
for img_path, score in similarities[:3]:
|
||||
print(f"{img_path}: {score:.3f}")
|
||||
```
|
||||
|
||||
## Content moderation
|
||||
|
||||
```python
|
||||
# Define safety categories
|
||||
categories = [
|
||||
"safe for work content",
|
||||
"not safe for work content",
|
||||
"violent or graphic content",
|
||||
"hate speech or offensive content",
|
||||
"spam or misleading content"
|
||||
]
|
||||
|
||||
text = clip.tokenize(categories)
|
||||
|
||||
# Check image
|
||||
with torch.no_grad():
|
||||
logits, _ = model(image, text)
|
||||
probs = logits.softmax(dim=-1)
|
||||
|
||||
# Get classification
|
||||
max_idx = probs.argmax().item()
|
||||
confidence = probs[0, max_idx].item()
|
||||
|
||||
if confidence > 0.7:
|
||||
print(f"Classified as: {categories[max_idx]} ({confidence:.2%})")
|
||||
else:
|
||||
print(f"Uncertain classification (confidence: {confidence:.2%})")
|
||||
```
|
||||
|
||||
## Image-to-text retrieval
|
||||
|
||||
```python
|
||||
# Text database
|
||||
captions = [
|
||||
"A beautiful sunset over the ocean",
|
||||
"A cute dog playing in the park",
|
||||
"A modern city skyline at night",
|
||||
"A delicious pizza with toppings"
|
||||
]
|
||||
|
||||
# Encode captions
|
||||
caption_features = []
|
||||
for caption in captions:
|
||||
text = clip.tokenize([caption])
|
||||
with torch.no_grad():
|
||||
features = model.encode_text(text)
|
||||
features /= features.norm(dim=-1, keepdim=True)
|
||||
caption_features.append(features)
|
||||
|
||||
caption_features = torch.cat(caption_features)
|
||||
|
||||
# Find matching captions for image
|
||||
with torch.no_grad():
|
||||
image_features = model.encode_image(image)
|
||||
image_features /= image_features.norm(dim=-1, keepdim=True)
|
||||
|
||||
similarities = (image_features @ caption_features.T).squeeze(0)
|
||||
top_k = similarities.topk(3)
|
||||
|
||||
for idx, score in zip(top_k.indices, top_k.values):
|
||||
print(f"{captions[idx]}: {score:.3f}")
|
||||
```
|
||||
|
||||
## Visual question answering
|
||||
|
||||
```python
|
||||
# Create yes/no questions
|
||||
image = preprocess(Image.open("photo.jpg")).unsqueeze(0)
|
||||
|
||||
questions = [
|
||||
"a photo showing people",
|
||||
"a photo showing animals",
|
||||
"a photo taken indoors",
|
||||
"a photo taken outdoors",
|
||||
"a photo taken during daytime",
|
||||
"a photo taken at night"
|
||||
]
|
||||
|
||||
text = clip.tokenize(questions)
|
||||
|
||||
with torch.no_grad():
|
||||
logits, _ = model(image, text)
|
||||
probs = logits.softmax(dim=-1)
|
||||
|
||||
# Answer questions
|
||||
for question, prob in zip(questions, probs[0]):
|
||||
answer = "Yes" if prob > 0.5 else "No"
|
||||
print(f"{question}: {answer} ({prob:.2%})")
|
||||
```
|
||||
|
||||
## Image deduplication
|
||||
|
||||
```python
|
||||
# Detect duplicate/similar images
|
||||
def compute_similarity(img1_path, img2_path):
|
||||
img1 = preprocess(Image.open(img1_path)).unsqueeze(0)
|
||||
img2 = preprocess(Image.open(img2_path)).unsqueeze(0)
|
||||
|
||||
with torch.no_grad():
|
||||
feat1 = model.encode_image(img1)
|
||||
feat2 = model.encode_image(img2)
|
||||
|
||||
feat1 /= feat1.norm(dim=-1, keepdim=True)
|
||||
feat2 /= feat2.norm(dim=-1, keepdim=True)
|
||||
|
||||
similarity = (feat1 @ feat2.T).item()
|
||||
|
||||
return similarity
|
||||
|
||||
# Check for duplicates
|
||||
threshold = 0.95
|
||||
image_pairs = [("img1.jpg", "img2.jpg"), ("img1.jpg", "img3.jpg")]
|
||||
|
||||
for img1, img2 in image_pairs:
|
||||
sim = compute_similarity(img1, img2)
|
||||
if sim > threshold:
|
||||
print(f"{img1} and {img2} are duplicates (similarity: {sim:.3f})")
|
||||
```
|
||||
|
||||
## Best practices
|
||||
|
||||
1. **Use descriptive labels** - "a photo of X" works better than just "X"
|
||||
2. **Normalize embeddings** - Always normalize for cosine similarity
|
||||
3. **Batch processing** - Process multiple images/texts together
|
||||
4. **Cache embeddings** - Expensive to recompute
|
||||
5. **Set appropriate thresholds** - Test on validation data
|
||||
6. **Use GPU** - 10-50× faster than CPU
|
||||
7. **Consider model size** - ViT-B/32 good default, ViT-L/14 for best quality
|
||||
|
||||
## Resources
|
||||
|
||||
- **Paper**: https://arxiv.org/abs/2103.00020
|
||||
- **GitHub**: https://github.com/openai/CLIP
|
||||
- **Colab**: https://colab.research.google.com/github/openai/clip/
|
||||
Reference in New Issue
Block a user