Sync all skills and memories 2026-04-14 07:27
This commit is contained in:
@@ -0,0 +1,584 @@
|
||||
# Artifacts & Model Registry Guide
|
||||
|
||||
Complete guide to data versioning and model management with W&B Artifacts.
|
||||
|
||||
## Table of Contents
|
||||
- What are Artifacts
|
||||
- Creating Artifacts
|
||||
- Using Artifacts
|
||||
- Model Registry
|
||||
- Versioning & Lineage
|
||||
- Best Practices
|
||||
|
||||
## What are Artifacts
|
||||
|
||||
Artifacts are versioned datasets, models, or files tracked with lineage.
|
||||
|
||||
**Key Features:**
|
||||
- Automatic versioning (v0, v1, v2...)
|
||||
- Lineage tracking (which runs produced/used artifacts)
|
||||
- Efficient storage (deduplication)
|
||||
- Collaboration (team-wide access)
|
||||
- Aliases (latest, best, production)
|
||||
|
||||
**Common Use Cases:**
|
||||
- Dataset versioning
|
||||
- Model checkpoints
|
||||
- Preprocessed data
|
||||
- Evaluation results
|
||||
- Configuration files
|
||||
|
||||
## Creating Artifacts
|
||||
|
||||
### Basic Dataset Artifact
|
||||
|
||||
```python
|
||||
import wandb
|
||||
|
||||
run = wandb.init(project="my-project")
|
||||
|
||||
# Create artifact
|
||||
dataset = wandb.Artifact(
|
||||
name='training-data',
|
||||
type='dataset',
|
||||
description='ImageNet training split with augmentations',
|
||||
metadata={
|
||||
'size': '1.2M images',
|
||||
'format': 'JPEG',
|
||||
'resolution': '224x224'
|
||||
}
|
||||
)
|
||||
|
||||
# Add files
|
||||
dataset.add_file('data/train.csv') # Single file
|
||||
dataset.add_dir('data/images') # Entire directory
|
||||
dataset.add_reference('s3://bucket/data') # Cloud reference
|
||||
|
||||
# Log artifact
|
||||
run.log_artifact(dataset)
|
||||
wandb.finish()
|
||||
```
|
||||
|
||||
### Model Artifact
|
||||
|
||||
```python
|
||||
import torch
|
||||
import wandb
|
||||
|
||||
run = wandb.init(project="my-project")
|
||||
|
||||
# Train model
|
||||
model = train_model()
|
||||
|
||||
# Save model
|
||||
torch.save(model.state_dict(), 'model.pth')
|
||||
|
||||
# Create model artifact
|
||||
model_artifact = wandb.Artifact(
|
||||
name='resnet50-classifier',
|
||||
type='model',
|
||||
description='ResNet50 trained on ImageNet',
|
||||
metadata={
|
||||
'architecture': 'ResNet50',
|
||||
'accuracy': 0.95,
|
||||
'loss': 0.15,
|
||||
'epochs': 50,
|
||||
'framework': 'PyTorch'
|
||||
}
|
||||
)
|
||||
|
||||
# Add model file
|
||||
model_artifact.add_file('model.pth')
|
||||
|
||||
# Add config
|
||||
model_artifact.add_file('config.yaml')
|
||||
|
||||
# Log with aliases
|
||||
run.log_artifact(model_artifact, aliases=['latest', 'best'])
|
||||
|
||||
wandb.finish()
|
||||
```
|
||||
|
||||
### Preprocessed Data Artifact
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import wandb
|
||||
|
||||
run = wandb.init(project="nlp-project")
|
||||
|
||||
# Preprocess data
|
||||
df = pd.read_csv('raw_data.csv')
|
||||
df_processed = preprocess(df)
|
||||
df_processed.to_csv('processed_data.csv', index=False)
|
||||
|
||||
# Create artifact
|
||||
processed_data = wandb.Artifact(
|
||||
name='processed-text-data',
|
||||
type='dataset',
|
||||
metadata={
|
||||
'rows': len(df_processed),
|
||||
'columns': list(df_processed.columns),
|
||||
'preprocessing_steps': ['lowercase', 'remove_stopwords', 'tokenize']
|
||||
}
|
||||
)
|
||||
|
||||
processed_data.add_file('processed_data.csv')
|
||||
|
||||
# Log artifact
|
||||
run.log_artifact(processed_data)
|
||||
```
|
||||
|
||||
## Using Artifacts
|
||||
|
||||
### Download and Use
|
||||
|
||||
```python
|
||||
import wandb
|
||||
|
||||
run = wandb.init(project="my-project")
|
||||
|
||||
# Download artifact
|
||||
artifact = run.use_artifact('training-data:latest')
|
||||
artifact_dir = artifact.download()
|
||||
|
||||
# Use files
|
||||
import pandas as pd
|
||||
df = pd.read_csv(f'{artifact_dir}/train.csv')
|
||||
|
||||
# Train with artifact data
|
||||
model = train_model(df)
|
||||
```
|
||||
|
||||
### Use Specific Version
|
||||
|
||||
```python
|
||||
# Use specific version
|
||||
artifact_v2 = run.use_artifact('training-data:v2')
|
||||
|
||||
# Use alias
|
||||
artifact_best = run.use_artifact('model:best')
|
||||
artifact_prod = run.use_artifact('model:production')
|
||||
|
||||
# Use from another project
|
||||
artifact = run.use_artifact('team/other-project/model:latest')
|
||||
```
|
||||
|
||||
### Check Artifact Metadata
|
||||
|
||||
```python
|
||||
artifact = run.use_artifact('training-data:latest')
|
||||
|
||||
# Access metadata
|
||||
print(artifact.metadata)
|
||||
print(f"Size: {artifact.metadata['size']}")
|
||||
|
||||
# Access version info
|
||||
print(f"Version: {artifact.version}")
|
||||
print(f"Created at: {artifact.created_at}")
|
||||
print(f"Digest: {artifact.digest}")
|
||||
```
|
||||
|
||||
## Model Registry
|
||||
|
||||
Link models to a central registry for governance and deployment.
|
||||
|
||||
### Create Model Registry
|
||||
|
||||
```python
|
||||
# In W&B UI:
|
||||
# 1. Go to "Registry" tab
|
||||
# 2. Create new registry: "production-models"
|
||||
# 3. Define stages: development, staging, production
|
||||
```
|
||||
|
||||
### Link Model to Registry
|
||||
|
||||
```python
|
||||
import wandb
|
||||
|
||||
run = wandb.init(project="training")
|
||||
|
||||
# Create model artifact
|
||||
model_artifact = wandb.Artifact(
|
||||
name='sentiment-classifier',
|
||||
type='model',
|
||||
metadata={'accuracy': 0.94, 'f1': 0.92}
|
||||
)
|
||||
|
||||
model_artifact.add_file('model.pth')
|
||||
|
||||
# Log artifact
|
||||
run.log_artifact(model_artifact)
|
||||
|
||||
# Link to registry
|
||||
run.link_artifact(
|
||||
model_artifact,
|
||||
'model-registry/production-models',
|
||||
aliases=['staging'] # Deploy to staging
|
||||
)
|
||||
|
||||
wandb.finish()
|
||||
```
|
||||
|
||||
### Promote Model in Registry
|
||||
|
||||
```python
|
||||
# Retrieve model from registry
|
||||
api = wandb.Api()
|
||||
artifact = api.artifact('model-registry/production-models/sentiment-classifier:staging')
|
||||
|
||||
# Promote to production
|
||||
artifact.link('model-registry/production-models', aliases=['production'])
|
||||
|
||||
# Demote from production
|
||||
artifact.aliases = ['archived']
|
||||
artifact.save()
|
||||
```
|
||||
|
||||
### Use Model from Registry
|
||||
|
||||
```python
|
||||
import wandb
|
||||
|
||||
run = wandb.init()
|
||||
|
||||
# Download production model
|
||||
model_artifact = run.use_artifact(
|
||||
'model-registry/production-models/sentiment-classifier:production'
|
||||
)
|
||||
|
||||
model_dir = model_artifact.download()
|
||||
|
||||
# Load and use
|
||||
import torch
|
||||
model = torch.load(f'{model_dir}/model.pth')
|
||||
model.eval()
|
||||
```
|
||||
|
||||
## Versioning & Lineage
|
||||
|
||||
### Automatic Versioning
|
||||
|
||||
```python
|
||||
# First log: creates v0
|
||||
run1 = wandb.init(project="my-project")
|
||||
dataset_v0 = wandb.Artifact('my-dataset', type='dataset')
|
||||
dataset_v0.add_file('data_v1.csv')
|
||||
run1.log_artifact(dataset_v0)
|
||||
|
||||
# Second log with same name: creates v1
|
||||
run2 = wandb.init(project="my-project")
|
||||
dataset_v1 = wandb.Artifact('my-dataset', type='dataset')
|
||||
dataset_v1.add_file('data_v2.csv') # Different content
|
||||
run2.log_artifact(dataset_v1)
|
||||
|
||||
# Third log with SAME content as v1: references v1 (no new version)
|
||||
run3 = wandb.init(project="my-project")
|
||||
dataset_v1_again = wandb.Artifact('my-dataset', type='dataset')
|
||||
dataset_v1_again.add_file('data_v2.csv') # Same content as v1
|
||||
run3.log_artifact(dataset_v1_again) # Still v1, no v2 created
|
||||
```
|
||||
|
||||
### Track Lineage
|
||||
|
||||
```python
|
||||
# Training run
|
||||
run = wandb.init(project="my-project")
|
||||
|
||||
# Use dataset (input)
|
||||
dataset = run.use_artifact('training-data:v3')
|
||||
data = load_data(dataset.download())
|
||||
|
||||
# Train model
|
||||
model = train(data)
|
||||
|
||||
# Save model (output)
|
||||
model_artifact = wandb.Artifact('trained-model', type='model')
|
||||
torch.save(model.state_dict(), 'model.pth')
|
||||
model_artifact.add_file('model.pth')
|
||||
run.log_artifact(model_artifact)
|
||||
|
||||
# Lineage automatically tracked:
|
||||
# training-data:v3 --> [run] --> trained-model:v0
|
||||
```
|
||||
|
||||
### View Lineage Graph
|
||||
|
||||
```python
|
||||
# In W&B UI:
|
||||
# Artifacts → Select artifact → Lineage tab
|
||||
# Shows:
|
||||
# - Which runs produced this artifact
|
||||
# - Which runs used this artifact
|
||||
# - Parent/child artifacts
|
||||
```
|
||||
|
||||
## Artifact Types
|
||||
|
||||
### Dataset Artifacts
|
||||
|
||||
```python
|
||||
# Raw data
|
||||
raw_data = wandb.Artifact('raw-data', type='dataset')
|
||||
raw_data.add_dir('raw/')
|
||||
|
||||
# Processed data
|
||||
processed_data = wandb.Artifact('processed-data', type='dataset')
|
||||
processed_data.add_dir('processed/')
|
||||
|
||||
# Train/val/test splits
|
||||
train_split = wandb.Artifact('train-split', type='dataset')
|
||||
train_split.add_file('train.csv')
|
||||
|
||||
val_split = wandb.Artifact('val-split', type='dataset')
|
||||
val_split.add_file('val.csv')
|
||||
```
|
||||
|
||||
### Model Artifacts
|
||||
|
||||
```python
|
||||
# Checkpoint during training
|
||||
checkpoint = wandb.Artifact('checkpoint-epoch-10', type='model')
|
||||
checkpoint.add_file('checkpoint_epoch_10.pth')
|
||||
|
||||
# Final model
|
||||
final_model = wandb.Artifact('final-model', type='model')
|
||||
final_model.add_file('model.pth')
|
||||
final_model.add_file('tokenizer.json')
|
||||
|
||||
# Quantized model
|
||||
quantized = wandb.Artifact('quantized-model', type='model')
|
||||
quantized.add_file('model_int8.onnx')
|
||||
```
|
||||
|
||||
### Result Artifacts
|
||||
|
||||
```python
|
||||
# Predictions
|
||||
predictions = wandb.Artifact('test-predictions', type='predictions')
|
||||
predictions.add_file('predictions.csv')
|
||||
|
||||
# Evaluation metrics
|
||||
eval_results = wandb.Artifact('evaluation', type='evaluation')
|
||||
eval_results.add_file('metrics.json')
|
||||
eval_results.add_file('confusion_matrix.png')
|
||||
```
|
||||
|
||||
## Advanced Patterns
|
||||
|
||||
### Incremental Artifacts
|
||||
|
||||
Add files incrementally without re-uploading.
|
||||
|
||||
```python
|
||||
run = wandb.init(project="my-project")
|
||||
|
||||
# Create artifact
|
||||
dataset = wandb.Artifact('incremental-dataset', type='dataset')
|
||||
|
||||
# Add files incrementally
|
||||
for i in range(100):
|
||||
filename = f'batch_{i}.csv'
|
||||
process_batch(i, filename)
|
||||
dataset.add_file(filename)
|
||||
|
||||
# Log progress
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f"Added {i + 1}/100 batches")
|
||||
|
||||
# Log complete artifact
|
||||
run.log_artifact(dataset)
|
||||
```
|
||||
|
||||
### Artifact Tables
|
||||
|
||||
Track structured data with W&B Tables.
|
||||
|
||||
```python
|
||||
import wandb
|
||||
|
||||
run = wandb.init(project="my-project")
|
||||
|
||||
# Create table
|
||||
table = wandb.Table(columns=["id", "image", "label", "prediction"])
|
||||
|
||||
for idx, (img, label, pred) in enumerate(zip(images, labels, predictions)):
|
||||
table.add_data(
|
||||
idx,
|
||||
wandb.Image(img),
|
||||
label,
|
||||
pred
|
||||
)
|
||||
|
||||
# Log as artifact
|
||||
artifact = wandb.Artifact('predictions-table', type='predictions')
|
||||
artifact.add(table, "predictions")
|
||||
run.log_artifact(artifact)
|
||||
```
|
||||
|
||||
### Artifact References
|
||||
|
||||
Reference external data without copying.
|
||||
|
||||
```python
|
||||
# S3 reference
|
||||
dataset = wandb.Artifact('s3-dataset', type='dataset')
|
||||
dataset.add_reference('s3://my-bucket/data/', name='train')
|
||||
dataset.add_reference('s3://my-bucket/labels/', name='labels')
|
||||
|
||||
# GCS reference
|
||||
dataset.add_reference('gs://my-bucket/data/')
|
||||
|
||||
# HTTP reference
|
||||
dataset.add_reference('https://example.com/data.zip')
|
||||
|
||||
# Local filesystem reference (for shared storage)
|
||||
dataset.add_reference('file:///mnt/shared/data')
|
||||
```
|
||||
|
||||
## Collaboration Patterns
|
||||
|
||||
### Team Dataset Sharing
|
||||
|
||||
```python
|
||||
# Data engineer creates dataset
|
||||
run = wandb.init(project="data-eng", entity="my-team")
|
||||
dataset = wandb.Artifact('shared-dataset', type='dataset')
|
||||
dataset.add_dir('data/')
|
||||
run.log_artifact(dataset, aliases=['latest', 'production'])
|
||||
|
||||
# ML engineer uses dataset
|
||||
run = wandb.init(project="ml-training", entity="my-team")
|
||||
dataset = run.use_artifact('my-team/data-eng/shared-dataset:production')
|
||||
data = load_data(dataset.download())
|
||||
```
|
||||
|
||||
### Model Handoff
|
||||
|
||||
```python
|
||||
# Training team
|
||||
train_run = wandb.init(project="model-training", entity="ml-team")
|
||||
model = train_model()
|
||||
model_artifact = wandb.Artifact('nlp-model', type='model')
|
||||
model_artifact.add_file('model.pth')
|
||||
train_run.log_artifact(model_artifact)
|
||||
train_run.link_artifact(model_artifact, 'model-registry/nlp-models', aliases=['candidate'])
|
||||
|
||||
# Evaluation team
|
||||
eval_run = wandb.init(project="model-eval", entity="ml-team")
|
||||
model_artifact = eval_run.use_artifact('model-registry/nlp-models/nlp-model:candidate')
|
||||
metrics = evaluate_model(model_artifact)
|
||||
|
||||
if metrics['f1'] > 0.9:
|
||||
# Promote to production
|
||||
model_artifact.link('model-registry/nlp-models', aliases=['production'])
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Use Descriptive Names
|
||||
|
||||
```python
|
||||
# ✅ Good: Descriptive names
|
||||
wandb.Artifact('imagenet-train-augmented-v2', type='dataset')
|
||||
wandb.Artifact('bert-base-sentiment-finetuned', type='model')
|
||||
|
||||
# ❌ Bad: Generic names
|
||||
wandb.Artifact('dataset1', type='dataset')
|
||||
wandb.Artifact('model', type='model')
|
||||
```
|
||||
|
||||
### 2. Add Comprehensive Metadata
|
||||
|
||||
```python
|
||||
model_artifact = wandb.Artifact(
|
||||
'production-model',
|
||||
type='model',
|
||||
description='ResNet50 classifier for product categorization',
|
||||
metadata={
|
||||
# Model info
|
||||
'architecture': 'ResNet50',
|
||||
'framework': 'PyTorch 2.0',
|
||||
'pretrained': True,
|
||||
|
||||
# Performance
|
||||
'accuracy': 0.95,
|
||||
'f1_score': 0.93,
|
||||
'inference_time_ms': 15,
|
||||
|
||||
# Training
|
||||
'epochs': 50,
|
||||
'dataset': 'imagenet',
|
||||
'num_samples': 1200000,
|
||||
|
||||
# Business context
|
||||
'use_case': 'e-commerce product classification',
|
||||
'owner': 'ml-team@company.com',
|
||||
'approved_by': 'data-science-lead'
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Use Aliases for Deployment Stages
|
||||
|
||||
```python
|
||||
# Development
|
||||
run.log_artifact(model, aliases=['dev', 'latest'])
|
||||
|
||||
# Staging
|
||||
run.log_artifact(model, aliases=['staging'])
|
||||
|
||||
# Production
|
||||
run.log_artifact(model, aliases=['production', 'v1.2.0'])
|
||||
|
||||
# Archive old versions
|
||||
old_artifact = api.artifact('model:production')
|
||||
old_artifact.aliases = ['archived-v1.1.0']
|
||||
old_artifact.save()
|
||||
```
|
||||
|
||||
### 4. Track Data Lineage
|
||||
|
||||
```python
|
||||
def create_training_pipeline():
|
||||
run = wandb.init(project="pipeline")
|
||||
|
||||
# 1. Load raw data
|
||||
raw_data = run.use_artifact('raw-data:latest')
|
||||
|
||||
# 2. Preprocess
|
||||
processed = preprocess(raw_data)
|
||||
processed_artifact = wandb.Artifact('processed-data', type='dataset')
|
||||
processed_artifact.add_file('processed.csv')
|
||||
run.log_artifact(processed_artifact)
|
||||
|
||||
# 3. Train model
|
||||
model = train(processed)
|
||||
model_artifact = wandb.Artifact('trained-model', type='model')
|
||||
model_artifact.add_file('model.pth')
|
||||
run.log_artifact(model_artifact)
|
||||
|
||||
# Lineage: raw-data → processed-data → trained-model
|
||||
```
|
||||
|
||||
### 5. Efficient Storage
|
||||
|
||||
```python
|
||||
# ✅ Good: Reference large files
|
||||
large_dataset = wandb.Artifact('large-dataset', type='dataset')
|
||||
large_dataset.add_reference('s3://bucket/huge-file.tar.gz')
|
||||
|
||||
# ❌ Bad: Upload giant files
|
||||
# large_dataset.add_file('huge-file.tar.gz') # Don't do this
|
||||
|
||||
# ✅ Good: Upload only metadata
|
||||
metadata_artifact = wandb.Artifact('dataset-metadata', type='dataset')
|
||||
metadata_artifact.add_file('metadata.json') # Small file
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
- **Artifacts Documentation**: https://docs.wandb.ai/guides/artifacts
|
||||
- **Model Registry**: https://docs.wandb.ai/guides/model-registry
|
||||
- **Best Practices**: https://wandb.ai/site/articles/versioning-data-and-models-in-ml
|
||||
Reference in New Issue
Block a user