huggingface-transformers
SKILL.md
Hugging Face Transformers Skill
Quick Reference
| Task | Approach | Key Class |
|---|---|---|
| Text Generation | pipeline("text-generation") |
AutoModelForCausalLM |
| Classification | pipeline("text-classification") |
AutoModelForSequenceClassification |
| Embeddings | sentence-transformers |
SentenceTransformer |
| NER | pipeline("ner") |
AutoModelForTokenClassification |
| QA | pipeline("question-answering") |
AutoModelForQuestionAnswering |
| Fine-tuning | Trainer API |
TrainingArguments |
Installation
# Core transformers
pip install transformers torch
# With all extras
pip install transformers[torch] accelerate
# For embeddings
pip install sentence-transformers
# For quantization
pip install bitsandbytes
# For PEFT/LoRA
pip install peft
# For datasets
pip install datasets
Pipeline API (Fastest Start)
Text Generation
from transformers import pipeline
# Simple generation
generator = pipeline("text-generation", model="microsoft/DialoGPT-medium")
result = generator("Hello, how are you?", max_length=50, num_return_sequences=1)
print(result[0]["generated_text"])
# With specific model for instruction following
generator = pipeline(
"text-generation",
model="mistralai/Mistral-7B-Instruct-v0.2",
device_map="auto",
torch_dtype="auto"
)
messages = [{"role": "user", "content": "Explain transformers in 2 sentences"}]
response = generator(messages, max_new_tokens=100)
Text Classification
from transformers import pipeline
# Sentiment analysis
classifier = pipeline("sentiment-analysis")
result = classifier("I love this product!")
# [{'label': 'POSITIVE', 'score': 0.9998}]
# Multi-label classification
classifier = pipeline(
"text-classification",
model="facebook/bart-large-mnli",
top_k=None # Return all labels with scores
)
# Zero-shot classification
classifier = pipeline("zero-shot-classification")
result = classifier(
"This is a tutorial about machine learning",
candidate_labels=["education", "politics", "business"]
)
Named Entity Recognition
from transformers import pipeline
ner = pipeline("ner", aggregation_strategy="simple")
text = "Apple CEO Tim Cook announced new products in Cupertino"
entities = ner(text)
for entity in entities:
print(f"{entity['word']}: {entity['entity_group']} ({entity['score']:.2f})")
# Apple: ORG (0.99)
# Tim Cook: PER (0.99)
# Cupertino: LOC (0.98)
Question Answering
from transformers import pipeline
qa = pipeline("question-answering")
context = """
Hugging Face is a company that develops tools for building machine learning
applications. It was founded in 2016 and is headquartered in New York City.
"""
question = "When was Hugging Face founded?"
result = qa(question=question, context=context)
# {'answer': '2016', 'score': 0.98, 'start': 89, 'end': 93}
Summarization
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
text = """Your long document text here..."""
summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
Model and Tokenizer Loading
Basic Loading
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
# Load tokenizer and model separately
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
# For text generation models
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
device_map="auto",
torch_dtype="auto"
)
Loading with Options
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model = AutoModelForCausalLM.from_pretrained(
"mistralai/Mistral-7B-v0.1",
torch_dtype=torch.float16, # Half precision
device_map="auto", # Automatic GPU placement
low_cpu_mem_usage=True, # Reduce RAM during loading
trust_remote_code=True, # For custom architectures
attn_implementation="flash_attention_2" # If available
)
tokenizer = AutoTokenizer.from_pretrained(
"mistralai/Mistral-7B-v0.1",
padding_side="left", # For batch generation
use_fast=True # Use Rust tokenizer
)
tokenizer.pad_token = tokenizer.eos_token # Set pad token
Offline Loading
# Download model for offline use
from transformers import AutoModel, AutoTokenizer
model_name = "bert-base-uncased"
save_path = "./models/bert-base"
# Download and save
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)
# Load offline
tokenizer = AutoTokenizer.from_pretrained(save_path, local_files_only=True)
model = AutoModel.from_pretrained(save_path, local_files_only=True)
Embeddings with Sentence Transformers
Basic Embeddings
from sentence_transformers import SentenceTransformer
# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")
# Single text
embedding = model.encode("Hello, world!")
print(f"Dimension: {len(embedding)}") # 384
# Batch encoding
sentences = ["First sentence", "Second sentence", "Third sentence"]
embeddings = model.encode(sentences, show_progress_bar=True)
Semantic Similarity
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-mpnet-base-v2")
query = "How to learn Python?"
documents = [
"Python tutorial for beginners",
"Advanced JavaScript patterns",
"Machine learning with Python",
"Cooking recipes for dinner"
]
# Encode
query_embedding = model.encode(query, convert_to_tensor=True)
doc_embeddings = model.encode(documents, convert_to_tensor=True)
# Calculate similarity
scores = util.cos_sim(query_embedding, doc_embeddings)[0]
# Rank results
ranked = sorted(zip(documents, scores.tolist()), key=lambda x: x[1], reverse=True)
for doc, score in ranked:
print(f"{score:.3f}: {doc}")
Embedding Model Selection
| Model | Dim | Use Case |
|---|---|---|
all-MiniLM-L6-v2 |
384 | Fast, general purpose |
all-mpnet-base-v2 |
768 | Higher quality, balanced |
bge-large-en-v1.5 |
1024 | State-of-the-art retrieval |
e5-large-v2 |
1024 | Multilingual support |
nomic-embed-text-v1 |
768 | Long context (8K tokens) |
Optimized Embedding Generation
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
# Batch processing with optimal settings
embeddings = model.encode(
large_text_list,
batch_size=64,
show_progress_bar=True,
convert_to_numpy=True,
normalize_embeddings=True # For cosine similarity
)
Text Generation (Advanced)
Chat-Style Generation
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_id = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.float16
)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a haiku about coding"}
]
# Apply chat template
input_ids = tokenizer.apply_chat_template(
messages,
return_tensors="pt",
add_generation_prompt=True
).to(model.device)
# Generate
outputs = model.generate(
input_ids,
max_new_tokens=100,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
Streaming Generation
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
inputs = tokenizer("Explain quantum computing:", return_tensors="pt").to(model.device)
generation_kwargs = {
**inputs,
"streamer": streamer,
"max_new_tokens": 200,
"do_sample": True,
"temperature": 0.7
}
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
for text in streamer:
print(text, end="", flush=True)
Generation Parameters
outputs = model.generate(
input_ids,
# Length control
max_new_tokens=256,
min_new_tokens=10,
# Sampling strategy
do_sample=True,
temperature=0.8, # Randomness (0.0 = greedy)
top_k=50, # Top-k sampling
top_p=0.95, # Nucleus sampling
# Repetition control
repetition_penalty=1.1,
no_repeat_ngram_size=3,
# Beam search (alternative to sampling)
# num_beams=4,
# early_stopping=True,
# Other
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id
)
Quantization
BitsAndBytes 4-bit Quantization
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # Normal float 4
bnb_4bit_compute_dtype=torch.float16, # Computation dtype
bnb_4bit_use_double_quant=True # Nested quantization
)
model = AutoModelForCausalLM.from_pretrained(
"mistralai/Mistral-7B-v0.1",
quantization_config=bnb_config,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
8-bit Quantization
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=bnb_config,
device_map="auto"
)
GPTQ Models (Pre-quantized)
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load pre-quantized GPTQ model
model = AutoModelForCausalLM.from_pretrained(
"TheBloke/Mistral-7B-v0.1-GPTQ",
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-v0.1-GPTQ")
Memory Comparison
| Model (7B) | Precision | VRAM | Quality |
|---|---|---|---|
| Full | FP32 | ~28GB | 100% |
| Half | FP16 | ~14GB | ~99% |
| 8-bit | INT8 | ~7GB | ~97% |
| 4-bit | NF4 | ~4GB | ~95% |
Fine-Tuning with Trainer
Basic Fine-Tuning
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer
)
from datasets import load_dataset
# Load dataset
dataset = load_dataset("imdb")
# Load model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Tokenize dataset
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=512
)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=100,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"]
)
# Train
trainer.train()
Custom Metrics
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return {
"accuracy": accuracy_score(labels, predictions),
"f1": f1_score(labels, predictions, average="weighted")
}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics
)
PEFT and LoRA
LoRA Fine-Tuning
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
# Load base model
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
load_in_4bit=True,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# LoRA configuration
lora_config = LoraConfig(
r=16, # Rank
lora_alpha=32, # Alpha scaling
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM
)
# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06%
Save and Load LoRA Adapters
# Save adapter only (small file)
model.save_pretrained("./lora-adapter")
# Load adapter onto base model
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
model = PeftModel.from_pretrained(base_model, "./lora-adapter")
# Merge adapter into base model (optional)
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged-model")
QLoRA (Quantized LoRA)
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
# Load quantized model
model = AutoModelForCausalLM.from_pretrained(
"mistralai/Mistral-7B-v0.1",
quantization_config=bnb_config,
device_map="auto"
)
# Prepare for training
model = prepare_model_for_kbit_training(model)
# Apply LoRA
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.1,
bias="none",
task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
Model Selection from Hub
Finding Models
from huggingface_hub import HfApi, list_models
api = HfApi()
# Search for models
models = list_models(
filter="text-generation",
sort="downloads",
direction=-1,
limit=10
)
for model in models:
print(f"{model.id}: {model.downloads:,} downloads")
# Get model info
model_info = api.model_info("meta-llama/Llama-2-7b-hf")
print(f"Tags: {model_info.tags}")
print(f"License: {model_info.cardData.get('license', 'Unknown')}")
Recommended Models by Task
| Task | Small (<3B) | Medium (3-13B) | Large (>13B) |
|---|---|---|---|
| Chat | TinyLlama-1.1B | Phi-3-mini-4k | Llama-3-70B |
| Code | CodeGemma-2B | CodeLlama-7B | DeepSeek-Coder-33B |
| Embeddings | all-MiniLM-L6-v2 | bge-base-en-v1.5 | bge-large-en-v1.5 |
| Classification | DistilBERT | RoBERTa-base | DeBERTa-v3-large |
Local Inference Optimization
Batch Processing
from transformers import pipeline
import torch
# Optimal batching for embeddings
pipe = pipeline(
"feature-extraction",
model="sentence-transformers/all-MiniLM-L6-v2",
device=0 if torch.cuda.is_available() else -1
)
texts = ["text1", "text2", "text3", ...]
batch_size = 32
results = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
embeddings = pipe(batch, truncation=True)
results.extend(embeddings)
torch.compile for Speed
import torch
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
"microsoft/phi-2",
torch_dtype=torch.float16,
device_map="auto"
)
# Compile for faster inference (PyTorch 2.0+)
model = torch.compile(model, mode="reduce-overhead")
KV-Cache and Attention Optimization
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
"mistralai/Mistral-7B-v0.1",
device_map="auto",
torch_dtype="auto",
attn_implementation="flash_attention_2" # Faster attention
)
# For multi-turn generation, use past_key_values
outputs = model.generate(
input_ids,
max_new_tokens=50,
use_cache=True, # Enable KV-cache
return_dict_in_generate=True
)
past_key_values = outputs.past_key_values
# Continue generation with cached context
new_outputs = model.generate(
new_input_ids,
past_key_values=past_key_values,
max_new_tokens=50
)
Memory-Efficient Inference
from transformers import AutoModelForCausalLM
import torch
# Gradient checkpointing for large models
model = AutoModelForCausalLM.from_pretrained(
"large-model",
device_map="auto",
torch_dtype=torch.float16
)
# Enable for training (saves memory, slower)
model.gradient_checkpointing_enable()
# Inference with no_grad
with torch.no_grad():
outputs = model.generate(input_ids, max_new_tokens=100)
Best Practices
1. Choose the Right Model Size
# Start small, scale up only if needed
MODELS_BY_USE_CASE = {
"quick_prototype": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"production_chat": "microsoft/Phi-3-mini-4k-instruct",
"code_generation": "codellama/CodeLlama-7b-hf",
"embeddings": "sentence-transformers/all-MiniLM-L6-v2"
}
2. Always Set Device and Dtype
import torch
from transformers import AutoModelForCausalLM
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=dtype,
device_map="auto" if device == "cuda" else None
)
3. Handle Tokenizer Edge Cases
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Set pad token for batching
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# For left-padding in generation
tokenizer.padding_side = "left"
4. Error Handling for Generation
def safe_generate(model, tokenizer, prompt, **kwargs):
try:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=kwargs.get("max_new_tokens", 256),
pad_token_id=tokenizer.eos_token_id,
**kwargs
)
response = tokenizer.decode(
outputs[0][inputs.input_ids.shape[1]:],
skip_special_tokens=True
)
return response.strip()
except torch.cuda.OutOfMemoryError:
torch.cuda.empty_cache()
raise RuntimeError("GPU OOM - try smaller batch or quantized model")
5. Environment Configuration
import os
# Cache directory
os.environ["HF_HOME"] = "/path/to/cache"
os.environ["TRANSFORMERS_CACHE"] = "/path/to/models"
# Offline mode
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
# Disable telemetry
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
# Token for gated models
os.environ["HF_TOKEN"] = "your_token_here"
Common Patterns
Embedding Service
from sentence_transformers import SentenceTransformer
from functools import lru_cache
import numpy as np
class EmbeddingService:
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def embed(self, texts: list[str], normalize: bool = True) -> np.ndarray:
return self.model.encode(
texts,
normalize_embeddings=normalize,
show_progress_bar=len(texts) > 100
)
@lru_cache(maxsize=10000)
def embed_cached(self, text: str) -> tuple:
return tuple(self.embed([text])[0])
LLM Wrapper
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
class LocalLLM:
def __init__(self, model_id: str, quantize: bool = True):
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
load_kwargs = {"device_map": "auto", "torch_dtype": torch.float16}
if quantize:
from transformers import BitsAndBytesConfig
load_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True)
self.model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
def generate(self, prompt: str, max_tokens: int = 256, **kwargs) -> str:
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_tokens,
pad_token_id=self.tokenizer.eos_token_id,
**kwargs
)
return self.tokenizer.decode(
outputs[0][inputs.input_ids.shape[1]:],
skip_special_tokens=True
)
Troubleshooting
| Issue | Solution |
|---|---|
| CUDA OOM | Use 4-bit quantization or smaller model |
| Slow generation | Enable use_cache=True, use Flash Attention |
| Truncated output | Increase max_new_tokens |
| Repetitive text | Set repetition_penalty=1.1 |
| Model not found | Check HF_TOKEN for gated models |
| Wrong device | Explicitly set device_map="auto" |
Weekly Installs
1
Repository
fgarofalo56/sup…t_fabricFirst Seen
13 days ago
Security Audits
Installed on
amp1
cline1
openclaw1
opencode1
cursor1
kimi-cli1