pytorch

SKILL.md

Using PyTorch

PyTorch is a deep learning framework with dynamic computation graphs, strong GPU acceleration, and Pythonic design. This skill covers practical patterns for building production-quality neural networks.

Table of Contents

Core Concepts

Tensors

import torch

# Create tensors
x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
x = torch.zeros(3, 4)
x = torch.randn(3, 4)  # Normal distribution

# Device management
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = x.to(device)

# Operations (all return new tensors)
y = x + 1
y = x @ x.T  # Matrix multiplication
y = x.view(2, 6)  # Reshape

Autograd

# Enable gradient tracking
x = torch.randn(3, requires_grad=True)
y = x ** 2
loss = y.sum()

# Compute gradients
loss.backward()
print(x.grad)  # dy/dx

# Disable gradients for inference
with torch.no_grad():
    pred = model(x)

# Or use inference mode (more efficient)
with torch.inference_mode():
    pred = model(x)

Model Architecture

nn.Module Pattern

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

Common Layers

# Convolution
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)

# Normalization
nn.BatchNorm2d(num_features)
nn.LayerNorm(normalized_shape)

# Attention
nn.MultiheadAttention(embed_dim, num_heads)

# Recurrent
nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
nn.GRU(input_size, hidden_size, num_layers, batch_first=True)

Weight Initialization

def init_weights(module):
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
        nn.init.normal_(module.weight, std=0.02)

model.apply(init_weights)

Training Loop

Standard Pattern

model = Model(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()

        # Optional: gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

    scheduler.step()

    # Validation
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            # ... validation logic

Mixed Precision Training

from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for batch in train_loader:
    inputs, targets = batch
    inputs, targets = inputs.to(device), targets.to(device)

    optimizer.zero_grad()

    with autocast():
        outputs = model(inputs)
        loss = criterion(outputs, targets)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

Gradient Accumulation

# Requires setup from Mixed Precision Training above:
# scaler = GradScaler(), model, criterion, optimizer, device

accumulation_steps = 4

for i, batch in enumerate(train_loader):
    inputs, targets = batch
    inputs, targets = inputs.to(device), targets.to(device)

    with autocast():
        outputs = model(inputs)
        loss = criterion(outputs, targets) / accumulation_steps

    scaler.scale(loss).backward()

    if (i + 1) % accumulation_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

Data Loading

Dataset and DataLoader

from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]
        if self.transform:
            x = self.transform(x)
        return x, self.labels[idx]

train_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4,
    pin_memory=True,  # Faster GPU transfer
    drop_last=True,   # Consistent batch sizes
)

Collate Functions

def collate_fn(batch):
    """Custom batching for variable-length sequences."""
    inputs, targets = zip(*batch)
    inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
    targets = torch.stack(targets)
    return inputs, targets

loader = DataLoader(dataset, collate_fn=collate_fn)

Performance Optimization

torch.compile (PyTorch 2.0+)

# Basic compilation
model = torch.compile(model)

# With options
model = torch.compile(
    model,
    mode="reduce-overhead",  # Options: default, reduce-overhead, max-autotune
    fullgraph=True,          # Enforce no graph breaks
)

# Compile specific functions
@torch.compile
def train_step(model, inputs, targets):
    outputs = model(inputs)
    return criterion(outputs, targets)

Compilation modes:

  • default: Good balance of compile time and speedup
  • reduce-overhead: Minimizes framework overhead, good for small models
  • max-autotune: Maximum performance, longer compile time

Memory Optimization

# Activation checkpointing (trade compute for memory)
from torch.utils.checkpoint import checkpoint

class Model(nn.Module):
    def forward(self, x):
        # Recompute activations during backward
        x = checkpoint(self.expensive_layer, x, use_reentrant=False)
        return self.output_layer(x)

# Clear cache
torch.cuda.empty_cache()

# Monitor memory
print(torch.cuda.memory_allocated() / 1e9, "GB")
print(torch.cuda.max_memory_allocated() / 1e9, "GB")

Distributed Training

DistributedDataParallel (DDP)

import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler

def setup(rank, world_size):
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def cleanup():
    dist.destroy_process_group()

def train(rank, world_size):
    setup(rank, world_size)

    model = Model().to(rank)
    model = DDP(model, device_ids=[rank])

    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    loader = DataLoader(dataset, sampler=sampler)

    for epoch in range(num_epochs):
        sampler.set_epoch(epoch)  # Important for shuffling
        # ... training loop

    cleanup()

# Launch with: torchrun --nproc_per_node=4 train.py

FullyShardedDataParallel (FSDP)

from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import MixedPrecision

mp_policy = MixedPrecision(
    param_dtype=torch.bfloat16,
    reduce_dtype=torch.bfloat16,
    buffer_dtype=torch.bfloat16,
)

model = FSDP(
    model,
    mixed_precision=mp_policy,
    use_orig_params=True,  # Required for torch.compile compatibility
)

Saving and Loading

Checkpoints

# Save
torch.save({
    "epoch": epoch,
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "loss": loss,
}, "checkpoint.pt")

# Load
checkpoint = torch.load("checkpoint.pt", map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

Export for Deployment

# TorchScript
scripted = torch.jit.script(model)
scripted.save("model.pt")

# ONNX
torch.onnx.export(
    model,
    dummy_input,
    "model.onnx",
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},
)

Best Practices

  1. Always set model mode: Use model.train() and model.eval() appropriately
  2. Use inference_mode over no_grad: More efficient for inference
  3. Pin memory for GPU training: Set pin_memory=True in DataLoader
  4. Profile before optimizing: Use torch.profiler to find bottlenecks
  5. Prefer bfloat16 over float16: Better numerical stability on modern GPUs
  6. Use torch.compile: Significant speedups with minimal code changes
  7. Set deterministic mode for reproducibility:
    torch.manual_seed(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    

References

See reference/ for detailed documentation:

  • training-patterns.md - Advanced training techniques
  • debugging.md - Debugging and profiling tools
Weekly Installs
8
GitHub Stars
17
First Seen
Feb 10, 2026
Installed on
github-copilot8
opencode7
codex7
gemini-cli7
antigravity6
kimi-cli6