Deep Learning Primer

Essential deep learning concepts for ML systems.

Neural Network Basics

import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.layers(x)

Activation Functions

Function	Use Case	Output Range
ReLU	Hidden layers	[0, ∞)
LeakyReLU	Prevent dead neurons	(-∞, ∞)
GELU	Transformers	(-∞, ∞)
Sigmoid	Binary classification	(0, 1)
Softmax	Multi-class	(0, 1)
Tanh	Centered output	(-1, 1)

Loss Functions

# Classification
cross_entropy = nn.CrossEntropyLoss()
binary_ce = nn.BCEWithLogitsLoss()
focal_loss = lambda p, y, gamma=2: -((1-p)**gamma * y * torch.log(p))

# Regression
mse = nn.MSELoss()
mae = nn.L1Loss()
huber = nn.HuberLoss()

# Custom loss
def contrastive_loss(embeddings, labels, margin=1.0):
    distances = torch.cdist(embeddings, embeddings)
    same_class = labels.unsqueeze(0) == labels.unsqueeze(1)
    loss = same_class * distances + (~same_class) * F.relu(margin - distances)
    return loss.mean()

Optimization

# Optimizers
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-3,
    weight_decay=0.01
)

# Learning rate scheduling
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=num_epochs
)

# Training loop
for epoch in range(num_epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        loss = criterion(model(batch.x), batch.y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    scheduler.step()

Regularization Techniques

# Dropout
nn.Dropout(p=0.5)

# Weight decay (L2)
optimizer = AdamW(params, weight_decay=0.01)

# Batch normalization
nn.BatchNorm1d(num_features)

# Layer normalization (for transformers)
nn.LayerNorm(hidden_dim)

# Early stopping
class EarlyStopping:
    def __init__(self, patience=5):
        self.patience = patience
        self.counter = 0
        self.best_score = None

    def __call__(self, val_loss):
        if self.best_score is None or val_loss < self.best_score:
            self.best_score = val_loss
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience

Initialization

def init_weights(module):
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
        nn.init.normal_(module.weight, std=0.02)

model.apply(init_weights)

Best Practices

Start with Adam optimizer, lr=1e-3
Use learning rate warmup
Apply gradient clipping
Monitor training/validation gap
Use mixed precision training

deep-learning-primer

Deep Learning Primer

Neural Network Basics

Activation Functions

Loss Functions

Optimization

Regularization Techniques

Initialization

Best Practices