PyTorch Cheat Sheet — Tensors, Models & Training — PyTorch R
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam Define differentiable computation graphs, train with autograd, scale to GPU.
Like building with LEGO blocks: tensors are blocks, nn.Module is a blueprint, optim.Optimizer applies the hammer. GPU just speeds up the hammering.
Common Patterns
import torch
# Create tensors
x = torch.tensor([[1.0, 2.0], [3.0, 4.0]]) # from data
y = torch.zeros(3, 4) # zeros
z = torch.randn(2, 5) # random normal
# Reshape and device
x_reshaped = x.reshape(4, 1)
x_gpu = x.to('cuda') if torch.cuda.is_available() else x.cpu()
# Operations
result = torch.matmul(x, x.T) # matrix multiply
result = x + y # broadcasting works
result = x.sum(dim=1) # reduce
print(result.shape, result.dtype) torch.Size([2]) torch.float32 import torch
# Enable gradient tracking
x = torch.tensor([2.0, 3.0], requires_grad=True)
y = x ** 2
z = y.sum()
# Backprop
z.backward()
print(x.grad) # [4.0, 6.0] — dz/dx = 2x
# Gradient accumulation (default behavior)
z.backward() # grad is ADDED to existing x.grad
print(x.grad) # [8.0, 12.0] — accumulated!
# Clear gradients before next backward
x.grad.zero_()
z.backward()
print(x.grad) # [4.0, 6.0] — fresh
# Disable grad tracking (inference)
with torch.no_grad():
pred = model(x) # no gradient computation tensor([4., 6.]) import torch
import torch.nn as nn
class SimpleNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x) # activation
x = self.dropout(x) # regularization
x = self.fc2(x)
return x
# Instantiate
model = SimpleNN(input_size=10, hidden_size=64, output_size=2)
print(sum(p.numel() for p in model.parameters())) # total params
# Move to device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device) 1346 import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
# Dummy data
X_train = torch.randn(100, 10)
y_train = torch.randint(0, 2, (100,))
dataset = TensorDataset(X_train, y_train)
loader = DataLoader(dataset, batch_size=16, shuffle=True)
# Model, loss, optimizer
model = nn.Sequential(
nn.Linear(10, 64),
nn.ReLU(),
nn.Linear(64, 2)
)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
# Training
num_epochs = 3
for epoch in range(num_epochs):
total_loss = 0
for X_batch, y_batch in loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
# Forward
logits = model(X_batch)
loss = criterion(logits, y_batch)
# Backward
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(loader):.4f}") Epoch 1/3, Loss: 0.6891
Epoch 2/3, Loss: 0.6821
Epoch 3/3, Loss: 0.6752 import torch
# Check GPU
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU count: {torch.cuda.device_count()}")
print(f"Current GPU: {torch.cuda.current_device()}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")
# Move tensors and models to GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
x = torch.randn(1000, 1000).to(device)
model = model.to(device)
# Automatic Mixed Precision (AMP) for speed + memory
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for epoch in range(num_epochs):
for X_batch, y_batch in loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
with autocast():
logits = model(X_batch)
loss = criterion(logits, y_batch)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# Synchronize (for timing)
torch.cuda.synchronize() GPU available: True
GPU count: 1
Current GPU: 0 import torch
import torch.nn as nn
from pathlib import Path
model = nn.Linear(10, 2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Save model state (recommended)
torch.save(model.state_dict(), 'model_weights.pth')
model.load_state_dict(torch.load('model_weights.pth', weights_only=True))
# Save entire model (includes architecture)
torch.save(model, 'model_full.pth')
model = torch.load('model_full.pth')
# Save checkpoint (training state)
checkpoint = {
'epoch': 5,
'model_state': model.state_dict(),
'optimizer_state': optimizer.state_dict(),
'loss': 0.123
}
torch.save(checkpoint, 'checkpoint.pth')
# Resume training
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state'])
optimizer.load_state_dict(checkpoint['optimizer_state'])
epoch = checkpoint['epoch'] ✓ Model saved and loaded Key Parameters & Defaults
PyTorch Core Functions
| Function / Class | Key Parameters | Default / Notes |
|---|---|---|
nn.Linear(in, out) | in_features, out_features, bias | bias=True. Applies y=xW^T+b. Always check input shape. |
DataLoader(dataset, ...) | batch_size, shuffle, num_workers, pin_memory | batch_size=1, shuffle=False. Use pin_memory=True on GPU. num_workers>0 for speed. |
optim.Adam(params, lr) | lr, betas=(0.9,0.999), weight_decay, eps | lr=0.001. Good default for most tasks. weight_decay=0 (no L2 by default). |
nn.CrossEntropyLoss() | weight, ignore_index, reduction | reduction='mean'. Expects logits (not softmax). Built-in log_softmax. |
model.train() / eval() | no params | train(): dropout/batchnorm active. eval(): deterministic mode. |
torch.nn.functional.relu(x) | inplace | inplace=False. inplace=True saves memory but breaks gradients if misused. |
Common Errors & Fixes
RuntimeError: Expected all tensors to be on the same device Cause: Tensors on different devices (CPU vs CUDA) in same operation. Common: model on GPU, data on CPU.
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
X_batch = X_batch.to(device)
y_batch = y_batch.to(device)
# Or use a utility function
def to_device(batch, device):
return [x.to(device) if isinstance(x, torch.Tensor) else x for x in batch] RuntimeError: leaf variable has been moved into the graph interior Cause: Trying to call backward() on a non-leaf tensor or modifying requires_grad leaf after creation.
# Wrong: modifying leaf after computation
x = torch.tensor([1.0], requires_grad=True)
x.data[0] = 2.0 # detach before modifying
# Right: use .detach() or .data
x_detached = x.detach()
x_detached[0] = 2.0
# Or call backward on final scalar loss, not intermediate
loss = criterion(output, target) # scalar
loss.backward() # correct
# Don't do: output.backward() if output is not scalar RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation Cause: Using inplace operations (+=, .relu_(x), x[i] = y) on tensors that require gradients.
import torch
# Wrong: inplace operations
x = torch.randn(5, requires_grad=True)
x += 1 # inplace, breaks gradients
x.relu_() # inplace relu
# Right: non-inplace
x = x + 1 # creates new tensor
x = torch.relu(x) # functional, not inplace
# Exception: safe to modify .data directly
x.data += 1 # detaches from graph CUDA out of memory (OOM) Cause: Model or batch size too large for GPU memory. Accumulating gradients without clearing.
import torch
# Reduce batch size
batch_size = 8 # was 64
# Clear unused cache
torch.cuda.empty_cache()
# Use gradient accumulation instead of large batch
accum_steps = 4
for i, (X_batch, y_batch) in enumerate(loader):
logits = model(X_batch.to(device))
loss = criterion(logits, y_batch.to(device)) / accum_steps
loss.backward()
if (i + 1) % accum_steps == 0:
optimizer.step()
optimizer.zero_grad()
# Use mixed precision (AMP)
from torch.cuda.amp import autocast
with autocast():
logits = model(X_batch) # FP16 compute ValueError: Expected input batch_size (32) to match the defined batch_size (16) Cause: Last batch smaller than expected batch_size. DataLoader drop_last=False by default.
from torch.utils.data import DataLoader
# Option 1: drop last batch if incomplete
loader = DataLoader(
dataset,
batch_size=16,
drop_last=True, # drop incomplete batches
shuffle=True
)
# Option 2: handle variable batch sizes in code
for X_batch, y_batch in loader:
# Code handles any batch_size
logits = model(X_batch) # works for 16 or 5 Complete minimal end-to-end training example
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
# 1. Data
X = torch.randn(200, 20)
y = (X[:, 0] + X[:, 1] > 0).long() # synthetic classification
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
# 2. Model
class Classifier(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(20, 64),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(64, 2)
)
def forward(self, x):
return self.net(x)
model = Classifier()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# 3. Training
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
for epoch in range(10):
model.train()
epoch_loss = 0
for X_batch, y_batch in loader:
X_batch = X_batch.to(device)
y_batch = y_batch.to(device)
# Forward
logits = model(X_batch)
loss = criterion(logits, y_batch)
# Backward
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item()
print(f"Epoch {epoch+1:2d}/10 - Loss: {epoch_loss/len(loader):.4f}")
# 4. Inference
model.eval()
with torch.no_grad():
X_test = torch.randn(10, 20).to(device)
preds = model(X_test)
probs = torch.softmax(preds, dim=1)
print(f"\nTest predictions: {probs[:3]}") Production Gotchas
model.train() and model.eval() switch behavior. In eval(), dropout is disabled and batch norm uses running statistics. Forgetting to call model.eval() before inference gives different results each run. Batch norm stats (running_mean, running_var) must be computed on training data, not evaluation data.
If you call zero_grad() at start of loop, you clear gradients before backward. Correct order: forward → loss → zero_grad (for next iteration) → backward → step. OR: zero_grad at start, then backward+step, then next iteration's zero_grad. Mixing breaks training silently.
Buffers (like batch norm running_mean) aren't in model.parameters(). They're in model.buffers(). If you save only state_dict() of parameters, buffers are lost. Use model.state_dict() instead to capture everything.
When you load pretrained weights with load_state_dict(), requires_grad is NOT automatically set. If you load a model and freeze layers, you must explicitly set param.requires_grad = False after loading, or those params will still train.
Multi-process data loading (num_workers > 0) on Windows requires code to be in if __name__ == '__main__': block, or it hangs. On Linux/Mac it usually works. Always guard DataLoader with main guard for portability.
If you torch.save() the entire model object (not just state_dict), it pickles the class. If the class definition changes or is moved, loading fails. Always use state_dict() + architecture reconstruction for robustness. Only use torch.save(model) for quick prototyping.