On this page
article
PyTorch Training & Datasets
Train PyTorch models with custom datasets, DataLoader, training loops, GPU acceleration, and model checkpointing.
PyTorch’s explicit training loops give you full control over the training process. This chapter covers datasets, data loading, and production training patterns.
Custom Dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
class ImageFolderDataset(Dataset):
def __init__(self, root_dir, transform=None):
self.root_dir = root_dir
self.transform = transform
self.samples = [
(os.path.join(root_dir, f), label)
for label, subdir in enumerate(sorted(os.listdir(root_dir)))
for f in os.listdir(os.path.join(root_dir, subdir))
if f.endswith((".jpg", ".png"))
]
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
path, label = self.samples[idx]
image = Image.open(path).convert("RGB")
if self.transform:
image = self.transform(image)
return image, label
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])
dataset = ImageFolderDataset("data/train", transform=transform)
loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)
Training Loop
import torch.nn as nn
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
def train_epoch(model, loader, criterion, optimizer, device):
model.train()
running_loss = 0.0
correct = 0
total = 0
for inputs, labels in loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
return running_loss / total, correct / total
def evaluate(model, loader, criterion, device):
model.eval()
running_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
running_loss += loss.item() * inputs.size(0)
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
return running_loss / total, correct / total
Full Training Script
best_acc = 0.0
num_epochs = 20
for epoch in range(num_epochs):
train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
val_loss, val_acc = evaluate(model, val_loader, criterion, device)
scheduler.step()
print(f"Epoch {epoch+1}/{num_epochs}")
print(f" Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
print(f" Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
if val_acc > best_acc:
best_acc = val_acc
torch.save({
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"best_acc": best_acc,
}, "best_model.pth")
print(f" Saved best model (acc: {best_acc:.4f})")
Loading Checkpoints
checkpoint = torch.load("best_model.pth")
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
start_epoch = checkpoint["epoch"] + 1
Mixed Precision Training
Faster training on modern GPUs:
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for inputs, labels in loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
with autocast():
outputs = model(inputs)
loss = criterion(outputs, labels)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
Using Pretrained Models
from torchvision import models
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
model.fc = nn.Linear(model.fc.in_features, num_classes)
# Freeze backbone
for param in model.parameters():
param.requires_grad = False
for param in model.fc.parameters():
param.requires_grad = True
PyTorch’s explicit training loops teach you exactly what happens during deep learning — knowledge that transfers to any framework.