How to build a CNN with PyTorch

Convolutional neural networks (CNNs)  have enhanced the field of computer vision, enabling machines to understand and interpret visual data with remarkable accuracy. PyTorch is a famous deep learning framework and a powerful platform for building and training models. In this Answer, we will build a simple CNN using PyTorch and train it using the MNIST dataset for handwritten digit recognition.

Getting started with PyTorch

We’ll first get PyTorch installed in our Python environment. We can install PyTorch using the following command:

pip3 install torch torchvision

Once installed, we can follow the following steps to build a CNN model:

  1. Import the necessary modules:

import torch # Import the PyTorch library
import torch.nn as nn # Import the neural network module
import torch.optim as optim # Import the optimization module
import torchvision # Import the torchvision library containing datasets, model architectures, etc
import torchvision.transforms as transforms # Import the module for image transformations
  1. Define a simple CNN architecture using PyTorch’s nn.Module:

# Define a custom class
class SimpleCNN(nn.Module):
def __init__(self):
# Call the constructor
super(SimpleCNN, self).__init__()
# Define the first convolutional layer with its parameters
self.conv_layer1 = nn.Conv2d(in_channels=1, out_channels=16, padding=1, kernel_size=3, stride=1)
# Define the second convolutional layer with its parameters
self.conv_layer2 = nn.Conv2d(in_channels=16, out_channels=32, padding=1, kernel_size=3, stride=1)
# Define the first fully connected layer
self.fc_layer1 = nn.Linear(32 * 7 * 7, 128)
# Define the second fully connected layer
self.fc_layer2 = nn.Linear(128, 10)
def forward(self, x):
# Apply ReLU to the output of the first conv layer
x = torch.relu(self.conv_layer1(x))
# Perform max pooling operation
x = torch.max_pool2d(x, stride=2, kernel_size=2)
# Apply ReLU to the output of the second conv layer
x = torch.relu(self.conv_layer2(x))
x = torch.max_pool2d(x, stride=2, kernel_size=2)
# Flatten the output
x = x.view(-1, 32 * 7 * 7)
# Apply ReLU to the output of the first fully connected layer
x = torch.relu(self.fc_layer1(x))
# Apply the second fully connected layer
x = self.fc_layer2(x)
return x

The model architecture diagram for the code above is shown below:

Visual representation of model architecture
Visual representation of model architecture
  1. Load and preprocess the MNIST dataset using PyTorch’s built-in methods:

# Transformations to be applied to the data
transform_data = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
# Load the MNIST data and apply the transformations defined above
trainset_data = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform_data)
# DataLoader for the training dataset
train_loader = torch.utils.data.DataLoader(trainset_data, batch_size=64, shuffle=True)
  1. Initialize the CNN network, cross-entropy loss function, and adam optimizer:

# Create an instance of the model
net = SimpleCNN()
# Define the loss function
criterion = nn.CrossEntropyLoss()
# Define the optimizer used for updating the parameters Adam optimizer is used here
optimizer = optim.Adam(net.parameters(), lr=0.001)
  1. Train the CNN Network:

num_epochs = 5
# Iterate over each epoch
for epoch in range(num_epochs):
running_loss = 0.0
# Iterate over batches
for i, data in enumerate(train_loader, 0):
inputs, labels = data
optimizer.zero_grad()
outputs = net(inputs) # Forward pass
loss = criterion(outputs, labels) # Calculate the loss
loss.backward() # Backward pass
optimizer.step() # Update the parameters (weights)
running_loss += loss.item()
if i % 100 == 99: # Print every 100 mini-batches
print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100))
running_loss = 0.0
print('Training completed successfully.')
  1. Evaluate the CNN Network to observe the model performance.

# Evaluate the model
correct = 0 # Initialize variables
total = 0
# Disable gradient computation for efficiency
with torch.no_grad():
# Iterate over the batches
for images, labels in train_loader:
# Forward pass
outputs = net(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
# Count the number of correct predictions
correct += (predicted == labels).sum().item()
# Calculate the accuracy
accuracy = 100 * correct / total
print('Accuracy of the model on the train dataset: {:.2f}%'.format(accuracy))

During training, we iterate over mini-batches of data, compute the loss, perform backpropagation, and update the model parameters using the optimizer. We can practice the code by running the widget below:

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

# Define a CNN model
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv_layer1 = nn.Conv2d(in_channels=1, out_channels=16, padding=1, kernel_size=3, stride=1)
        self.conv_layer2 = nn.Conv2d(in_channels=16, out_channels=32, padding=1, kernel_size=3, stride=1)
        self.fc_layer1 = nn.Linear(32 * 7 * 7, 128)
        self.fc_layer2 = nn.Linear(128, 10)

    def forward(self, x):
        x = torch.relu(self.conv_layer1(x))
        x = torch.max_pool2d(x, stride=2, kernel_size=2)
        x = torch.relu(self.conv_layer2(x))
        x = torch.max_pool2d(x, stride=2, kernel_size=2)
        x = x.view(-1, 32 * 7 * 7)
        x = torch.relu(self.fc_layer1(x))
        x = self.fc_layer2(x)
        return x


# Load the MNIST dataset
transform_data = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
trainset_data = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform_data)
train_loader = torch.utils.data.DataLoader(trainset_data, batch_size=64, shuffle=True)

# Initialize the network
net = SimpleCNN()

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

# Train the network
numb_epochs = 5
for eachEpoch in range(numb_epochs):
    running_Loss = 0.0
    for j, Data in enumerate(train_loader, 0):
        inputS, labelS = Data
        optimizer.zero_grad()
        outputS = net(inputS)
        loss = criterion(outputS, labelS)
        loss.backward()
        optimizer.step()
        
        running_Loss += loss.item()
        if j % 100 == 99:
            print('[%d, %5d] loss: %.3f' % (eachEpoch + 1, j + 1, running_Loss / 100))
            running_Loss = 0.0

print('Training completed successfully.')


# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for images, labels in train_loader:
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print('Accuracy of the model on the train dataset: {:.2f}%'.format(accuracy))


Complete code for building and training a CNN model

Conclusion

We covered the fundamentals of building a CNN with PyTorch and training it on the MNIST dataset. By understanding the architecture of CNNs and leveraging PyTorch’s powerful tools, we can create sophisticated deep learning models for various computer vision tasks.

Free Resources

Copyright ©2025 Educative, Inc. All rights reserved