MNIST is the hello world into ML world. MNIST  dataset is a collection of images of numbers 0..9 and labels. The images are of size 28x28. Lets just take few sample from the dataset to get a feel of how it looks like.
You can see that the image of number <> is associated with number <>. It is a list of (image of number, number). As usual we are gonna feed the neural network with image from the left and its label from the right. We will train a simple feed forward network, call it Model0.
 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
class Args:
    pass
args = Args()
args.batch_size = 32
args.cuda = True
args.lr = 0.001
args.momentum = 0.01
args.epochs = 10
args.log_interval = 10
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, 
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=args.batch_size, shuffle=True, **kwargs)
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from PIL import Image
import pprint
import numpy 
num_of_samples = 5
fig = plt.figure(1,(8., 8.))
grid = ImageGrid(fig, 111,
                 nrows_ncols=(num_of_samples, num_of_samples),   
                 axes_pad=0.1)
output = numpy.zeros(num_of_samples ** 2)
for i, (data, target) in enumerate(test_loader):
    if i < 1: #dirty trick to take just one sample
        for j in range(num_of_samples ** 2):
            grid[j].matshow(Image.fromarray(data[j][0].numpy()))
            output[j] = target[j]
    else:
        break
           
output = output.reshape(num_of_samples, num_of_sample)
plt.show()
[[ 6. 9. 9. 5. 4.] [ 3. 6. 5. 0. 1.] [ 8. 1. 3. 6. 2.] [ 9. 4. 8. 8. 6.] [ 0. 6. 4. 2. 3.]]
You can see that the image of number <> is associated with number <>. It is a list of (image of number, number). As usual we are gonna feed the neural network with image from the left and its label from the right. We will train a simple feed forward network, call it Model0.
class Model0(nn.Module): def __init__(self): super(Model0, self).__init__() self.output_layer = nn.Linear(28*28, 10) def forward(self, x): x = self.output_layer(x) return F.log_softmax(x) class Model1(nn.Module): def __init__(self): super(Model1, self).__init__() self.input_layer = nn.Linear(28*28, 5) self.output_layer = nn.Linear(5, 10) def forward(self, x): x = self.input_layer(x) x = self.output_layer(x) return F.log_softmax(x) class Model2(nn.Module): def __init__(self): super(Model2, self).__init__() self.input_layer = nn.Linear(28*28, 6) self.output_layer = nn.Linear(6, 10) def forward(self, x): x = self.input_layer(x) x = self.output_layer(x) return F.log_softmax(x) class Model3(nn.Module): def __init__(self): super(Model3, self).__init__() self.input_layer = nn.Linear(28*28, 7) self.output_layer = nn.Linear(7, 10) def forward(self, x): x = self.input_layer(x) x = self.output_layer(x) return F.log_softmax(x) class Model4(nn.Module): def __init__(self): super(Model4, self).__init__() self.input_layer = nn.Linear(28*28, 8) self.output_layer = nn.Linear(8, 10) def forward(self, x): x = self.input_layer(x) x = self.output_layer(x) return F.log_softmax(x) class Model5(nn.Module): def __init__(self): super(Model5, self).__init__() self.input_layer = nn.Linear(28*28, 9) self.output_layer = nn.Linear(9, 10) def forward(self, x): x = self.input_layer(x) x = self.output_layer(x) return F.log_softmax(x) class Model6(nn.Module): def __init__(self): super(Model6, self).__init__() self.input_layer = nn.Linear(28*28, 10) self.output_layer = nn.Linear(10, 10) def forward(self, x): x = self.input_layer(x) x = self.output_layer(x) return F.log_softmax(x) class Model7(nn.Module): def __init__(self): super(Model7, self).__init__() self.input_layer = nn.Linear(28*28, 100) self.output_layer = nn.Linear(100, 10) def forward(self, x): x = self.input_layer(x) x = self.output_layer(x) return F.log_softmax(x) class Model8(nn.Module): def __init__(self): super(Model8, self).__init__() self.input_layer = nn.Linear(28*28, 100) self.hidden_layer = nn.Linear(100, 100) self.output_layer = nn.Linear(100, 10) def forward(self, x): x = self.input_layer(x) x = self.hidden_layer(x) x = self.output_layer(x) return F.log_softmax(x) class Model9(nn.Module): def __init__(self): super(Model9, self).__init__() self.input_layer = nn.Linear(28*28, 100) self.hidden_layer = nn.Linear(100, 100) self.hidden_layer1 = nn.Linear(100, 100) self.output_layer = nn.Linear(100, 10) def forward(self, x): x = self.input_layer(x) x = self.hidden_layer(x) x = self.hidden_layer1(x) x = self.output_layer(x) return F.log_softmax(x) class Model10(nn.Module): def __init__(self): super(Model10, self).__init__() self.input_layer = nn.Linear(28*28, 100) self.hidden_layer = nn.Linear(100, 100) self.hidden_layer1 = nn.Linear(100, 100) self.hidden_layer2 = nn.Linear(100, 100) self.output_layer = nn.Linear(100, 10) def forward(self, x): x = self.input_layer(x) x = self.hidden_layer(x) x = self.hidden_layer1(x) x = self.hidden_layer2(x) x = self.output_layer(x return F.log_softmax(x)and lets train it
def train(epoch, model, print_every=10):
    optimizer = optim.SGD(model.parameters(),
           lr=args.lr, momentum=args.momentum)
    for i in range(epoch):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            if args.cuda:
                data, target = data.cuda(), target.cuda()
           
            data = data.view(args.batch_size , -1)
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
        
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
           
       
        if i % print_every == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    i, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.data[0]))
for model in models:
     train(1000, model)
        
for i, model in enumerate(models):
    model.load_state_dict(torch.load('mnist_mlp_multiple_model{}.pth'.format(i)))
lets see how our network predicts the images.
[[ 6. 2. 9. 1. 8.] [ 5. 6. 5. 7. 5.] [ 4. 8. 6. 3. 0.] [ 6. 1. 0. 9. 3.] [ 7. 2. 8. 4. 4.]]
Most of the predictions look right. Lets run this over entire test dataset.
def test(model):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        if args.cuda:
             data, target = data.cuda(), target.cuda()
        
        data = data.view(data.size()[0], -1)
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        test_loss += F.nll_loss(output, target).data[0]
        pred = output.data.max(1)[1]
        correct += pred.eq(target.data).cpu().sum()
    test_loss = test_loss
    test_loss /= len(test_loader) #
    print(' Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)
      '.format(
        test_loss,
        correct,
        len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)
      )
    )
   return 100. * correct / len(test_loader.dataset)
accuracy = []
for model in models:
    accuracy.append(test_tuts(model))
pprint.pprint(accuracy)
plt.plot(range(len(accuracy)), accuracy, linewidth=1.0) plt.axis([0, 10, 0, 100]) plt.show()
 
pl.plot(range(len(accuracy)), accuracy, linewidth=1.0) plt.axis([0, 10, 90, 93]) plt.show()



