mlflow-pytorch-training(Python)

Loading...

MLflow PyTorch Notebook

This is an MLflow PyTorch notebook is based on MLflow's PyTorch TensorBoard tutorial.

  • This notebook demonstrates how to run PyTorch to fit a neural network on MNIST handwritten digit recognition data.
  • The run results are logged to an MLflow server.
  • Training metrics and weights in TensorFlow event format are logged locally and then uploaded to the MLflow run's artifact directory.
  • TensorBoard is started on the local log and then optionally on the uploaded log.

In this tutorial you:

  • Create a GPU-enabled cluster
  • Install the MLflow library on the cluster
  • Run a neural network on MNIST handwritten digit recognition data
  • View the results of training the network in the MLflow experiment UI
  • View the results of training the network in TensorBoard

Create a cluster and install MLflow on your cluster

  1. Create a GPU-enabled cluster specifying Python 3.
  2. If you are not running Databricks Runtime for Machine Learning, you must install the required library.
    1. Create required library.
    • Source PyPI and enter mlflow[extras].
    1. Install the library into the cluster.
  3. Attach this notebook to the cluster.

Train an MNIST digit recognizer using PyTorch

import mlflow
# Trains using PyTorch and logs training metrics and weights in TensorFlow event format to the MLflow run's artifact directory. 
# This stores the TensorFlow events in MLflow for later access using TensorBoard.
#
# Code based on https://github.com/mlflow/mlflow/blob/master/example/tutorial/pytorch_tensorboard.py.
#

from __future__ import print_function
import os
import tempfile
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from torch.utils.tensorboard import SummaryWriter
from collections import namedtuple
import tensorflow as tf
import tensorflow.summary
from tensorflow.summary import scalar
from tensorflow.summary import histogram
from chardet.universaldetector import UniversalDetector

# Create Params dictionary
class Params(object):
	def __init__(self, batch_size, test_batch_size, epochs, lr, momentum, seed, cuda, log_interval):
		self.batch_size = batch_size
		self.test_batch_size = test_batch_size
		self.epochs = epochs
		self.lr = lr
		self.momentum = momentum
		self.seed = seed
		self.cuda = cuda
		self.log_interval = log_interval

# Configure args
args = Params(64, 1000, 10, 0.01, 0.5, 1, True, 200)

cuda = not args.cuda and torch.cuda.is_available()


kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])),
    batch_size=args.test_batch_size, shuffle=True, **kwargs)

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=0)

    def log_weights(self, step):
      with writer.as_default():
        tf.summary.histogram('weights/conv1/weight', model.conv1.weight.data, step)
        tf.summary.histogram('weights/conv1/bias', model.conv1.bias.data, step)
        tf.summary.histogram('weights/conv2/weight', model.conv2.weight.data, step)
        tf.summary.histogram('weights/conv2/bias', model.conv2.bias.data, step)
        tf.summary.histogram('weights/fc1/weight', model.fc1.weight.data, step)
        tf.summary.histogram('weights/fc1/bias', model.fc1.bias.data, step)
        tf.summary.histogram('weights/fc2/weight', model.fc2.weight.data, step)
        tf.summary.histogram('weights/fc2/bias', model.fc2.bias.data, step)

model = Model()
if cuda:
    model.cuda()

optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

writer = None # Will be used to write TensorBoard events

def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        if cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data.item()))
            step = epoch * len(train_loader) + batch_idx
            log_scalar('train_loss', loss.data.item(), step)
            model.log_weights(step)

def test(epoch):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            if cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').data.item() # sum up batch loss
            pred = output.data.max(1)[1] # get the index of the max log-probability
            correct += pred.eq(target.data).cpu().sum().item()

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100.0 * correct / len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset), test_accuracy))
    step = (epoch + 1) * len(train_loader)
    log_scalar('test_loss', test_loss, step)
    log_scalar('test_accuracy', test_accuracy, step)

def log_scalar(name, value, step):
    """Log a scalar value to both MLflow and TensorBoard"""
    with writer.as_default():
      tf.summary.scalar(name, value, step)
    mlflow.log_metric(name, value, step=step)

Create a TensorFlow session and start MLflow

import mlflow.pytorch

with mlflow.start_run() as run:  
  # Log our parameters into mlflow
  for key, value in vars(args).items():
      mlflow.log_param(key, value)

  output_dir = tempfile.mkdtemp()
  print("Writing TensorFlow events locally to %s\n" % output_dir)
  writer = tf.summary.create_file_writer(output_dir)

  for epoch in range(1, args.epochs + 1):
      # print out active_run
      print("Active Run ID: %s, Epoch: %s \n" % (run.info.run_uuid, epoch))

      train(epoch)
      test(epoch)
      
  print("Uploading TensorFlow events as a run artifact.")
  mlflow.log_artifacts(output_dir, artifact_path="events")
Writing TensorFlow events locally to /tmp/tmp4qm_re9h Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 1 Train Epoch: 1 [0/60000 (0%)] Loss: 4.162675 Train Epoch: 1 [12800/60000 (21%)] Loss: 3.393654 Train Epoch: 1 [25600/60000 (43%)] Loss: 2.984073 Train Epoch: 1 [38400/60000 (64%)] Loss: 2.848104 Train Epoch: 1 [51200/60000 (85%)] Loss: 2.592036 Test set: Average loss: 4.9353, Accuracy: 9322/10000 (93%) Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 2 Train Epoch: 2 [0/60000 (0%)] Loss: 2.414659 Train Epoch: 2 [12800/60000 (21%)] Loss: 2.479717 Train Epoch: 2 [25600/60000 (43%)] Loss: 2.518646 Train Epoch: 2 [38400/60000 (64%)] Loss: 2.339756 Train Epoch: 2 [51200/60000 (85%)] Loss: 2.297640 Test set: Average loss: 4.7987, Accuracy: 9519/10000 (95%) Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 3 Train Epoch: 3 [0/60000 (0%)] Loss: 2.408521 Train Epoch: 3 [12800/60000 (21%)] Loss: 2.324917 Train Epoch: 3 [25600/60000 (43%)] Loss: 2.489070 Train Epoch: 3 [38400/60000 (64%)] Loss: 2.353824 Train Epoch: 3 [51200/60000 (85%)] Loss: 2.270601 Test set: Average loss: 4.7542, Accuracy: 9619/10000 (96%) Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 4 Train Epoch: 4 [0/60000 (0%)] Loss: 2.318603 Train Epoch: 4 [12800/60000 (21%)] Loss: 2.254439 Train Epoch: 4 [25600/60000 (43%)] Loss: 2.292812 Train Epoch: 4 [38400/60000 (64%)] Loss: 2.360202 Train Epoch: 4 [51200/60000 (85%)] Loss: 2.099524 Test set: Average loss: 4.7312, Accuracy: 9677/10000 (97%) Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 5 Train Epoch: 5 [0/60000 (0%)] Loss: 2.115560 Train Epoch: 5 [12800/60000 (21%)] Loss: 2.219702 Train Epoch: 5 [25600/60000 (43%)] Loss: 2.295865 Train Epoch: 5 [38400/60000 (64%)] Loss: 2.173573 Train Epoch: 5 [51200/60000 (85%)] Loss: 2.197725 Test set: Average loss: 4.7149, Accuracy: 9712/10000 (97%) Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 6 Train Epoch: 6 [0/60000 (0%)] Loss: 2.394348 Train Epoch: 6 [12800/60000 (21%)] Loss: 2.266938 Train Epoch: 6 [25600/60000 (43%)] Loss: 2.315325 Train Epoch: 6 [38400/60000 (64%)] Loss: 2.204606 Train Epoch: 6 [51200/60000 (85%)] Loss: 2.258469 Test set: Average loss: 4.7066, Accuracy: 9753/10000 (98%) Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 7 Train Epoch: 7 [0/60000 (0%)] Loss: 2.199848 Train Epoch: 7 [12800/60000 (21%)] Loss: 2.461301 Train Epoch: 7 [25600/60000 (43%)] Loss: 2.278232 Train Epoch: 7 [38400/60000 (64%)] Loss: 2.176651 Train Epoch: 7 [51200/60000 (85%)] Loss: 2.099115 Test set: Average loss: 4.6989, Accuracy: 9773/10000 (98%) Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 8 Train Epoch: 8 [0/60000 (0%)] Loss: 2.241166 Train Epoch: 8 [12800/60000 (21%)] Loss: 2.184432 Train Epoch: 8 [25600/60000 (43%)] Loss: 2.065749 Train Epoch: 8 [38400/60000 (64%)] Loss: 2.139455 Train Epoch: 8 [51200/60000 (85%)] Loss: 2.120107 Test set: Average loss: 4.6946, Accuracy: 9771/10000 (98%) Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 9 Train Epoch: 9 [0/60000 (0%)] Loss: 2.192689 Train Epoch: 9 [12800/60000 (21%)] Loss: 2.164002 Train Epoch: 9 [25600/60000 (43%)] Loss: 2.127155 Train Epoch: 9 [38400/60000 (64%)] Loss: 2.102258 Train Epoch: 9 [51200/60000 (85%)] Loss: 2.224101 Test set: Average loss: 4.6877, Accuracy: 9798/10000 (98%) Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 10 Train Epoch: 10 [0/60000 (0%)] Loss: 2.188497 Train Epoch: 10 [12800/60000 (21%)] Loss: 2.123358 Train Epoch: 10 [25600/60000 (43%)] Loss: 2.137002 Train Epoch: 10 [38400/60000 (64%)] Loss: 2.098072 Train Epoch: 10 [51200/60000 (85%)] Loss: 2.169977 Test set: Average loss: 4.6824, Accuracy: 9807/10000 (98%) Uploading TensorFlow events as a run artifact.

MLflow UI for the PyTorch MNIST Run

Start TensorBoard on local directory

%load_ext tensorboard
experiment_log_dir = output_dir
%tensorboard --logdir $experiment_log_dir

View the results in TensorBoard

Click the View TensorBoard link. It should look like the following:

TensorBoard for the PyTorch MNIST Run