# Trains using PyTorch and logs training metrics and weights in TensorFlow event format to the MLflow run's artifact directory. # This stores the TensorFlow events in MLflow for later access using TensorBoard. # # Code based on https://github.com/mlflow/mlflow/blob/master/example/tutorial/pytorch_tensorboard.py. # from __future__ import print_function import os import tempfile import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torchvision import datasets, transforms from torch.autograd import Variable from torch.utils.tensorboard import SummaryWriter from collections import namedtuple import tensorflow as tf import tensorflow.summary from tensorflow.summary import scalar from tensorflow.summary import histogram from chardet.universaldetector import UniversalDetector # Create Params dictionary class Params(object): def __init__(self, batch_size, test_batch_size, epochs, lr, momentum, seed, cuda, log_interval): self.batch_size = batch_size self.test_batch_size = test_batch_size self.epochs = epochs self.lr = lr self.momentum = momentum self.seed = seed self.cuda = cuda self.log_interval = log_interval # Configure args args = Params(64, 1000, 10, 0.01, 0.5, 1, True, 200) cuda = not args.cuda and torch.cuda.is_available() kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) class Model(nn.Module): def __init__(self): super(Model, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5) self.conv2_drop = nn.Dropout2d() self.fc1 = nn.Linear(320, 50) self.fc2 = nn.Linear(50, 10) def forward(self, x): x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) return F.log_softmax(x, dim=0) def log_weights(self, step): with writer.as_default(): tf.summary.histogram('weights/conv1/weight', model.conv1.weight.data, step) tf.summary.histogram('weights/conv1/bias', model.conv1.bias.data, step) tf.summary.histogram('weights/conv2/weight', model.conv2.weight.data, step) tf.summary.histogram('weights/conv2/bias', model.conv2.bias.data, step) tf.summary.histogram('weights/fc1/weight', model.fc1.weight.data, step) tf.summary.histogram('weights/fc1/bias', model.fc1.bias.data, step) tf.summary.histogram('weights/fc2/weight', model.fc2.weight.data, step) tf.summary.histogram('weights/fc2/bias', model.fc2.bias.data, step) model = Model() if cuda: model.cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) writer = None # Will be used to write TensorBoard events def train(epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): if cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.data.item())) step = epoch * len(train_loader) + batch_idx log_scalar('train_loss', loss.data.item(), step) model.log_weights(step) def test(epoch): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: if cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) output = model(data) test_loss += F.nll_loss(output, target, reduction='sum').data.item() # sum up batch loss pred = output.data.max(1)[1] # get the index of the max log-probability correct += pred.eq(target.data).cpu().sum().item() test_loss /= len(test_loader.dataset) test_accuracy = 100.0 * correct / len(test_loader.dataset) print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), test_accuracy)) step = (epoch + 1) * len(train_loader) log_scalar('test_loss', test_loss, step) log_scalar('test_accuracy', test_accuracy, step) def log_scalar(name, value, step): """Log a scalar value to both MLflow and TensorBoard""" with writer.as_default(): tf.summary.scalar(name, value, step) mlflow.log_metric(name, value, step=step)
import mlflow.pytorch with mlflow.start_run() as run: # Log our parameters into mlflow for key, value in vars(args).items(): mlflow.log_param(key, value) output_dir = tempfile.mkdtemp() print("Writing TensorFlow events locally to %s\n" % output_dir) writer = tf.summary.create_file_writer(output_dir) for epoch in range(1, args.epochs + 1): # print out active_run print("Active Run ID: %s, Epoch: %s \n" % (run.info.run_uuid, epoch)) train(epoch) test(epoch) print("Uploading TensorFlow events as a run artifact.") mlflow.log_artifacts(output_dir, artifact_path="events")
Writing TensorFlow events locally to /tmp/tmp4qm_re9h
Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 1
Train Epoch: 1 [0/60000 (0%)] Loss: 4.162675
Train Epoch: 1 [12800/60000 (21%)] Loss: 3.393654
Train Epoch: 1 [25600/60000 (43%)] Loss: 2.984073
Train Epoch: 1 [38400/60000 (64%)] Loss: 2.848104
Train Epoch: 1 [51200/60000 (85%)] Loss: 2.592036
Test set: Average loss: 4.9353, Accuracy: 9322/10000 (93%)
Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 2
Train Epoch: 2 [0/60000 (0%)] Loss: 2.414659
Train Epoch: 2 [12800/60000 (21%)] Loss: 2.479717
Train Epoch: 2 [25600/60000 (43%)] Loss: 2.518646
Train Epoch: 2 [38400/60000 (64%)] Loss: 2.339756
Train Epoch: 2 [51200/60000 (85%)] Loss: 2.297640
Test set: Average loss: 4.7987, Accuracy: 9519/10000 (95%)
Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 3
Train Epoch: 3 [0/60000 (0%)] Loss: 2.408521
Train Epoch: 3 [12800/60000 (21%)] Loss: 2.324917
Train Epoch: 3 [25600/60000 (43%)] Loss: 2.489070
Train Epoch: 3 [38400/60000 (64%)] Loss: 2.353824
Train Epoch: 3 [51200/60000 (85%)] Loss: 2.270601
Test set: Average loss: 4.7542, Accuracy: 9619/10000 (96%)
Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 4
Train Epoch: 4 [0/60000 (0%)] Loss: 2.318603
Train Epoch: 4 [12800/60000 (21%)] Loss: 2.254439
Train Epoch: 4 [25600/60000 (43%)] Loss: 2.292812
Train Epoch: 4 [38400/60000 (64%)] Loss: 2.360202
Train Epoch: 4 [51200/60000 (85%)] Loss: 2.099524
Test set: Average loss: 4.7312, Accuracy: 9677/10000 (97%)
Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 5
Train Epoch: 5 [0/60000 (0%)] Loss: 2.115560
Train Epoch: 5 [12800/60000 (21%)] Loss: 2.219702
Train Epoch: 5 [25600/60000 (43%)] Loss: 2.295865
Train Epoch: 5 [38400/60000 (64%)] Loss: 2.173573
Train Epoch: 5 [51200/60000 (85%)] Loss: 2.197725
Test set: Average loss: 4.7149, Accuracy: 9712/10000 (97%)
Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 6
Train Epoch: 6 [0/60000 (0%)] Loss: 2.394348
Train Epoch: 6 [12800/60000 (21%)] Loss: 2.266938
Train Epoch: 6 [25600/60000 (43%)] Loss: 2.315325
Train Epoch: 6 [38400/60000 (64%)] Loss: 2.204606
Train Epoch: 6 [51200/60000 (85%)] Loss: 2.258469
Test set: Average loss: 4.7066, Accuracy: 9753/10000 (98%)
Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 7
Train Epoch: 7 [0/60000 (0%)] Loss: 2.199848
Train Epoch: 7 [12800/60000 (21%)] Loss: 2.461301
Train Epoch: 7 [25600/60000 (43%)] Loss: 2.278232
Train Epoch: 7 [38400/60000 (64%)] Loss: 2.176651
Train Epoch: 7 [51200/60000 (85%)] Loss: 2.099115
Test set: Average loss: 4.6989, Accuracy: 9773/10000 (98%)
Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 8
Train Epoch: 8 [0/60000 (0%)] Loss: 2.241166
Train Epoch: 8 [12800/60000 (21%)] Loss: 2.184432
Train Epoch: 8 [25600/60000 (43%)] Loss: 2.065749
Train Epoch: 8 [38400/60000 (64%)] Loss: 2.139455
Train Epoch: 8 [51200/60000 (85%)] Loss: 2.120107
Test set: Average loss: 4.6946, Accuracy: 9771/10000 (98%)
Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 9
Train Epoch: 9 [0/60000 (0%)] Loss: 2.192689
Train Epoch: 9 [12800/60000 (21%)] Loss: 2.164002
Train Epoch: 9 [25600/60000 (43%)] Loss: 2.127155
Train Epoch: 9 [38400/60000 (64%)] Loss: 2.102258
Train Epoch: 9 [51200/60000 (85%)] Loss: 2.224101
Test set: Average loss: 4.6877, Accuracy: 9798/10000 (98%)
Active Run ID: fa0401ddf9334f0a9520f463100da825, Epoch: 10
Train Epoch: 10 [0/60000 (0%)] Loss: 2.188497
Train Epoch: 10 [12800/60000 (21%)] Loss: 2.123358
Train Epoch: 10 [25600/60000 (43%)] Loss: 2.137002
Train Epoch: 10 [38400/60000 (64%)] Loss: 2.098072
Train Epoch: 10 [51200/60000 (85%)] Loss: 2.169977
Test set: Average loss: 4.6824, Accuracy: 9807/10000 (98%)
Uploading TensorFlow events as a run artifact.
MLflow PyTorch Notebook
This is an MLflow PyTorch notebook is based on MLflow's PyTorch TensorBoard tutorial.
In this tutorial you: