Serverless GPU compute: Onboard to Databricks Notebooks

Use this notebook to test out your first training run with Serverless GPU compute.

In this notebook you will:

Configure an onboarding notebook
Connect to one A10G GPU
Run the notebook cells to train your first model using Serverelss GPU compute on a notebook.

For this example, keep in mind the following topics:

Start up time may take up to 8 minutes.
Notebooks immediately begin using available compute after connected to a resource.
Connection to your compute auto-terminates after 60 minutes of inactivity.
Spark Connect UDF is not supported.

Single GPU Deep Learning Training using PyTorch for MNIST

This notebook illustrates training using PyTorch. This example runs on a single node, single GPU and walks you through how to:

Set up your checkpoint location
Prepare your single-node PyTorch code
Run single-node training with PyTorch
Load and use the model

Requirements

Databricks Runtime 7.0 ML or above.
HorovodRunner is designed to improve model training performance on clusters with multiple workers, but multiple workers are not required to run this notebook.

PYTORCH_DIR = '/Volumes/XXXXXXX/ml-genai/serverless_gpu_model/'

import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

# Specify training parameters
batch_size = 100
num_epochs = 3
momentum = 0.5
log_interval = 100

def train_one_epoch(model, device, data_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(data_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(data_loader) * len(data),
                100. * batch_idx / len(data_loader), loss.item()))

def save_checkpoint(log_dir, model, optimizer, epoch):
  filepath = log_dir + '/checkpoint-{epoch}.pth.tar'.format(epoch=epoch)
  state = {
    'model': model.state_dict(),
    'optimizer': optimizer.state_dict(),
  }
  torch.save(state, filepath)
  
def load_checkpoint(log_dir, epoch=num_epochs):
  filepath = log_dir + '/checkpoint-{epoch}.pth.tar'.format(epoch=epoch)
  return torch.load(filepath)

def create_log_dir():
  log_dir = os.path.join(PYTORCH_DIR, str(time()), 'MNISTDemo')
  os.makedirs(log_dir)
  return log_dir

import torch.optim as optim
from torchvision import datasets, transforms
from time import time
import os

single_node_log_dir = create_log_dir()
print("Log directory:", single_node_log_dir)

def train(learning_rate):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  train_dataset = datasets.MNIST(
    'data', 
    train=True,
    download=True,
    transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]))
  data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  model = Net().to(device)

  optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

  for epoch in range(1, num_epochs + 1):
    train_one_epoch(model, device, data_loader, optimizer, epoch)
    save_checkpoint(single_node_log_dir, model, optimizer, epoch)

    
def test(log_dir):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  loaded_model = Net().to(device)
  
  checkpoint = load_checkpoint(log_dir)
  loaded_model.load_state_dict(checkpoint['model'])
  loaded_model.eval()

  test_dataset = datasets.MNIST(
    'data', 
    train=False,
    download=True,
    transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]))
  data_loader = torch.utils.data.DataLoader(test_dataset)

  test_loss = 0
  for data, target in data_loader:
      data, target = data.to(device), target.to(device)
      output = loaded_model(data)
      test_loss += F.nll_loss(output, target)
  
  test_loss /= len(data_loader.dataset)
  print("Average test loss: {}".format(test_loss.item()))

train(learning_rate = 0.001)

test(single_node_log_dir)

serverless-gpu-compute-notebook

Serverless GPU compute: Onboard to Databricks Notebooks

Connect your notebook to an A10G

Single GPU Deep Learning Training using PyTorch for MNIST

Requirements

1. Set up checkpoint location

2. Prepare single-node code

Define a simple convolutional network

Configure single-node training

Create methods for saving and loading model checkpoints

3) Run single-node training with PyTorch

4) Load and use the model

Congratulations! You've completed your first notebook using Serverless GPU compute.