Models in Unity Catalog Example

This notebook illustrates how to use Models in Unity Catalog APIs to manage models. The notebook includes the following steps:

Track and log models with MLflow.
Register models to Unity Catalog.
Use the API to add descriptions to models and model versions.
Use aliases to deploy model versions.
Use the API to load model versions for inference.
Delete models.

This tutorial leverages features from MLflow 3.0. For more details, see "Get started with MLflow 3.0" (AWS|Azure|GCP)

Requirements

This notebook requires a workspace that has been enabled for Unity Catalog. Your workspace must be attached to a Unity Catalog metastore that supports privilege inheritance. This is true for all metastores created after August 25, 2022.
The notebook must be attached to a cluster that has access to Unity Catalog and that is running Databricks Runtime for Machine Learning 13.3 LTS or above.
This notebook creates models in the main.default schema by default. This requires USE CATALOG privilege on the main catalog, plus CREATE MODEL and USE SCHEMA privileges on the main.default schema. You can change the catalog and schema used in this notebook, as long as you have the same privileges on the catalog and schema you specify.
This notebook uses MLflow 3.0, which requires installing a version of mlflow that is >= 3.0

# Upgrade to the latest MLflow version to use MLflow 3.0 features
%pip install mlflow>=3.0 --upgrade
dbutils.library.restartPython()

import mlflow
mlflow.set_registry_uri("databricks-uc")

# You can update the catalog and schema name containing the model in Unity Catalog if needed
CATALOG_NAME = "main"
SCHEMA_NAME = "default"
MODEL_NAME = f"{CATALOG_NAME}.{SCHEMA_NAME}.bike_share"

df = spark.read.csv("/databricks-datasets/bikeSharing/data-001/hour.csv", header="true", inferSchema="true")

import pandas as pd

df = pd.read_csv("/databricks-datasets/bikeSharing/data-001/hour.csv", header=0)
df = df[["season", "yr", "mnth", "hr", "holiday", "weekday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed", "cnt"]]
df.display()

from sklearn.model_selection import train_test_split

X = df.drop(columns=["cnt"]).astype("float64")
y = df["cnt"].astype("float64")

# Split out the training data
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.60, random_state=123)

# Split the remaining data equally into validation and test
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)

import mlflow.sklearn
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

with mlflow.start_run() as run:

  gradient_booster = GradientBoostingRegressor()
  gradient_booster.fit(X_train, y_train)

  mse = mean_squared_error(gradient_booster.predict(X_val), y_val)
  print("Validation MSE: %d" % mse)
  mlflow.log_metric("mse", mse)

  example_input = X_val.iloc[[0]]

  # To register the model to Unity Catalog, specify the `registered_model_name` parameter 
  # of the `mlflow.sklearn.log_model()` function. This automatically creates a new model version.
  # All metrics of the model will be available in Unity Catalog. You can log additional metrics
  # any time to the model with mlflow.log_metric() by passing in the model_id argument, which
  # will all be available under the model version in Unity Catalog.
  mlflow.sklearn.log_model(
    sk_model=gradient_booster,
    name="sklearn-model",
    input_example=example_input,
    registered_model_name=MODEL_NAME
  )

from mlflow.tracking.client import MlflowClient

# This function returns the latest model version.
def get_latest_model_version(model_name):
  client = MlflowClient()
  model_version_infos = client.search_model_versions("name = '%s'" % model_name)
  return max([int(model_version_info.version) for model_version_info in model_version_infos])

latest_version = get_latest_model_version(model_name=MODEL_NAME)

client = MlflowClient()
client.update_registered_model(
  name=MODEL_NAME,
  description="Bike share model."
)

client.update_model_version(
  name=MODEL_NAME,
  version=latest_version,
  description="This model version was built using the scikit-learn GradientBoostingRegressor."
)

client = MlflowClient()
latest_version = get_latest_model_version(MODEL_NAME)
client.set_registered_model_alias(MODEL_NAME, "Champion", latest_version)

import mlflow.pyfunc

model_version_uri = "models:/{model_name}/1".format(model_name=MODEL_NAME)

print("Loading registered model version from URI: '{model_uri}'".format(model_uri=model_version_uri))
model_version_1 = mlflow.pyfunc.load_model(model_version_uri)

model_champion_uri = "models:/{model_name}@Champion".format(model_name=MODEL_NAME)

print("Loading registered model version from URI: '{model_uri}'".format(model_uri=model_champion_uri))
champion_model = mlflow.pyfunc.load_model(model_champion_uri)

from mlflow.tracking import MlflowClient

def load_and_predict(model_name, model_alias, new_data):
  import pandas as pd
  client = MlflowClient()
  model_uri = "models:/{model_name}@{model_alias}".format(model_name=MODEL_NAME, model_alias=model_alias)
  model = mlflow.pyfunc.load_model(model_uri)
  predictions = pd.DataFrame(model.predict(new_data))
  print(predictions)
  return predictions

gb_predictions = load_and_predict(MODEL_NAME, "Champion", X_val)

import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

with mlflow.start_run() as run:
  n_estimators = 300
  mlflow.log_param("n_estimators", n_estimators)

  rand_forest = RandomForestRegressor(n_estimators=n_estimators)
  rand_forest.fit(X_train, y_train)

  mse = mean_squared_error(rand_forest.predict(X_val), y_val)
  print("Validation MSE: %d" % mse)
  mlflow.log_metric("mse", mse)

  example_input = X_val.iloc[[0]]

  # Specify the `registered_model_name` parameter of the `mlflow.sklearn.log_model()`
  # function to register the model to Unity Catalog. This automatically
  # creates a new model version.
  mlflow.sklearn.log_model(
    sk_model=rand_forest,
    name="sklearn-model",
    input_example=example_input,
    registered_model_name=MODEL_NAME
  )

client.update_model_version(
  name=MODEL_NAME,
  version=get_latest_model_version(MODEL_NAME),
  description="This model version was built using the scikit-learn RandomForestRegressor."
)

client = MlflowClient()
latest_version = get_latest_model_version(MODEL_NAME)
client.set_registered_model_alias(MODEL_NAME, "Challenger", latest_version)

rf_predictions = load_and_predict(MODEL_NAME, "Challenger", X_val)

# Convert y_val to a DataFrame
ground_truth = y_val.to_frame()

# Reset indices to ensure alignment
gb_predictions = gb_predictions.reset_index(drop=True)
rf_predictions = rf_predictions.reset_index(drop=True)
ground_truth = ground_truth.reset_index(drop=True)

# Combine c1, c2, and c3 into a single DataFrame
combined_df = pd.concat([gb_predictions, rf_predictions, ground_truth], axis=1)
combined_df.columns = ['gb_preds', 'rf_preds', 'ground_truth']
# Display the combined DataFrame
display(combined_df)

from sklearn.metrics import mean_squared_error

mse_rf = mean_squared_error(combined_df['rf_preds'], combined_df['ground_truth'])
mse_gb = mean_squared_error(combined_df['gb_preds'], combined_df['ground_truth'])
print(f"Random Forest model mean squared error: {mse_rf}")
print(f"Gradient Booster model mean squared error: {mse_gb}")

new_model_version = get_latest_model_version(MODEL_NAME)

client.set_registered_model_alias(
  name=MODEL_NAME,
  alias="Champion",
  version=new_model_version
)

client.delete_registered_model_alias(name=MODEL_NAME, alias="Challenger")

client.delete_model_version(
   name=MODEL_NAME,
   version=1,
)

client = MlflowClient()
client.delete_registered_model(name=MODEL_NAME)

models-in-uc-example-mlflow-3

Models in Unity Catalog Example

Requirements

Configure MLflow client to access models in Unity Catalog

Load and pre-process dataset

Train, register, and deploy model

Add model and model version descriptions using the API

View the model in the UI

Deploy a model version for inference

Load model versions using the API

Make predictions using the champion model

Create and deploy a new model version

Add a description for the new model version

Mark new model version as Challenger and test the model

Compare the performance of the two model versions

Calculate the mean squared error for the predictions of each model relative to the ground truth

Deploy the new model version using the "Champion" alias

Remove a model alias

Delete model versions and models