import pandas
from mlflow import version
import pyarrow
import tensorflow as tf
import xgboost as xgb
print("Pandas version: %s" % pandas.__version__)
print("MLflow version: %s" % version.VERSION)
print("PyArrow version: %s" % pyarrow.__version__)
print("XGboost version: %s" % xgb.__version__)
print("TensorFlow version: %s" % tf.__version__)
# Downloading csv file from ggplot2's hosted dataset on github.
url = "https://raw.githubusercontent.com/tidyverse/ggplot2/4c678917/data-raw/diamonds.csv"
# Reading the csv into a pandas DataFrame
pd_df = pandas.read_csv(url)
pd_df
Pandas version: 0.23.2
MLflow version: 0.4.2
PyArrow version: 0.8.0
XGboost version: 0.71
TensorFlow version: 1.8.0
Out[1]:
carat cut color clarity ... price x y z
0 0.23 Ideal E SI2 ... 326 3.95 3.98 2.43
1 0.21 Premium E SI1 ... 326 3.89 3.84 2.31
2 0.23 Good E VS1 ... 327 4.05 4.07 2.31
3 0.29 Premium I VS2 ... 334 4.20 4.23 2.63
4 0.31 Good J SI2 ... 335 4.34 4.35 2.75
5 0.24 Very Good J VVS2 ... 336 3.94 3.96 2.48
6 0.24 Very Good I VVS1 ... 336 3.95 3.98 2.47
7 0.26 Very Good H SI1 ... 337 4.07 4.11 2.53
8 0.22 Fair E VS2 ... 337 3.87 3.78 2.49
9 0.23 Very Good H VS1 ... 338 4.00 4.05 2.39
10 0.30 Good J SI1 ... 339 4.25 4.28 2.73
11 0.23 Ideal J VS1 ... 340 3.93 3.90 2.46
12 0.22 Premium F SI1 ... 342 3.88 3.84 2.33
13 0.31 Ideal J SI2 ... 344 4.35 4.37 2.71
14 0.20 Premium E SI2 ... 345 3.79 3.75 2.27
15 0.32 Premium E I1 ... 345 4.38 4.42 2.68
16 0.30 Ideal I SI2 ... 348 4.31 4.34 2.68
17 0.30 Good J SI1 ... 351 4.23 4.29 2.70
18 0.30 Good J SI1 ... 351 4.23 4.26 2.71
19 0.30 Very Good J SI1 ... 351 4.21 4.27 2.66
20 0.30 Good I SI2 ... 351 4.26 4.30 2.71
21 0.23 Very Good E VS2 ... 352 3.85 3.92 2.48
22 0.23 Very Good H VS1 ... 353 3.94 3.96 2.41
23 0.31 Very Good J SI1 ... 353 4.39 4.43 2.62
24 0.31 Very Good J SI1 ... 353 4.44 4.47 2.59
25 0.23 Very Good G VVS2 ... 354 3.97 4.01 2.41
26 0.24 Premium I VS1 ... 355 3.97 3.94 2.47
27 0.30 Very Good J VS2 ... 357 4.28 4.30 2.67
28 0.23 Very Good D VS2 ... 357 3.96 3.97 2.40
29 0.23 Very Good F VS1 ... 357 3.96 3.99 2.42
... ... ... ... ... ... ... ... ... ...
53910 0.70 Premium E SI1 ... 2753 5.74 5.77 3.48
53911 0.57 Premium E IF ... 2753 5.43 5.38 3.23
53912 0.61 Premium F VVS1 ... 2753 5.48 5.40 3.36
53913 0.80 Good G VS2 ... 2753 5.84 5.81 3.74
53914 0.84 Good I VS1 ... 2753 5.94 5.90 3.77
53915 0.77 Ideal E SI2 ... 2753 5.84 5.86 3.63
53916 0.74 Good D SI1 ... 2753 5.71 5.74 3.61
53917 0.90 Very Good J SI1 ... 2753 6.12 6.09 3.86
53918 0.76 Premium I VS1 ... 2753 5.93 5.85 3.49
53919 0.76 Ideal I VVS1 ... 2753 5.89 5.87 3.66
53920 0.70 Very Good E VS2 ... 2755 5.57 5.61 3.49
53921 0.70 Very Good E VS2 ... 2755 5.59 5.65 3.53
53922 0.70 Very Good D VS1 ... 2755 5.67 5.58 3.55
53923 0.73 Ideal I VS2 ... 2756 5.80 5.84 3.57
53924 0.73 Ideal I VS2 ... 2756 5.82 5.84 3.59
53925 0.79 Ideal I SI1 ... 2756 5.95 5.97 3.67
53926 0.71 Ideal E SI1 ... 2756 5.71 5.73 3.54
53927 0.79 Good F SI1 ... 2756 6.06 6.13 3.54
53928 0.79 Premium E SI2 ... 2756 6.03 5.96 3.68
53929 0.71 Ideal G VS1 ... 2756 5.76 5.73 3.53
53930 0.71 Premium E SI1 ... 2756 5.79 5.74 3.49
53931 0.71 Premium F SI1 ... 2756 5.74 5.73 3.43
53932 0.70 Very Good E VS2 ... 2757 5.71 5.76 3.47
53933 0.70 Very Good E VS2 ... 2757 5.69 5.72 3.49
53934 0.72 Premium D SI1 ... 2757 5.69 5.73 3.58
53935 0.72 Ideal D SI1 ... 2757 5.75 5.76 3.50
53936 0.72 Good D SI1 ... 2757 5.69 5.75 3.61
53937 0.70 Very Good D SI1 ... 2757 5.66 5.68 3.56
53938 0.86 Premium H SI2 ... 2757 6.15 6.12 3.74
53939 0.75 Ideal D SI2 ... 2757 5.83 5.87 3.64
[53940 rows x 10 columns]
pd_df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
carat 53940 non-null float64
cut 53940 non-null object
color 53940 non-null object
clarity 53940 non-null object
depth 53940 non-null float64
table 53940 non-null float64
price 53940 non-null int64
x 53940 non-null float64
y 53940 non-null float64
z 53940 non-null float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB
# Conversion of qualitative values to quantitative values. Higher numbers equate to better quality.
pd_df['cut'] = pd_df['cut'].replace({'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4})
pd_df['color'] = pd_df['color'].replace({'J':0, 'I':1, 'H':2, 'G':3, 'F':4, 'E':5, 'D':6})
pd_df['clarity'] = pd_df['clarity'].replace({'I1':0, 'SI1':1, 'SI2':2, 'VS1':3, 'VS2':4, 'VVS1':5, 'VVS2':6, 'IF':7})
pd_df
Out[3]:
carat cut color clarity depth table price x y z
0 0.23 4 5 2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 3 5 1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 1 5 3 56.9 65.0 327 4.05 4.07 2.31
3 0.29 3 1 4 62.4 58.0 334 4.20 4.23 2.63
4 0.31 1 0 2 63.3 58.0 335 4.34 4.35 2.75
5 0.24 2 0 6 62.8 57.0 336 3.94 3.96 2.48
6 0.24 2 1 5 62.3 57.0 336 3.95 3.98 2.47
7 0.26 2 2 1 61.9 55.0 337 4.07 4.11 2.53
8 0.22 0 5 4 65.1 61.0 337 3.87 3.78 2.49
9 0.23 2 2 3 59.4 61.0 338 4.00 4.05 2.39
10 0.30 1 0 1 64.0 55.0 339 4.25 4.28 2.73
11 0.23 4 0 3 62.8 56.0 340 3.93 3.90 2.46
12 0.22 3 4 1 60.4 61.0 342 3.88 3.84 2.33
13 0.31 4 0 2 62.2 54.0 344 4.35 4.37 2.71
14 0.20 3 5 2 60.2 62.0 345 3.79 3.75 2.27
15 0.32 3 5 0 60.9 58.0 345 4.38 4.42 2.68
16 0.30 4 1 2 62.0 54.0 348 4.31 4.34 2.68
17 0.30 1 0 1 63.4 54.0 351 4.23 4.29 2.70
18 0.30 1 0 1 63.8 56.0 351 4.23 4.26 2.71
19 0.30 2 0 1 62.7 59.0 351 4.21 4.27 2.66
20 0.30 1 1 2 63.3 56.0 351 4.26 4.30 2.71
21 0.23 2 5 4 63.8 55.0 352 3.85 3.92 2.48
22 0.23 2 2 3 61.0 57.0 353 3.94 3.96 2.41
23 0.31 2 0 1 59.4 62.0 353 4.39 4.43 2.62
24 0.31 2 0 1 58.1 62.0 353 4.44 4.47 2.59
25 0.23 2 3 6 60.4 58.0 354 3.97 4.01 2.41
26 0.24 3 1 3 62.5 57.0 355 3.97 3.94 2.47
27 0.30 2 0 4 62.2 57.0 357 4.28 4.30 2.67
28 0.23 2 6 4 60.5 61.0 357 3.96 3.97 2.40
29 0.23 2 4 3 60.9 57.0 357 3.96 3.99 2.42
... ... ... ... ... ... ... ... ... ... ...
53910 0.70 3 5 1 60.5 58.0 2753 5.74 5.77 3.48
53911 0.57 3 5 7 59.8 60.0 2753 5.43 5.38 3.23
53912 0.61 3 4 5 61.8 59.0 2753 5.48 5.40 3.36
53913 0.80 1 3 4 64.2 58.0 2753 5.84 5.81 3.74
53914 0.84 1 1 3 63.7 59.0 2753 5.94 5.90 3.77
53915 0.77 4 5 2 62.1 56.0 2753 5.84 5.86 3.63
53916 0.74 1 6 1 63.1 59.0 2753 5.71 5.74 3.61
53917 0.90 2 0 1 63.2 60.0 2753 6.12 6.09 3.86
53918 0.76 3 1 3 59.3 62.0 2753 5.93 5.85 3.49
53919 0.76 4 1 5 62.2 55.0 2753 5.89 5.87 3.66
53920 0.70 2 5 4 62.4 60.0 2755 5.57 5.61 3.49
53921 0.70 2 5 4 62.8 60.0 2755 5.59 5.65 3.53
53922 0.70 2 6 3 63.1 59.0 2755 5.67 5.58 3.55
53923 0.73 4 1 4 61.3 56.0 2756 5.80 5.84 3.57
53924 0.73 4 1 4 61.6 55.0 2756 5.82 5.84 3.59
53925 0.79 4 1 1 61.6 56.0 2756 5.95 5.97 3.67
53926 0.71 4 5 1 61.9 56.0 2756 5.71 5.73 3.54
53927 0.79 1 4 1 58.1 59.0 2756 6.06 6.13 3.54
53928 0.79 3 5 2 61.4 58.0 2756 6.03 5.96 3.68
53929 0.71 4 3 3 61.4 56.0 2756 5.76 5.73 3.53
53930 0.71 3 5 1 60.5 55.0 2756 5.79 5.74 3.49
53931 0.71 3 4 1 59.8 62.0 2756 5.74 5.73 3.43
53932 0.70 2 5 4 60.5 59.0 2757 5.71 5.76 3.47
53933 0.70 2 5 4 61.2 59.0 2757 5.69 5.72 3.49
53934 0.72 3 6 1 62.7 59.0 2757 5.69 5.73 3.58
53935 0.72 4 6 1 60.8 57.0 2757 5.75 5.76 3.50
53936 0.72 1 6 1 63.1 55.0 2757 5.69 5.75 3.61
53937 0.70 2 6 1 62.8 60.0 2757 5.66 5.68 3.56
53938 0.86 3 2 2 61.0 58.0 2757 6.15 6.12 3.74
53939 0.75 4 6 2 62.2 55.0 2757 5.83 5.87 3.64
[53940 rows x 10 columns]
# Shuffling the dataset for more accurate training and testing results.
pd_df = pd_df.sample(frac=1).reset_index(drop=True)
# Splitting the data up so that 80% of the data is training data, 20% testing data.
training_data = pd_df[:int(pd_df.shape[0]*.8)]
testing_data = pd_df[int(pd_df.shape[0]*.8):]
print("We have %d training examples and %d test examples." % (training_data.shape[0], testing_data.shape[0]))
We have 43152 training examples and 10788 test examples.
import os
import tempfile
# Creating a temporary folder for storing our data.
temp = tempfile.mkdtemp()
# Defining paths for the data to be saved.
train = os.path.join(temp, "train_diamonds.parquet")
test = os.path.join(temp, "test_diamonds.parquet")
# Creating diamonds dataset parquet files.
training_data.to_parquet(train)
testing_data.to_parquet(test)
# The number of data points we want to predict on when calling mlflow pyfunc predict.
num_pred = 20
# This dataframe contains the price of test diamonds. Predictions can be compared with these actual values.
diamond_prices = pd_df["price"][:num_pred]
# For data we want to predict on, we will drop the price and keep only the feature columns.
predict = pd_df.drop(["price"], 1)
diamond_predict = predict[:num_pred]
os.listdir(temp)
Out[5]: ['train_diamonds.parquet', 'test_diamonds.parquet']
Model Selection with MLflow & mlflow-apps
This example notebook demonstrates how to use mlflow-apps, a collection of pluggable applications written with MLflow, to simplify ML model training & selection.
When tackling ML problems, it's often useful to experiment with a variety of different frameworks, comparing results to choose the best model for the job. MLflow makes this easy through a set of components accessible via CLIs & Python APIs:
In terms of the above, mlflow-apps is a collection of runnable MLflow projects that use the tracking & models APIs to simplify model training & comparison.
In this notebook, we'll use mlflow-apps to fit TensorFlow, scikit-learn, and XGBoost models on ggplot2's diamonds dataset, running each model training app through a single MLflow API call. We'll also cover data preprocessing and using MLflow to predict with the trained models.