import pandas
from mlflow import version
import pyarrow
import tensorflow as tf
import xgboost as xgb
print("Pandas version: %s" % pandas.__version__)
print("MLflow version: %s" % version.VERSION)
print("PyArrow version: %s" % pyarrow.__version__)
print("XGboost version: %s" % xgb.__version__)
print("TensorFlow version: %s" % tf.__version__)
# Downloading csv file from ggplot2's hosted dataset on github.
url = "https://raw.githubusercontent.com/tidyverse/ggplot2/4c678917/data-raw/diamonds.csv"
# Reading the csv into a pandas DataFrame
pd_df = pandas.read_csv(url)
pd_df
Pandas version: 0.23.2
MLflow version: 0.4.2
PyArrow version: 0.8.0
XGboost version: 0.71
TensorFlow version: 1.8.0
Out[1]:
carat cut color clarity ... price x y z
0 0.23 Ideal E SI2 ... 326 3.95 3.98 2.43
1 0.21 Premium E SI1 ... 326 3.89 3.84 2.31
2 0.23 Good E VS1 ... 327 4.05 4.07 2.31
3 0.29 Premium I VS2 ... 334 4.20 4.23 2.63
4 0.31 Good J SI2 ... 335 4.34 4.35 2.75
5 0.24 Very Good J VVS2 ... 336 3.94 3.96 2.48
6 0.24 Very Good I VVS1 ... 336 3.95 3.98 2.47
7 0.26 Very Good H SI1 ... 337 4.07 4.11 2.53
8 0.22 Fair E VS2 ... 337 3.87 3.78 2.49
9 0.23 Very Good H VS1 ... 338 4.00 4.05 2.39
10 0.30 Good J SI1 ... 339 4.25 4.28 2.73
11 0.23 Ideal J VS1 ... 340 3.93 3.90 2.46
12 0.22 Premium F SI1 ... 342 3.88 3.84 2.33
13 0.31 Ideal J SI2 ... 344 4.35 4.37 2.71
14 0.20 Premium E SI2 ... 345 3.79 3.75 2.27
15 0.32 Premium E I1 ... 345 4.38 4.42 2.68
16 0.30 Ideal I SI2 ... 348 4.31 4.34 2.68
17 0.30 Good J SI1 ... 351 4.23 4.29 2.70
18 0.30 Good J SI1 ... 351 4.23 4.26 2.71
19 0.30 Very Good J SI1 ... 351 4.21 4.27 2.66
20 0.30 Good I SI2 ... 351 4.26 4.30 2.71
21 0.23 Very Good E VS2 ... 352 3.85 3.92 2.48
22 0.23 Very Good H VS1 ... 353 3.94 3.96 2.41
23 0.31 Very Good J SI1 ... 353 4.39 4.43 2.62
24 0.31 Very Good J SI1 ... 353 4.44 4.47 2.59
25 0.23 Very Good G VVS2 ... 354 3.97 4.01 2.41
26 0.24 Premium I VS1 ... 355 3.97 3.94 2.47
27 0.30 Very Good J VS2 ... 357 4.28 4.30 2.67
28 0.23 Very Good D VS2 ... 357 3.96 3.97 2.40
29 0.23 Very Good F VS1 ... 357 3.96 3.99 2.42
... ... ... ... ... ... ... ... ... ...
53910 0.70 Premium E SI1 ... 2753 5.74 5.77 3.48
53911 0.57 Premium E IF ... 2753 5.43 5.38 3.23
53912 0.61 Premium F VVS1 ... 2753 5.48 5.40 3.36
53913 0.80 Good G VS2 ... 2753 5.84 5.81 3.74
53914 0.84 Good I VS1 ... 2753 5.94 5.90 3.77
53915 0.77 Ideal E SI2 ... 2753 5.84 5.86 3.63
53916 0.74 Good D SI1 ... 2753 5.71 5.74 3.61
53917 0.90 Very Good J SI1 ... 2753 6.12 6.09 3.86
53918 0.76 Premium I VS1 ... 2753 5.93 5.85 3.49
53919 0.76 Ideal I VVS1 ... 2753 5.89 5.87 3.66
53920 0.70 Very Good E VS2 ... 2755 5.57 5.61 3.49
53921 0.70 Very Good E VS2 ... 2755 5.59 5.65 3.53
53922 0.70 Very Good D VS1 ... 2755 5.67 5.58 3.55
53923 0.73 Ideal I VS2 ... 2756 5.80 5.84 3.57
53924 0.73 Ideal I VS2 ... 2756 5.82 5.84 3.59
53925 0.79 Ideal I SI1 ... 2756 5.95 5.97 3.67
53926 0.71 Ideal E SI1 ... 2756 5.71 5.73 3.54
53927 0.79 Good F SI1 ... 2756 6.06 6.13 3.54
53928 0.79 Premium E SI2 ... 2756 6.03 5.96 3.68
53929 0.71 Ideal G VS1 ... 2756 5.76 5.73 3.53
53930 0.71 Premium E SI1 ... 2756 5.79 5.74 3.49
53931 0.71 Premium F SI1 ... 2756 5.74 5.73 3.43
53932 0.70 Very Good E VS2 ... 2757 5.71 5.76 3.47
53933 0.70 Very Good E VS2 ... 2757 5.69 5.72 3.49
53934 0.72 Premium D SI1 ... 2757 5.69 5.73 3.58
53935 0.72 Ideal D SI1 ... 2757 5.75 5.76 3.50
53936 0.72 Good D SI1 ... 2757 5.69 5.75 3.61
53937 0.70 Very Good D SI1 ... 2757 5.66 5.68 3.56
53938 0.86 Premium H SI2 ... 2757 6.15 6.12 3.74
53939 0.75 Ideal D SI2 ... 2757 5.83 5.87 3.64
[53940 rows x 10 columns]
pd_df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
carat 53940 non-null float64
cut 53940 non-null object
color 53940 non-null object
clarity 53940 non-null object
depth 53940 non-null float64
table 53940 non-null float64
price 53940 non-null int64
x 53940 non-null float64
y 53940 non-null float64
z 53940 non-null float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB
# Conversion of qualitative values to quantitative values. Higher numbers equate to better quality.
pd_df['cut'] = pd_df['cut'].replace({'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4})
pd_df['color'] = pd_df['color'].replace({'J':0, 'I':1, 'H':2, 'G':3, 'F':4, 'E':5, 'D':6})
pd_df['clarity'] = pd_df['clarity'].replace({'I1':0, 'SI1':1, 'SI2':2, 'VS1':3, 'VS2':4, 'VVS1':5, 'VVS2':6, 'IF':7})
pd_df
Out[3]:
carat cut color clarity depth table price x y z
0 0.23 4 5 2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 3 5 1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 1 5 3 56.9 65.0 327 4.05 4.07 2.31
3 0.29 3 1 4 62.4 58.0 334 4.20 4.23 2.63
4 0.31 1 0 2 63.3 58.0 335 4.34 4.35 2.75
5 0.24 2 0 6 62.8 57.0 336 3.94 3.96 2.48
6 0.24 2 1 5 62.3 57.0 336 3.95 3.98 2.47
7 0.26 2 2 1 61.9 55.0 337 4.07 4.11 2.53
8 0.22 0 5 4 65.1 61.0 337 3.87 3.78 2.49
9 0.23 2 2 3 59.4 61.0 338 4.00 4.05 2.39
10 0.30 1 0 1 64.0 55.0 339 4.25 4.28 2.73
11 0.23 4 0 3 62.8 56.0 340 3.93 3.90 2.46
12 0.22 3 4 1 60.4 61.0 342 3.88 3.84 2.33
13 0.31 4 0 2 62.2 54.0 344 4.35 4.37 2.71
14 0.20 3 5 2 60.2 62.0 345 3.79 3.75 2.27
15 0.32 3 5 0 60.9 58.0 345 4.38 4.42 2.68
16 0.30 4 1 2 62.0 54.0 348 4.31 4.34 2.68
17 0.30 1 0 1 63.4 54.0 351 4.23 4.29 2.70
18 0.30 1 0 1 63.8 56.0 351 4.23 4.26 2.71
19 0.30 2 0 1 62.7 59.0 351 4.21 4.27 2.66
20 0.30 1 1 2 63.3 56.0 351 4.26 4.30 2.71
21 0.23 2 5 4 63.8 55.0 352 3.85 3.92 2.48
22 0.23 2 2 3 61.0 57.0 353 3.94 3.96 2.41
23 0.31 2 0 1 59.4 62.0 353 4.39 4.43 2.62
24 0.31 2 0 1 58.1 62.0 353 4.44 4.47 2.59
25 0.23 2 3 6 60.4 58.0 354 3.97 4.01 2.41
26 0.24 3 1 3 62.5 57.0 355 3.97 3.94 2.47
27 0.30 2 0 4 62.2 57.0 357 4.28 4.30 2.67
28 0.23 2 6 4 60.5 61.0 357 3.96 3.97 2.40
29 0.23 2 4 3 60.9 57.0 357 3.96 3.99 2.42
... ... ... ... ... ... ... ... ... ... ...
53910 0.70 3 5 1 60.5 58.0 2753 5.74 5.77 3.48
53911 0.57 3 5 7 59.8 60.0 2753 5.43 5.38 3.23
53912 0.61 3 4 5 61.8 59.0 2753 5.48 5.40 3.36
53913 0.80 1 3 4 64.2 58.0 2753 5.84 5.81 3.74
53914 0.84 1 1 3 63.7 59.0 2753 5.94 5.90 3.77
53915 0.77 4 5 2 62.1 56.0 2753 5.84 5.86 3.63
53916 0.74 1 6 1 63.1 59.0 2753 5.71 5.74 3.61
53917 0.90 2 0 1 63.2 60.0 2753 6.12 6.09 3.86
53918 0.76 3 1 3 59.3 62.0 2753 5.93 5.85 3.49
53919 0.76 4 1 5 62.2 55.0 2753 5.89 5.87 3.66
53920 0.70 2 5 4 62.4 60.0 2755 5.57 5.61 3.49
53921 0.70 2 5 4 62.8 60.0 2755 5.59 5.65 3.53
53922 0.70 2 6 3 63.1 59.0 2755 5.67 5.58 3.55
53923 0.73 4 1 4 61.3 56.0 2756 5.80 5.84 3.57
53924 0.73 4 1 4 61.6 55.0 2756 5.82 5.84 3.59
53925 0.79 4 1 1 61.6 56.0 2756 5.95 5.97 3.67
53926 0.71 4 5 1 61.9 56.0 2756 5.71 5.73 3.54
53927 0.79 1 4 1 58.1 59.0 2756 6.06 6.13 3.54
53928 0.79 3 5 2 61.4 58.0 2756 6.03 5.96 3.68
53929 0.71 4 3 3 61.4 56.0 2756 5.76 5.73 3.53
53930 0.71 3 5 1 60.5 55.0 2756 5.79 5.74 3.49
53931 0.71 3 4 1 59.8 62.0 2756 5.74 5.73 3.43
53932 0.70 2 5 4 60.5 59.0 2757 5.71 5.76 3.47
53933 0.70 2 5 4 61.2 59.0 2757 5.69 5.72 3.49
53934 0.72 3 6 1 62.7 59.0 2757 5.69 5.73 3.58
53935 0.72 4 6 1 60.8 57.0 2757 5.75 5.76 3.50
53936 0.72 1 6 1 63.1 55.0 2757 5.69 5.75 3.61
53937 0.70 2 6 1 62.8 60.0 2757 5.66 5.68 3.56
53938 0.86 3 2 2 61.0 58.0 2757 6.15 6.12 3.74
53939 0.75 4 6 2 62.2 55.0 2757 5.83 5.87 3.64
[53940 rows x 10 columns]
# Shuffling the dataset for more accurate training and testing results.
pd_df = pd_df.sample(frac=1).reset_index(drop=True)
# Splitting the data up so that 80% of the data is training data, 20% testing data.
training_data = pd_df[:int(pd_df.shape[0]*.8)]
testing_data = pd_df[int(pd_df.shape[0]*.8):]
print("We have %d training examples and %d test examples." % (training_data.shape[0], testing_data.shape[0]))
We have 43152 training examples and 10788 test examples.
import os
import tempfile
# Creating a temporary folder for storing our data.
temp = tempfile.mkdtemp()
# Defining paths for the data to be saved.
train = os.path.join(temp, "train_diamonds.parquet")
test = os.path.join(temp, "test_diamonds.parquet")
# Creating diamonds dataset parquet files.
training_data.to_parquet(train)
testing_data.to_parquet(test)
# The number of data points we want to predict on when calling mlflow pyfunc predict.
num_pred = 20
# This dataframe contains the price of test diamonds. Predictions can be compared with these actual values.
diamond_prices = pd_df["price"][:num_pred]
# For data we want to predict on, we will drop the price and keep only the feature columns.
predict = pd_df.drop(["price"], 1)
diamond_predict = predict[:num_pred]
os.listdir(temp)
Out[5]: ['train_diamonds.parquet', 'test_diamonds.parquet']
import mlflow
linear_run = mlflow.projects.run(uri="https://github.com/mlflow/mlflow-apps.git#apps/linear-regression", parameters={"train":train, "test":test, "label-col": "price"})
=== Fetching project from https://github.com/mlflow/mlflow-apps.git#apps/linear-regression into /tmp/tmplo3pbflb ===
=== Creating conda environment mlflow-842ed82a6c8d95069a1b1ef3403e86dce70f465c ===
=== Created directory /tmp/tmp79jxha6u for downloading remote URIs passed to arguments of type 'path' ===
=== Running command 'source /databricks/conda/bin/activate mlflow-842ed82a6c8d95069a1b1ef3403e86dce70f465c && python main_linear.py /tmp/tmpwoeuig38/train_diamonds.parquet /tmp/tmpwoeuig38/test_diamonds.parquet 0.001 0.5 price' in run with ID '96771d893a5e46159d9f3b49bf9013e2' ===
=== Run (ID '96771d893a5e46159d9f3b49bf9013e2') succeeded ===
import numpy as np
from mlflow.sklearn import load_model
# Define helper for comparing model predictions to their expected values
def print_summary(predictions, expected):
"""Compares model predictions with expected values, printing a summary with absolute and percent differences."""
print("=== Printing summary of model predictions ===")
difference = np.absolute(np.subtract(expected, predictions))
difference_percent = np.divide(difference, expected) * 100
table = pandas.DataFrame({'Prediction':predictions, 'Actual': expected, 'Difference':difference, '% Difference':difference_percent})
table = table[['Prediction', 'Actual', 'Difference', '% Difference']]
print(table)
# Load our model back & print a summary of its predictions on test data
lr_model = load_model("model", run_id=linear_run.run_id)
linear_predictions = lr_model.predict(diamond_predict)
print_summary(predictions=linear_predictions, expected=diamond_prices)
/databricks/python/lib/python3.5/site-packages/sklearn/base.py:315: UserWarning: Trying to unpickle estimator ElasticNet from version 0.19.1 when using version 0.18.1. This might lead to breaking code or invalid results. Use at your own risk.
UserWarning)
=== Printing summary of model predictions ===
Prediction Actual Difference % Difference
0 11867.926167 16123 4255.073833 26.391328
1 -424.401805 855 1279.401805 149.637638
2 2119.883747 1895 224.883747 11.867216
3 -141.575642 594 735.575642 123.834283
4 3961.997654 3385 576.997654 17.045721
5 8672.338959 7526 1146.338959 15.231716
6 352.172667 1094 741.827333 67.808714
7 2961.745061 2226 735.745061 33.052339
8 1946.029484 1240 706.029484 56.937862
9 11203.717268 9414 1789.717268 19.011231
10 1930.262398 951 979.262398 102.971861
11 5708.454804 4454 1254.454804 28.164679
12 1110.784529 552 558.784529 101.229081
13 4582.507600 3740 842.507600 22.526941
14 4730.464998 4052 678.464998 16.743954
15 3792.290186 3282 510.290186 15.548147
16 1082.936332 680 402.936332 59.255343
17 154.618664 828 673.381336 81.326248
18 1245.914940 1002 243.914940 24.342808
19 5708.386194 5670 38.386194 0.677005
# Passing new values for alpha and l1-ratio.
linear_run2 = mlflow.projects.run(uri="https://github.com/mlflow/mlflow-apps.git#apps/linear-regression", parameters={"train":train, "test":test, "label-col": "price", "alpha":.0001, "l1-ratio":.1})
# Print metrics
print_run_metrics(linear_run2.run_id)
# Load the new model & make predictions.
linear_model2 = load_model("model", run_id=linear_run2.run_id)
linear_prediction2 = linear_model2.predict(diamond_predict)
print_summary(predictions=linear_prediction2, expected=diamond_prices)
=== Fetching project from https://github.com/mlflow/mlflow-apps.git#apps/linear-regression into /tmp/tmpkvkn0dc3 ===
=== Created directory /tmp/tmpi9w3viuk for downloading remote URIs passed to arguments of type 'path' ===
=== Running command 'source /databricks/conda/bin/activate mlflow-842ed82a6c8d95069a1b1ef3403e86dce70f465c && python main_linear.py /tmp/tmpwoeuig38/train_diamonds.parquet /tmp/tmpwoeuig38/test_diamonds.parquet 0.0001 0.1 price' in run with ID '2bc63a690d33486a9dc116c08cfce4d6' ===
=== Run (ID '2bc63a690d33486a9dc116c08cfce4d6') succeeded ===
=== Metrics for run 2bc63a690d33486a9dc116c08cfce4d6 ===
Test RMSE: 1280.865449
Test R2: 0.895606
Train R2: 0.894495
/databricks/python/lib/python3.5/site-packages/sklearn/base.py:315: UserWarning: Trying to unpickle estimator ElasticNet from version 0.19.1 when using version 0.18.1. This might lead to breaking code or invalid results. Use at your own risk.
UserWarning)
=== Printing summary of model predictions ===
Prediction Actual Difference % Difference
0 11903.878003 16123 4219.121997 26.168343
1 -421.876969 855 1276.876969 149.342336
2 2093.115020 1895 198.115020 10.454618
3 -112.503023 594 706.503023 118.939903
4 3923.555667 3385 538.555667 15.910064
5 8681.736429 7526 1155.736429 15.356583
6 334.424508 1094 759.575492 69.431032
7 2940.191156 2226 714.191156 32.084059
8 1966.802329 1240 726.802329 58.613091
9 11239.618949 9414 1825.618949 19.392596
10 1935.973932 951 984.973932 103.572443
11 5679.758085 4454 1225.758085 27.520388
12 1171.622328 552 619.622328 112.250422
13 4537.994270 3740 797.994270 21.336745
14 4689.399442 4052 637.399442 15.730490
15 3733.537074 3282 451.537074 13.757985
16 1122.745246 680 442.745246 65.109595
17 175.038405 828 652.961595 78.860096
18 1269.911501 1002 267.911501 26.737675
19 5655.342145 5670 14.657855 0.258516
# Launch training
gbt_run2 = mlflow.projects.run(uri="https://github.com/mlflow/mlflow-apps.git#apps/gbt-regression", parameters={"train":train, "test":test, "label-col": "price", "n-trees":500, "m-depth":5, "learning-rate":.1})
# Print metrics
print_run_metrics(gbt_run2.run_id)
# Loading the new model & making predictions
gbt_model2 = load_model("model", run_id=gbt_run2.run_id)
gbt_prediction2 = gbt_model2.predict(diamond_predict)
print_summary(gbt_prediction2, expected=diamond_prices)
=== Fetching project from https://github.com/mlflow/mlflow-apps.git#apps/gbt-regression into /tmp/tmpai1ssx_q ===
=== Creating conda environment mlflow-6876bc479f0d8935eada0e0b89f32993b9abf06a ===
=== Created directory /tmp/tmp6x_4ted0 for downloading remote URIs passed to arguments of type 'path' ===
=== Running command 'source /databricks/conda/bin/activate mlflow-6876bc479f0d8935eada0e0b89f32993b9abf06a && python main_gbt.py /tmp/tmpwoeuig38/train_diamonds.parquet /tmp/tmpwoeuig38/test_diamonds.parquet 500 5 0.1 rmse price' in run with ID '4582675484f94c659e10f293d6ff4d68' ===
=== Run (ID '4582675484f94c659e10f293d6ff4d68') succeeded ===
=== Metrics for run 4582675484f94c659e10f293d6ff4d68 ===
Test RMSE: 509.789048
Test R2: 0.983463
Train R2: 0.990626
=== Printing summary of model predictions ===
Prediction Actual Difference % Difference
0 15857.067383 16123 265.932617 1.649399
1 745.649292 855 109.350708 12.789556
2 1887.651245 1895 7.348755 0.387797
3 485.248230 594 108.751770 18.308379
4 3297.602539 3385 87.397461 2.581904
5 7381.205566 7526 144.794434 1.923923
6 994.682434 1094 99.317566 9.078388
7 1964.191772 2226 261.808228 11.761376
8 1074.228149 1240 165.771851 13.368698
9 9545.660156 9414 131.660156 1.398557
10 1134.098999 951 183.098999 19.253312
11 4556.489258 4454 102.489258 2.301061
12 582.466492 552 30.466492 5.519292
13 3664.408691 3740 75.591309 2.021158
14 4392.076660 4052 340.076660 8.392810
15 3171.805176 3282 110.194824 3.357551
16 711.498596 680 31.498596 4.632146
17 724.117004 828 103.882996 12.546256
18 1006.505310 1002 4.505310 0.449632
19 5721.875000 5670 51.875000 0.914903
# Launch training
dnn_run = mlflow.projects.run(uri="https://github.com/mlflow/mlflow-apps.git#apps/dnn-regression", parameters={"train":train, "test":test, "model-dir":temp, "label-col": "price", "hidden-units":"50,50", "steps":1000})
# Print metrics
# TODO: Update TF app to log same metrics as XGBoost & linear regression apps (R2 score)
print_run_metrics(dnn_run.run_id)
# Loading the new model & making predictions
dnn_model = mlflow.pyfunc.load_pyfunc("model", dnn_run.run_id)
dnn_prediction = dnn_model.predict(diamond_predict)
# Tensorflow estimator output requires some extra formatting to be a numpy array.
dnn_prediction = np.array([value[0][0] for value in dnn_prediction.values])
print_summary(dnn_prediction, expected=diamond_prices)
=== Fetching project from https://github.com/mlflow/mlflow-apps.git#apps/dnn-regression into /tmp/tmpm96k36b_ ===
=== Creating conda environment mlflow-3aeddf0774950c8857851841ef8b38289de49425 ===
=== Created directory /tmp/tmpx8prtks8 for downloading remote URIs passed to arguments of type 'path' ===
=== Running command 'source /databricks/conda/bin/activate mlflow-3aeddf0774950c8857851841ef8b38289de49425 && python main_dnn.py /tmp/tmpwoeuig38 /tmp/tmpwoeuig38/train_diamonds.parquet /tmp/tmpwoeuig38/test_diamonds.parquet 50,50 1000 128 price' in run with ID '55fd296ed9c24011902cee464b8b95a9' ===
=== Run (ID '55fd296ed9c24011902cee464b8b95a9') succeeded ===
=== Metrics for run 55fd296ed9c24011902cee464b8b95a9 ===
Test RMSE: 3070.662795
Test R2: 0.400027
Train R2: -0.089750
INFO:tensorflow:Restoring parameters from b'/tmp/tmpwoeuig38/1534286876/variables/variables'
=== Printing summary of model predictions ===
Prediction Actual Difference % Difference
0 5771.968750 16123 10351.031250 64.200405
1 3372.881348 855 2517.881348 294.489047
2 4271.750000 1895 2376.750000 125.422164
3 2924.267578 594 2330.267578 392.300939
4 4492.222656 3385 1107.222656 32.709680
5 5904.715820 7526 1621.284180 21.542442
6 3930.275879 1094 2836.275879 259.257393
7 4748.329590 2226 2522.329590 113.312201
8 2329.298340 1240 1089.298340 87.846640
9 6145.951172 9414 3268.048828 34.714774
10 2722.435059 951 1771.435059 186.270774
11 4559.107910 4454 105.107910 2.359854
12 2110.821777 552 1558.821777 282.395250
13 5325.910156 3740 1585.910156 42.404015
14 4300.595703 4052 248.595703 6.135136
15 4574.995117 3282 1292.995117 39.396561
16 2351.775635 680 1671.775635 245.849358
17 2685.854248 828 1857.854248 224.378532
18 2445.909424 1002 1443.909424 144.102737
19 4091.577637 5670 1578.422363 27.838137
Model Selection with MLflow & mlflow-apps
This example notebook demonstrates how to use mlflow-apps, a collection of pluggable applications written with MLflow, to simplify ML model training & selection.
When tackling ML problems, it's often useful to experiment with a variety of different frameworks, comparing results to choose the best model for the job. MLflow makes this easy through a set of components accessible via CLIs & Python APIs:
In terms of the above, mlflow-apps is a collection of runnable MLflow projects that use the tracking & models APIs to simplify model training & comparison.
In this notebook, we'll use mlflow-apps to fit TensorFlow, scikit-learn, and XGBoost models on ggplot2's diamonds dataset, running each model training app through a single MLflow API call. We'll also cover data preprocessing and using MLflow to predict with the trained models.