MLflow: Train PySpark Model and Log in MLeap Format

#dbutils.library.installPyPI("mlflow[extras]")#dbutils.library.restartPython()

df = spark.read.parquet("/databricks-datasets/news20.binary/data-001/training").select("text", "topic")df.cache()display(df)

from pyspark.ml import Pipelinefrom pyspark.ml.classification import DecisionTreeClassifierfrom pyspark.ml.feature import StringIndexer, Tokenizer, HashingTFfrom pyspark.ml.evaluation import MulticlassClassificationEvaluatorfrom pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Define pipeline componentslabelIndexer = StringIndexer(inputCol="topic", outputCol="label", handleInvalid="keep")tokenizer = Tokenizer(inputCol="text", outputCol="words")hashingTF = HashingTF(inputCol="words", outputCol="features")dt = DecisionTreeClassifier() # Construct a Pipeline object using the defined componentspipeline = Pipeline(stages=[labelIndexer, tokenizer, hashingTF, dt])

import mlflowimport mlflow.mleap def fit_model():  # Start a new MLflow run  with mlflow.start_run() as run:    # Fit the model, performing cross validation to improve accuracy    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [1000, 2000]).build()    cv = CrossValidator(estimator=pipeline, evaluator=MulticlassClassificationEvaluator(), estimatorParamMaps=paramGrid)    cvModel = cv.fit(df)    model = cvModel.bestModel      # Log the model within the MLflow run    mlflow.mleap.log_model(spark_model=model, sample_input=df, artifact_path="model")

# Train the PySpark Pipeline model within a new MLflow runfit_model()

MLflow Deployment: Train PySpark Model and Log in MLeap Format

Setup

Train a PySpark Pipeline model

Setup

Train a PySpark Pipeline model

Load pipeline training data

Define the PySpark Pipeline structure

Train the Pipeline model and log it within an MLflow run with MLeap flavor