Serve a SparkML model

import numpy as npimport pandas as pdfrom pyspark.ml.regression import LinearRegressionfrom pyspark.ml.feature import VectorAssemblerfrom pyspark.ml import Pipelineimport mlflow

mlflow.autolog(disable=True)

X1 = "x1"
X2 = "x2"
Y = "y"
ERROR = "error"

DATA_LEN = 10000
_beta1 = 9
_beta2 = 6
x1 = np.random.randn(DATA_LEN)
x2 = np.random.randn(DATA_LEN)
error = np.random.randn(DATA_LEN)

df = (
        pd.DataFrame({
            X1 : x1,
            X2 : x2,
            ERROR : error
        })
        .assign(y = lambda z : _beta1 * z[X1] + _beta2 * z[X2] + z[ERROR])
        .drop(columns=ERROR)
    )
df

# Convert the pandas dataframe to spark
spark_df = spark.createDataFrame(df)
with mlflow.start_run() as run:
    # Create the vector assembler
    assembler = VectorAssembler(inputCols=[X1,X2], outputCol="features")
    # Create the linear regression
    lr = LinearRegression(featuresCol="features", labelCol=Y)
    # Put the vector assembler and the linear regression into a pipeline
    pipeline = Pipeline(stages=[assembler,lr])
    # Train the pipeline
    model = pipeline.fit(spark_df)
    mlflow.spark.log_model(model, "model", registered_model_name="spark_linear_regression")

2023/04/28 18:58:38 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model(). Successfully registered model 'spark_linear_regression'. 2023/04/28 18:59:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: spark_linear_regression, version 1 Created version '1' of model 'spark_linear_regression'.

Serve a SparkML model(Python)

Serve a SparkML model

Requirements