XGBoost regression with Spark DataFrames

import java.util.UUID.randomUUID temp_path: String = /tmp/power_plant/23c86fb4-2744-4582-bdfe-f4bf7668199a defined class PowerPlantTable powerPlantData: Unit = ()

Table

import org.apache.spark.ml.feature.VectorAssembler assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_e3255d0f3101, handleInvalid=error, numInputCols=4 xgbInput: org.apache.spark.sql.DataFrame = [AT: double, V: double ... 4 more fields]

Table

split20: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [AT: double, V: double ... 4 more fields] split80: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [AT: double, V: double ... 4 more fields] testSet: split20.type = [AT: double, V: double ... 4 more fields] trainingSet: split80.type = [AT: double, V: double ... 4 more fields]

import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor} xgbParam: scala.collection.immutable.Map[String,Any] = Map(num_workers -> 2, max_depth -> 6, objective -> reg:squarederror, num_round -> 10, eta -> 0.3) xgbRegressor: ml.dmlc.xgboost4j.scala.spark.XGBoostRegressor = xgbr_8fbae9c09bc0

Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=51469, DMLC_NUM_WORKER=2} xgbRegressionModel: ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel = xgbr_8fbae9c09bc0

predictions: org.apache.spark.sql.DataFrame = [AT: double, V: double ... 5 more fields]

Table

import org.apache.spark.ml.evaluation.RegressionEvaluator evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = RegressionEvaluator: uid=regEval_83b65b551a4e, metricName=rmse, throughOrigin=false rmse: Double = 13.450027850150057

Root mean squared error: 13.450027850150057

Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=55173, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=37075, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=58581, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=47463, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=43165, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=50885, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=47005, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=46371, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=37891, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=33205, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=52569, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=52333, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=47265, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=49411, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=46845, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=52399, DMLC_NUM_WORKER=2} Tracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.168.5.140, DMLC_TRACKER_PORT=40821, DMLC_NUM_WORKER=2} import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} paramGrid: Array[org.apache.spark.ml.param.ParamMap] = Array({ xgbr_8fbae9c09bc0-eta: 0.1, xgbr_8fbae9c09bc0-maxDepth: 4 }, { xgbr_8fbae9c09bc0-eta: 0.1, xgbr_8fbae9c09bc0-maxDepth: 7 }, { xgbr_8fbae9c09bc0-eta: 0.6, xgbr_8fbae9c09bc0-maxDepth: 4 }, { xgbr_8fbae9c09bc0-eta: 0.6, xgbr_8fbae9c09bc0-maxDepth: 7 }) cv: org.apache.spark.ml.tuning.CrossValidator = cv_04628ab83845 cvModel: org.apache.spark.ml.tuning.CrossValidatorModel = CrossValidatorModel: uid=cv_04628ab83845, bestModel=xgbr_8fbae9c09bc0, numFolds=4

res8: org.apache.spark.ml.param.ParamMap = { xgbr_8fbae9c09bc0-allowNonZeroForMissing: false, xgbr_8fbae9c09bc0-batchSize: 32768, xgbr_8fbae9c09bc0-dmlcWorkerConnectRetry: 5, xgbr_8fbae9c09bc0-eta: 0.6, xgbr_8fbae9c09bc0-evalMetric: rmse, xgbr_8fbae9c09bc0-featuresCol: features, xgbr_8fbae9c09bc0-handleInvalid: error, xgbr_8fbae9c09bc0-labelCol: label, xgbr_8fbae9c09bc0-maxDepth: 7, xgbr_8fbae9c09bc0-missing: NaN, xgbr_8fbae9c09bc0-nthread: 1, xgbr_8fbae9c09bc0-numRound: 10, xgbr_8fbae9c09bc0-numWorkers: 2, xgbr_8fbae9c09bc0-objective: reg:squarederror, xgbr_8fbae9c09bc0-predictionCol: prediction, xgbr_8fbae9c09bc0-rabitRingReduceThreshold: 32768, xgbr_8fbae9c09bc0-rabitTimeout: -1, xgbr_8fbae9c09bc0-seed: 0, xgbr_8fbae9c09bc0-trackerConf: TrackerConf(0,python,,), xgbr_8fbae9c09bc0-trainTestRatio: 1.0, xgbr_8fbae9c09bc0-treeLimit: 0, xgbr_8fbae9c09bc0-useExternalMemory: false }

predictions2: org.apache.spark.sql.DataFrame = [AT: double, V: double ... 5 more fields] rmse2: Double = 3.1257189881608602

Root mean squared error: 3.1257189881608602

Table

res12: Boolean = true

xgboost-regression(Scala)

XGBoost regression with Spark DataFrames

Prepare data

Train the XGBoost model

Evaluate the model

Tune the model