# Use the Spark CSV datasource with options specifying:
# - First line of file is a header
# - Automatically infer the schema of the data
data = spark.read.csv("/databricks-datasets/samples/population-vs-price/data_geo.csv", header="true", inferSchema="true")
data.cache() # Cache data for faster reuse
data.count()
Out[1]: 294
import pandas as pd
import matplotlib.pyplot as plt
pandas_vdata = vdata.toPandas()
x = pandas_vdata.population
y = pandas_vdata.label
plt.style.use('classic')
plt.rcParams['lines.linewidth'] = 0
fig, ax = plt.subplots()
ax.loglog(x,y)
plt.xlim(1.0e5, 1.0e7)
plt.ylim(5.0e1, 1.0e3)
ax.scatter(x, y, c="blue")
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
stages = []
assembler = VectorAssembler(inputCols=["population"], outputCol="features")
stages += [assembler]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(vdata)
dataset = pipelineModel.transform(vdata)
# Keep relevant columns
selectedcols = ["features", "label"]
dataset.select(selectedcols).show()
+-----------+-----+
| features|label|
+-----------+-----+
| [212247.0]|162.9|
| [188226.0]|157.7|
| [194675.0]|122.5|
| [200481.0]|129.0|
|[1537058.0]|206.1|
| [527972.0]|178.1|
| [197706.0]|131.8|
| [346997.0]|685.7|
|[3928864.0]|434.7|
| [319504.0]|281.0|
| [485199.0]|275.8|
|[1381069.0]|510.3|
| [852469.0]|748.3|
|[1015785.0]|900.0|
| [105112.0]|442.2|
| [445830.0]|220.1|
| [663862.0]|338.1|
| [147612.0]|342.7|
| [124705.0]|202.6|
| [130282.0]|205.1|
+-----------+-----+
only showing top 20 rows
# Fit 2 models, using different regularization parameters
modelA = lr.fit(dataset, {lr.regParam:0.0})
modelB = lr.fit(dataset, {lr.regParam:100.0})
print(">>>> ModelA intercept: %r, coefficient: %r" % (modelA.intercept, modelA.coefficients[0]))
print(">>>> ModelB intercept: %r, coefficient: %r" % (modelB.intercept, modelB.coefficients[0]))
>>>> ModelA intercept: 191.29427575139394, coefficient: 3.779789682338248e-05
>>>> ModelB intercept: 199.85112564667153, coefficient: 2.1603499483717156e-05
predictionsA = modelA.transform(dataset)
predictionsA.show()
+----------+-----+-----------+------------------+
|population|label| features| prediction|
+----------+-----+-----------+------------------+
| 212247|162.9| [212247.0]| 199.3167659584664|
| 188226|157.7| [188226.0]|198.40882267887193|
| 194675|122.5| [194675.0]|198.65258131548592|
| 200481|129.0| [200481.0]|198.87203590444247|
| 1537058|206.1|[1537058.0]|249.39183544694856|
| 527972|178.1| [527972.0]|211.25050693302884|
| 197706|131.8| [197706.0]| 198.7671467407576|
| 346997|685.7| [346997.0]| 204.4100325554172|
| 3928864|434.7|[3928864.0]|339.79707185649573|
| 319504|281.0| [319504.0]|203.37085497805194|
| 485199|275.8| [485199.0]|209.63377749220228|
| 1381069|510.3|[1381069.0]|243.49577931936597|
| 852469|748.3| [852469.0]|223.51581105852597|
| 1015785|900.0|[1015785.0]| 229.6888123761335|
| 105112|442.2| [105112.0]|195.26728828229332|
| 445830|220.1| [445830.0]|208.14571209216254|
| 663862|338.1| [663862.0]| 216.3868631323583|
| 147612|342.7| [147612.0]|196.87369889728708|
| 124705|202.6| [124705.0]|196.00786247475386|
| 130282|205.1| [130282.0]|196.21866134533786|
+----------+-----+-----------+------------------+
only showing top 20 rows
predictionsB = modelB.transform(dataset)
predictionsB.show()
+----------+-----+-----------+------------------+
|population|label| features| prediction|
+----------+-----+-----------+------------------+
| 212247|162.9| [212247.0]|204.43640360159205|
| 188226|157.7| [188226.0]|203.91746594049368|
| 194675|122.5| [194675.0]|204.05678690866418|
| 200481|129.0| [200481.0]|204.18221682666663|
| 1537058|206.1|[1537058.0]|233.05695735611485|
| 527972|178.1| [527972.0]|211.25716847608865|
| 197706|131.8| [197706.0]|204.12226711559933|
| 346997|685.7| [346997.0]|207.34747515702293|
| 3928864|434.7|[3928864.0]|284.72833704226645|
| 319504|281.0| [319504.0]| 206.7535301457171|
| 485199|275.8| [485199.0]| 210.3331219926716|
| 1381069|510.3|[1381069.0]| 229.6870490751493|
| 852469|748.3| [852469.0]| 218.2674392480564|
| 1015785|900.0|[1015785.0]|221.79563636973916|
| 105112|442.2| [105112.0]| 202.121912684404|
| 445830|220.1| [445830.0]|209.48261382149715|
| 663862|338.1| [663862.0]|214.19286802093097|
| 147612|342.7| [147612.0]| 203.040061412462|
| 124705|202.6| [124705.0]| 202.5451900497885|
| 130282|205.1| [130282.0]|202.66567276640916|
+----------+-----+-----------+------------------+
only showing top 20 rows
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse")
RMSE_modelA = evaluator.evaluate(predictionsA)
RMSE_modelB = evaluator.evaluate(predictionsB)
print("ModelA: Root Mean Squared Error = " + str(RMSE_modelA))
print("ModelB: Root Mean Squared Error = " + str(RMSE_modelB))
ModelA: Root Mean Squared Error = 128.60202684284758
ModelB: Root Mean Squared Error = 129.49630019270606
import databricks.koalas as ks
# Combine the predictions made by each model into a Koalas DataFrame.
predA = predictionsA.select('prediction')
predB = predictionsB.select('prediction')
kdfA = predA.to_koalas().rename(columns={"prediction": "pred_ModelA"})
kdfB = predB.to_koalas().rename(columns={"prediction": "pred_ModelB"})
kdfpreds = ks.concat([kdfA, kdfB], axis = 1)
# Create a new Koalas DataFrame containing the original population and price data.
selectedcols = ["population", "label"]
kdf = dataset.select(selectedcols).to_koalas()
# Combine the two DataFrames into one that contains the original data and the predictions from each model.
kdf2 = ks.concat([kdf, kdfpreds], axis = 1)
display(kdf2)
import seaborn as sns
f, ax = plt.subplots()
ax.set(xscale="log", yscale="log")
plt.xlim(1.0e5, 1.0e7)
plt.ylim(5.0e1, 1.0e3)
ax.scatter(x, y, c="blue")
sns.regplot(x="population",
y="pred_ModelB",
data=pydf,
color='g')
sns.regplot(x="population",
y="pred_ModelA",
data=pydf,
color='r')
ax.set(ylabel = "price")
Build a linear regression model to predict a city's median home price from its population