# Click on the Plot Options Button...to see how this pivot table was configured.
from pyspark.sql import Row
largePivotSeries = map(lambda x: Row(key="k_%03d" % (x % 200), series_grouping = "group_%d" % (x % 3), value = x), range(1, 5001))
largePivotDataFrame = spark.createDataFrame(sc.parallelize(largePivotSeries))
largePivotDataFrame.registerTempTable("table_to_be_pivoted")
display(spark.sql("select * from table_to_be_pivoted"))
from pyspark.sql import Row
salesEntryDataFrame = spark.createDataFrame(sc.parallelize([
Row(category="fruits_and_vegetables", product="apples", year=2012, salesAmount=100.50),
Row(category="fruits_and_vegetables", product="oranges", year=2012, salesAmount=100.75),
Row(category="fruits_and_vegetables", product="apples", year=2013, salesAmount=200.25),
Row(category="fruits_and_vegetables", product="oranges", year=2013, salesAmount=300.65),
Row(category="fruits_and_vegetables", product="apples", year=2014, salesAmount=300.65),
Row(category="fruits_and_vegetables", product="oranges", year=2015, salesAmount=100.35),
Row(category="butcher_shop", product="beef", year=2012, salesAmount=200.50),
Row(category="butcher_shop", product="chicken", year=2012, salesAmount=200.75),
Row(category="butcher_shop", product="pork", year=2013, salesAmount=400.25),
Row(category="butcher_shop", product="beef", year=2013, salesAmount=600.65),
Row(category="butcher_shop", product="beef", year=2014, salesAmount=600.65),
Row(category="butcher_shop", product="chicken", year=2015, salesAmount=200.35),
Row(category="misc", product="gum", year=2012, salesAmount=400.50),
Row(category="misc", product="cleaning_supplies", year=2012, salesAmount=400.75),
Row(category="misc", product="greeting_cards", year=2013, salesAmount=800.25),
Row(category="misc", product="kitchen_utensils", year=2013, salesAmount=1200.65),
Row(category="misc", product="cleaning_supplies", year=2014, salesAmount=1200.65),
Row(category="misc", product="cleaning_supplies", year=2015, salesAmount=400.35)
]))
salesEntryDataFrame.registerTempTable("test_sales_table")
display(spark.sql("select * from test_sales_table"))
from pyspark.sql import Row
stateRDD = spark.createDataFrame(sc.parallelize([
Row(state="MO", value=1), Row(state="MO", value=10),
Row(state="NH", value=4),
Row(state="MA", value=8),
Row(state="NY", value=4),
Row(state="CA", value=7)
]))
stateRDD.registerTempTable("test_state_table")
display(spark.sql("Select * from test_state_table"))
from pyspark.sql import Row
scatterPlotRDD = spark.createDataFrame(sc.parallelize([
Row(key="k1", a=0.2, b=120, c=1), Row(key="k1", a=0.4, b=140, c=1), Row(key="k1", a=0.6, b=160, c=1), Row(key="k1", a=0.8, b=180, c=1),
Row(key="k2", a=0.2, b=220, c=1), Row(key="k2", a=0.4, b=240, c=1), Row(key="k2", a=0.6, b=260, c=1), Row(key="k2", a=0.8, b=280, c=1),
Row(key="k1", a=1.8, b=120, c=1), Row(key="k1", a=1.4, b=140, c=1), Row(key="k1", a=1.6, b=160, c=1), Row(key="k1", a=1.8, b=180, c=1),
Row(key="k2", a=1.8, b=220, c=2), Row(key="k2", a=1.4, b=240, c=2), Row(key="k2", a=1.6, b=260, c=2), Row(key="k2", a=1.8, b=280, c=2),
Row(key="k1", a=2.2, b=120, c=1), Row(key="k1", a=2.4, b=140, c=1), Row(key="k1", a=2.6, b=160, c=1), Row(key="k1", a=2.8, b=180, c=1),
Row(key="k2", a=2.2, b=220, c=3), Row(key="k2", a=2.4, b=240, c=3), Row(key="k2", a=2.6, b=260, c=3), Row(key="k2", a=2.8, b=280, c=3)
]))
display(scatterPlotRDD)
from pyspark.sql import Row
# Hover over the entry in the histogram to read off the exact valued plotted.
histogramRDD = spark.createDataFrame(sc.parallelize([
Row(key1="a", key2="x", val=0.2), Row(key1="a", key2="x", val=0.4), Row(key1="a", key2="x", val=0.6), Row(key1="a", key2="x", val=0.8), Row(key1="a", key2="x", val=1.0),
Row(key1="b", key2="z", val=0.2), Row(key1="b", key2="x", val=0.4), Row(key1="b", key2="x", val=0.6), Row(key1="b", key2="y", val=0.8), Row(key1="b", key2="x", val=1.0),
Row(key1="a", key2="x", val=0.2), Row(key1="a", key2="y", val=0.4), Row(key1="a", key2="x", val=0.6), Row(key1="a", key2="x", val=0.8), Row(key1="a", key2="x", val=1.0),
Row(key1="b", key2="x", val=0.2), Row(key1="b", key2="x", val=0.4), Row(key1="b", key2="x", val=0.6), Row(key1="b", key2="z", val=0.8), Row(key1="b", key2="x", val=1.0)]))
display(histogramRDD)
from pyspark.sql import Row
import random
# Hovering over the Box will display the exact median value.
boxSeries = map(lambda x: Row(key="key_%01d" % (x % 2), grouping="group_%01d" % (x % 3), value=random.randint(0, x)), range(1, 5001))
boxSeriesRDD = spark.createDataFrame(sc.parallelize(boxSeries))
display(boxSeriesRDD)
Chart and Graph Types with Python
This notebook covers the various charts and graphs that are built into Databricks.
While Python is used to generate the test data displayed in the visualizations in this notebook, all the information about how to configure these charts & graphs applies to all notebooks.