table = "bigquery-public-data.samples.shakespeare"
df = spark.read.format("bigquery").option("table",table).load()
df.show()
df.printSchema()
df.createOrReplaceTempView("words")
+---------+----------+-------+-----------+
| word|word_count| corpus|corpus_date|
+---------+----------+-------+-----------+
| LVII| 1|sonnets| 0|
| augurs| 1|sonnets| 0|
| dimm'd| 1|sonnets| 0|
| plagues| 1|sonnets| 0|
| treason| 1|sonnets| 0|
| surmise| 1|sonnets| 0|
| heed| 1|sonnets| 0|
|Unthrifty| 1|sonnets| 0|
| quality| 1|sonnets| 0|
| wherever| 1|sonnets| 0|
| C| 1|sonnets| 0|
| L| 1|sonnets| 0|
|imaginary| 1|sonnets| 0|
| H| 1|sonnets| 0|
| relief| 1|sonnets| 0|
| W| 1|sonnets| 0|
| V| 1|sonnets| 0|
| advised| 1|sonnets| 0|
| grey| 1|sonnets| 0|
| X| 1|sonnets| 0|
+---------+----------+-------+-----------+
only showing top 20 rows
root
|-- word: string (nullable = false)
|-- word_count: long (nullable = false)
|-- corpus: string (nullable = false)
|-- corpus_date: long (nullable = false)
table = "bigquery-public-data.samples.shakespeare"
tempLocation = "databricks_testing"
query = "SELECT count(1) FROM {table}".format(table=table)
# load the result of a SQL query on BigQuery into a DataFrame
df = spark.read.format("bigquery") \
.option("materializationDataset", tempLocation) \
.option("query", query) \
.load() \
.collect()
display(df)
from pyspark.sql import *
Employee = Row("firstName", "lastName", "email", "salary")
employee1 = Employee("michael", "armbrust", "no-reply@berkeley.edu", 100000)
employee2 = Employee("xiangrui", "meng", "no-reply@stanford.edu", 120000)
employee3 = Employee("matei", "zaharia", "no-reply@waterloo.edu", 140000)
employee4 = Employee("patrick", "wendell", "no-reply@princeton.edu", 160000)
employees = [employee1, employee2, employee3, employee4]
df = spark.createDataFrame(employees)
display(df)
df = spark.read.format("bigquery") \
.option("table", "bigquery-public-data.samples.github_nested") \
.load() \
.where("payload.pull_request.user.id > 500 and repository.url='https://github.com/bitcoin/bitcoin'") \
.select("payload.pull_request.user.url") \
.distinct() \
.sort("payload.pull_request.user.url") \
.take(3)
display(df)
Loading a Google BigQuery table into a DataFrame