val table = "bigquery-public-data.samples.shakespeare"
// load data from BigQuery
val df = spark.read.format("bigquery").option("table", table).load()
df.show()
df.printSchema()
df.createOrReplaceTempView("words")
+---------+----------+-------+-----------+
| word|word_count| corpus|corpus_date|
+---------+----------+-------+-----------+
| LVII| 1|sonnets| 0|
| augurs| 1|sonnets| 0|
| dimm'd| 1|sonnets| 0|
| plagues| 1|sonnets| 0|
| treason| 1|sonnets| 0|
| surmise| 1|sonnets| 0|
| heed| 1|sonnets| 0|
|Unthrifty| 1|sonnets| 0|
| quality| 1|sonnets| 0|
| wherever| 1|sonnets| 0|
| C| 1|sonnets| 0|
| L| 1|sonnets| 0|
|imaginary| 1|sonnets| 0|
| H| 1|sonnets| 0|
| relief| 1|sonnets| 0|
| W| 1|sonnets| 0|
| V| 1|sonnets| 0|
| advised| 1|sonnets| 0|
| grey| 1|sonnets| 0|
| X| 1|sonnets| 0|
+---------+----------+-------+-----------+
only showing top 20 rows
root
|-- word: string (nullable = false)
|-- word_count: long (nullable = false)
|-- corpus: string (nullable = false)
|-- corpus_date: long (nullable = false)
table: String = bigquery-public-data.samples.shakespeare
df: org.apache.spark.sql.DataFrame = [word: string, word_count: bigint ... 2 more fields]
val table = "bigquery-public-data.samples.shakespeare"
val tempLocation = "databricks_testing"
// load the result of a SQL query on BigQuery into a DataFrame
val df =
spark.read.format("bigquery")
.option("materializationDataset", tempLocation)
.option("query", s"SELECT count(1) FROM `${table}`")
.load()
.collect()
display(df)
case class Employee(firstName: String, lastName: String, email: String, salary: Int)
// Create the Employees
val employee1 = new Employee("michael", "armbrust", "no-reply@berkeley.edu", 100000)
val employee2 = new Employee("xiangrui", "meng", "no-reply@stanford.edu", 120000)
val employee3 = new Employee("matei", "zaharia", "no-reply@waterloo.edu", 140000)
val employee4 = new Employee("patrick", "wendell", "no-reply@princeton.edu", 160000)
val df = Seq(employee1, employee2, employee3, employee4).toDF
display(df)
val df = spark.read.format("bigquery")
.option("table", "bigquery-public-data.samples.github_nested")
.load()
.where("payload.pull_request.user.id > 500 and repository.url='https://github.com/bitcoin/bitcoin'")
.select("payload.pull_request.user.url")
.distinct
.as[String]
.sort("payload.pull_request.user.url")
.take(3)
df: Array[String] = Array(https://api.github.com/users/Diapolo, https://api.github.com/users/TheBlueMatt, https://api.github.com/users/ali1234)
Loading a Google BigQuery table into a DataFrame