Big Query Sample Notebook (Scala)

import com.google.cloud.spark.bigquery.BigQueryDataFrameReaderimport sqlContext.implicits._

import com.google.cloud.spark.bigquery.BigQueryDataFrameReader import sqlContext.implicits._

val table = "bigquery-public-data.samples.shakespeare" // load data from BigQueryval df = spark.read.format("bigquery").option("table", table).load()df.show()df.printSchema()df.createOrReplaceTempView("words")

+---------+----------+-------+-----------+ | word|word_count| corpus|corpus_date| +---------+----------+-------+-----------+ | LVII| 1|sonnets| 0| | augurs| 1|sonnets| 0| | dimm'd| 1|sonnets| 0| | plagues| 1|sonnets| 0| | treason| 1|sonnets| 0| | surmise| 1|sonnets| 0| | heed| 1|sonnets| 0| |Unthrifty| 1|sonnets| 0| | quality| 1|sonnets| 0| | wherever| 1|sonnets| 0| | C| 1|sonnets| 0| | L| 1|sonnets| 0| |imaginary| 1|sonnets| 0| | H| 1|sonnets| 0| | relief| 1|sonnets| 0| | W| 1|sonnets| 0| | V| 1|sonnets| 0| | advised| 1|sonnets| 0| | grey| 1|sonnets| 0| | X| 1|sonnets| 0| +---------+----------+-------+-----------+ only showing top 20 rows root |-- word: string (nullable = false) |-- word_count: long (nullable = false) |-- corpus: string (nullable = false) |-- corpus_date: long (nullable = false) table: String = bigquery-public-data.samples.shakespeare df: org.apache.spark.sql.DataFrame = [word: string, word_count: bigint ... 2 more fields]

// perform word countval wordCountDf = spark.sql("SELECT word, SUM(word_count) AS word_count FROM words GROUP BY word ORDER BY word_count DESC LIMIT 10") display(wordCountDf)

val table = "bigquery-public-data.samples.shakespeare"val tempLocation = "databricks_testing" // load the result of a SQL query on BigQuery into a DataFrameval df =   spark.read.format("bigquery")  .option("materializationDataset", tempLocation)  .option("query", s"SELECT count(1) FROM `${table}`")  .load()  .collect() display(df)

val df = spark.read.format("bigquery").option("table", "Nested.nested").load()

df: org.apache.spark.sql.DataFrame = [id: bigint, name: string ... 1 more field]

case class Employee(firstName: String, lastName: String, email: String, salary: Int) // Create the Employeesval employee1 = new Employee("michael", "armbrust", "no-reply@berkeley.edu", 100000)val employee2 = new Employee("xiangrui", "meng", "no-reply@stanford.edu", 120000)val employee3 = new Employee("matei", "zaharia", "no-reply@waterloo.edu", 140000)val employee4 = new Employee("patrick", "wendell", "no-reply@princeton.edu", 160000) val df = Seq(employee1, employee2, employee3, employee4).toDF display(df)

// save to a BigQuery table named `Nested.employees`df.write  .format("bigquery")  .mode("overwrite")  .option("temporaryGcsBucket", "databricks-bigquery-temp")  .option("table", "Nested.employees")  .save()

// read the data we wrote to BigQuery back into a DataFrame to prove the write worked val readDF =   spark.read    .format("bigquery")    .option("table", "Nested.employees")    .load() display(readDF)

Nested filter pushdown and nested column pruning

This data source supports advanced filter pushdown and column pruning to reduce the amount of data transfer from BigQuery to Databricks. For example, advanced filter pushdown processes certain filters on BigQuery instead of Databricks, which helps reduce the amount of data transferred.

In this example, the payload.pull_request.user.id > 500 predicate is evaluated in BigQuery. Also, the payload.pull_request.user.url column is selected in BigQuery.

val df = spark.read.format("bigquery")  .option("table", "bigquery-public-data.samples.github_nested")  .load()  .where("payload.pull_request.user.id > 500 and repository.url='https://github.com/bitcoin/bitcoin'")  .select("payload.pull_request.user.url")  .distinct  .as[String]  .sort("payload.pull_request.user.url")  .take(3)

df: Array[String] = Array(https://api.github.com/users/Diapolo, https://api.github.com/users/TheBlueMatt, https://api.github.com/users/ali1234)

val df = spark.read.format("bigquery")   .option("table", "bigquery-public-data.samples.github_nested")   .load()   .where("payload.pages[0].html_url = 'https://github.com/clarkmoody/bitcoin/wiki/Home'")   .select("payload.pages")   .count()

df: Long = 1

val df =   spark.read.format("bigquery")  .option("table", "bigquery-public-data.samples.natality")  .load()  .filter("(father_age + mother_age) < 40")  .filter("state = 'WA'")  .filter("weight_pounds > 10") display(df)

source_year

year

month

day

wday

state

is_male

child_race

weight_pounds

plurality

apgar_1min

apgar_5min

mother_residence_state

mother_race

mother_age

gestation_weeks

lmp

mother_married

mother_birth_state

cigarette_use

cigarettes_per_day

alcohol_use

drinks_per_week

weight_gain_pounds

born_alive_alive

born_alive_dead

born_dead

ever_born

father_race

father_age

record_weight

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

1981

4

2

null

WA

false

1

10.37495404972

1

6

9

WA

1

20

42

06121910

true

OH

null

1

19

1

1986

11

20

null

WA

false

7

10.12583169366

1

8

9

WA

1

15

39

02201966

false

SC

null

0

1

7

19

1

1978

7

8

null

WA

true

1

10.74973989512

1

99

WA

1

20

42

09171987

true

WA

null

0

1

19

1

1980

3

17

null

WA

true

1

10.74973989512

1

7

9

WA

1

18

46

04271999

true

NJ

null

0

1

20

1

1983

1

null

WA

false

3

10.6262810284

1

7

9

WA

3

18

40

03241932

true

IA

null

0

1

21

1

1997

8

null

4

WA

true

9

10.0751253734

1

99

9

WA

1

19

41

10241996

true

Mexico

null

false

null

45

0

1

17

1

1974

3

26

null

WA

true

1

10.37495404972

1

null

WA

1

19

43

05301943

true

ID

null

0

1

20

2

1981

6

21

null

WA

false

1

10.00016820432

1

6

10

WA

1

18

44

08171910

true

OR

null

0

1

18

1

1992

1

null

5

WA

true

1

10.24929056038

1

8

9

WA

1

18

38

04131991

false

WA

null

false

null

61

0

1

18

1

2002

8

null

5

WA

true

9

10.06189763768

1

99

9

WA

1

20

39

11152001

false

MA

null

false

null

71

0

1

18

1

1992

5

null

2

WA

false

1

10.00016820432

1

2

8

WA

1

19

46

06151991

true

Mexico

null

false

null

99

0

1

0

2

1

20

1

1996

5

null

6

WA

false

9

10.81146932848

1

99

9

WA

1

19

39

08161995

true

WA

null

false

null

40

1

0

2

1

20

1

1980

12

23

null

WA

true

1

11.62497507526

1

4

WA

1

19

35

04191900

true

WA

null

0

1

2

1

19

1

1984

9

27

null

WA

true

1

11.06279630716

1

8

9

WA

1

18

40

12191943

true

WA

null

0

null

3

4

1

20

1

1994

1

null

2

WA

true

9

10.50061753906

1

8

9

WA

1

19

42

99999999

true

AK

null

9

19

1

2000

4

null

4

WA

true

9

10.56234697242

1

99

10

WA

3

17

40

06251999

false

MT

null

false

null

54

0

2

1

38

19

1

1983

12

31

null

WA

true

3

10.74973989512

1

8

9

WA

3

18

40

03251933

true

WA

null

0

1

19

1

Showing all 1028 rows.

Big Query Sample Notebook (Scala)(Scala)

Loading a Google BigQuery table into a DataFrame

Loading the result of a BigQuery SQL query into a DataFrame

Write the contents of a DataFrame to a BigQuery table

Advanced Features

Nested filter pushdown and nested column pruning

Array pushdown

Expression pushdown