AQE Demo - Databricks

set spark.sql.adaptive.enabled = true;

-- For demo purpose only.-- Not necesary in real-life usage. set spark.sql.adaptive.coalescePartitions.minPartitionNum = 1;

%scaladbutils.fs.rm("dbfs:/user/hive/warehouse/aqe_demo_db", true)

res1: Boolean = false

CREATE DATABASE IF NOT EXISTS aqe_demo_db;USE aqe_demo_db; DROP TABLE IF EXISTS items;DROP TABLE IF EXISTS sales; -- Create "items" table. CREATE TABLE itemsUSING parquetASSELECT id AS i_item_id,CAST(rand() * 1000 AS INT) AS i_priceFROM RANGE(30000000); -- Create "sales" table with skew.-- Item with id 100 is in 80% of all sales. CREATE TABLE salesUSING parquetASSELECT CASE WHEN rand() < 0.8 THEN 100 ELSE CAST(rand() * 30000000 AS INT) END AS s_item_id,CAST(rand() * 100 AS INT) AS s_quantity,DATE_ADD(current_date(), - CAST(rand() * 360 AS INT)) AS s_dateFROM RANGE(1000000000);

OK

-- Get the sums of sales quantity grouped by sales date.-- The grouped result is very small. SELECT s_date, sum(s_quantity) AS qFROM salesGROUP BY s_dateORDER BY q DESC;

-- Get total sales amount grouped by sales date for items with a price lower than 10.-- The selectivity of the filter by price is not known in static planning, so the initial plan opts for sort merge join.-- But in fact, the "items" table after filtering is very small, so the query can do a broadcast hash join instead. -- Static explain shows the initial plan with sort merge join. EXPLAIN FORMATTEDSELECT s_date, sum(s_quantity * i_price) AS total_salesFROM salesJOIN items ON s_item_id = i_item_idWHERE i_price < 10GROUP BY s_dateORDER BY total_sales DESC;

plan

1

== Physical Plan == AdaptiveSparkPlan (18) +- Sort (17) +- Exchange (16) +- HashAggregate (15) +- Exchange (14) +- HashAggregate (13) +- Project (12) +- SortMergeJoin Inner (11) :- Sort (5) : +- Exchange (4) : +- Project (3) : +- Filter (2) : +- Scan parquet aqe_demo_db.sales (1) +- Sort ...

Showing all 1 rows.

-- The runtime join stategy is changed to broadcast hash join. SELECT s_date, sum(s_quantity * i_price) AS total_salesFROM salesJOIN items ON s_item_id = i_item_idWHERE i_price < 10GROUP BY s_dateORDER BY total_sales DESC;

-- Get the total sales amount grouped by sales date.-- The partition in the "sales" table containing value "100" as "s_item_id" is much larger than other partitions.-- AQE splits the skewed partition into smaller partitions before joining the "sales" table with the "items" table. SELECT s_date, sum(s_quantity * i_price) AS total_salesFROM salesJOIN items ON i_item_id = s_item_idGROUP BY s_dateORDER BY total_sales DESC;

AQE Demo(SQL)

Adaptive Query Execution Demo

Enable AQE

Create Tables

Dynamically Coalesce Shuffle Partitions

Dynamically Switch Join Strategies

Dynamically Optimize Skew Join