import org.apache.spark.sql.functions._
val rawData = spark
.read
.json("/databricks-datasets/structured-streaming/events/")
.drop("time")
.withColumn("date", expr("cast(concat('2018-01-', cast(rand(5) * 30 as int) + 1) as date)"))
.withColumn("deviceId", expr("cast(rand(5) * 100 as int)")).dropDuplicates("deviceId").limit(100)
rawData.write.mode("append").partitionBy("action","date").format("delta").saveAsTable("default.events_tbl10")
display(rawData)
val rawData = spark
.read
.json("/databricks-datasets/structured-streaming/events/")
.drop("time")
.withColumn("date", expr("cast(concat('2018-01-', cast(rand(5) * 30 as int) + 1) as date)"))
.withColumn("deviceId", expr("cast(rand(5) * 100 as int)")).dropDuplicates("deviceId").limit(100)
rawData.write.mode("append").partitionBy("action","date").format("delta").saveAsTable("default.events_tbl10")
rawData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [action: string, date: date ... 1 more field]
%sh grep -r 'part-00000-c4ec83b7-b901-4e11-bdc6-5deacb715575.c000.snappy.parquet' /dbfs/user/hive/warehouse/events_tbl10/_delta_log
/dbfs/user/hive/warehouse/events_tbl10/_delta_log/00000000000000000001.json:{"add":{"path":"action=Open/date=2018-01-01/part-00000-c4ec83b7-b901-4e11-bdc6-5deacb715575.c000.snappy.parquet","partitionValues":{"action":"Open","date":"2018-01-01"},"size":538,"modificationTime":1624533982000,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"deviceId\":0},\"maxValues\":{\"deviceId\":1},\"nullCount\":{\"deviceId\":0}}"}}
%sh grep -r 'part-00000-f1144779-3215-4ef2-be0a-5058d8326433.c000.snappy.parquet' /dbfs/user/hive/warehouse/events_tbl10/_delta_log
/dbfs/user/hive/warehouse/events_tbl10/_delta_log/00000000000000000000.json:{"add":{"path":"action=Open/date=2018-01-01/part-00000-f1144779-3215-4ef2-be0a-5058d8326433.c000.snappy.parquet","partitionValues":{"action":"Open","date":"2018-01-01"},"size":538,"modificationTime":1624533827000,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"deviceId\":0},\"maxValues\":{\"deviceId\":1},\"nullCount\":{\"deviceId\":0}}"}}
This example notebook shows you how to identify the source (notebook or job id) of duplicate data in an append scenario.