Parse a JSON string or Python dictionary example

%scalaimport scala.collection.mutable.ListBufferval json_content1 = "{'json_col1': 'hello', 'json_col2': 32}"val json_content2 = "{'json_col1': 'hello', 'json_col2': 'world'}" var json_seq = new ListBuffer[String]()json_seq += json_content1json_seq += json_content2 val json_ds = json_seq.toDS() // Create a Spark dataset from the list.

import scala.collection.mutable.ListBuffer json_content1: String = {'json_col1': 'hello', 'json_col2': 32} json_content2: String = {'json_col1': 'hello', 'json_col2': 'world'} json_seq: scala.collection.mutable.ListBuffer[String] = ListBuffer({'json_col1': 'hello', 'json_col2': 32}, {'json_col1': 'hello', 'json_col2': 'world'}) json_ds: org.apache.spark.sql.Dataset[String] = [value: string]

%scalaval df = spark.read.json(json_ds) // Use `spark.read.json` to parse the Spark dataset.display(df)

%pyjson_content1 = "{'json_col1': 'hello', 'json_col2': 32}"json_content2 = "{'json_col1': 'hello', 'json_col2': 'world'}" json_list = []json_list.append(json_content1)json_list.append(json_content2)print(json_list)

["{'json_col1': 'hello', 'json_col2': 32}", "{'json_col1': 'hello', 'json_col2': 'world'}"]

%pydf = spark.read.json(sc.parallelize(json_list))display(df)

%scalaimport org.apache.spark.sql.functions._ val test_df = Seq(  ("1", "{'json_col1': 'hello', 'json_col2': 32}", "1.0"),("1", "{'json_col1': 'hello', 'json_col2': 'world'}", "1.0")).toDF("row_number", "json_data", "token") val row_rdd = test_df.select(col("json_data")).rdd  // Selecting just the JSON column and converting it to RDD.val string_rdd = row_rdd.map(_.mkString(","))       // Convert `RDD[Row]` to `RDD[String]`.

import org.apache.spark.sql.functions._ test_df: org.apache.spark.sql.DataFrame = [row_number: string, json_data: string ... 1 more field] row_rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[235] at rdd at command-1945001:7 string_rdd: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[236] at map at command-1945001:8

%scalaval df1= spark.read.json(string_rdd)display(df1)

jsonDataDict = {"job_id":33100,"run_id":1048560,"number_in_job":1,"state":{"life_cycle_state":"PENDING","state_message":"Waiting for cluster"},"task":{"notebook_task":{"notebook_path":"/Users/user@databricks.com/path/test_notebook"}},"cluster_spec":{"new_cluster":{"spark_version":"4.3.x-scala2.11","attributes":{"type":"fixed_node","memory":"8g"},"enable_elastic_disk":"false","num_workers":1}},"cluster_instance":{"cluster_id":"0000-000000-wares10"},"start_time":1584689872601,"setup_duration":0,"execution_duration":0,"cleanup_duration":0,"creator_user_name":"user@databricks.com","run_name":"my test job","run_page_url":"https://testurl.databricks.com#job/33100/run/1","run_type":"SUBMIT_RUN"} type(jsonDataDict)

Out[3]: dict

import jsonjsonData = json.dumps(jsonDataDict) jsonDataList = []jsonDataList.append(jsonData) jsonRDD = sc.parallelize(jsonDataList)df = spark.read.json(jsonRDD) #to view the schema usedf.printSchema()

display(df)

cleanup_duration

cluster_instance

cluster_spec

creator_user_name

execution_duration

job_id

number_in_job

run_id

run_name

run_page_url

run_type

setup_duration

start_time

state

task

{"cluster_id": "0000-000000-wares10"}

{"new_cluster": {"attributes": {"memory": "8g", "type": "fixed_node"}, "enable_elastic_disk": "false", "num_workers": 1, "spark_version": "4.3.x-scala2.11"}}

user@databricks.com

33100

1048560

my test job

https://testurl.databricks.com#job/33100/run/1

SUBMIT_RUN

1584689872601

{"life_cycle_state": "PENDING", "state_message": "Waiting for cluster"}

{"notebook_task": {"notebook_path": "/Users/user@databricks.com/path/test_notebook"}}

Showing all 1 rows.

Parse a JSON string or Python dictionary example(Python)