read-json-files(Scala)
Loading...
dbutils.fs.put("/tmp/test.json", """
{"string":"string1","int":1,"array":[1,2,3],"dict": {"key": "value1"}}
{"string":"string2","int":2,"array":[2,4,6],"dict": {"key": "value2"}}
{"string":"string3","int":3,"array":[3,6,9],"dict": {"key": "value3", "extra_key": "extra_value3"}}
""", true)
Wrote 243 bytes. res3: Boolean = true
%scala

val testJsonData = spark.read.json("/tmp/test.json")

display(testJsonData)
 
array
dict
int
string
1
2
3
[1, 2, 3]
{"extra_key": null, "key": "value1"}
1
string1
[2, 4, 6]
{"extra_key": null, "key": "value2"}
2
string2
[3, 6, 9]
{"extra_key": "extra_value3", "key": "value3"}
3
string3

Showing all 3 rows.

%r 
library(SparkR)

testJsonData <- read.df("/tmp/test.json", "json")

display(testJsonData)
 
array
dict
int
string
1
2
3
[1, 2, 3]
{"extra_key": null, "key": "value1"}
1
string1
[2, 4, 6]
{"extra_key": null, "key": "value2"}
2
string2
[3, 6, 9]
{"extra_key": "extra_value3", "key": "value3"}
3
string3

Showing all 3 rows.

%python

testJsonData = spark.read.json("/tmp/test.json")

display(testJsonData)
 
array
dict
int
string
1
2
3
[1, 2, 3]
{"extra_key": null, "key": "value1"}
1
string1
[2, 4, 6]
{"extra_key": null, "key": "value2"}
2
string2
[3, 6, 9]
{"extra_key": "extra_value3", "key": "value3"}
3
string3

Showing all 3 rows.

%sql 
CREATE TEMPORARY VIEW jsonTable
USING json
OPTIONS (path="/tmp/test.json")
OK
%sql SELECT * FROM jsonTable
 
array
dict
int
string
1
2
3
[1, 2, 3]
{"extra_key": null, "key": "value1"}
1
string1
[2, 4, 6]
{"extra_key": null, "key": "value2"}
2
string2
[3, 6, 9]
{"extra_key": "extra_value3", "key": "value3"}
3
string3

Showing all 3 rows.

dbutils.fs.put("/tmp/multi-line.json", """[
    {"string":"string1","int":1,"array":[1,2,3],"dict": {"key": "value1"}},
    {"string":"string2","int":2,"array":[2,4,6],"dict": {"key": "value2"}},
    {
        "string": "string3",
        "int": 3,
        "array": [
            3,
            6,
            9
        ],
        "dict": {
            "key": "value3",
            "extra_key": "extra_value3"
        }
    }
]""", true)
Wrote 385 bytes. res6: Boolean = true
val mldf = spark.read.option("multiline", "true").json("/tmp/multi-line.json")
mldf.show(false)
+---------+----------------------+---+-------+ |array |dict |int|string | +---------+----------------------+---+-------+ |[1, 2, 3]|[, value1] |1 |string1| |[2, 4, 6]|[, value2] |2 |string2| |[3, 6, 9]|[extra_value3, value3]|3 |string3| +---------+----------------------+---+-------+ mldf: org.apache.spark.sql.DataFrame = [array: array<bigint>, dict: struct<extra_key: string, key: string> ... 2 more fields]
%sql 
CREATE TEMPORARY VIEW multiLineJsonTable
USING json
OPTIONS (path="/tmp/multi-line.json",multiline=true)
OK
%sql
select * from multiLineJsonTable
 
array
dict
int
string
1
2
3
[1, 2, 3]
{"extra_key": null, "key": "value1"}
1
string1
[2, 4, 6]
{"extra_key": null, "key": "value2"}
2
string2
[3, 6, 9]
{"extra_key": "extra_value3", "key": "value3"}
3
string3

Showing all 3 rows.