read-csv-column-subset

import org.apache.spark.sql.types._

val schema = new StructType()
  .add("_c0",IntegerType,true)
  .add("carat",DoubleType,true)
  .add("cut",StringType,true)
  .add("color",StringType,true)
  .add("clarity",StringType,true)
  .add("depth",IntegerType,true) // The depth field is defined wrongly. The actual data contains floating point numbers, while the schema specifies an integer.
  .add("table",DoubleType,true)
  .add("price",IntegerType,true)
  .add("x",DoubleType,true)
  .add("y",DoubleType,true)
  .add("z",DoubleType,true)

val diamonds_with_wrong_schema = spark.read.format("csv")
  .option("header", "true")
  .schema(schema)
  .load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv")

import org.apache.spark.sql.types._ schema: org.apache.spark.sql.types.StructType = StructType(StructField(_c0,IntegerType,true), StructField(carat,DoubleType,true), StructField(cut,StringType,true), StructField(color,StringType,true), StructField(clarity,StringType,true), StructField(depth,IntegerType,true), StructField(table,DoubleType,true), StructField(price,IntegerType,true), StructField(x,DoubleType,true), StructField(y,DoubleType,true), StructField(z,DoubleType,true)) diamonds_with_wrong_schema: org.apache.spark.sql.DataFrame = [_c0: int, carat: double ... 9 more fields]

// The mistake in the user-specified schema causes any row with a non-integer value in the depth column to be nullified.
// There are some rows, where the value of depth is an integer e.g. 64.0. They are parsed and converted successfully.
display(diamonds_with_wrong_schema)


1	0.23	Ideal	E	SI2	null	55	326	3.95	3.98	2.43
2	0.21	Premium	E	SI1	null	61	326	3.89	3.84	2.31
3	0.23	Good	E	VS1	null	65	327	4.05	4.07	2.31
4	0.29	Premium	I	VS2	null	58	334	4.2	4.23	2.63
5	0.31	Good	J	SI2	null	58	335	4.34	4.35	2.75
6	0.24	Very Good	J	VVS2	null	57	336	3.94	3.96	2.48
7	0.24	Very Good	I	VVS1	null	57	336	3.95	3.98	2.47
8	0.26	Very Good	H	SI1	null	55	337	4.07	4.11	2.53
9	0.22	Fair	E	VS2	null	61	337	3.87	3.78	2.49
10	0.23	Very Good	H	VS1	null	61	338	4	4.05	2.39
11	0.3	Good	J	SI1	64	55	339	4.25	4.28	2.73
12	0.23	Ideal	J	VS1	null	56	340	3.93	3.9	2.46
13	0.22	Premium	F	SI1	null	61	342	3.88	3.84	2.33
14	0.31	Ideal	J	SI2	null	54	344	4.35	4.37	2.71
15	0.2	Premium	E	SI2	null	62	345	3.79	3.75	2.27
16	0.32	Premium	E	I1	null	58	345	4.38	4.42	2.68
17	0.3	Ideal	I	SI2	62	54	348	4.31	4.34	2.68
18	0.3	Good	J	SI1	null	54	351	4.23	4.29	2.7
19	0.3	Good	J	SI1	null	56	351	4.23	4.26	2.71
20	0.3	Very Good	J	SI1	null	59	351	4.21	4.27	2.66
21	0.3	Good	I	SI2	null	56	351	4.26	4.3	2.71
22	0.23	Very Good	E	VS2	null	55	352	3.85	3.92	2.48
23	0.23	Very Good	H	VS1	61	57	353	3.94	3.96	2.41
24	0.31	Very Good	J	SI1	null	62	353	4.39	4.43	2.62
25	0.31	Very Good	J	SI1	null	62	353	4.44	4.47	2.59
26	0.23	Very Good	G	VVS2	null	58	354	3.97	4.01	2.41
27	0.24	Premium	I	VS1	null	57	355	3.97	3.94	2.47
28	0.3	Very Good	J	VS2	null	57	357	4.28	4.3	2.67
29	0.23	Very Good	D	VS2	null	61	357	3.96	3.97	2.4
30	0.23	Very Good	F	VS1	null	57	357	3.96	3.99	2.42
31	0.23	Very Good	F	VS1	60	57	402	4	4.03	2.41
32	0.23	Very Good	F	VS1	null	57	402	4.04	4.06	2.42
33	0.23	Very Good	E	VS1	null	59	402	3.97	4.01	2.42
34	0.23	Very Good	E	VS1	null	58	402	4.01	4.06	2.4
35	0.23	Very Good	D	VS1	null	58	402	3.92	3.96	2.44
36	0.23	Good	F	VS1	null	59	402	4.06	4.08	2.37
37	0.23	Good	E	VS1	null	59	402	3.83	3.85	2.46
38	0.31	Good	H	SI1	64	54	402	4.29	4.31	2.75
39	0.26	Very Good	D	VS2	null	59	403	4.13	4.16	2.52
40	0.33	Ideal	I	SI2	null	55	403	4.49	4.51	2.78
41	0.33	Ideal	I	SI2	null	56	403	4.49	4.5	2.75
42	0.33	Ideal	J	SI1	null	56	403	4.49	4.55	2.76
43	0.26	Good	D	VS2	null	56	403	3.99	4.02	2.61
44	0.26	Good	D	VS1	null	63	403	4.19	4.24	2.46
45	0.32	Good	H	SI2	null	56	403	4.34	4.37	2.75
46	0.29	Premium	F	SI1	null	58	403	4.24	4.26	2.65
47	0.32	Very Good	H	SI2	null	55	403	4.35	4.42	2.71
48	0.32	Good	H	SI2	null	56	403	4.36	4.38	2.79
49	0.25	Very Good	E	VS2	null	60	404	4	4.03	2.54
50	0.29	Very Good	H	SI2	null	60	404	4.33	4.37	2.64
51	0.24	Very Good	F	SI1	null	61	404	4.02	4.03	2.45
52	0.23	Ideal	G	VS1	null	54	404	3.93	3.95	2.44
53	0.32	Ideal	I	SI1	null	55	404	4.45	4.48	2.72
54	0.22	Premium	E	VS2	null	58	404	3.93	3.89	2.41
55	0.22	Premium	D	VS2	null	62	404	3.91	3.88	2.31
56	0.3	Ideal	I	SI2	61	59	405	4.3	4.33	2.63
57	0.3	Premium	J	SI2	null	61	405	4.43	4.38	2.61
58	0.3	Very Good	I	SI1	null	57	405	4.25	4.28	2.67
59	0.3	Very Good	I	SI1	63	57	405	4.28	4.32	2.71
60	0.3	Good	I	SI1	null	55	405	4.25	4.29	2.7
61	0.35	Ideal	I	VS1	null	57	552	4.54	4.59	2.78
62	0.3	Premium	D	SI1	null	59	552	4.23	4.27	2.66
63	0.3	Ideal	D	SI1	null	57	552	4.29	4.32	2.69
64	0.3	Ideal	D	SI1	null	56	552	4.3	4.33	2.68
65	0.42	Premium	I	SI2	null	59	552	4.78	4.84	2.96
66	0.28	Ideal	G	VVS2	null	56	553	4.19	4.22	2.58
67	0.32	Ideal	I	VVS1	62	55.3	553	4.39	4.42	2.73
68	0.31	Very Good	G	SI1	null	57	553	4.33	4.3	2.73
69	0.31	Premium	G	SI1	null	58	553	4.35	4.32	2.68
70	0.24	Premium	E	VVS1	null	58	553	4.01	4.03	2.44
71	0.24	Very Good	D	VVS1	null	60	553	3.97	4	2.45
72	0.3	Very Good	H	SI1	null	56	554	4.29	4.27	2.7
73	0.3	Premium	H	SI1	null	59	554	4.28	4.24	2.68
74	0.3	Premium	H	SI1	null	57	554	4.29	4.25	2.67
75	0.3	Good	H	SI1	null	57	554	4.28	4.26	2.72
76	0.26	Very Good	F	VVS2	null	60	554	4.19	4.22	2.49
77	0.26	Very Good	E	VVS2	null	58	554	4.15	4.23	2.51
78	0.26	Very Good	D	VVS2	null	54	554	4.08	4.13	2.56
79	0.26	Very Good	D	VVS2	null	60	554	4.01	4.05	2.53
80	0.26	Very Good	E	VVS1	null	59	554	4.06	4.09	2.55
81	0.26	Very Good	E	VVS1	null	59	554	4	4.04	2.55
82	0.26	Very Good	D	VVS1	null	60	554	4.03	4.12	2.53
83	0.26	Ideal	E	VVS2	null	58	554	4.02	4.06	2.54
84	0.38	Ideal	I	SI2	null	56	554	4.65	4.67	2.87
85	0.26	Good	E	VVS1	null	60	554	4.22	4.25	2.45
86	0.24	Premium	G	VVS1	null	59	554	3.95	3.92	2.45
87	0.24	Premium	H	VVS1	null	58	554	4.01	3.96	2.44
88	0.24	Premium	H	VVS1	null	59	554	4.02	4	2.44
89	0.24	Premium	H	VVS2	null	58	554	4.07	4.04	2.46
90	0.32	Premium	I	SI1	null	58	554	4.35	4.33	2.73
91	0.7	Ideal	E	SI1	null	57	2757	5.7	5.72	3.57
92	0.86	Fair	E	SI2	null	69	2757	6.45	6.33	3.52
93	0.7	Ideal	G	VS2	null	56	2757	5.7	5.67	3.5
94	0.71	Very Good	E	VS2	null	57	2759	5.68	5.73	3.56
95	0.78	Very Good	G	SI2	null	56	2759	5.81	5.85	3.72
96	0.7	Good	E	VS2	null	58	2759	5.85	5.9	3.38
97	0.7	Good	F	VS1	null	62	2759	5.71	5.76	3.4
98	0.96	Fair	F	SI2	null	62	2759	6.27	5.95	4.07
99	0.73	Very Good	E	SI1	null	59	2760	5.77	5.78	3.56
100	0.8	Premium	H	SI1	null	58	2760	5.97	5.93	3.66

Showing the first 1000 rows.

// Reading a subset of columns that does not include the problematic depth column avoids the issue.
// There are no nullified rows.
display(diamonds_with_wrong_schema.select($"_c0", $"carat", $"clarity"))


1	0.23	SI2
2	0.21	SI1
3	0.23	VS1
4	0.29	VS2
5	0.31	SI2
6	0.24	VVS2
7	0.24	VVS1
8	0.26	SI1
9	0.22	VS2
10	0.23	VS1
11	0.3	SI1
12	0.23	VS1
13	0.22	SI1
14	0.31	SI2
15	0.2	SI2
16	0.32	I1
17	0.3	SI2
18	0.3	SI1
19	0.3	SI1
20	0.3	SI1
21	0.3	SI2
22	0.23	VS2
23	0.23	VS1
24	0.31	SI1
25	0.31	SI1
26	0.23	VVS2
27	0.24	VS1
28	0.3	VS2
29	0.23	VS2
30	0.23	VS1
31	0.23	VS1
32	0.23	VS1
33	0.23	VS1
34	0.23	VS1
35	0.23	VS1
36	0.23	VS1
37	0.23	VS1
38	0.31	SI1
39	0.26	VS2
40	0.33	SI2
41	0.33	SI2
42	0.33	SI1
43	0.26	VS2
44	0.26	VS1
45	0.32	SI2
46	0.29	SI1
47	0.32	SI2
48	0.32	SI2
49	0.25	VS2
50	0.29	SI2
51	0.24	SI1
52	0.23	VS1
53	0.32	SI1
54	0.22	VS2
55	0.22	VS2
56	0.3	SI2
57	0.3	SI2
58	0.3	SI1
59	0.3	SI1
60	0.3	SI1
61	0.35	VS1
62	0.3	SI1
63	0.3	SI1
64	0.3	SI1
65	0.42	SI2
66	0.28	VVS2
67	0.32	VVS1
68	0.31	SI1
69	0.31	SI1
70	0.24	VVS1
71	0.24	VVS1
72	0.3	SI1
73	0.3	SI1
74	0.3	SI1
75	0.3	SI1
76	0.26	VVS2
77	0.26	VVS2
78	0.26	VVS2
79	0.26	VVS2
80	0.26	VVS1
81	0.26	VVS1
82	0.26	VVS1
83	0.26	VVS2
84	0.38	SI2
85	0.26	VVS1
86	0.24	VVS1
87	0.24	VVS1
88	0.24	VVS1
89	0.24	VVS2
90	0.32	SI1
91	0.7	SI1
92	0.86	SI2
93	0.7	VS2
94	0.71	VS2
95	0.78	SI2
96	0.7	VS2
97	0.7	VS1
98	0.96	SI2
99	0.73	SI1
100	0.8	SI1

Showing the first 1000 rows.

// However as soon as the depth column is included, the parsing and coversion fails for many rows. 
display(diamonds_with_wrong_schema.select($"_c0", $"carat", $"clarity", $"depth"))


1	0.23	SI2	null
2	0.21	SI1	null
3	0.23	VS1	null
4	0.29	VS2	null
5	0.31	SI2	null
6	0.24	VVS2	null
7	0.24	VVS1	null
8	0.26	SI1	null
9	0.22	VS2	null
10	0.23	VS1	null
11	0.3	SI1	64
12	0.23	VS1	null
13	0.22	SI1	null
14	0.31	SI2	null
15	0.2	SI2	null
16	0.32	I1	null
17	0.3	SI2	62
18	0.3	SI1	null
19	0.3	SI1	null
20	0.3	SI1	null
21	0.3	SI2	null
22	0.23	VS2	null
23	0.23	VS1	61
24	0.31	SI1	null
25	0.31	SI1	null
26	0.23	VVS2	null
27	0.24	VS1	null
28	0.3	VS2	null
29	0.23	VS2	null
30	0.23	VS1	null
31	0.23	VS1	60
32	0.23	VS1	null
33	0.23	VS1	null
34	0.23	VS1	null
35	0.23	VS1	null
36	0.23	VS1	null
37	0.23	VS1	null
38	0.31	SI1	64
39	0.26	VS2	null
40	0.33	SI2	null
41	0.33	SI2	null
42	0.33	SI1	null
43	0.26	VS2	null
44	0.26	VS1	null
45	0.32	SI2	null
46	0.29	SI1	null
47	0.32	SI2	null
48	0.32	SI2	null
49	0.25	VS2	null
50	0.29	SI2	null
51	0.24	SI1	null
52	0.23	VS1	null
53	0.32	SI1	null
54	0.22	VS2	null
55	0.22	VS2	null
56	0.3	SI2	61
57	0.3	SI2	null
58	0.3	SI1	null
59	0.3	SI1	63
60	0.3	SI1	null
61	0.35	VS1	null
62	0.3	SI1	null
63	0.3	SI1	null
64	0.3	SI1	null
65	0.42	SI2	null
66	0.28	VVS2	null
67	0.32	VVS1	62
68	0.31	SI1	null
69	0.31	SI1	null
70	0.24	VVS1	null
71	0.24	VVS1	null
72	0.3	SI1	null
73	0.3	SI1	null
74	0.3	SI1	null
75	0.3	SI1	null
76	0.26	VVS2	null
77	0.26	VVS2	null
78	0.26	VVS2	null
79	0.26	VVS2	null
80	0.26	VVS1	null
81	0.26	VVS1	null
82	0.26	VVS1	null
83	0.26	VVS2	null
84	0.38	SI2	null
85	0.26	VVS1	null
86	0.24	VVS1	null
87	0.24	VVS1	null
88	0.24	VVS1	null
89	0.24	VVS2	null
90	0.32	SI1	null
91	0.7	SI1	null
92	0.86	SI2	null
93	0.7	VS2	null
94	0.71	VS2	null
95	0.78	SI2	null
96	0.7	VS2	null
97	0.7	VS1	null
98	0.96	SI2	null
99	0.73	SI1	null
100	0.8	SI1	null

Showing the first 1000 rows.

diamonds_with_wrong_schema.createOrReplaceTempView("diamonds_with_wrong_schema")

%sql
-- Reading all the columns hits the problem with the depth column. We can see a lot of nullified rows.
SELECT * FROM diamonds_with_wrong_schema WHERE _c0 is null

%sql
-- Running a very similar query with a count(_c0) aggregate instead of "*" does return 0 contrary to the output of the query above.
-- In this case, the depth column doesn't need to be touched. Therefore, the parsing and coversion errors do not occur and no rows are nullified.
SELECT count(_c0) FROM diamonds_with_wrong_schema WHERE _c0 is null


0

%sql
-- The same applies for this similar query.
SELECT count(1) FROM diamonds_with_wrong_schema WHERE _c0 is null


0

%sql
-- The same applies for this similar query.
SELECT count(*) FROM diamonds_with_wrong_schema WHERE _c0 is null