read-csv-corrupt-record

import org.apache.spark.sql.types._

val schema = new StructType()
  .add("_c0",IntegerType,true)
  .add("carat",DoubleType,true)
  .add("cut",StringType,true)
  .add("color",StringType,true)
  .add("clarity",StringType,true)
  .add("depth",IntegerType,true) // The depth field is defined wrongly. The actual data contains floating point numbers, while the schema specifies an integer.
  .add("table",DoubleType,true)
  .add("price",IntegerType,true)
  .add("x",DoubleType,true)
  .add("y",DoubleType,true)
  .add("z",DoubleType,true)
  .add("_corrupt_record", StringType, true) // The schema contains a special column _corrupt_record, which does not exist in the data. This column captures rows that did not parse correctly.

val diamonds_with_wrong_schema = spark.read.format("csv")
  .option("header", "true")
  .schema(schema)
  .load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv")

import org.apache.spark.sql.types._ schema: org.apache.spark.sql.types.StructType = StructType(StructField(_c0,IntegerType,true), StructField(carat,DoubleType,true), StructField(cut,StringType,true), StructField(color,StringType,true), StructField(clarity,StringType,true), StructField(depth,IntegerType,true), StructField(table,DoubleType,true), StructField(price,IntegerType,true), StructField(x,DoubleType,true), StructField(y,DoubleType,true), StructField(z,DoubleType,true), StructField(_corrupt_record,StringType,true)) diamonds_with_wrong_schema: org.apache.spark.sql.DataFrame = [_c0: int, carat: double ... 10 more fields]

// The mistake in the user-specified schema causes any row with a non-integer value in the depth column to be nullified.
// There are some rows, where the value of depth is an integer e.g. 64.0. They are parsed and coverted successfully.
// The _currupt_record column shows the string with original row data, which helps find the issue. 
display(diamonds_with_wrong_schema)


1	0.23	Ideal	E	SI2	null	55	326	3.95	3.98	2.43	"1",0.23,"Ideal","E","SI2",61.5,55,326,3.95,3.98,2.43
2	0.21	Premium	E	SI1	null	61	326	3.89	3.84	2.31	"2",0.21,"Premium","E","SI1",59.8,61,326,3.89,3.84,2.31
3	0.23	Good	E	VS1	null	65	327	4.05	4.07	2.31	"3",0.23,"Good","E","VS1",56.9,65,327,4.05,4.07,2.31
4	0.29	Premium	I	VS2	null	58	334	4.2	4.23	2.63	"4",0.29,"Premium","I","VS2",62.4,58,334,4.2,4.23,2.63
5	0.31	Good	J	SI2	null	58	335	4.34	4.35	2.75	"5",0.31,"Good","J","SI2",63.3,58,335,4.34,4.35,2.75
6	0.24	Very Good	J	VVS2	null	57	336	3.94	3.96	2.48	"6",0.24,"Very Good","J","VVS2",62.8,57,336,3.94,3.96,2.48
7	0.24	Very Good	I	VVS1	null	57	336	3.95	3.98	2.47	"7",0.24,"Very Good","I","VVS1",62.3,57,336,3.95,3.98,2.47
8	0.26	Very Good	H	SI1	null	55	337	4.07	4.11	2.53	"8",0.26,"Very Good","H","SI1",61.9,55,337,4.07,4.11,2.53
9	0.22	Fair	E	VS2	null	61	337	3.87	3.78	2.49	"9",0.22,"Fair","E","VS2",65.1,61,337,3.87,3.78,2.49
10	0.23	Very Good	H	VS1	null	61	338	4	4.05	2.39	"10",0.23,"Very Good","H","VS1",59.4,61,338,4,4.05,2.39
11	0.3	Good	J	SI1	64	55	339	4.25	4.28	2.73	null
12	0.23	Ideal	J	VS1	null	56	340	3.93	3.9	2.46	"12",0.23,"Ideal","J","VS1",62.8,56,340,3.93,3.9,2.46
13	0.22	Premium	F	SI1	null	61	342	3.88	3.84	2.33	"13",0.22,"Premium","F","SI1",60.4,61,342,3.88,3.84,2.33
14	0.31	Ideal	J	SI2	null	54	344	4.35	4.37	2.71	"14",0.31,"Ideal","J","SI2",62.2,54,344,4.35,4.37,2.71
15	0.2	Premium	E	SI2	null	62	345	3.79	3.75	2.27	"15",0.2,"Premium","E","SI2",60.2,62,345,3.79,3.75,2.27
16	0.32	Premium	E	I1	null	58	345	4.38	4.42	2.68	"16",0.32,"Premium","E","I1",60.9,58,345,4.38,4.42,2.68
17	0.3	Ideal	I	SI2	62	54	348	4.31	4.34	2.68	null
18	0.3	Good	J	SI1	null	54	351	4.23	4.29	2.7	"18",0.3,"Good","J","SI1",63.4,54,351,4.23,4.29,2.7
19	0.3	Good	J	SI1	null	56	351	4.23	4.26	2.71	"19",0.3,"Good","J","SI1",63.8,56,351,4.23,4.26,2.71
20	0.3	Very Good	J	SI1	null	59	351	4.21	4.27	2.66	"20",0.3,"Very Good","J","SI1",62.7,59,351,4.21,4.27,2.66
21	0.3	Good	I	SI2	null	56	351	4.26	4.3	2.71	"21",0.3,"Good","I","SI2",63.3,56,351,4.26,4.3,2.71
22	0.23	Very Good	E	VS2	null	55	352	3.85	3.92	2.48	"22",0.23,"Very Good","E","VS2",63.8,55,352,3.85,3.92,2.48
23	0.23	Very Good	H	VS1	61	57	353	3.94	3.96	2.41	null
24	0.31	Very Good	J	SI1	null	62	353	4.39	4.43	2.62	"24",0.31,"Very Good","J","SI1",59.4,62,353,4.39,4.43,2.62
25	0.31	Very Good	J	SI1	null	62	353	4.44	4.47	2.59	"25",0.31,"Very Good","J","SI1",58.1,62,353,4.44,4.47,2.59
26	0.23	Very Good	G	VVS2	null	58	354	3.97	4.01	2.41	"26",0.23,"Very Good","G","VVS2",60.4,58,354,3.97,4.01,2.41
27	0.24	Premium	I	VS1	null	57	355	3.97	3.94	2.47	"27",0.24,"Premium","I","VS1",62.5,57,355,3.97,3.94,2.47
28	0.3	Very Good	J	VS2	null	57	357	4.28	4.3	2.67	"28",0.3,"Very Good","J","VS2",62.2,57,357,4.28,4.3,2.67
29	0.23	Very Good	D	VS2	null	61	357	3.96	3.97	2.4	"29",0.23,"Very Good","D","VS2",60.5,61,357,3.96,3.97,2.4
30	0.23	Very Good	F	VS1	null	57	357	3.96	3.99	2.42	"30",0.23,"Very Good","F","VS1",60.9,57,357,3.96,3.99,2.42
31	0.23	Very Good	F	VS1	60	57	402	4	4.03	2.41	null
32	0.23	Very Good	F	VS1	null	57	402	4.04	4.06	2.42	"32",0.23,"Very Good","F","VS1",59.8,57,402,4.04,4.06,2.42
33	0.23	Very Good	E	VS1	null	59	402	3.97	4.01	2.42	"33",0.23,"Very Good","E","VS1",60.7,59,402,3.97,4.01,2.42
34	0.23	Very Good	E	VS1	null	58	402	4.01	4.06	2.4	"34",0.23,"Very Good","E","VS1",59.5,58,402,4.01,4.06,2.4
35	0.23	Very Good	D	VS1	null	58	402	3.92	3.96	2.44	"35",0.23,"Very Good","D","VS1",61.9,58,402,3.92,3.96,2.44
36	0.23	Good	F	VS1	null	59	402	4.06	4.08	2.37	"36",0.23,"Good","F","VS1",58.2,59,402,4.06,4.08,2.37
37	0.23	Good	E	VS1	null	59	402	3.83	3.85	2.46	"37",0.23,"Good","E","VS1",64.1,59,402,3.83,3.85,2.46
38	0.31	Good	H	SI1	64	54	402	4.29	4.31	2.75	null
39	0.26	Very Good	D	VS2	null	59	403	4.13	4.16	2.52	"39",0.26,"Very Good","D","VS2",60.8,59,403,4.13,4.16,2.52
40	0.33	Ideal	I	SI2	null	55	403	4.49	4.51	2.78	"40",0.33,"Ideal","I","SI2",61.8,55,403,4.49,4.51,2.78
41	0.33	Ideal	I	SI2	null	56	403	4.49	4.5	2.75	"41",0.33,"Ideal","I","SI2",61.2,56,403,4.49,4.5,2.75
42	0.33	Ideal	J	SI1	null	56	403	4.49	4.55	2.76	"42",0.33,"Ideal","J","SI1",61.1,56,403,4.49,4.55,2.76
43	0.26	Good	D	VS2	null	56	403	3.99	4.02	2.61	"43",0.26,"Good","D","VS2",65.2,56,403,3.99,4.02,2.61
44	0.26	Good	D	VS1	null	63	403	4.19	4.24	2.46	"44",0.26,"Good","D","VS1",58.4,63,403,4.19,4.24,2.46
45	0.32	Good	H	SI2	null	56	403	4.34	4.37	2.75	"45",0.32,"Good","H","SI2",63.1,56,403,4.34,4.37,2.75
46	0.29	Premium	F	SI1	null	58	403	4.24	4.26	2.65	"46",0.29,"Premium","F","SI1",62.4,58,403,4.24,4.26,2.65
47	0.32	Very Good	H	SI2	null	55	403	4.35	4.42	2.71	"47",0.32,"Very Good","H","SI2",61.8,55,403,4.35,4.42,2.71
48	0.32	Good	H	SI2	null	56	403	4.36	4.38	2.79	"48",0.32,"Good","H","SI2",63.8,56,403,4.36,4.38,2.79
49	0.25	Very Good	E	VS2	null	60	404	4	4.03	2.54	"49",0.25,"Very Good","E","VS2",63.3,60,404,4,4.03,2.54
50	0.29	Very Good	H	SI2	null	60	404	4.33	4.37	2.64	"50",0.29,"Very Good","H","SI2",60.7,60,404,4.33,4.37,2.64
51	0.24	Very Good	F	SI1	null	61	404	4.02	4.03	2.45	"51",0.24,"Very Good","F","SI1",60.9,61,404,4.02,4.03,2.45
52	0.23	Ideal	G	VS1	null	54	404	3.93	3.95	2.44	"52",0.23,"Ideal","G","VS1",61.9,54,404,3.93,3.95,2.44
53	0.32	Ideal	I	SI1	null	55	404	4.45	4.48	2.72	"53",0.32,"Ideal","I","SI1",60.9,55,404,4.45,4.48,2.72
54	0.22	Premium	E	VS2	null	58	404	3.93	3.89	2.41	"54",0.22,"Premium","E","VS2",61.6,58,404,3.93,3.89,2.41
55	0.22	Premium	D	VS2	null	62	404	3.91	3.88	2.31	"55",0.22,"Premium","D","VS2",59.3,62,404,3.91,3.88,2.31
56	0.3	Ideal	I	SI2	61	59	405	4.3	4.33	2.63	null
57	0.3	Premium	J	SI2	null	61	405	4.43	4.38	2.61	"57",0.3,"Premium","J","SI2",59.3,61,405,4.43,4.38,2.61
58	0.3	Very Good	I	SI1	null	57	405	4.25	4.28	2.67	"58",0.3,"Very Good","I","SI1",62.6,57,405,4.25,4.28,2.67
59	0.3	Very Good	I	SI1	63	57	405	4.28	4.32	2.71	null
60	0.3	Good	I	SI1	null	55	405	4.25	4.29	2.7	"60",0.3,"Good","I","SI1",63.2,55,405,4.25,4.29,2.7
61	0.35	Ideal	I	VS1	null	57	552	4.54	4.59	2.78	"61",0.35,"Ideal","I","VS1",60.9,57,552,4.54,4.59,2.78
62	0.3	Premium	D	SI1	null	59	552	4.23	4.27	2.66	"62",0.3,"Premium","D","SI1",62.6,59,552,4.23,4.27,2.66
63	0.3	Ideal	D	SI1	null	57	552	4.29	4.32	2.69	"63",0.3,"Ideal","D","SI1",62.5,57,552,4.29,4.32,2.69
64	0.3	Ideal	D	SI1	null	56	552	4.3	4.33	2.68	"64",0.3,"Ideal","D","SI1",62.1,56,552,4.3,4.33,2.68
65	0.42	Premium	I	SI2	null	59	552	4.78	4.84	2.96	"65",0.42,"Premium","I","SI2",61.5,59,552,4.78,4.84,2.96
66	0.28	Ideal	G	VVS2	null	56	553	4.19	4.22	2.58	"66",0.28,"Ideal","G","VVS2",61.4,56,553,4.19,4.22,2.58
67	0.32	Ideal	I	VVS1	62	55.3	553	4.39	4.42	2.73	null
68	0.31	Very Good	G	SI1	null	57	553	4.33	4.3	2.73	"68",0.31,"Very Good","G","SI1",63.3,57,553,4.33,4.3,2.73
69	0.31	Premium	G	SI1	null	58	553	4.35	4.32	2.68	"69",0.31,"Premium","G","SI1",61.8,58,553,4.35,4.32,2.68
70	0.24	Premium	E	VVS1	null	58	553	4.01	4.03	2.44	"70",0.24,"Premium","E","VVS1",60.7,58,553,4.01,4.03,2.44
71	0.24	Very Good	D	VVS1	null	60	553	3.97	4	2.45	"71",0.24,"Very Good","D","VVS1",61.5,60,553,3.97,4,2.45
72	0.3	Very Good	H	SI1	null	56	554	4.29	4.27	2.7	"72",0.3,"Very Good","H","SI1",63.1,56,554,4.29,4.27,2.7
73	0.3	Premium	H	SI1	null	59	554	4.28	4.24	2.68	"73",0.3,"Premium","H","SI1",62.9,59,554,4.28,4.24,2.68
74	0.3	Premium	H	SI1	null	57	554	4.29	4.25	2.67	"74",0.3,"Premium","H","SI1",62.5,57,554,4.29,4.25,2.67
75	0.3	Good	H	SI1	null	57	554	4.28	4.26	2.72	"75",0.3,"Good","H","SI1",63.7,57,554,4.28,4.26,2.72
76	0.26	Very Good	F	VVS2	null	60	554	4.19	4.22	2.49	"76",0.26,"Very Good","F","VVS2",59.2,60,554,4.19,4.22,2.49
77	0.26	Very Good	E	VVS2	null	58	554	4.15	4.23	2.51	"77",0.26,"Very Good","E","VVS2",59.9,58,554,4.15,4.23,2.51
78	0.26	Very Good	D	VVS2	null	54	554	4.08	4.13	2.56	"78",0.26,"Very Good","D","VVS2",62.4,54,554,4.08,4.13,2.56
79	0.26	Very Good	D	VVS2	null	60	554	4.01	4.05	2.53	"79",0.26,"Very Good","D","VVS2",62.8,60,554,4.01,4.05,2.53
80	0.26	Very Good	E	VVS1	null	59	554	4.06	4.09	2.55	"80",0.26,"Very Good","E","VVS1",62.6,59,554,4.06,4.09,2.55
81	0.26	Very Good	E	VVS1	null	59	554	4	4.04	2.55	"81",0.26,"Very Good","E","VVS1",63.4,59,554,4,4.04,2.55
82	0.26	Very Good	D	VVS1	null	60	554	4.03	4.12	2.53	"82",0.26,"Very Good","D","VVS1",62.1,60,554,4.03,4.12,2.53
83	0.26	Ideal	E	VVS2	null	58	554	4.02	4.06	2.54	"83",0.26,"Ideal","E","VVS2",62.9,58,554,4.02,4.06,2.54
84	0.38	Ideal	I	SI2	null	56	554	4.65	4.67	2.87	"84",0.38,"Ideal","I","SI2",61.6,56,554,4.65,4.67,2.87
85	0.26	Good	E	VVS1	null	60	554	4.22	4.25	2.45	"85",0.26,"Good","E","VVS1",57.9,60,554,4.22,4.25,2.45
86	0.24	Premium	G	VVS1	null	59	554	3.95	3.92	2.45	"86",0.24,"Premium","G","VVS1",62.3,59,554,3.95,3.92,2.45
87	0.24	Premium	H	VVS1	null	58	554	4.01	3.96	2.44	"87",0.24,"Premium","H","VVS1",61.2,58,554,4.01,3.96,2.44
88	0.24	Premium	H	VVS1	null	59	554	4.02	4	2.44	"88",0.24,"Premium","H","VVS1",60.8,59,554,4.02,4,2.44
89	0.24	Premium	H	VVS2	null	58	554	4.07	4.04	2.46	"89",0.24,"Premium","H","VVS2",60.7,58,554,4.07,4.04,2.46
90	0.32	Premium	I	SI1	null	58	554	4.35	4.33	2.73	"90",0.32,"Premium","I","SI1",62.9,58,554,4.35,4.33,2.73
91	0.7	Ideal	E	SI1	null	57	2757	5.7	5.72	3.57	"91",0.7,"Ideal","E","SI1",62.5,57,2757,5.7,5.72,3.57
92	0.86	Fair	E	SI2	null	69	2757	6.45	6.33	3.52	"92",0.86,"Fair","E","SI2",55.1,69,2757,6.45,6.33,3.52
93	0.7	Ideal	G	VS2	null	56	2757	5.7	5.67	3.5	"93",0.7,"Ideal","G","VS2",61.6,56,2757,5.7,5.67,3.5
94	0.71	Very Good	E	VS2	null	57	2759	5.68	5.73	3.56	"94",0.71,"Very Good","E","VS2",62.4,57,2759,5.68,5.73,3.56
95	0.78	Very Good	G	SI2	null	56	2759	5.81	5.85	3.72	"95",0.78,"Very Good","G","SI2",63.8,56,2759,5.81,5.85,3.72
96	0.7	Good	E	VS2	null	58	2759	5.85	5.9	3.38	"96",0.7,"Good","E","VS2",57.5,58,2759,5.85,5.9,3.38
97	0.7	Good	F	VS1	null	62	2759	5.71	5.76	3.4	"97",0.7,"Good","F","VS1",59.4,62,2759,5.71,5.76,3.4
98	0.96	Fair	F	SI2	null	62	2759	6.27	5.95	4.07	"98",0.96,"Fair","F","SI2",66.3,62,2759,6.27,5.95,4.07
99	0.73	Very Good	E	SI1	null	59	2760	5.77	5.78	3.56	"99",0.73,"Very Good","E","SI1",61.6,59,2760,5.77,5.78,3.56
100	0.8	Premium	H	SI1	null	58	2760	5.97	5.93	3.66	"100",0.8,"Premium","H","SI1",61.5,58,2760,5.97,5.93,3.66

Showing the first 1000 rows.

// Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named _corrupt_record by default).
// For example: spark.read.schema(schema).csv(file).filter($"_corrupt_record".isNotNull).count() and spark.read.schema(schema).csv(file).select("_corrupt_record").show().
// Instead, you can cache or save the parsed results and then send the same query.

val badRows = diamonds_with_wrong_schema.filter($"_corrupt_record".isNotNull)
badRows.cache()
val numBadRows = badRows.count()
badRows.unpersist()

org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 167.0 failed 4 times, most recent failure: Lost task 0.3 in stage 167.0 (TID 877, 10.97.245.56, executor 0): com.databricks.sql.io.FileReadException: Error while reading file dbfs:/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv.

val diamonds_with_wrong_schema_drop_malformed = spark.read.format("csv")
  .option("mode", "DROPMALFORMED")
  .option("header", "true")
  .schema(schema)
  .load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv")

diamonds_with_wrong_schema_drop_malformed: org.apache.spark.sql.DataFrame = [_c0: int, carat: double ... 10 more fields]

display(diamonds_with_wrong_schema_drop_malformed)


11	0.3	Good	J	SI1	64	55	339	4.25	4.28	2.73	null
17	0.3	Ideal	I	SI2	62	54	348	4.31	4.34	2.68	null
23	0.23	Very Good	H	VS1	61	57	353	3.94	3.96	2.41	null
31	0.23	Very Good	F	VS1	60	57	402	4	4.03	2.41	null
38	0.31	Good	H	SI1	64	54	402	4.29	4.31	2.75	null
56	0.3	Ideal	I	SI2	61	59	405	4.3	4.33	2.63	null
59	0.3	Very Good	I	SI1	63	57	405	4.28	4.32	2.71	null
67	0.32	Ideal	I	VVS1	62	55.3	553	4.39	4.42	2.73	null
107	0.8	Premium	G	SI1	63	59	2760	5.9	5.81	3.69	null
110	0.59	Ideal	E	VVS2	62	55	2761	5.38	5.43	3.35	null
113	0.9	Premium	I	VS2	63	58	2761	6.16	6.12	3.87	null
131	0.77	Ideal	H	VS2	62	56	2763	5.89	5.86	3.64	null
144	0.7	Very Good	F	VS1	60	57	2767	5.8	5.87	3.5	null
167	0.8	Very Good	F	SI2	61	57	2772	6.01	6.03	3.67	null
172	0.7	Premium	D	VS2	58	62	2773	5.87	5.78	3.38	null
194	0.7	Premium	E	SI1	60	59	2777	5.79	5.75	3.46	null
197	0.7	Premium	E	SI1	61	57	2777	5.73	5.68	3.48	null
198	0.7	Premium	E	SI1	61	58	2777	5.78	5.72	3.51	null
231	0.72	Very Good	F	VS2	63	54	2784	5.69	5.73	3.6	null
234	0.51	Ideal	F	VVS1	62	57	2787	5.11	5.15	3.18	null
252	0.81	Good	G	SI2	61	61	2789	5.94	5.99	3.64	null
281	0.72	Premium	D	SI2	62	60	2795	5.73	5.69	3.54	null
282	0.72	Premium	I	IF	63	57	2795	5.72	5.7	3.6	null
283	0.81	Premium	H	VS2	58	59	2795	6.17	6.13	3.57	null
298	0.8	Premium	F	SI2	61	57	2797	6.03	6.01	3.67	null
302	0.83	Very Good	E	SI2	58	62	2799	6.19	6.25	3.61	null
315	0.76	Fair	G	VS1	59	70	2800	5.89	5.8	3.46	null
328	0.72	Premium	E	VS2	63	55	2802	5.79	5.61	3.59	null
336	0.71	Premium	F	VS2	58	62	2803	5.85	5.81	3.38	null
344	0.71	Very Good	E	VS2	64	57	2804	5.66	5.68	3.63	null
350	0.7	Ideal	D	SI1	61	59	2804	5.68	5.7	3.47	null
361	0.82	Good	G	VS2	64	57	2805	5.92	5.89	3.78	null
366	0.71	Very Good	F	VS1	60	57	2807	5.84	5.9	3.52	null
369	0.7	Very Good	F	VS1	62	57	2808	5.64	5.71	3.52	null
387	0.7	Very Good	G	VS1	63	60	2812	5.57	5.64	3.53	null
394	0.32	Premium	I	SI1	61	59	554	4.39	4.36	2.67	null
412	0.34	Ideal	I	VS1	62	56	555	4.5	4.53	2.8	null
419	0.3	Ideal	G	VS2	62	56	556	4.28	4.3	2.66	null
424	0.99	Fair	J	SI1	55	61	2812	6.72	6.67	3.68	null
426	0.51	Ideal	F	VVS1	62	57	2812	5.15	5.11	3.18	null
440	0.9	Fair	J	VS2	65	56	2815	6.08	6.04	3.94	null
441	0.95	Fair	F	SI2	56	60	2815	6.62	6.53	3.68	null
465	0.7	Premium	E	VS2	63	60	2818	5.64	5.6	3.54	null
468	0.8	Ideal	H	SI1	61	57	2818	6.07	6	3.68	null
479	0.75	Ideal	E	SI1	62	57	2821	5.8	5.78	3.59	null
493	0.6	Ideal	F	VVS2	62	55	2822	5.37	5.4	3.34	null
500	0.9	Good	J	VS2	64	61	2822	6.04	6.03	3.86	null
523	0.83	Premium	H	SI1	60	59	2825	6.08	6.05	3.64	null
524	0.73	Very Good	G	VS1	62	57	2825	5.75	5.8	3.58	null
538	0.72	Good	D	VS2	64	54	2827	5.68	5.7	3.64	null
546	0.59	Ideal	E	VVS1	62	56	2829	5.36	5.38	3.33	null
565	0.81	Premium	G	SI1	63	60	2832	5.87	5.81	3.68	null
569	0.8	Very Good	I	VS2	62	58	2833	5.86	5.95	3.66	null
570	0.56	Very Good	E	IF	61	59	2833	5.28	5.34	3.24	null
572	0.7	Ideal	D	VS2	61	57	2833	5.74	5.76	3.51	null
588	0.72	Ideal	E	SI1	61	57	2835	5.78	5.8	3.53	null
605	0.7	Premium	G	VS1	63	60	2838	5.64	5.57	3.53	null
606	0.74	Very Good	E	SI1	60	57	2839	5.85	5.89	3.52	null
611	0.71	Premium	F	VS1	59	60	2839	5.82	5.8	3.43	null
621	0.77	Very Good	H	VS1	61	60	2840	5.9	5.87	3.59	null
623	0.7	Ideal	F	SI1	61	56	2840	5.75	5.8	3.52	null
649	0.72	Ideal	E	VS2	62	57	2843	5.71	5.74	3.55	null
653	0.7	Very Good	E	VS1	62	59	2844	5.65	5.68	3.51	null
680	0.7	Very Good	E	VS2	61	60	2850	5.74	5.77	3.51	null
686	0.87	Very Good	F	SI2	61	63	2851	6.22	6.07	3.75	null
688	0.74	Ideal	F	SI1	61	57	2851	5.85	5.81	3.56	null
697	0.79	Premium	D	SI2	60	60	2853	6.07	6.03	3.63	null
703	0.73	Premium	E	VS2	62	57	2854	5.86	5.76	3.6	null
714	1.02	Fair	I	SI1	53	63	2856	6.84	6.77	3.66	null
734	0.31	Ideal	I	VVS1	62	54	557	4.37	4.4	2.72	null
744	0.33	Premium	F	SI2	63	58	557	4.42	4.4	2.78	null
748	0.33	Ideal	I	SI1	63	57	557	4.39	4.37	2.76	null
753	0.77	Premium	F	SI1	61	58	2856	5.94	5.9	3.61	null
762	0.71	Premium	E	VS2	61	60	2858	5.76	5.69	3.49	null
773	0.7	Ideal	D	VS2	61	57	2859	5.76	5.74	3.51	null
788	0.9	Premium	I	SI1	63	58	2861	6.09	6.01	3.81	null
791	0.66	Premium	D	VS1	61	58	2861	5.67	5.57	3.43	null
799	0.79	Premium	H	VS1	60	60	2862	6.07	5.99	3.64	null
808	0.71	Premium	D	SI1	61	61	2863	5.82	5.75	3.53	null
820	0.71	Ideal	E	VS2	61	55	2863	5.79	5.75	3.52	null
824	0.56	Very Good	D	VVS1	62	56	2866	5.25	5.3	3.27	null
843	0.79	Premium	E	SI2	61	58	2868	5.96	5.9	3.62	null
867	0.7	Very Good	G	SI2	59	62	2872	5.79	5.81	3.42	null
876	0.76	Very Good	F	VS2	62	58	2873	5.8	5.86	3.62	null
883	0.9	Good	J	SI1	64	61	2873	6	5.96	3.83	null
889	0.7	Premium	F	VS1	59	59	2874	5.79	5.77	3.41	null
892	0.7	Ideal	F	VS1	61	55	2874	5.77	5.73	3.51	null
917	0.84	Ideal	G	SI2	61	56	2879	6.13	6.1	3.73	null
921	0.72	Ideal	F	VS1	62	56	2879	5.76	5.73	3.56	null
924	0.7	Ideal	H	VVS1	62	55	2881	5.74	5.71	3.55	null
925	0.71	Very Good	E	VS2	60	59	2881	5.84	5.83	3.5	null
926	1.05	Premium	H	I1	62	59	2881	6.5	6.47	4.02	null
934	0.9	Fair	H	SI2	65	61	2883	6.01	5.96	3.89	null
1014	0.81	Ideal	E	SI2	63	56	2901	5.95	5.9	3.73	null
1017	0.73	Premium	E	VS2	62	60	2902	5.76	5.73	3.56	null
1027	0.79	Very Good	F	SI1	63	54	2904	5.91	5.94	3.73	null
1030	0.62	Ideal	E	VVS2	62	56	2904	5.48	5.52	3.41	null
1075	0.31	Premium	H	SI1	61	61	558	4.39	4.33	2.66	null
1083	0.7	Premium	D	VS2	61	60	2909	5.75	5.7	3.49	null
1098	1	Fair	I	SI1	66	56	2912	6.31	6.24	4.13	null

Showing the first 1000 rows.

val diamonds_with_wrong_schema_fail_fast = spark.read.format("csv")
  .option("mode", "FAILFAST")
  .option("header", "true")
  .schema(schema)
  .load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv")

diamonds_with_wrong_schema_fail_fast: org.apache.spark.sql.DataFrame = [_c0: int, carat: double ... 10 more fields]

display(diamonds_with_wrong_schema_fail_fast)

SparkException: Job aborted due to stage failure: Task 0 in stage 170.0 failed 4 times, most recent failure: Lost task 0.3 in stage 170.0 (TID 882, 10.97.240.8, executor 1): com.databricks.sql.io.FileReadException: Error while reading file dbfs:/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv. at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.logFileNameAndThrow(FileScanRDD.scala:340) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.getNext(FileScanRDD.scala:319) at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:406) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:259) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at org.apache.spark.sql.execution.collect.UnsafeRowBatchUtils$.encodeUnsafeRows(UnsafeRowBatchUtils.scala:80) at org.apache.spark.sql.execution.collect.Collector.$anonfun$processFunc$1(Collector.scala:187) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.doRunTask(Task.scala:144) at org.apache.spark.scheduler.Task.run(Task.scala:117) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$9(Executor.scala:639) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1559) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:642) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.spark.SparkException: Malformed records are detected in record parsing. Parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'. at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:77) at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$2(UnivocityParser.scala:411) at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490) at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.getNext(FileScanRDD.scala:292) ... 15 more Caused by: org.apache.spark.sql.catalyst.util.BadRecordException: java.lang.NumberFormatException: For input string: "61.5" at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:308) at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$parse$2(UnivocityParser.scala:253) at org.apache.spark.sql.catalyst.csv.UnivocityParser$.$anonfun$parseIterator$1(UnivocityParser.scala:404) at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:64) ... 21 more Caused by: java.lang.NumberFormatException: For input string: "61.5" at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65) at java.lang.Integer.parseInt(Integer.java:580) at java.lang.Integer.parseInt(Integer.java:615) at scala.collection.immutable.StringLike.toInt(StringLike.scala:304) at scala.collection.immutable.StringLike.toInt$(StringLike.scala:304) at scala.collection.immutable.StringOps.toInt(StringOps.scala:33) at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6(UnivocityParser.scala:156) at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$6$adapted(UnivocityParser.scala:156) at org.apache.spark.sql.catalyst.csv.UnivocityParser.nullSafeDatum(UnivocityParser.scala:237) at org.apache.spark.sql.catalyst.csv.UnivocityParser.$anonfun$makeConverter$5(UnivocityParser.scala:156) at org.apache.spark.sql.catalyst.csv.UnivocityParser.org$apache$spark$sql$catalyst$csv$UnivocityParser$$convert(UnivocityParser.scala:290) ... 24 more Driver stacktrace:

PERMISSIVE mode (default)

DROPMALFORMED mode

FAILFAST mode