import org.apache.spark.sql.types._ val schema = new StructType() .add("_c0",IntegerType,true) .add("carat",DoubleType,true) .add("cut",StringType,true) .add("color",StringType,true) .add("clarity",StringType,true) .add("depth",IntegerType,true) // The depth field is defined wrongly. The actual data contains floating point numbers, while the schema specifies an integer. .add("table",DoubleType,true) .add("price",IntegerType,true) .add("x",DoubleType,true) .add("y",DoubleType,true) .add("z",DoubleType,true) val diamonds_with_wrong_schema = spark.read.format("csv") .option("header", "true") .schema(schema) .load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv")
import org.apache.spark.sql.types._
schema: org.apache.spark.sql.types.StructType = StructType(StructField(_c0,IntegerType,true), StructField(carat,DoubleType,true), StructField(cut,StringType,true), StructField(color,StringType,true), StructField(clarity,StringType,true), StructField(depth,IntegerType,true), StructField(table,DoubleType,true), StructField(price,IntegerType,true), StructField(x,DoubleType,true), StructField(y,DoubleType,true), StructField(z,DoubleType,true))
diamonds_with_wrong_schema: org.apache.spark.sql.DataFrame = [_c0: int, carat: double ... 9 more fields]
%sql -- Running a very similar query with a count(_c0) aggregate instead of "*" does return 0 contrary to the output of the query above. -- In this case, the depth column doesn't need to be touched. Therefore, the parsing and coversion errors do not occur and no rows are nullified. SELECT count(_c0) FROM diamonds_with_wrong_schema WHERE _c0 is null