read-csv-column-subset(Scala)
Loading...
import org.apache.spark.sql.types._

val schema = new StructType()
  .add("_c0",IntegerType,true)
  .add("carat",DoubleType,true)
  .add("cut",StringType,true)
  .add("color",StringType,true)
  .add("clarity",StringType,true)
  .add("depth",IntegerType,true) // The depth field is defined wrongly. The actual data contains floating point numbers, while the schema specifies an integer.
  .add("table",DoubleType,true)
  .add("price",IntegerType,true)
  .add("x",DoubleType,true)
  .add("y",DoubleType,true)
  .add("z",DoubleType,true)

val diamonds_with_wrong_schema = spark.read.format("csv")
  .option("header", "true")
  .schema(schema)
  .load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv")
import org.apache.spark.sql.types._ schema: org.apache.spark.sql.types.StructType = StructType(StructField(_c0,IntegerType,true), StructField(carat,DoubleType,true), StructField(cut,StringType,true), StructField(color,StringType,true), StructField(clarity,StringType,true), StructField(depth,IntegerType,true), StructField(table,DoubleType,true), StructField(price,IntegerType,true), StructField(x,DoubleType,true), StructField(y,DoubleType,true), StructField(z,DoubleType,true)) diamonds_with_wrong_schema: org.apache.spark.sql.DataFrame = [_c0: int, carat: double ... 9 more fields]
// The mistake in the user-specified schema causes any row with a non-integer value in the depth column to be nullified.
// There are some rows, where the value of depth is an integer e.g. 64.0. They are parsed and converted successfully.
display(diamonds_with_wrong_schema)
10.23IdealESI2null553263.953.982.43
20.21PremiumESI1null613263.893.842.31
30.23GoodEVS1null653274.054.072.31
40.29PremiumIVS2null583344.24.232.63
50.31GoodJSI2null583354.344.352.75
60.24Very GoodJVVS2null573363.943.962.48
70.24Very GoodIVVS1null573363.953.982.47
80.26Very GoodHSI1null553374.074.112.53
90.22FairEVS2null613373.873.782.49
100.23Very GoodHVS1null6133844.052.39
110.3GoodJSI164553394.254.282.73
120.23IdealJVS1null563403.933.92.46
130.22PremiumFSI1null613423.883.842.33
140.31IdealJSI2null543444.354.372.71
150.2PremiumESI2null623453.793.752.27
160.32PremiumEI1null583454.384.422.68
170.3IdealISI262543484.314.342.68
180.3GoodJSI1null543514.234.292.7
190.3GoodJSI1null563514.234.262.71
200.3Very GoodJSI1null593514.214.272.66
210.3GoodISI2null563514.264.32.71
220.23Very GoodEVS2null553523.853.922.48
230.23Very GoodHVS161573533.943.962.41
240.31Very GoodJSI1null623534.394.432.62
250.31Very GoodJSI1null623534.444.472.59
260.23Very GoodGVVS2null583543.974.012.41
270.24PremiumIVS1null573553.973.942.47
280.3Very GoodJVS2null573574.284.32.67
290.23Very GoodDVS2null613573.963.972.4
300.23Very GoodFVS1null573573.963.992.42
310.23Very GoodFVS1605740244.032.41
320.23Very GoodFVS1null574024.044.062.42
330.23Very GoodEVS1null594023.974.012.42
340.23Very GoodEVS1null584024.014.062.4
350.23Very GoodDVS1null584023.923.962.44
360.23GoodFVS1null594024.064.082.37
370.23GoodEVS1null594023.833.852.46
380.31GoodHSI164544024.294.312.75
390.26Very GoodDVS2null594034.134.162.52
400.33IdealISI2null554034.494.512.78
410.33IdealISI2null564034.494.52.75
420.33IdealJSI1null564034.494.552.76
430.26GoodDVS2null564033.994.022.61
440.26GoodDVS1null634034.194.242.46
450.32GoodHSI2null564034.344.372.75
460.29PremiumFSI1null584034.244.262.65
470.32Very GoodHSI2null554034.354.422.71
480.32GoodHSI2null564034.364.382.79
490.25Very GoodEVS2null6040444.032.54
500.29Very GoodHSI2null604044.334.372.64
510.24Very GoodFSI1null614044.024.032.45
520.23IdealGVS1null544043.933.952.44
530.32IdealISI1null554044.454.482.72
540.22PremiumEVS2null584043.933.892.41
550.22PremiumDVS2null624043.913.882.31
560.3IdealISI261594054.34.332.63
570.3PremiumJSI2null614054.434.382.61
580.3Very GoodISI1null574054.254.282.67
590.3Very GoodISI163574054.284.322.71
600.3GoodISI1null554054.254.292.7
610.35IdealIVS1null575524.544.592.78
620.3PremiumDSI1null595524.234.272.66
630.3IdealDSI1null575524.294.322.69
640.3IdealDSI1null565524.34.332.68
650.42PremiumISI2null595524.784.842.96
660.28IdealGVVS2null565534.194.222.58
670.32IdealIVVS16255.35534.394.422.73
680.31Very GoodGSI1null575534.334.32.73
690.31PremiumGSI1null585534.354.322.68
700.24PremiumEVVS1null585534.014.032.44
710.24Very GoodDVVS1null605533.9742.45
720.3Very GoodHSI1null565544.294.272.7
730.3PremiumHSI1null595544.284.242.68
740.3PremiumHSI1null575544.294.252.67
750.3GoodHSI1null575544.284.262.72
760.26Very GoodFVVS2null605544.194.222.49
770.26Very GoodEVVS2null585544.154.232.51
780.26Very GoodDVVS2null545544.084.132.56
790.26Very GoodDVVS2null605544.014.052.53
800.26Very GoodEVVS1null595544.064.092.55
810.26Very GoodEVVS1null5955444.042.55
820.26Very GoodDVVS1null605544.034.122.53
830.26IdealEVVS2null585544.024.062.54
840.38IdealISI2null565544.654.672.87
850.26GoodEVVS1null605544.224.252.45
860.24PremiumGVVS1null595543.953.922.45
870.24PremiumHVVS1null585544.013.962.44
880.24PremiumHVVS1null595544.0242.44
890.24PremiumHVVS2null585544.074.042.46
900.32PremiumISI1null585544.354.332.73
910.7IdealESI1null5727575.75.723.57
920.86FairESI2null6927576.456.333.52
930.7IdealGVS2null5627575.75.673.5
940.71Very GoodEVS2null5727595.685.733.56
950.78Very GoodGSI2null5627595.815.853.72
960.7GoodEVS2null5827595.855.93.38
970.7GoodFVS1null6227595.715.763.4
980.96FairFSI2null6227596.275.954.07
990.73Very GoodESI1null5927605.775.783.56
1000.8PremiumHSI1null5827605.975.933.66

Showing the first 1000 rows.

// Reading a subset of columns that does not include the problematic depth column avoids the issue.
// There are no nullified rows.
display(diamonds_with_wrong_schema.select($"_c0", $"carat", $"clarity"))
10.23SI2
20.21SI1
30.23VS1
40.29VS2
50.31SI2
60.24VVS2
70.24VVS1
80.26SI1
90.22VS2
100.23VS1
110.3SI1
120.23VS1
130.22SI1
140.31SI2
150.2SI2
160.32I1
170.3SI2
180.3SI1
190.3SI1
200.3SI1
210.3SI2
220.23VS2
230.23VS1
240.31SI1
250.31SI1
260.23VVS2
270.24VS1
280.3VS2
290.23VS2
300.23VS1
310.23VS1
320.23VS1
330.23VS1
340.23VS1
350.23VS1
360.23VS1
370.23VS1
380.31SI1
390.26VS2
400.33SI2
410.33SI2
420.33SI1
430.26VS2
440.26VS1
450.32SI2
460.29SI1
470.32SI2
480.32SI2
490.25VS2
500.29SI2
510.24SI1
520.23VS1
530.32SI1
540.22VS2
550.22VS2
560.3SI2
570.3SI2
580.3SI1
590.3SI1
600.3SI1
610.35VS1
620.3SI1
630.3SI1
640.3SI1
650.42SI2
660.28VVS2
670.32VVS1
680.31SI1
690.31SI1
700.24VVS1
710.24VVS1
720.3SI1
730.3SI1
740.3SI1
750.3SI1
760.26VVS2
770.26VVS2
780.26VVS2
790.26VVS2
800.26VVS1
810.26VVS1
820.26VVS1
830.26VVS2
840.38SI2
850.26VVS1
860.24VVS1
870.24VVS1
880.24VVS1
890.24VVS2
900.32SI1
910.7SI1
920.86SI2
930.7VS2
940.71VS2
950.78SI2
960.7VS2
970.7VS1
980.96SI2
990.73SI1
1000.8SI1

Showing the first 1000 rows.

// However as soon as the depth column is included, the parsing and coversion fails for many rows. 
display(diamonds_with_wrong_schema.select($"_c0", $"carat", $"clarity", $"depth"))
10.23SI2null
20.21SI1null
30.23VS1null
40.29VS2null
50.31SI2null
60.24VVS2null
70.24VVS1null
80.26SI1null
90.22VS2null
100.23VS1null
110.3SI164
120.23VS1null
130.22SI1null
140.31SI2null
150.2SI2null
160.32I1null
170.3SI262
180.3SI1null
190.3SI1null
200.3SI1null
210.3SI2null
220.23VS2null
230.23VS161
240.31SI1null
250.31SI1null
260.23VVS2null
270.24VS1null
280.3VS2null
290.23VS2null
300.23VS1null
310.23VS160
320.23VS1null
330.23VS1null
340.23VS1null
350.23VS1null
360.23VS1null
370.23VS1null
380.31SI164
390.26VS2null
400.33SI2null
410.33SI2null
420.33SI1null
430.26VS2null
440.26VS1null
450.32SI2null
460.29SI1null
470.32SI2null
480.32SI2null
490.25VS2null
500.29SI2null
510.24SI1null
520.23VS1null
530.32SI1null
540.22VS2null
550.22VS2null
560.3SI261
570.3SI2null
580.3SI1null
590.3SI163
600.3SI1null
610.35VS1null
620.3SI1null
630.3SI1null
640.3SI1null
650.42SI2null
660.28VVS2null
670.32VVS162
680.31SI1null
690.31SI1null
700.24VVS1null
710.24VVS1null
720.3SI1null
730.3SI1null
740.3SI1null
750.3SI1null
760.26VVS2null
770.26VVS2null
780.26VVS2null
790.26VVS2null
800.26VVS1null
810.26VVS1null
820.26VVS1null
830.26VVS2null
840.38SI2null
850.26VVS1null
860.24VVS1null
870.24VVS1null
880.24VVS1null
890.24VVS2null
900.32SI1null
910.7SI1null
920.86SI2null
930.7VS2null
940.71VS2null
950.78SI2null
960.7VS2null
970.7VS1null
980.96SI2null
990.73SI1null
1000.8SI1null

Showing the first 1000 rows.

diamonds_with_wrong_schema.createOrReplaceTempView("diamonds_with_wrong_schema")
%sql
-- Reading all the columns hits the problem with the depth column. We can see a lot of nullified rows.
SELECT * FROM diamonds_with_wrong_schema WHERE _c0 is null
OK
%sql
-- Running a very similar query with a count(_c0) aggregate instead of "*" does return 0 contrary to the output of the query above.
-- In this case, the depth column doesn't need to be touched. Therefore, the parsing and coversion errors do not occur and no rows are nullified.
SELECT count(_c0) FROM diamonds_with_wrong_schema WHERE _c0 is null
0
%sql
-- The same applies for this similar query.
SELECT count(1) FROM diamonds_with_wrong_schema WHERE _c0 is null
0
%sql
-- The same applies for this similar query.
SELECT count(*) FROM diamonds_with_wrong_schema WHERE _c0 is null
0