Description
// package com.verifyfilter.example; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; public class ExcludeInTesting { public static void main(String[] args) { SparkSession spark = SparkSession.builder() .appName("ExcludeInTesting") .config("spark.some.config.option", "some-value") .getOrCreate(); Dataset<Row> dataReadFromCSV = spark.read().format("com.databricks.spark.csv") .option("header", "true") .option("delimiter", "|") .option("inferSchema", "true") //.load("E:/resources/customer.csv"); local //below path for VM .load("/home/myproject/bda/home/bin/customer.csv"); dataReadFromCSV.printSchema(); dataReadFromCSV.show(); //Adding an extra step of saving to db and then loading it again dataReadFromCSV.write().mode(SaveMode.Overwrite).saveAsTable("customer"); Dataset<Row> dataLoaded = spark.sql("select * from customer"); //Gender EQ M Column genderCol = dataLoaded.col("Gender"); Dataset<Row> onlyMaleDS = dataLoaded.where(genderCol.equalTo("M")); //Dataset<Row> onlyMaleDS = spark.sql("select count(*) from customer where Gender='M'"); onlyMaleDS.show(); System.out.println("The count of Male customers is :"+ onlyMaleDS.count()); System.out.println("*************************************"); // Income in the list Object[] valuesArray = new Object[5]; valuesArray[0]=503.65; valuesArray[1]=495.54; valuesArray[2]=486.82; valuesArray[3]=481.28; valuesArray[4]=479.79; Column incomeCol = dataLoaded.col("Income"); Dataset<Row> incomeMatchingSet = dataLoaded.where(incomeCol.isin((Object[]) valuesArray)); System.out.println("The count of customers satisfaying Income is :"+ incomeMatchingSet.count()); System.out.println("*************************************"); Dataset<Row> maleExcptIncomeMatch = onlyMaleDS.except(incomeMatchingSet); System.out.println("The count of final customers is :"+ maleExcptIncomeMatch.count()); System.out.println("*************************************"); } }
When the above code is executed on Spark 2.3.0 ,it gives below different results:
Windows : The code gives correct count of dataset 148237,
Linux : The code gives different count of dataset 129532
Some more info related to this bug:
1. Application Code (attached)
2. CSV file used(attached)
3. Windows spec
Windows 10- 64 bit OS
4. Linux spec (Running on Oracle VM virtual box)
Specifications: {as captured from Vbox.log}
00:00:26.112908 VMMDev: Guest Additions information report: Version 5.0.32 r112930 '5.0.32_Ubuntu'
00:00:26.112996 VMMDev: Guest Additions information report: Interface = 0x00010004 osType = 0x00053100 (Linux >= 2.6, 64-bit)
5. Snapshots of output in both cases (attached)