Reported by brkyvz. Original description below:
The generated code used by `pmod` conflicts with DataFrameWriter.partitionBy
Quick repro:
import org.apache.spark.sql.functions._ case class Test(a: Int, b: String) val ds = Seq(Test(0, "a"), Test(1, "b"), Test(1, "a")).toDS.createOrReplaceTempView("test") sql(""" select * from test distribute by pmod(a, 2) """) .write .partitionBy("b") .mode("overwrite") .parquet("/tmp/repro")
You may also use repartition with the function `pmod` instead of using `pmod` inside `distribute by` in sql.
Example generated code (two variables defined as r):
/* 025 */ public UnsafeRow apply(InternalRow i) { /* 026 */ int value1 = 42; /* 027 */ /* 028 */ boolean isNull2 = i.isNullAt(0); /* 029 */ UTF8String value2 = isNull2 ? null : (i.getUTF8String(0)); /* 030 */ if (!isNull2) { /* 031 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value2.getBaseObject(), value2.getBaseOffset(), value2.numBytes(), value1); /* 032 */ } /* 033 */ /* 034 */ /* 035 */ int value4 = 42; /* 036 */ /* 037 */ boolean isNull5 = i.isNullAt(1); /* 038 */ UTF8String value5 = isNull5 ? null : (i.getUTF8String(1)); /* 039 */ if (!isNull5) { /* 040 */ value4 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashUnsafeBytes(value5.getBaseObject(), value5.getBaseOffset(), value5.numBytes(), value4); /* 041 */ } /* 042 */ /* 043 */ int value3 = -1; /* 044 */ /* 045 */ int r = value4 % 10; /* 046 */ if (r < 0) { /* 047 */ value3 = (r + 10) % 10; /* 048 */ } else { /* 049 */ value3 = r; /* 050 */ } /* 051 */ value1 = org.apache.spark.unsafe.hash.Murmur3_x86_32.hashInt(value3, value1); /* 052 */ /* 053 */ int value = -1; /* 054 */ /* 055 */ int r = value1 % 200; /* 056 */ if (r < 0) { /* 057 */ value = (r + 200) % 200; /* 058 */ } else { /* 059 */ value = r; /* 060 */ } /* 061 */ rowWriter.write(0, value); /* 062 */ return result; /* 063 */ } /* 064 */ }
Issue Links
- relates to
SPARK-16489 Test harness to prevent expression code generation from reusing variable names
- Resolved
- links to