Merge branch 'master' of https://github.com/pgandhi999/spark into SPARK-25250

pgandhi · pgandhi · commit 8667c28e39c0 · 2018-10-23T14:47:15.000-05:00
[SPARK-25250] : Upmerging with master to fix unit tests
diff --git a/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala b/core/src/test/scala/org/apache/spark/ui/PagedTableSuite.scala
@@ -73,7 +73,7 @@ class PagedTableSuite extends SparkFunSuite {
       override def goButtonFormPath: String = ""
     }
 
-    assert(pagedTable.pageNavigation(1, 10, 1) === Nil)
+    assert((pagedTable.pageNavigation(1, 10, 1).head \\ "li").map(_.text.trim) === Seq("1"))
     assert(
       (pagedTable.pageNavigation(1, 10, 2).head \\ "li").map(_.text.trim) === Seq("1", "2", ">"))
     assert(
diff --git a/docs/sql-data-sources-load-save-functions.md b/docs/sql-data-sources-load-save-functions.md
@@ -82,6 +82,50 @@ To load a CSV file you can use:
 </div>
 </div>
 
+The extra options are also used during write operation.
+For example, you can control bloom filters and dictionary encodings for ORC data sources.
+The following ORC example will create bloom filter and use dictionary encoding only for `favorite_color`.
+For Parquet, there exists `parquet.enable.dictionary`, too.
+To find more detailed information about the extra ORC/Parquet options,
+visit the official Apache ORC/Parquet websites.
+
+<div class="codetabs">
+
+<div data-lang="scala"  markdown="1">
+{% include_example manual_save_options_orc scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
+</div>
+
+<div data-lang="java"  markdown="1">
+{% include_example manual_save_options_orc java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
+</div>
+
+<div data-lang="python"  markdown="1">
+{% include_example manual_save_options_orc python/sql/datasource.py %}
+</div>
+
+<div data-lang="r"  markdown="1">
+{% include_example manual_save_options_orc r/RSparkSQLExample.R %}
+</div>
+
+<div data-lang="sql"  markdown="1">
+
+{% highlight sql %}
+CREATE TABLE users_with_options (
+  name STRING,
+  favorite_color STRING,
+  favorite_numbers array<integer>
+) USING ORC
+OPTIONS (
+  orc.bloom.filter.columns 'favorite_color',
+  orc.dictionary.key.threshold '1.0',
+  orc.column.encoding.direct 'name'
+)
+{% endhighlight %}
+
+</div>
+
+</div>
+
 ### Run SQL on files directly
 
 Instead of using read API to load a file into DataFrame and query it, you can also query that
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
@@ -123,6 +123,13 @@ private static void runBasicDataSourceExample(SparkSession spark) {
       .option("header", "true")
       .load("examples/src/main/resources/people.csv");
     // $example off:manual_load_options_csv$
+    // $example on:manual_save_options_orc$
+    usersDF.write().format("orc")
+      .option("orc.bloom.filter.columns", "favorite_color")
+      .option("orc.dictionary.key.threshold", "1.0")
+      .option("orc.column.encoding.direct", "name")
+      .save("users_with_options.orc");
+    // $example off:manual_save_options_orc$
     // $example on:direct_sql$
     Dataset<Row> sqlDF =
       spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`");
diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py
@@ -57,6 +57,15 @@ def basic_datasource_example(spark):
                          format="csv", sep=":", inferSchema="true", header="true")
     # $example off:manual_load_options_csv$
 
+    # $example on:manual_save_options_orc$
+    df = spark.read.orc("examples/src/main/resources/users.orc")
+    (df.write.format("orc")
+        .option("orc.bloom.filter.columns", "favorite_color")
+        .option("orc.dictionary.key.threshold", "1.0")
+        .option("orc.column.encoding.direct", "name")
+        .save("users_with_options.orc"))
+    # $example off:manual_save_options_orc$
+
     # $example on:write_sorting_and_bucketing$
     df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed")
     # $example off:write_sorting_and_bucketing$
diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R
@@ -114,10 +114,14 @@ write.df(namesAndAges, "namesAndAges.parquet", "parquet")
 
 
 # $example on:manual_load_options_csv$
-df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", inferSchema=T, header=T)
+df <- read.df("examples/src/main/resources/people.csv", "csv", sep = ";", inferSchema = TRUE, header = TRUE)
 namesAndAges <- select(df, "name", "age")
 # $example off:manual_load_options_csv$
 
+# $example on:manual_save_options_orc$
+df <- read.df("examples/src/main/resources/users.orc", "orc")
+write.orc(df, "users_with_options.orc", orc.bloom.filter.columns = "favorite_color", orc.dictionary.key.threshold = 1.0, orc.column.encoding.direct = "name")
+# $example off:manual_save_options_orc$
 
 # $example on:direct_sql$
 df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
diff --git a/examples/src/main/resources/users.orc b/examples/src/main/resources/users.orc
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
@@ -56,6 +56,13 @@ object SQLDataSourceExample {
       .option("header", "true")
       .load("examples/src/main/resources/people.csv")
     // $example off:manual_load_options_csv$
+    // $example on:manual_save_options_orc$
+    usersDF.write.format("orc")
+      .option("orc.bloom.filter.columns", "favorite_color")
+      .option("orc.dictionary.key.threshold", "1.0")
+      .option("orc.column.encoding.direct", "name")
+      .save("users_with_options.orc")
+    // $example off:manual_save_options_orc$
 
     // $example on:direct_sql$
     val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -171,7 +171,7 @@ object RowEncoder {
 
       if (inputObject.nullable) {
         If(IsNull(inputObject),
-          Literal.create(null, inputType),
+          Literal.create(null, nonNullOutput.dataType),
           nonNullOutput)
       } else {
         nonNullOutput
@@ -187,7 +187,9 @@ object RowEncoder {
         val convertedField = if (field.nullable) {
           If(
             Invoke(inputObject, "isNullAt", BooleanType, Literal(index) :: Nil),
-            Literal.create(null, field.dataType),
+            // Because we strip UDTs, `field.dataType` can be different from `fieldValue.dataType`.
+            // We should use `fieldValue.dataType` here.
+            Literal.create(null, fieldValue.dataType),
             fieldValue
           )
         } else {
@@ -198,7 +200,7 @@ object RowEncoder {
 
       if (inputObject.nullable) {
         If(IsNull(inputObject),
-          Literal.create(null, inputType),
+          Literal.create(null, nonNullOutput.dataType),
           nonNullOutput)
       } else {
         nonNullOutput
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
@@ -273,6 +273,14 @@ class RowEncoderSuite extends CodegenInterpretedPlanTest {
     assert(e4.getMessage.contains("java.lang.String is not a valid external type"))
   }
 
+  test("SPARK-25791: Datatype of serializers should be accessible") {
+    val udtSQLType = new StructType().add("a", IntegerType)
+    val pythonUDT = new PythonUserDefinedType(udtSQLType, "pyUDT", "serializedPyClass")
+    val schema = new StructType().add("pythonUDT", pythonUDT, true)
+    val encoder = RowEncoder(schema)
+    assert(encoder.serializer(0).dataType == pythonUDT.sqlType)
+  }
+
   for {
     elementType <- Seq(IntegerType, StringType)
     containsNull <- Seq(true, false)

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ class PagedTableSuite extends SparkFunSuite {`
`73`	`73`	`override def goButtonFormPath: String = ""`
`74`	`74`	`}`
`75`	`75`
`76`		`- assert(pagedTable.pageNavigation(1, 10, 1) === Nil)`
	`76`	`+ assert((pagedTable.pageNavigation(1, 10, 1).head \\ "li").map(_.text.trim) === Seq("1"))`
`77`	`77`	`assert(`
`78`	`78`	`(pagedTable.pageNavigation(1, 10, 2).head \\ "li").map(_.text.trim) === Seq("1", "2", ">"))`
`79`	`79`	`assert(`