Add e2e test cases for codec.

viirya · viirya · commit b364990239d2 · 2021-09-05T00:12:19.000-07:00
diff --git a/pom.xml b/pom.xml
@@ -300,6 +300,13 @@
         <enabled>false</enabled>
       </snapshots>
     </repository>
+
+    <repository>
+      <id>jitpack.io</id>
+      <url>https://jitpack.io</url>
+      <name>Jitpack.io repository</name>
+      <!-- needed for brotli-codec -->
+    </repository>
   </repositories>
   <pluginRepositories>
     <pluginRepository>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
@@ -274,7 +274,9 @@ object SparkBuild extends PomBuild {
       "gcs-maven-central-mirror" at "https://maven-central.storage-download.googleapis.com/maven2/",
       DefaultMavenRepository,
       Resolver.mavenLocal,
-      Resolver.file("ivyLocal", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns)
+      Resolver.file("ivyLocal", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns),
+      // needed for brotli-codec
+      "jitpack.io" at "https://jitpack.io"
     ),
     externalResolvers := resolvers.value,
     otherResolvers := SbtPomKeys.mvnLocalRepository(dotM2 => Seq(Resolver.file("dotM2", dotM2))).value,
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
@@ -178,6 +178,12 @@
       <artifactId>htmlunit-driver</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>com.github.rdblue</groupId>
+      <artifactId>brotli-codec</artifactId>
+      <version>0.1.1</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceCodecTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceCodecTest.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.test.SQLTestUtils
+
+abstract class DataSourceCodecTest extends QueryTest with SQLTestUtils {
+
+  protected def dataSourceName: String
+  protected val codecConfigName: String
+  protected def availableCodecs: Seq[String]
+
+  def testWithAllCodecs(name: String)(f: => Unit): Unit = {
+    for (codec <- availableCodecs) {
+      test(s"$name - data source $dataSourceName - codec: $codec") {
+        withSQLConf(codecConfigName -> codec) {
+          f
+        }
+      }
+    }
+  }
+
+  testWithAllCodecs("write and read - single partition") {
+    withTempPath { dir =>
+      testData
+        .repartition(1)
+        .write
+        .format(dataSourceName)
+        .save(dir.getCanonicalPath)
+
+      val df = spark.read.format(dataSourceName).load(dir.getCanonicalPath)
+      checkAnswer(df, testData)
+    }
+  }
+
+  testWithAllCodecs("write and read") {
+    withTempPath { dir =>
+      testData
+        .repartition(5)
+        .write
+        .format(dataSourceName)
+        .save(dir.getCanonicalPath)
+
+      val df = spark.read.format(dataSourceName).load(dir.getCanonicalPath)
+      checkAnswer(df, testData)
+    }
+  }
+}
+
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcCodecTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcCodecTestSuite.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc
+
+import org.apache.spark.sql.execution.datasources.DataSourceCodecTest
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+
+class OrcCodecTestSuite extends DataSourceCodecTest with SharedSparkSession{
+
+  override def dataSourceName: String = "orc"
+  override val codecConfigName = SQLConf.ORC_COMPRESSION.key
+  override protected def availableCodecs = Seq("none", "uncompressed", "snappy",
+    "zlib", "zstd", "lz4", "lzo")
+}
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCodecTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCodecTestSuite.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import org.apache.spark.sql.execution.datasources.DataSourceCodecTest
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSparkSession
+
+class ParquetCodecTestSuite extends DataSourceCodecTest with SharedSparkSession {
+
+  override def dataSourceName: String = "parquet"
+  override val codecConfigName = SQLConf.PARQUET_COMPRESSION.key
+  // Exclude "lzo" because it is GPL-licenced so not included in Hadoop.
+  // TODO (SPARK-36669): Add "lz4" back after fix it.
+  override protected def availableCodecs: Seq[String] = Seq("none", "uncompressed", "snappy",
+    "gzip", "brotli", "zstd")
+}
+