apache · jerryshao · Aug 2, 2024 · Jun 19, 2024 · Jul 24, 2024 · Jul 31, 2024
diff --git a/.github/workflows/spark-integration-test.yml b/.github/workflows/spark-integration-test.yml
@@ -90,11 +90,17 @@ jobs:
       - name: Spark Integration Test
         id: integrationTest
         run: |
+          # There are some odd problems when running Spark integration test and Spark SQL regression test in one gradle command, so split to run separately.
           if [ "${{ matrix.scala-version }}" == "2.12" ];then 
-            ./gradlew -PskipTests -PtestMode=${{ matrix.test-mode }} -PjdkVersion=${{ matrix.java-version }} -PscalaVersion=${{ matrix.scala-version }} -PskipDockerTests=false :spark-connector:spark-3.3:test --tests "org.apache.gravitino.spark.connector.integration.test.**"
+            ./gradlew -PskipTests -PtestMode=${{ matrix.test-mode }} -PjdkVersion=${{ matrix.java-version }} -PscalaVersion=${{ matrix.scala-version }} -PskipDockerTests=false :spark-connector:spark-3.3:test --tests "org.apache.gravitino.spark.connector.integration.test.sql.**"
+             ./gradlew -PskipTests -PtestMode=${{ matrix.test-mode }} -PjdkVersion=${{ matrix.java-version }} -PscalaVersion=${{ matrix.scala-version }} -PskipDockerTests=false :spark-connector:spark-3.3:test --tests "org.apache.gravitino.spark.connector.integration.test.**" -PskipSparkSQLITs
           fi
-          ./gradlew -PskipTests -PtestMode=${{ matrix.test-mode }} -PjdkVersion=${{ matrix.java-version }} -PscalaVersion=${{ matrix.scala-version }} -PskipDockerTests=false :spark-connector:spark-3.4:test --tests "org.apache.gravitino.spark.connector.integration.test.**"
-          ./gradlew -PskipTests -PtestMode=${{ matrix.test-mode }} -PjdkVersion=${{ matrix.java-version }} -PscalaVersion=${{ matrix.scala-version }} -PskipDockerTests=false :spark-connector:spark-3.5:test --tests "org.apache.gravitino.spark.connector.integration.test.**"
+
+          ./gradlew -PskipTests -PtestMode=${{ matrix.test-mode }} -PjdkVersion=${{ matrix.java-version }} -PscalaVersion=${{ matrix.scala-version }} -PskipDockerTests=false :spark-connector:spark-3.4:test --tests "org.apache.gravitino.spark.connector.integration.test.**" -PskipSparkSQLITs
+          ./gradlew -PskipTests -PtestMode=${{ matrix.test-mode }} -PjdkVersion=${{ matrix.java-version }} -PscalaVersion=${{ matrix.scala-version }} -PskipDockerTests=false :spark-connector:spark-3.4:test --tests "org.apache.gravitino.spark.connector.integration.test.sql.**"
+
+          ./gradlew -PskipTests -PtestMode=${{ matrix.test-mode }} -PjdkVersion=${{ matrix.java-version }} -PscalaVersion=${{ matrix.scala-version }} -PskipDockerTests=false :spark-connector:spark-3.5:test --tests "org.apache.gravitino.spark.connector.integration.test.**" -PskipSparkSQLITs
+          ./gradlew -PskipTests -PtestMode=${{ matrix.test-mode }} -PjdkVersion=${{ matrix.java-version }} -PscalaVersion=${{ matrix.scala-version }} -PskipDockerTests=false :spark-connector:spark-3.5:test --tests "org.apache.gravitino.spark.connector.integration.test.sql.**"
 
       - name: Upload integrate tests reports
         uses: actions/upload-artifact@v3

diff --git a/build.gradle.kts b/build.gradle.kts
@@ -471,6 +471,7 @@ tasks.rat {
     "integration-test/**/*.sql",
     "integration-test/**/*.txt",
     "docs/**/*.md",
+    "spark-connector/spark-common/src/test/resources/**",
     "web/.**",
     "web/next-env.d.ts",
     "web/dist/**/*",

diff --git a/.../src/test/java/org/apache/gravitino/spark/connector/integration/test/sql/CatalogType.java b/.../src/test/java/org/apache/gravitino/spark/connector/integration/test/sql/CatalogType.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.gravitino.spark.connector.integration.test.sql;
+
+public enum CatalogType {
+  HIVE,
+  ICEBERG,
+  UNKNOWN;
+
+  public static CatalogType fromString(String str) {
+    if (str == null) {
+      return UNKNOWN;
+    }
+    switch (str.toLowerCase()) {
+      case "hive":
+        return HIVE;
+      case "lakehouse-iceberg":
+        return ICEBERG;
+      default:
+        return UNKNOWN;
+    }
-    switch (str.toLowerCase()) {
-      case "hive":
-        return HIVE;
-      case "lakehouse-iceberg":
-        return ICEBERG;
-      default:
-        return UNKNOWN;
-    }
+for (CatalogType type : CatalogType.values())
+    if (type.name.equals(str.toUpperCase())) {
+        //....
+    }
+}
-    switch (str.toLowerCase()) {
-      case "hive":
-        return HIVE;
-      case "lakehouse-iceberg":
-        return ICEBERG;
-      default:
-        return UNKNOWN;
-    }
+for (CatalogType type : CatalogType.values())
+    if (type.name.equals(str.toUpperCase())) {
+        //....
+    }
+}
+  }
+
+  public static CatalogType merge(CatalogType parentCatalogType, CatalogType childCatalogType) {
+    if (parentCatalogType.equals(UNKNOWN)) {
+      return childCatalogType;
+    } else {
+      return parentCatalogType;
+    }
+  }
+}
diff --git a/.../src/test/java/org/apache/gravitino/spark/connector/integration/test/sql/QueryOutput.java b/.../src/test/java/org/apache/gravitino/spark/connector/integration/test/sql/QueryOutput.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.gravitino.spark.connector.integration.test.sql;
+
+import lombok.Getter;
+
+/** The SQL execution output, include schemas and output */
+@Getter
+public class QueryOutput {
+  private String sql;
+  private String schema;
+  private String output;
+
+  public QueryOutput(String sql, String schema, String output) {
+    this.sql = sql;
+    this.schema = schema;
+    this.output = output;
+  }
+
+  @Override
+  public String toString() {
+    return "-- !query\n"
+        + sql
+        + "\n"
+        + "-- !query schema\n"
+        + schema
+        + "\n"
+        + "-- !query output\n"
+        + output;
+  }
+}
diff --git a/...st/java/org/apache/gravitino/spark/connector/integration/test/sql/SQLQueryTestHelper.java b/...st/java/org/apache/gravitino/spark/connector/integration/test/sql/SQLQueryTestHelper.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.gravitino.spark.connector.integration.test.sql;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.execution.HiveResult;
+import org.apache.spark.sql.execution.SQLExecution;
+import org.apache.spark.sql.types.StructType;
+import scala.Option;
+import scala.collection.JavaConverters;
+
+public class SQLQueryTestHelper {
+
+  private static final String notIncludedMsg = "[not included in comparison]";
+  private static final String clsName = SQLQueryTestHelper.class.getCanonicalName();
+  private static final String emptySchema = new StructType().catalogString();
+
+  private static String replaceNotIncludedMsg(String line) {
+    line =
+        line.replaceAll("#\\d+", "#x")
+            .replaceAll("plan_id=\\d+", "plan_id=x")
+            .replaceAll(
+                "Location.*" + clsName + "/", "Location " + notIncludedMsg + "/{warehouse_dir}/")
+            .replaceAll("file:[^\\s,]*" + clsName, "file:" + notIncludedMsg + "/{warehouse_dir}")
+            .replaceAll("Created By.*", "Created By " + notIncludedMsg)
+            .replaceAll("Created Time.*", "Created Time " + notIncludedMsg)
+            .replaceAll("Last Access.*", "Last Access " + notIncludedMsg)
+            .replaceAll("Partition Statistics\t\\d+", "Partition Statistics\t" + notIncludedMsg)
+            .replaceAll("\\s+$", "")
+            .replaceAll("\\*\\(\\d+\\) ", "*");
+    return line;
+  }
+
+  public static Pair<String, List<String>> getNormalizedResult(SparkSession session, String sql) {
+    Dataset<Row> df = session.sql(sql);
+    String schema = df.schema().catalogString();
+    List<String> answer =
+        SQLExecution.withNewExecutionId(
+            df.queryExecution(),
+            Option.apply(""),
+            () ->
+                JavaConverters.seqAsJavaList(
+                        HiveResult.hiveResultString(df.queryExecution().executedPlan()))
+                    .stream()
+                    .map(s -> replaceNotIncludedMsg(s))
+                    .filter(s -> !s.isEmpty())
+                    .collect(Collectors.toList()));
+
+    Collections.sort(answer);
+
+    return Pair.of(schema, answer);
+  }
+
+  // different Spark version may produce different exceptions, so here just produce
+  // [SPARK_EXCEPTION]
+  public static Pair<String, List<String>> handleExceptions(
+      Supplier<Pair<String, List<String>>> result) {
+    try {
+      return result.get();
+    } catch (Throwable e) {
+      return Pair.of(emptySchema, Arrays.asList("[SPARK_EXCEPTION]"));
+    }
+  }
+}