diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 7a6c49f9135d0..b2b6a38916eeb 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -14,28 +14,6 @@ on: required: true jobs: - # This is on the top to give the most visibility in case of failures - hadoop-2: - name: Hadoop 2 build - runs-on: ubuntu-20.04 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - hadoop-2-coursier- - - name: Install Java 8 - uses: actions/setup-java@v1 - with: - java-version: 1.8 - - name: Build with SBT - run: | - ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile - # Build: build Spark and run the tests for specified modules. build: name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" @@ -45,7 +23,7 @@ jobs: fail-fast: false matrix: java: - - 1.8 + - 8 hadoop: - hadoop3.2 hive: @@ -71,26 +49,26 @@ jobs: include: # Hive tests - modules: hive - java: 1.8 + java: 8 hadoop: hadoop3.2 hive: hive2.3 included-tags: org.apache.spark.tags.SlowHiveTest comment: "- slow tests" - modules: hive - java: 1.8 + java: 8 hadoop: hadoop3.2 hive: hive2.3 excluded-tags: org.apache.spark.tags.SlowHiveTest comment: "- other tests" # SQL tests - modules: sql - java: 1.8 + java: 8 hadoop: hadoop3.2 hive: hive2.3 included-tags: org.apache.spark.tags.ExtendedSQLTest comment: "- slow tests" - modules: sql - java: 1.8 + java: 8 hadoop: hadoop3.2 hive: hive2.3 excluded-tags: org.apache.spark.tags.ExtendedSQLTest @@ -123,16 +101,10 @@ jobs: build/zinc-* build/scala-* build/*.jar + ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-maven- - name: Cache Coursier local repository uses: actions/cache@v2 with: @@ -140,7 +112,7 @@ jobs: key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- - - name: Install JDK ${{ matrix.java }} + - name: Install Java ${{ matrix.java }} uses: actions/setup-java@v1 with: java-version: ${{ matrix.java }} @@ -163,9 +135,7 @@ jobs: run: | # Hive and SQL tests become flaky when running in parallel as it's too intensive. if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi - mkdir -p ~/.m2 ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - rm -rf ~/.m2/repository/org/apache/spark - name: Upload test results to report if: always() uses: actions/upload-artifact@v2 @@ -218,16 +188,10 @@ jobs: build/zinc-* build/scala-* build/*.jar + ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: pyspark-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - pyspark-maven- - name: Cache Coursier local repository uses: actions/cache@v2 with: @@ -250,24 +214,22 @@ jobs: # Run the tests. - name: Run tests run: | - mkdir -p ~/.m2 ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" - rm -rf ~/.m2/repository/org/apache/spark - name: Upload test results to report if: always() uses: actions/upload-artifact@v2 with: - name: test-results-${{ matrix.modules }}--1.8-hadoop3.2-hive2.3 + name: test-results-${{ matrix.modules }}--8-hadoop3.2-hive2.3 path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: failure() uses: actions/upload-artifact@v2 with: - name: unit-tests-log-${{ matrix.modules }}--1.8-hadoop3.2-hive2.3 + name: unit-tests-log-${{ matrix.modules }}--8-hadoop3.2-hive2.3 path: "**/target/unit-tests.log" sparkr: - name: Build modules - sparkr + name: "Build modules: sparkr" runs-on: ubuntu-20.04 container: image: dongjoon/apache-spark-github-action-image:20201025 @@ -294,16 +256,10 @@ jobs: build/zinc-* build/scala-* build/*.jar + ~/.sbt key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: sparkr-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - sparkr-maven- - name: Cache Coursier local repository uses: actions/cache@v2 with: @@ -313,18 +269,16 @@ jobs: sparkr-coursier- - name: Run tests run: | - mkdir -p ~/.m2 # The followings are also used by `r-lib/actions/setup-r` to avoid # R issues at docker environment export TZ=UTC export _R_CHECK_SYSTEM_CLOCK_=FALSE ./dev/run-tests --parallelism 2 --modules sparkr - rm -rf ~/.m2/repository/org/apache/spark - name: Upload test results to report if: always() uses: actions/upload-artifact@v2 with: - name: test-results-sparkr--1.8-hadoop3.2-hive2.3 + name: test-results-sparkr--8-hadoop3.2-hive2.3 path: "**/target/test-reports/*.xml" # Static analysis, and documentation build @@ -334,17 +288,37 @@ jobs: steps: - name: Checkout Spark repository uses: actions/checkout@v2 + # Cache local repositories. Note that GitHub Actions cache has a 2G limit. + - name: Cache Scala, SBT, Maven and Zinc + uses: actions/cache@v2 + with: + path: | + build/apache-maven-* + build/zinc-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Coursier local repository + uses: actions/cache@v2 + with: + path: ~/.cache/coursier + key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + docs-coursier- - name: Cache Maven local repository uses: actions/cache@v2 with: path: ~/.m2/repository - key: docs-maven-repo-${{ hashFiles('**/pom.xml') }} + key: docs-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | docs-maven- - - name: Install JDK 1.8 + - name: Install Java 8 uses: actions/setup-java@v1 with: - java-version: 1.8 + java-version: 8 - name: Install Python 3.6 uses: actions/setup-python@v2 with: @@ -395,8 +369,8 @@ jobs: cd docs jekyll build - java11: - name: Java 11 build + java-11: + name: Java 11 build with Maven runs-on: ubuntu-20.04 steps: - name: Checkout Spark repository @@ -416,12 +390,12 @@ jobs: run: | export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" export MAVEN_CLI_OPTS="--no-transfer-progress" - mkdir -p ~/.m2 + # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414. ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=11 install rm -rf ~/.m2/repository/org/apache/spark scala-213: - name: Scala 2.13 build + name: Scala 2.13 build with SBT runs-on: ubuntu-20.04 steps: - name: Checkout Spark repository @@ -433,11 +407,32 @@ jobs: key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | scala-213-coursier- - - name: Install Java 11 + - name: Install Java 8 uses: actions/setup-java@v1 with: - java-version: 11 + java-version: 8 - name: Build with SBT run: | ./dev/change-scala-version.sh 2.13 ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Djava.version=11 -Pscala-2.13 compile test:compile + + hadoop-2: + name: Hadoop 2 build with SBT + runs-on: ubuntu-20.04 + steps: + - name: Checkout Spark repository + uses: actions/checkout@v2 + - name: Cache Coursier local repository + uses: actions/cache@v2 + with: + path: ~/.cache/coursier + key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + hadoop-2-coursier- + - name: Install Java 8 + uses: actions/setup-java@v1 + with: + java-version: 8 + - name: Build with SBT + run: | + ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 404a6968ea429..b927a6b96b810 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -292,6 +292,7 @@ exportMethods("%<=>%", "floor", "format_number", "format_string", + "from_avro", "from_csv", "from_json", "from_unixtime", @@ -416,6 +417,7 @@ exportMethods("%<=>%", "timestamp_seconds", "toDegrees", "toRadians", + "to_avro", "to_csv", "to_date", "to_json", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index bcd798a8c31e2..039d28a3a37b6 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -361,6 +361,50 @@ NULL #' } NULL +#' Avro processing functions for Column operations +#' +#' Avro processing functions defined for \code{Column}. +#' +#' @param x Column to compute on. +#' @param jsonFormatSchema character Avro schema in JSON string format +#' @param ... additional argument(s) passed as parser options. +#' @name column_avro_functions +#' @rdname column_avro_functions +#' @family avro functions +#' @note Avro is built-in but external data source module since Spark 2.4. +#' Please deploy the application as per +#' \href{https://spark.apache.org/docs/latest/sql-data-sources-avro.html#deploying}{ +#' the deployment section +#' } of "Apache Avro Data Source Guide". +#' @examples +#' \dontrun{ +#' df <- createDataFrame(iris) +#' schema <- paste( +#' c( +#' '{"type": "record", "namespace": "example.avro", "name": "Iris", "fields": [', +#' '{"type": ["double", "null"], "name": "Sepal_Length"},', +#' '{"type": ["double", "null"], "name": "Sepal_Width"},', +#' '{"type": ["double", "null"], "name": "Petal_Length"},', +#' '{"type": ["double", "null"], "name": "Petal_Width"},', +#' '{"type": ["string", "null"], "name": "Species"}]}' +#' ), +#' collapse="\\n" +#' ) +#' +#' df_serialized <- select( +#' df, +#' alias(to_avro(alias(struct(column("*")), "fields")), "payload") +#' ) +#' +#' df_deserialized <- select( +#' df_serialized, +#' from_avro(df_serialized$payload, schema) +#' ) +#' +#' head(df_deserialized) +#' } +NULL + #' @details #' \code{lit}: A new Column is created to represent the literal value. #' If the parameter is a Column, it is returned unchanged. @@ -4547,3 +4591,60 @@ setMethod("vector_to_array", ) column(jc) }) + +#' @details +#' \code{from_avro} Converts a binary column of Avro format into its corresponding catalyst value. +#' The specified schema must match the read data, otherwise the behavior is undefined: +#' it may fail or return arbitrary result. +#' To deserialize the data with a compatible and evolved schema, the expected Avro schema can be +#' set via the option avroSchema. +#' +#' @rdname column_avro_functions +#' @aliases from_avro from_avro,Column-method +#' @note from_avro since 3.1.0 +setMethod("from_avro", + signature(x = "characterOrColumn"), + function(x, jsonFormatSchema, ...) { + x <- if (is.character(x)) { + column(x) + } else { + x + } + + options <- varargsToStrEnv(...) + jc <- callJStatic( + "org.apache.spark.sql.avro.functions", "from_avro", + x@jc, + jsonFormatSchema, + options + ) + column(jc) + }) + +#' @details +#' \code{to_avro} Converts a column into binary of Avro format. +#' +#' @rdname column_avro_functions +#' @aliases to_avro to_avro,Column-method +#' @note to_avro since 3.1.0 +setMethod("to_avro", + signature(x = "characterOrColumn"), + function(x, jsonFormatSchema = NULL) { + x <- if (is.character(x)) { + column(x) + } else { + x + } + + jc <- if (is.null(jsonFormatSchema)) { + callJStatic("org.apache.spark.sql.avro.functions", "to_avro", x@jc) + } else { + callJStatic( + "org.apache.spark.sql.avro.functions", + "to_avro", + x@jc, + jsonFormatSchema + ) + } + column(jc) + }) diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index e372ae27e315a..1fe6599bf1b97 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -950,7 +950,6 @@ setGeneric("current_date", function(x = "missing") { standardGeneric("current_da #' @name NULL setGeneric("current_timestamp", function(x = "missing") { standardGeneric("current_timestamp") }) - #' @rdname column_datetime_diff_functions #' @name NULL setGeneric("datediff", function(y, x) { standardGeneric("datediff") }) @@ -1015,6 +1014,10 @@ setGeneric("expr", function(x) { standardGeneric("expr") }) #' @name NULL setGeneric("flatten", function(x) { standardGeneric("flatten") }) +#' @rdname column_avro_functions +#' @name NULL +setGeneric("from_avro", function(x, ...) { standardGeneric("from_avro") }) + #' @rdname column_datetime_diff_functions #' @name NULL setGeneric("from_utc_timestamp", function(y, x) { standardGeneric("from_utc_timestamp") }) @@ -1388,6 +1391,10 @@ setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") }) #' @name timestamp_seconds setGeneric("timestamp_seconds", function(x) { standardGeneric("timestamp_seconds") }) +#' @rdname column_avro_functions +#' @name NULL +setGeneric("to_avro", function(x, ...) { standardGeneric("to_avro") }) + #' @rdname column_collection_functions #' @name NULL setGeneric("transform_keys", function(x, f) { standardGeneric("transform_keys") }) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 3a0d359e2ae79..45de1ef1bd3d1 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1841,6 +1841,32 @@ test_that("column functions", { ) }) +test_that("avro column functions", { + skip_if_not( + grepl("spark-avro", sparkR.conf("spark.jars", "")), + "spark-avro jar not present" + ) + + schema <- '{"namespace": "example.avro", + "type": "record", + "name": "User", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "favorite_color", "type": ["string", "null"]} + ] + }' + + c0 <- column("foo") + c1 <- from_avro(c0, schema) + expect_s4_class(c1, "Column") + c2 <- from_avro("foo", schema) + expect_s4_class(c2, "Column") + c3 <- to_avro(c1) + expect_s4_class(c3, "Column") + c4 <- to_avro(c1, schema) + expect_s4_class(c4, "Column") +}) + test_that("column binary mathfunctions", { lines <- c("{\"a\":1, \"b\":5}", "{\"a\":2, \"b\":6}", diff --git a/R/run-tests.sh b/R/run-tests.sh index 51ca7d600caf0..edc2b2b60b60e 100755 --- a/R/run-tests.sh +++ b/R/run-tests.sh @@ -23,7 +23,18 @@ FAILED=0 LOGFILE=$FWDIR/unit-tests.out rm -f $LOGFILE -SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE +SPARK_AVRO_JAR_PATH=$(find $FWDIR/../external/avro/ -name "spark-avro*jar" -print | egrep -v "tests.jar|test-sources.jar|sources.jar|javadoc.jar") + +if [[ $(echo $SPARK_AVRO_JAR_PATH | wc -l) -eq 1 ]]; then + SPARK_JARS=$SPARK_AVRO_JAR_PATH +fi + +if [ -z "$SPARK_JARS" ]; then + SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE +else + SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE +fi + FAILED=$((PIPESTATUS[0]||$FAILED)) NUM_TEST_WARNING="$(grep -c -e 'Warnings ----------------' $LOGFILE)" diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java index 82810dacdad84..9a71cf593e28c 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/ChunkFetchRequestHandler.java @@ -88,12 +88,14 @@ public void processFetchRequest( logger.trace("Received req from {} to fetch block {}", getRemoteAddress(channel), msg.streamChunkId); } - long chunksBeingTransferred = streamManager.chunksBeingTransferred(); - if (chunksBeingTransferred >= maxChunksBeingTransferred) { - logger.warn("The number of chunks being transferred {} is above {}, close the connection.", - chunksBeingTransferred, maxChunksBeingTransferred); - channel.close(); - return; + if (maxChunksBeingTransferred < Long.MAX_VALUE) { + long chunksBeingTransferred = streamManager.chunksBeingTransferred(); + if (chunksBeingTransferred >= maxChunksBeingTransferred) { + logger.warn("The number of chunks being transferred {} is above {}, close the connection.", + chunksBeingTransferred, maxChunksBeingTransferred); + channel.close(); + return; + } } ManagedBuffer buf; try { diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java index f178928006902..4a30f8de07827 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java @@ -124,12 +124,14 @@ private void processStreamRequest(final StreamRequest req) { req.streamId); } - long chunksBeingTransferred = streamManager.chunksBeingTransferred(); - if (chunksBeingTransferred >= maxChunksBeingTransferred) { - logger.warn("The number of chunks being transferred {} is above {}, close the connection.", - chunksBeingTransferred, maxChunksBeingTransferred); - channel.close(); - return; + if (maxChunksBeingTransferred < Long.MAX_VALUE) { + long chunksBeingTransferred = streamManager.chunksBeingTransferred(); + if (chunksBeingTransferred >= maxChunksBeingTransferred) { + logger.warn("The number of chunks being transferred {} is above {}, close the connection.", + chunksBeingTransferred, maxChunksBeingTransferred); + channel.close(); + return; + } } ManagedBuffer buf; try { diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index 4d765481eb836..09fa91655fba5 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -21,7 +21,6 @@ import java.util.{Properties, Timer, TimerTask} import scala.collection.JavaConverters._ import scala.concurrent.duration._ -import scala.language.postfixOps import scala.util.{Failure, Success => ScalaSuccess, Try} import org.apache.spark.annotation.{Experimental, Since} diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index c3152d9225107..cdec1982b4487 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -35,7 +35,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.io.CompressionCodec import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEndpointRef, RpcEnv} -import org.apache.spark.scheduler.{ExecutorCacheTaskLocation, MapStatus} +import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.MetadataFetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockId} import org.apache.spark.util._ diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index d68015454de9d..0440a9de6ab31 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1542,8 +1542,8 @@ class SparkContext(config: SparkConf) extends Logging { val schemeCorrectedURI = uri.getScheme match { case null => new File(path).getCanonicalFile.toURI case "local" => - logWarning("File with 'local' scheme is not supported to add to file server, since " + - "it is already available on every node.") + logWarning(s"File with 'local' scheme $path is not supported to add to file server, " + + s"since it is already available on every node.") return case _ => uri } diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala index d7a09b599794e..136da80d48dee 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala @@ -24,13 +24,8 @@ import java.nio.charset.StandardCharsets.UTF_8 import java.util.concurrent.atomic.AtomicBoolean import scala.collection.JavaConverters._ -import scala.collection.mutable.ArrayBuffer import scala.util.control.NonFatal -import org.json4s.JsonAST._ -import org.json4s.JsonDSL._ -import org.json4s.jackson.JsonMethods.{compact, render} - import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config.{BUFFER_SIZE, EXECUTOR_CORES} @@ -86,6 +81,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( private val conf = SparkEnv.get.conf protected val bufferSize: Int = conf.get(BUFFER_SIZE) private val reuseWorker = conf.get(PYTHON_WORKER_REUSE) + protected val simplifiedTraceback: Boolean = false // All the Python functions should have the same exec, version and envvars. protected val envVars: java.util.Map[String, String] = funcs.head.funcs.head.envVars @@ -133,6 +129,9 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( if (reuseWorker) { envVars.put("SPARK_REUSE_WORKER", "1") } + if (simplifiedTraceback) { + envVars.put("SPARK_SIMPLIFIED_TRACEBACK", "1") + } // SPARK-30299 this could be wrong with standalone mode when executor // cores might not be correct because it defaults to all cores on the box. val execCores = execCoresProp.map(_.toInt).getOrElse(conf.get(EXECUTOR_CORES)) diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala index 5a6fa507963f0..dc2587a62ae40 100644 --- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala +++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala @@ -17,8 +17,6 @@ package org.apache.spark.api.python -import java.nio.ByteOrder -import java.nio.charset.StandardCharsets import java.util.{ArrayList => JArrayList} import scala.collection.JavaConverters._ diff --git a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala index 20ab6fc2f348d..41c66024272b9 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala @@ -19,7 +19,6 @@ package org.apache.spark.api.r import java.io._ -import org.apache.spark._ import org.apache.spark.broadcast.Broadcast /** diff --git a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala index 17733d99cd5bc..d76fb7f9a20b3 100644 --- a/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/deploy/JsonProtocol.scala @@ -22,7 +22,6 @@ import org.json4s.JsonDSL._ import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, WorkerStateResponse} import org.apache.spark.deploy.master._ -import org.apache.spark.deploy.master.RecoveryState.MasterState import org.apache.spark.deploy.worker.ExecutorRunner import org.apache.spark.resource.{ResourceInformation, ResourceRequirement} @@ -208,7 +207,8 @@ private[deploy] object JsonProtocol { * master * `completeddrivers` a list of Json objects of [[DriverInfo]] of the completed drivers * of the master - * `status` status of the master, see [[MasterState]] + * `status` status of the master, + * see [[org.apache.spark.deploy.master.RecoveryState.MasterState]] */ def writeMasterState(obj: MasterStateResponse): JObject = { val aliveWorkers = obj.workers.filter(_.isAlive()) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 9a316e8c5b5a9..4b17661496808 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -391,6 +391,7 @@ private[spark] class SparkSubmit extends Logging { downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr) }.orNull args.files = renameResourcesToLocalFS(args.files, localFiles) + args.pyFiles = renameResourcesToLocalFS(args.pyFiles, localPyFiles) } } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/BasicEventFilterBuilder.scala b/core/src/main/scala/org/apache/spark/deploy/history/BasicEventFilterBuilder.scala index c659d32d16314..57b05ff245258 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/BasicEventFilterBuilder.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/BasicEventFilterBuilder.scala @@ -19,7 +19,6 @@ package org.apache.spark.deploy.history import scala.collection.mutable -import org.apache.spark.SparkContext import org.apache.spark.deploy.history.EventFilter.FilterStatistics import org.apache.spark.internal.Logging import org.apache.spark.scheduler._ diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index e1b0fc5e45d6e..e5341aff8ce66 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -21,7 +21,7 @@ import java.io.{File, FileNotFoundException, IOException} import java.lang.{Long => JLong} import java.nio.file.Files import java.util.{Date, NoSuchElementException, ServiceLoader} -import java.util.concurrent.{ConcurrentHashMap, ExecutorService, Future, TimeUnit} +import java.util.concurrent.{ConcurrentHashMap, ExecutorService, TimeUnit} import java.util.zip.ZipOutputStream import scala.collection.JavaConverters._ diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala b/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala index 58714f16e8417..1b8c7ff26e9f5 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HybridStore.scala @@ -17,7 +17,6 @@ package org.apache.spark.deploy.history -import java.io.IOException import java.util.Collection import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.atomic.AtomicBoolean diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index a582a5d045855..cccd3da323774 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -22,9 +22,7 @@ import java.util.{Date, Locale} import java.util.concurrent.{ScheduledFuture, TimeUnit} import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} -import scala.collection.mutable import scala.util.Random -import scala.util.control.NonFatal import org.apache.spark.{SecurityManager, SparkConf, SparkException} import org.apache.spark.deploy.{ApplicationDescription, DriverDescription, ExecutorState, SparkHadoopUtil} diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala index 035f9d379471c..af94bd6d9e0f2 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala @@ -18,7 +18,6 @@ package org.apache.spark.deploy.master.ui import java.net.{InetAddress, NetworkInterface, SocketException} -import java.util.Locale import javax.servlet.http.{HttpServlet, HttpServletRequest, HttpServletResponse} import org.apache.spark.deploy.DeployMessages.{DecommissionWorkersOnHosts, MasterStateResponse, RequestMasterState} diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala index b2bc6b3b68007..6a1fd57873c3a 100644 --- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala @@ -17,7 +17,6 @@ package org.apache.spark.executor -import java.io.File import java.net.URL import java.nio.ByteBuffer import java.util.Locale diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 1a0ad566633da..f7246448959e9 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -22,7 +22,7 @@ import java.lang.Thread.UncaughtExceptionHandler import java.lang.management.ManagementFactory import java.net.{URI, URL} import java.nio.ByteBuffer -import java.util.Properties +import java.util.{Locale, Properties} import java.util.concurrent._ import java.util.concurrent.atomic.AtomicBoolean import javax.annotation.concurrent.GuardedBy @@ -110,7 +110,9 @@ private[spark] class Executor( .build() Executors.newCachedThreadPool(threadFactory).asInstanceOf[ThreadPoolExecutor] } - private val executorSource = new ExecutorSource(threadPool, executorId) + private val schemes = conf.get(EXECUTOR_METRICS_FILESYSTEM_SCHEMES) + .toLowerCase(Locale.ROOT).split(",").map(_.trim).filter(_.nonEmpty) + private val executorSource = new ExecutorSource(threadPool, executorId, schemes) // Pool used for threads that supervise task killing / cancellation private val taskReaperPool = ThreadUtils.newDaemonCachedThreadPool("Task reaper") // For tasks which are in the process of being killed, this map holds the most recently created diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala index 50207aeb3ef6b..d2765d061d662 100644 --- a/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorSource.scala @@ -27,7 +27,10 @@ import org.apache.hadoop.fs.FileSystem import org.apache.spark.metrics.source.Source private[spark] -class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends Source { +class ExecutorSource( + threadPool: ThreadPoolExecutor, + executorId: String, + fileSystemSchemes: Array[String]) extends Source { private def fileStats(scheme: String) : Option[FileSystem.Statistics] = FileSystem.getAllStatistics.asScala.find(s => s.getScheme.equals(scheme)) @@ -70,7 +73,7 @@ class ExecutorSource(threadPool: ThreadPoolExecutor, executorId: String) extends }) // Gauge for file system stats of this executor - for (scheme <- Array("hdfs", "file")) { + for (scheme <- fileSystemSchemes) { registerFileSystemStat(scheme, "read_bytes", _.getBytesRead(), 0L) registerFileSystemStat(scheme, "write_bytes", _.getBytesWritten(), 0L) registerFileSystemStat(scheme, "read_ops", _.getReadOps(), 0) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 2bb1290963f87..4bc49514fc5ad 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -271,6 +271,13 @@ package object config { .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("0") + private[spark] val EXECUTOR_METRICS_FILESYSTEM_SCHEMES = + ConfigBuilder("spark.executor.metrics.fileSystemSchemes") + .doc("The file system schemes to report in executor metrics.") + .version("3.1.0") + .stringConf + .createWithDefaultString("file,hdfs") + private[spark] val EXECUTOR_JAVA_OPTIONS = ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS) .withPrepended(SparkLauncher.EXECUTOR_DEFAULT_JAVA_OPTIONS) diff --git a/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala b/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala index 62fbc166167d3..cafb39ea82ad9 100644 --- a/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala +++ b/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala @@ -22,7 +22,7 @@ import scala.reflect.ClassTag import org.apache.spark.TaskContext import org.apache.spark.network.buffer.ManagedBuffer import org.apache.spark.network.client.StreamCallbackWithID -import org.apache.spark.storage.{BlockId, ShuffleBlockId, StorageLevel} +import org.apache.spark.storage.{BlockId, StorageLevel} private[spark] trait BlockDataManager { diff --git a/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala index c7f5a97e35612..635efc3e22628 100644 --- a/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala +++ b/core/src/main/scala/org/apache/spark/network/BlockTransferService.scala @@ -23,7 +23,6 @@ import scala.concurrent.{Future, Promise} import scala.concurrent.duration.Duration import scala.reflect.ClassTag -import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, BlockStoreClient, DownloadFileManager} import org.apache.spark.storage.{BlockId, EncryptedManagedBuffer, StorageLevel} diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala index 806fbf52795bc..828849812bbd1 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala @@ -19,9 +19,7 @@ package org.apache.spark.network.netty import java.io.IOException import java.nio.ByteBuffer -import java.util import java.util.{HashMap => JHashMap, Map => JMap} -import java.util.concurrent.CompletableFuture import scala.collection.JavaConverters._ import scala.concurrent.{Future, Promise} @@ -35,11 +33,11 @@ import org.apache.spark.ExecutorDeadException import org.apache.spark.internal.config import org.apache.spark.network._ import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} -import org.apache.spark.network.client.{RpcResponseCallback, TransportClient, TransportClientBootstrap, TransportClientFactory} +import org.apache.spark.network.client.{RpcResponseCallback, TransportClientBootstrap} import org.apache.spark.network.crypto.{AuthClientBootstrap, AuthServerBootstrap} import org.apache.spark.network.server._ import org.apache.spark.network.shuffle.{BlockFetchingListener, DownloadFileManager, OneForOneBlockFetcher, RetryingBlockFetcher} -import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, GetLocalDirsForExecutors, LocalDirsForExecutors, UploadBlock, UploadBlockStream} +import org.apache.spark.network.shuffle.protocol.{UploadBlock, UploadBlockStream} import org.apache.spark.network.util.JavaUtils import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.serializer.JavaSerializer diff --git a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala index 324cba5b4de42..f0239cdd9136d 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ParallelCollectionRDD.scala @@ -19,7 +19,6 @@ package org.apache.spark.rdd import java.io._ -import scala.Serializable import scala.collection.Map import scala.collection.immutable.NumericRange import scala.collection.mutable.ArrayBuffer diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala index 576a83f6ab4d9..5093a12777ad3 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala @@ -20,7 +20,6 @@ package org.apache.spark.rdd import java.io.{FileNotFoundException, IOException} import java.util.concurrent.TimeUnit -import scala.collection.mutable import scala.reflect.ClassTag import scala.util.control.NonFatal diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceAllocator.scala b/core/src/main/scala/org/apache/spark/resource/ResourceAllocator.scala index 482d9e94c6dd9..22d10a975ad0f 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceAllocator.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceAllocator.scala @@ -20,7 +20,6 @@ package org.apache.spark.resource import scala.collection.mutable import org.apache.spark.SparkException -import org.apache.spark.util.collection.OpenHashMap /** * Trait used to help executor/worker allocate resources. @@ -40,7 +39,7 @@ trait ResourceAllocator { * can be a multiple, such that each address can be allocated up to [[slotsPerAddress]] * times. * - * TODO Use [[OpenHashMap]] instead to gain better performance. + * TODO Use [[org.apache.spark.util.collection.OpenHashMap]] instead to gain better performance. */ private lazy val addressAvailabilityMap = { mutable.HashMap(resourceAddresses.map(_ -> slotsPerAddress): _*) diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala index 5a9435653920f..837b2d80aace6 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala @@ -29,8 +29,8 @@ import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.resource.ResourceDiscoveryPlugin import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.{CPUS_PER_TASK, EXECUTOR_CORES, RESOURCES_DISCOVERY_PLUGIN, SPARK_TASK_PREFIX} -import org.apache.spark.internal.config.Tests.{RESOURCES_WARNING_TESTING, SKIP_VALIDATE_CORES_TESTING} +import org.apache.spark.internal.config.{EXECUTOR_CORES, RESOURCES_DISCOVERY_PLUGIN, SPARK_TASK_PREFIX} +import org.apache.spark.internal.config.Tests.{RESOURCES_WARNING_TESTING} import org.apache.spark.util.Utils /** diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 57e219999b0d0..b939e40f3b60c 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -26,9 +26,6 @@ import scala.collection.mutable import scala.collection.mutable.{ArrayBuffer, Buffer, HashMap, HashSet} import scala.util.Random -import com.google.common.base.Ticker -import com.google.common.cache.CacheBuilder - import org.apache.spark._ import org.apache.spark.TaskState.TaskState import org.apache.spark.executor.ExecutorMetrics diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala index eda1cb52d4abc..e084453be0789 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala @@ -22,7 +22,6 @@ import java.nio.ByteBuffer import org.apache.spark.TaskState.TaskState import org.apache.spark.resource.{ResourceInformation, ResourceProfile} import org.apache.spark.rpc.RpcEndpointRef -import org.apache.spark.scheduler.ExecutorDecommissionInfo import org.apache.spark.scheduler.ExecutorLossReason import org.apache.spark.util.SerializableBuffer diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala index 72460180f5908..d9b8eddcf8cd0 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala @@ -22,11 +22,9 @@ import java.util.concurrent.ConcurrentHashMap import scala.collection.JavaConverters._ import org.apache.spark._ -import org.apache.spark.internal.{config, Logging} -import org.apache.spark.scheduler.MapStatus +import org.apache.spark.internal.Logging import org.apache.spark.shuffle._ -import org.apache.spark.shuffle.api.{ShuffleDataIO, ShuffleExecutorComponents} -import org.apache.spark.util.Utils +import org.apache.spark.shuffle.api.ShuffleExecutorComponents import org.apache.spark.util.collection.OpenHashSet /** diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala index 5c6543fe28a18..affa85b76cf19 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala @@ -22,8 +22,7 @@ import java.util.{List => JList} import scala.collection.JavaConverters._ import scala.collection.mutable.HashMap -import org.apache.spark.{JobExecutionStatus, SparkConf, SparkException} -import org.apache.spark.resource.ResourceProfileManager +import org.apache.spark.{JobExecutionStatus, SparkConf} import org.apache.spark.status.api.v1 import org.apache.spark.ui.scope._ import org.apache.spark.util.Utils diff --git a/core/src/main/scala/org/apache/spark/status/KVUtils.scala b/core/src/main/scala/org/apache/spark/status/KVUtils.scala index 45348be5c98b9..c79f2dcd86533 100644 --- a/core/src/main/scala/org/apache/spark/status/KVUtils.scala +++ b/core/src/main/scala/org/apache/spark/status/KVUtils.scala @@ -21,7 +21,6 @@ import java.io.File import scala.annotation.meta.getter import scala.collection.JavaConverters._ -import scala.language.implicitConversions import scala.reflect.{classTag, ClassTag} import com.fasterxml.jackson.annotation.JsonInclude diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 3909c02c5bb1f..924601f92c5b8 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -55,7 +55,6 @@ import org.apache.spark.rpc.RpcEnv import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.serializer.{SerializerInstance, SerializerManager} import org.apache.spark.shuffle.{MigratableResolver, ShuffleManager, ShuffleWriteMetricsReporter} -import org.apache.spark.shuffle.{ShuffleManager, ShuffleWriteMetricsReporter} import org.apache.spark.storage.BlockManagerMessages.{DecommissionBlockManager, ReplicateBlock} import org.apache.spark.storage.memory._ import org.apache.spark.unsafe.Platform diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala index 9699515c626bf..7a55039db1b60 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerDecommissioner.scala @@ -18,7 +18,6 @@ package org.apache.spark.storage import java.io.IOException -import java.util.concurrent.ExecutorService import java.util.concurrent.atomic.AtomicInteger import scala.collection.JavaConverters._ @@ -28,7 +27,7 @@ import scala.util.control.NonFatal import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config -import org.apache.spark.shuffle.{MigratableResolver, ShuffleBlockInfo} +import org.apache.spark.shuffle.ShuffleBlockInfo import org.apache.spark.storage.BlockManagerMessages.ReplicateBlock import org.apache.spark.util.ThreadUtils diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala index b8c5cbd121861..a7532a9870fae 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala @@ -33,7 +33,7 @@ import org.apache.spark.{MapOutputTrackerMaster, SparkConf} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.{config, Logging} import org.apache.spark.network.shuffle.ExternalBlockStoreClient -import org.apache.spark.rpc.{IsolatedRpcEndpoint, RpcCallContext, RpcEndpointAddress, RpcEndpointRef, RpcEnv} +import org.apache.spark.rpc.{IsolatedRpcEndpoint, RpcCallContext, RpcEndpointRef, RpcEnv} import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.{CoarseGrainedClusterMessages, CoarseGrainedSchedulerBackend} import org.apache.spark.storage.BlockManagerMessages._ diff --git a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala index c0a135e04bac5..a3a528cddee37 100644 --- a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala @@ -27,7 +27,6 @@ import org.apache.hadoop.fs.viewfs.ViewFileSystem import org.apache.hadoop.hdfs.DistributedFileSystem import org.apache.spark._ -import org.apache.spark.annotation.Private import org.apache.spark.internal.Logging import org.apache.spark.metrics.source.HiveCatalogMetrics @@ -45,8 +44,6 @@ private[spark] object HadoopFSUtils extends Logging { * @param paths Input paths to list * @param hadoopConf Hadoop configuration * @param filter Path filter used to exclude leaf files from result - * @param isRootLevel Whether the input paths are at the root level, i.e., they are the root - * paths as opposed to nested paths encountered during recursive calls of this. * @param ignoreMissingFiles Ignore missing files that happen during recursive listing * (e.g., due to race conditions) * @param ignoreLocality Whether to fetch data locality info when listing leaf files. If false, @@ -57,11 +54,22 @@ private[spark] object HadoopFSUtils extends Logging { * @param parallelismMax The maximum parallelism for listing. If the number of input paths is * larger than this value, parallelism will be throttled to this value * to avoid generating too many tasks. - * @param filterFun Optional predicate on the leaf files. Files who failed the check will be - * excluded from the results * @return for each input path, the set of discovered files for the path */ def parallelListLeafFiles( + sc: SparkContext, + paths: Seq[Path], + hadoopConf: Configuration, + filter: PathFilter, + ignoreMissingFiles: Boolean, + ignoreLocality: Boolean, + parallelismThreshold: Int, + parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = { + parallelListLeafFilesInternal(sc, paths, hadoopConf, filter, isRootLevel = true, + ignoreMissingFiles, ignoreLocality, parallelismThreshold, parallelismMax) + } + + private def parallelListLeafFilesInternal( sc: SparkContext, paths: Seq[Path], hadoopConf: Configuration, @@ -70,8 +78,7 @@ private[spark] object HadoopFSUtils extends Logging { ignoreMissingFiles: Boolean, ignoreLocality: Boolean, parallelismThreshold: Int, - parallelismMax: Int, - filterFun: Option[String => Boolean] = None): Seq[(Path, Seq[FileStatus])] = { + parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = { // Short-circuits parallel listing when serial listing is likely to be faster. if (paths.size <= parallelismThreshold) { @@ -85,8 +92,7 @@ private[spark] object HadoopFSUtils extends Logging { ignoreLocality = ignoreLocality, isRootPath = isRootLevel, parallelismThreshold = parallelismThreshold, - parallelismMax = parallelismMax, - filterFun = filterFun) + parallelismMax = parallelismMax) (path, leafFiles) } } @@ -126,58 +132,16 @@ private[spark] object HadoopFSUtils extends Logging { ignoreMissingFiles = ignoreMissingFiles, ignoreLocality = ignoreLocality, isRootPath = isRootLevel, - filterFun = filterFun, parallelismThreshold = Int.MaxValue, parallelismMax = 0) (path, leafFiles) }.iterator - }.map { case (path, statuses) => - val serializableStatuses = statuses.map { status => - // Turn FileStatus into SerializableFileStatus so we can send it back to the driver - val blockLocations = status match { - case f: LocatedFileStatus => - f.getBlockLocations.map { loc => - SerializableBlockLocation( - loc.getNames, - loc.getHosts, - loc.getOffset, - loc.getLength) - } - - case _ => - Array.empty[SerializableBlockLocation] - } - - SerializableFileStatus( - status.getPath.toString, - status.getLen, - status.isDirectory, - status.getReplication, - status.getBlockSize, - status.getModificationTime, - status.getAccessTime, - blockLocations) - } - (path.toString, serializableStatuses) }.collect() } finally { sc.setJobDescription(previousJobDescription) } - // turn SerializableFileStatus back to Status - statusMap.map { case (path, serializableStatuses) => - val statuses = serializableStatuses.map { f => - val blockLocations = f.blockLocations.map { loc => - new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length) - } - new LocatedFileStatus( - new FileStatus( - f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime, - new Path(f.path)), - blockLocations) - } - (new Path(path), statuses) - } + statusMap.toSeq } // scalastyle:off argcount @@ -197,7 +161,6 @@ private[spark] object HadoopFSUtils extends Logging { ignoreMissingFiles: Boolean, ignoreLocality: Boolean, isRootPath: Boolean, - filterFun: Option[String => Boolean], parallelismThreshold: Int, parallelismMax: Int): Seq[FileStatus] = { @@ -245,19 +208,11 @@ private[spark] object HadoopFSUtils extends Logging { Array.empty[FileStatus] } - def doFilter(statuses: Array[FileStatus]) = filterFun match { - case Some(shouldFilterOut) => - statuses.filterNot(status => shouldFilterOut(status.getPath.getName)) - case None => - statuses - } - - val filteredStatuses = doFilter(statuses) val allLeafStatuses = { - val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory) + val (dirs, topLevelFiles) = statuses.partition(_.isDirectory) val nestedFiles: Seq[FileStatus] = contextOpt match { case Some(context) if dirs.size > parallelismThreshold => - parallelListLeafFiles( + parallelListLeafFilesInternal( context, dirs.map(_.getPath), hadoopConf = hadoopConf, @@ -265,7 +220,6 @@ private[spark] object HadoopFSUtils extends Logging { isRootLevel = false, ignoreMissingFiles = ignoreMissingFiles, ignoreLocality = ignoreLocality, - filterFun = filterFun, parallelismThreshold = parallelismThreshold, parallelismMax = parallelismMax ).flatMap(_._2) @@ -279,7 +233,6 @@ private[spark] object HadoopFSUtils extends Logging { ignoreMissingFiles = ignoreMissingFiles, ignoreLocality = ignoreLocality, isRootPath = false, - filterFun = filterFun, parallelismThreshold = parallelismThreshold, parallelismMax = parallelismMax) } @@ -289,8 +242,7 @@ private[spark] object HadoopFSUtils extends Logging { } val missingFiles = mutable.ArrayBuffer.empty[String] - val filteredLeafStatuses = doFilter(allLeafStatuses) - val resolvedLeafStatuses = filteredLeafStatuses.flatMap { + val resolvedLeafStatuses = allLeafStatuses.flatMap { case f: LocatedFileStatus => Some(f) @@ -339,22 +291,4 @@ private[spark] object HadoopFSUtils extends Logging { resolvedLeafStatuses } // scalastyle:on argcount - - /** A serializable variant of HDFS's BlockLocation. */ - private case class SerializableBlockLocation( - names: Array[String], - hosts: Array[String], - offset: Long, - length: Long) - - /** A serializable variant of HDFS's FileStatus. */ - private case class SerializableFileStatus( - path: String, - length: Long, - isDir: Boolean, - blockReplication: Short, - blockSize: Long, - modificationTime: Long, - accessTime: Long, - blockLocations: Array[SerializableBlockLocation]) } diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index 78206c51c1028..d45dc937910d9 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -23,7 +23,6 @@ import java.util.concurrent.locks.ReentrantLock import scala.concurrent.{Awaitable, ExecutionContext, ExecutionContextExecutor, Future} import scala.concurrent.duration.{Duration, FiniteDuration} -import scala.language.higherKinds import scala.util.control.NonFatal import com.google.common.util.concurrent.ThreadFactoryBuilder diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 7f1f3a71acab8..b743ab6507117 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -28,7 +28,7 @@ import java.nio.channels.{Channels, FileChannel, WritableByteChannel} import java.nio.charset.StandardCharsets import java.nio.file.Files import java.security.SecureRandom -import java.util.{Arrays, Locale, Properties, Random, UUID} +import java.util.{Locale, Properties, Random, UUID} import java.util.concurrent._ import java.util.concurrent.TimeUnit.NANOSECONDS import java.util.zip.GZIPInputStream diff --git a/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala b/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala index fae6c4af1240c..e6d3377120e56 100644 --- a/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark import scala.concurrent.duration._ -import scala.language.implicitConversions import org.scalatest.concurrent.Eventually._ import org.scalatest.matchers.must.Matchers diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala index e1d4eff0a62cb..e47181719a9db 100644 --- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala @@ -21,7 +21,7 @@ import scala.collection.mutable import scala.concurrent.duration._ import org.mockito.ArgumentMatchers.any -import org.mockito.Mockito.{mock, verify, when} +import org.mockito.Mockito.{mock, when} import org.scalatest.{BeforeAndAfterAll, PrivateMethodTester} import org.scalatest.concurrent.Eventually._ diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala index ac39f022d5ca6..7d07af4d7246b 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala @@ -23,10 +23,9 @@ import scala.io.{Codec, Source} import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.json4s.jackson.JsonMethods.parse -import org.apache.spark.{SparkConf, SparkFunSuite, Success} +import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.history.EventLogTestHelper.writeEventsToRollingWriter -import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.status.ListenerEventsTestHelper._ diff --git a/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala index 35de457ec48ce..be83ec12f92f5 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala @@ -21,7 +21,6 @@ import java.io.DataOutputStream import java.net.{HttpURLConnection, URL} import java.nio.charset.StandardCharsets import java.util.Date -import javax.servlet.http.HttpServletResponse import scala.collection.mutable.HashMap @@ -32,7 +31,6 @@ import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite} import org.apache.spark.deploy.DeployMessages.{DecommissionWorkersOnHosts, KillDriverResponse, RequestKillDriver} import org.apache.spark.deploy.DeployTestUtils._ import org.apache.spark.deploy.master._ -import org.apache.spark.internal.config.UI import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv} diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala index 8e58beff74290..31049d104e63d 100644 --- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.executor -import java.io.{Externalizable, File, ObjectInput, ObjectOutput} +import java.io.{Externalizable, ObjectInput, ObjectOutput} import java.lang.Thread.UncaughtExceptionHandler import java.nio.ByteBuffer import java.util.Properties @@ -41,7 +41,6 @@ import org.scalatestplus.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.TaskState.TaskState import org.apache.spark.broadcast.Broadcast -import org.apache.spark.deploy.{SimpleApplicationTest, SparkSubmitSuite} import org.apache.spark.internal.config._ import org.apache.spark.internal.config.UI._ import org.apache.spark.memory.TestMemoryManager @@ -53,7 +52,7 @@ import org.apache.spark.scheduler.{DirectTaskResult, FakeTask, ResultTask, Task, import org.apache.spark.serializer.{JavaSerializer, SerializerInstance, SerializerManager} import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockManager, BlockManagerId} -import org.apache.spark.util.{LongAccumulator, UninterruptibleThread, Utils} +import org.apache.spark.util.{LongAccumulator, UninterruptibleThread} class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar with Eventually with PrivateMethodTester { diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala index fab7aea6c47aa..f1d7053c34594 100644 --- a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala +++ b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala @@ -29,7 +29,6 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.internal.Logging -import org.apache.spark.util.Utils /** * Tests the correctness of diff --git a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala index fa1a75d076051..182c3c09e0524 100644 --- a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala +++ b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala @@ -24,7 +24,7 @@ import scala.reflect.ClassTag import scala.util.Random import org.mockito.ArgumentMatchers.any -import org.mockito.Mockito.{mock, times, verify, when} +import org.mockito.Mockito.{mock, when} import org.scalatest.BeforeAndAfterEach import org.scalatest.matchers.must.Matchers import org.scalatest.matchers.should.Matchers._ diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala index 278a72a7192d8..e8e8682e20ed4 100644 --- a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala @@ -26,10 +26,8 @@ import org.json4s.{DefaultFormats, Extraction} import org.apache.spark.{LocalSparkContext, SparkConf, SparkException, SparkFunSuite} import org.apache.spark.TestUtils._ import org.apache.spark.internal.config._ -import org.apache.spark.internal.config.Tests._ import org.apache.spark.resource.ResourceUtils._ import org.apache.spark.resource.TestResourceIDs._ -import org.apache.spark.scheduler.LiveListenerBus import org.apache.spark.util.Utils class ResourceUtilsSuite extends SparkFunSuite diff --git a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala index e4ec62f8efc5b..b7ac9ecac2387 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala @@ -25,7 +25,6 @@ import org.scalatest.concurrent.Eventually import org.scalatest.time.SpanSugar._ import org.apache.spark._ -import org.apache.spark.internal.config import org.apache.spark.internal.config.Tests.TEST_NO_STAGE_RETRY class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext with Eventually { diff --git a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala index 47e37fc55cefe..65d51e57ee308 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala @@ -188,7 +188,6 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo } test("extra resources from executor") { - import TestUtils._ val execCores = 3 val conf = new SparkConf() diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 99be1faab8b85..58aa246b7358f 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -42,7 +42,7 @@ import org.apache.spark.resource.ResourceUtils.{FPGA, GPU} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.shuffle.{FetchFailedException, MetadataFetchFailedException} import org.apache.spark.storage.{BlockId, BlockManagerId, BlockManagerMaster} -import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, CallSite, LongAccumulator, ThreadUtils, Utils} +import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, CallSite, LongAccumulator, Utils} class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler) extends DAGSchedulerEventProcessLoop(dagScheduler) { diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index 0c60c42c054cf..b6a59c8bbd944 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -34,7 +34,7 @@ import org.apache.spark.internal.config import org.apache.spark.resource.{ExecutorResourceRequests, ResourceProfile, TaskResourceRequests} import org.apache.spark.resource.ResourceUtils._ import org.apache.spark.resource.TestResourceIDs._ -import org.apache.spark.util.{Clock, ManualClock, SystemClock} +import org.apache.spark.util.{Clock, ManualClock} class FakeSchedulerBackend extends SchedulerBackend { def start(): Unit = {} diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala index e01e278f60205..a760dda3897df 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala @@ -1768,7 +1768,6 @@ class TaskSetManagerSuite } test("TaskSetManager passes task resource along") { - import TestUtils._ sc = new SparkContext("local", "test") sc.conf.set(TASK_GPU_ID.amountConf, "2") diff --git a/core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala index 4a92cbcb85847..1c2326db6dc99 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala @@ -19,14 +19,12 @@ package org.apache.spark.scheduler import java.util.concurrent.Semaphore -import scala.concurrent.TimeoutException import scala.concurrent.duration._ -import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite, - TestUtils} +import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite, TestUtils} import org.apache.spark.internal.config import org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend -import org.apache.spark.util.{RpcUtils, SerializableBuffer, ThreadUtils} +import org.apache.spark.util.ThreadUtils class WorkerDecommissionSuite extends SparkFunSuite with LocalSparkContext { diff --git a/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala index 9c0699bc981f8..d2bf385e10796 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.storage import java.util.Properties -import scala.concurrent.{Await, ExecutionContext, Future} +import scala.concurrent.{ExecutionContext, Future} import scala.language.implicitConversions import scala.reflect.ClassTag diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala index 48e0d218c0e5c..d02d7f862df80 100644 --- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala @@ -17,7 +17,6 @@ package org.apache.spark.ui -import java.util.Locale import javax.servlet.http.HttpServletRequest import scala.xml.Node diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 857749e84764d..20624c743bc22 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -18,8 +18,7 @@ package org.apache.spark.util import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataOutput, DataOutputStream, File, - FileOutputStream, InputStream, PrintStream, SequenceInputStream} -import java.lang.{Double => JDouble, Float => JFloat} + FileOutputStream, PrintStream, SequenceInputStream} import java.lang.reflect.Field import java.net.{BindException, ServerSocket, URI} import java.nio.{ByteBuffer, ByteOrder} diff --git a/dev/.rat-excludes b/dev/.rat-excludes index 0e892a927906a..7da330dfe1fbf 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -42,11 +42,11 @@ jquery.dataTables.1.10.20.min.js jquery.mustache.js jsonFormatter.min.css jsonFormatter.min.js -.*avsc -.*txt -.*json -.*data -.*log +.*\.avsc +.*\.txt +.*\.json +.*\.data +.*\.log pyspark-coverage-site/* cloudpickle/* join.py @@ -98,17 +98,17 @@ local-1430917381535_2 DESCRIPTION NAMESPACE test_support/* -.*Rd +.*\.Rd help/* html/* INDEX .lintr gen-java.* -.*avpr -.*parquet +.*\.avpr +.*\.parquet spark-deps-.* -.*csv -.*tsv +.*\.csv +.*\.tsv .*\.sql .Rbuildignore META-INF/* @@ -125,3 +125,11 @@ application_1578436911597_0052 config.properties app-20200706201101-0003 py.typed +_metadata +_SUCCESS +part-00000 +.*\.res +flights_tiny.txt.1 +over1k +over10k +exported_table/* diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 index 8c1ab9e3c1cfe..bcf05506855c5 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -7,7 +7,7 @@ activation/1.1.1//activation-1.1.1.jar aircompressor/0.10//aircompressor-0.10.jar algebra_2.12/2.0.0-M2//algebra_2.12-2.0.0-M2.jar antlr-runtime/3.5.2//antlr-runtime-3.5.2.jar -antlr4-runtime/4.7.1//antlr4-runtime-4.7.1.jar +antlr4-runtime/4.8-1//antlr4-runtime-4.8-1.jar aopalliance-repackaged/2.6.1//aopalliance-repackaged-2.6.1.jar aopalliance/1.0//aopalliance-1.0.jar apacheds-i18n/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar @@ -155,26 +155,26 @@ jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar jul-to-slf4j/1.7.30//jul-to-slf4j-1.7.30.jar kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar -kubernetes-client/4.11.1//kubernetes-client-4.11.1.jar -kubernetes-model-admissionregistration/4.11.1//kubernetes-model-admissionregistration-4.11.1.jar -kubernetes-model-apiextensions/4.11.1//kubernetes-model-apiextensions-4.11.1.jar -kubernetes-model-apps/4.11.1//kubernetes-model-apps-4.11.1.jar -kubernetes-model-autoscaling/4.11.1//kubernetes-model-autoscaling-4.11.1.jar -kubernetes-model-batch/4.11.1//kubernetes-model-batch-4.11.1.jar -kubernetes-model-certificates/4.11.1//kubernetes-model-certificates-4.11.1.jar -kubernetes-model-common/4.11.1//kubernetes-model-common-4.11.1.jar -kubernetes-model-coordination/4.11.1//kubernetes-model-coordination-4.11.1.jar -kubernetes-model-core/4.11.1//kubernetes-model-core-4.11.1.jar -kubernetes-model-discovery/4.11.1//kubernetes-model-discovery-4.11.1.jar -kubernetes-model-events/4.11.1//kubernetes-model-events-4.11.1.jar -kubernetes-model-extensions/4.11.1//kubernetes-model-extensions-4.11.1.jar -kubernetes-model-metrics/4.11.1//kubernetes-model-metrics-4.11.1.jar -kubernetes-model-networking/4.11.1//kubernetes-model-networking-4.11.1.jar -kubernetes-model-policy/4.11.1//kubernetes-model-policy-4.11.1.jar -kubernetes-model-rbac/4.11.1//kubernetes-model-rbac-4.11.1.jar -kubernetes-model-scheduling/4.11.1//kubernetes-model-scheduling-4.11.1.jar -kubernetes-model-settings/4.11.1//kubernetes-model-settings-4.11.1.jar -kubernetes-model-storageclass/4.11.1//kubernetes-model-storageclass-4.11.1.jar +kubernetes-client/4.12.0//kubernetes-client-4.12.0.jar +kubernetes-model-admissionregistration/4.12.0//kubernetes-model-admissionregistration-4.12.0.jar +kubernetes-model-apiextensions/4.12.0//kubernetes-model-apiextensions-4.12.0.jar +kubernetes-model-apps/4.12.0//kubernetes-model-apps-4.12.0.jar +kubernetes-model-autoscaling/4.12.0//kubernetes-model-autoscaling-4.12.0.jar +kubernetes-model-batch/4.12.0//kubernetes-model-batch-4.12.0.jar +kubernetes-model-certificates/4.12.0//kubernetes-model-certificates-4.12.0.jar +kubernetes-model-common/4.12.0//kubernetes-model-common-4.12.0.jar +kubernetes-model-coordination/4.12.0//kubernetes-model-coordination-4.12.0.jar +kubernetes-model-core/4.12.0//kubernetes-model-core-4.12.0.jar +kubernetes-model-discovery/4.12.0//kubernetes-model-discovery-4.12.0.jar +kubernetes-model-events/4.12.0//kubernetes-model-events-4.12.0.jar +kubernetes-model-extensions/4.12.0//kubernetes-model-extensions-4.12.0.jar +kubernetes-model-metrics/4.12.0//kubernetes-model-metrics-4.12.0.jar +kubernetes-model-networking/4.12.0//kubernetes-model-networking-4.12.0.jar +kubernetes-model-policy/4.12.0//kubernetes-model-policy-4.12.0.jar +kubernetes-model-rbac/4.12.0//kubernetes-model-rbac-4.12.0.jar +kubernetes-model-scheduling/4.12.0//kubernetes-model-scheduling-4.12.0.jar +kubernetes-model-settings/4.12.0//kubernetes-model-settings-4.12.0.jar +kubernetes-model-storageclass/4.12.0//kubernetes-model-storageclass-4.12.0.jar leveldbjni-all/1.8//leveldbjni-all-1.8.jar libfb303/0.9.3//libfb303-0.9.3.jar libthrift/0.12.0//libthrift-0.12.0.jar @@ -195,7 +195,6 @@ objenesis/2.6//objenesis-2.6.jar okhttp/3.12.12//okhttp-3.12.12.jar okio/1.14.0//okio-1.14.0.jar opencsv/2.3//opencsv-2.3.jar -openshift-model/4.11.1//openshift-model-4.11.1.jar orc-core/1.5.12//orc-core-1.5.12.jar orc-mapreduce/1.5.12//orc-mapreduce-1.5.12.jar orc-shims/1.5.12//orc-shims-1.5.12.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 index fcb993033221e..cd274bef7045b 100644 --- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -7,7 +7,7 @@ activation/1.1.1//activation-1.1.1.jar aircompressor/0.10//aircompressor-0.10.jar algebra_2.12/2.0.0-M2//algebra_2.12-2.0.0-M2.jar antlr-runtime/3.5.2//antlr-runtime-3.5.2.jar -antlr4-runtime/4.7.1//antlr4-runtime-4.7.1.jar +antlr4-runtime/4.8-1//antlr4-runtime-4.8-1.jar aopalliance-repackaged/2.6.1//aopalliance-repackaged-2.6.1.jar arpack_combined_all/0.1//arpack_combined_all-0.1.jar arrow-format/2.0.0//arrow-format-2.0.0.jar @@ -125,26 +125,26 @@ jsr305/3.0.0//jsr305-3.0.0.jar jta/1.1//jta-1.1.jar jul-to-slf4j/1.7.30//jul-to-slf4j-1.7.30.jar kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar -kubernetes-client/4.11.1//kubernetes-client-4.11.1.jar -kubernetes-model-admissionregistration/4.11.1//kubernetes-model-admissionregistration-4.11.1.jar -kubernetes-model-apiextensions/4.11.1//kubernetes-model-apiextensions-4.11.1.jar -kubernetes-model-apps/4.11.1//kubernetes-model-apps-4.11.1.jar -kubernetes-model-autoscaling/4.11.1//kubernetes-model-autoscaling-4.11.1.jar -kubernetes-model-batch/4.11.1//kubernetes-model-batch-4.11.1.jar -kubernetes-model-certificates/4.11.1//kubernetes-model-certificates-4.11.1.jar -kubernetes-model-common/4.11.1//kubernetes-model-common-4.11.1.jar -kubernetes-model-coordination/4.11.1//kubernetes-model-coordination-4.11.1.jar -kubernetes-model-core/4.11.1//kubernetes-model-core-4.11.1.jar -kubernetes-model-discovery/4.11.1//kubernetes-model-discovery-4.11.1.jar -kubernetes-model-events/4.11.1//kubernetes-model-events-4.11.1.jar -kubernetes-model-extensions/4.11.1//kubernetes-model-extensions-4.11.1.jar -kubernetes-model-metrics/4.11.1//kubernetes-model-metrics-4.11.1.jar -kubernetes-model-networking/4.11.1//kubernetes-model-networking-4.11.1.jar -kubernetes-model-policy/4.11.1//kubernetes-model-policy-4.11.1.jar -kubernetes-model-rbac/4.11.1//kubernetes-model-rbac-4.11.1.jar -kubernetes-model-scheduling/4.11.1//kubernetes-model-scheduling-4.11.1.jar -kubernetes-model-settings/4.11.1//kubernetes-model-settings-4.11.1.jar -kubernetes-model-storageclass/4.11.1//kubernetes-model-storageclass-4.11.1.jar +kubernetes-client/4.12.0//kubernetes-client-4.12.0.jar +kubernetes-model-admissionregistration/4.12.0//kubernetes-model-admissionregistration-4.12.0.jar +kubernetes-model-apiextensions/4.12.0//kubernetes-model-apiextensions-4.12.0.jar +kubernetes-model-apps/4.12.0//kubernetes-model-apps-4.12.0.jar +kubernetes-model-autoscaling/4.12.0//kubernetes-model-autoscaling-4.12.0.jar +kubernetes-model-batch/4.12.0//kubernetes-model-batch-4.12.0.jar +kubernetes-model-certificates/4.12.0//kubernetes-model-certificates-4.12.0.jar +kubernetes-model-common/4.12.0//kubernetes-model-common-4.12.0.jar +kubernetes-model-coordination/4.12.0//kubernetes-model-coordination-4.12.0.jar +kubernetes-model-core/4.12.0//kubernetes-model-core-4.12.0.jar +kubernetes-model-discovery/4.12.0//kubernetes-model-discovery-4.12.0.jar +kubernetes-model-events/4.12.0//kubernetes-model-events-4.12.0.jar +kubernetes-model-extensions/4.12.0//kubernetes-model-extensions-4.12.0.jar +kubernetes-model-metrics/4.12.0//kubernetes-model-metrics-4.12.0.jar +kubernetes-model-networking/4.12.0//kubernetes-model-networking-4.12.0.jar +kubernetes-model-policy/4.12.0//kubernetes-model-policy-4.12.0.jar +kubernetes-model-rbac/4.12.0//kubernetes-model-rbac-4.12.0.jar +kubernetes-model-scheduling/4.12.0//kubernetes-model-scheduling-4.12.0.jar +kubernetes-model-settings/4.12.0//kubernetes-model-settings-4.12.0.jar +kubernetes-model-storageclass/4.12.0//kubernetes-model-storageclass-4.12.0.jar leveldbjni-all/1.8//leveldbjni-all-1.8.jar libfb303/0.9.3//libfb303-0.9.3.jar libthrift/0.12.0//libthrift-0.12.0.jar @@ -165,7 +165,6 @@ objenesis/2.6//objenesis-2.6.jar okhttp/3.12.12//okhttp-3.12.12.jar okio/1.14.0//okio-1.14.0.jar opencsv/2.3//opencsv-2.3.jar -openshift-model/4.11.1//openshift-model-4.11.1.jar orc-core/1.5.12//orc-core-1.5.12.jar orc-mapreduce/1.5.12//orc-mapreduce-1.5.12.jar orc-shims/1.5.12//orc-shims-1.5.12.jar diff --git a/docs/_config.yml b/docs/_config.yml index 3be9807f81082..cd341063a1f92 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -26,3 +26,15 @@ SCALA_VERSION: "2.12.10" MESOS_VERSION: 1.0.0 SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK SPARK_GITHUB_URL: https://github.com/apache/spark +# Before a new release, we should apply a new `apiKey` for the new Spark documentation +# on https://docsearch.algolia.com/. Otherwise, after release, the search results are always based +# on the latest documentation(https://spark.apache.org/docs/latest/) even when visiting the +# documentation of previous releases. +DOCSEARCH_SCRIPT: | + docsearch({ + apiKey: 'b18ca3732c502995563043aa17bc6ecb', + indexName: 'apache_spark', + inputSelector: '#docsearch-input', + enhancedSearchInput: true, + debug: false // Set debug to true if you want to inspect the dropdown + }); diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html index 65af17ed2e4a1..de98f29acf3b7 100755 --- a/docs/_layouts/global.html +++ b/docs/_layouts/global.html @@ -187,13 +187,7 @@

{{ page.title }}

// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index // to your search input and display its results in a dropdown UI. If you want to find more // details on how works DocSearch, check the docs of DocSearch. - docsearch({ - apiKey: 'b18ca3732c502995563043aa17bc6ecb', - indexName: 'apache_spark', - inputSelector: '#docsearch-input', - enhancedSearchInput: true, - debug: false // Set debug to true if you want to inspect the dropdown - }); + {{site.DOCSEARCH_SCRIPT}} diff --git a/docs/monitoring.md b/docs/monitoring.md index a07a113445981..15a6cbd910210 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -1175,6 +1175,8 @@ This is the component with the largest amount of instrumented metrics These metrics are exposed by Spark executors. - namespace=executor (metrics are of type counter or gauge) + - **notes:** + - `spark.executor.metrics.fileSystemSchemes` (default: `file,hdfs`) determines the exposed file system metrics. - bytesRead.count - bytesWritten.count - cpuTime.count diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md index 578ab90fedfca..80591bd08650a 100644 --- a/docs/running-on-mesos.md +++ b/docs/running-on-mesos.md @@ -734,6 +734,38 @@ See the [configuration page](configuration.html) for information on Spark config 2.1.0 + + spark.mesos.dispatcher.queue + (none) + + Set the name of the dispatcher queue to which the application is submitted. + The specified queue must be added to the dispatcher with spark.mesos.dispatcher.queue.[QueueName]. + If no queue is specified, then the application is submitted to the "default" queue with 0.0 priority. + + 3.1.0 + + + spark.mesos.dispatcher.queue.[QueueName] + 0.0 + + Add a new queue for submitted drivers with the specified priority. + Higher numbers indicate higher priority. + The user can specify multiple queues to define a workload management policy for queued drivers in the dispatcher. + A driver can then be submitted to a specific queue with spark.mesos.dispatcher.queue. + By default, the dispatcher has a single queue with 0.0 priority (cannot be overridden). + It is possible to implement a consistent and overall workload management policy throughout the lifecycle of drivers + by mapping priority queues to weighted Mesos roles, and by specifying a + spark.mesos.role along with a spark.mesos.dispatcher.queue when submitting an application. + For example, with the URGENT Mesos role: +
+    spark.mesos.dispatcher.queue.URGENT=1.0
+
+    spark.mesos.dispatcher.queue=URGENT
+    spark.mesos.role=URGENT
+    
+ + 3.1.0 + spark.mesos.gpus.max 0 diff --git a/docs/sql-data-sources-avro.md b/docs/sql-data-sources-avro.md index 69b165ed28bae..9ecc6eb91da5a 100644 --- a/docs/sql-data-sources-avro.md +++ b/docs/sql-data-sources-avro.md @@ -88,8 +88,6 @@ Kafka key-value record will be augmented with some metadata, such as the ingesti * If the "value" field that contains your data is in Avro, you could use `from_avro()` to extract your data, enrich it, clean it, and then push it downstream to Kafka again or write it out to a file. * `to_avro()` can be used to turn structs into Avro records. This method is particularly useful when you would like to re-encode multiple columns into a single one when writing data out to Kafka. -Both functions are currently only available in Scala, Java, and Python. -
{% highlight scala %} @@ -183,6 +181,38 @@ query = output\ .option("topic", "topic2")\ .start() +{% endhighlight %} +
+
+{% highlight r %} + +# `from_avro` requires Avro schema in JSON string format. +jsonFormatSchema <- paste0(readLines("examples/src/main/resources/user.avsc"), collapse=" ") + +df <- read.stream( + "kafka", + kafka.bootstrap.servers = "host1:port1,host2:port2", + subscribe = "topic1" +) + +# 1. Decode the Avro data into a struct; +# 2. Filter by column `favorite_color`; +# 3. Encode the column `name` in Avro format. + +output <- select( + filter( + select(df, alias(from_avro("value", jsonFormatSchema), "user")), + column("user.favorite_color") == "red" + ), + alias(to_avro("user.name"), "value") +) + +write.stream( + output, + "kafka", + kafka.bootstrap.servers = "host1:port1,host2:port2", + topic = "topic2" +) {% endhighlight %}
diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index c2b36033e318e..fd7208615a09f 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -61,6 +61,27 @@ Spark SQL has three kinds of type conversions: explicit casting, type coercion, When `spark.sql.ansi.enabled` is set to `true`, explicit casting by `CAST` syntax throws a runtime exception for illegal cast patterns defined in the standard, e.g. casts from a string to an integer. On the other hand, `INSERT INTO` syntax throws an analysis exception when the ANSI mode enabled via `spark.sql.storeAssignmentPolicy=ANSI`. +The type conversion of Spark ANSI mode follows the syntax rules of section 6.13 "cast specification" in [ISO/IEC 9075-2:2011 Information technology — Database languages - SQL — Part 2: Foundation (SQL/Foundation)"](https://www.iso.org/standard/53682.html), except it specially allows the following + straightforward type conversions which are disallowed as per the ANSI standard: +* NumericType <=> BooleanType +* StringType <=> BinaryType + + The valid combinations of target data type and source data type in a `CAST` expression are given by the following table. +“Y” indicates that the combination is syntactically valid without restriction and “N” indicates that the combination is not valid. + +| From\To | NumericType | StringType | DateType | TimestampType | IntervalType | BooleanType | BinaryType | ArrayType | MapType | StructType | +|-----------|---------|--------|------|-----------|----------|---------|--------|-------|-----|--------| +| NumericType | Y | Y | N | N | N | Y | N | N | N | N | +| StringType | Y | Y | Y | Y | Y | Y | Y | N | N | N | +| DateType | N | Y | Y | Y | N | N | N | N | N | N | +| TimestampType | N | Y | Y | Y | N | N | N | N | N | N | +| IntervalType | N | Y | N | N | Y | N | N | N | N | N | +| BooleanType | Y | Y | N | N | N | Y | N | N | N | N | +| BinaryType | Y | N | N | N | N | N | Y | N | N | N | +| ArrayType | N | N | N | N | N | N | N | Y | N | N | +| MapType | N | N | N | N | N | N | N | N | Y | N | +| StructType | N | N | N | N | N | N | N | N | N | Y | + Currently, the ANSI mode affects explicit casting and assignment casting only. In future releases, the behaviour of type coercion might change along with the other two type conversion rules. @@ -112,12 +133,14 @@ SELECT * FROM t; The behavior of some SQL functions can be different under ANSI mode (`spark.sql.ansi.enabled=true`). - `size`: This function returns null for null input. - `element_at`: This function throws `ArrayIndexOutOfBoundsException` if using invalid indices. + - `element_at`: This function throws `NoSuchElementException` if key does not exist in map. - `elt`: This function throws `ArrayIndexOutOfBoundsException` if using invalid indices. ### SQL Operators The behavior of some SQL operators can be different under ANSI mode (`spark.sql.ansi.enabled=true`). - `array_col[index]`: This operator throws `ArrayIndexOutOfBoundsException` if using invalid indices. + - `map_col[key]`: This operator throws `NoSuchElementException` if key does not exist in map. ### SQL Keywords diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala index 86d00cac9485f..487cb27b93fe8 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala @@ -41,7 +41,6 @@ object DeveloperApiExample { .builder .appName("DeveloperApiExample") .getOrCreate() - import spark.implicits._ // Prepare training data. val training = spark.createDataFrame(Seq( diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala index 2845028dd0814..7a7501ee84526 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RankingMetricsExample.scala @@ -30,7 +30,6 @@ object RankingMetricsExample { .builder .appName("RankingMetricsExample") .getOrCreate() - import spark.implicits._ // $example on$ // Read in the ratings data val ratings = spark.read.textFile("data/mllib/sample_movielens_data.txt").rdd.map { line => diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala index fde281087c267..b17b86c08314b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala @@ -42,8 +42,6 @@ object SparkSQLExample { .config("spark.some.config.option", "some-value") .getOrCreate() - // For implicit conversions like converting RDDs to DataFrames - import spark.implicits._ // $example off:init_session$ runBasicDataFrameExample(spark) diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala b/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala index c685c89f0dfc8..09c849960c1b5 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala +++ b/external/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala @@ -27,7 +27,7 @@ import org.apache.avro.Schema.Type._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.util.RandomUUIDGenerator import org.apache.spark.sql.types._ -import org.apache.spark.sql.types.Decimal.{maxPrecisionForBytes, minBytesForPrecision} +import org.apache.spark.sql.types.Decimal.minBytesForPrecision /** * This object contains method that are used to convert sparkSQL schemas to avro schemas and vice diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala index c9c6bcecac14e..d3bfb716f515c 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala +++ b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala @@ -44,7 +44,7 @@ import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.Filter import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{withDefaultTimeZone, LA, UTC} import org.apache.spark.sql.execution.{FormattedMode, SparkPlan} -import org.apache.spark.sql.execution.datasources.{CommonFileDataSourceSuite, DataSource, FilePartition, PartitionedFile} +import org.apache.spark.sql.execution.datasources.{CommonFileDataSourceSuite, DataSource, FilePartition} import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.SQLConf diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala index 9ad083f1cfde5..a1b0f7d22216b 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.kafka010 -import org.apache.kafka.common.TopicPartition - import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Network.NETWORK_TIMEOUT diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala index 6599e7e0fe707..c25b8b4e510a0 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala @@ -19,12 +19,9 @@ package org.apache.spark.sql.kafka010 import java.{util => ju} -import org.apache.kafka.clients.consumer.ConsumerConfig - import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Network.NETWORK_TIMEOUT -import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql.SparkSession import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory} import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset, ReadAllAvailable, ReadLimit, ReadMaxRows, SupportsAdmissionControl} diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala index 6d30bd2a6d2cd..adcc20c25cb5f 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala @@ -23,7 +23,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.util.control.NonFatal -import org.apache.kafka.clients.consumer.{Consumer, ConsumerConfig, KafkaConsumer, OffsetAndTimestamp} +import org.apache.kafka.clients.consumer.{Consumer, ConsumerConfig, OffsetAndTimestamp} import org.apache.kafka.common.TopicPartition import org.apache.spark.SparkEnv @@ -33,10 +33,12 @@ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.util.{UninterruptibleThread, UninterruptibleThreadRunner} /** - * This class uses Kafka's own [[KafkaConsumer]] API to read data offsets from Kafka. + * This class uses Kafka's own [[org.apache.kafka.clients.consumer.KafkaConsumer]] API to + * read data offsets from Kafka. * The [[ConsumerStrategy]] class defines which Kafka topics and partitions should be read * by this source. These strategies directly correspond to the different consumption options - * in. This class is designed to return a configured [[KafkaConsumer]] that is used by the + * in. This class is designed to return a configured + * [[org.apache.kafka.clients.consumer.KafkaConsumer]] that is used by the * [[KafkaSource]] to query for the offsets. See the docs on * [[org.apache.spark.sql.kafka010.ConsumerStrategy]] * for more details. @@ -50,7 +52,8 @@ private[kafka010] class KafkaOffsetReader( driverGroupIdPrefix: String) extends Logging { /** - * [[UninterruptibleThreadRunner]] ensures that all [[KafkaConsumer]] communication called in an + * [[UninterruptibleThreadRunner]] ensures that all + * [[org.apache.kafka.clients.consumer.KafkaConsumer]] communication called in an * [[UninterruptibleThread]]. In the case of streaming queries, we are already running in an * [[UninterruptibleThread]], however for batch mode this is not the case. */ diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala index 413a0c4de8bea..69a66e2209773 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala @@ -17,13 +17,10 @@ package org.apache.spark.sql.kafka010 -import org.apache.kafka.common.TopicPartition - import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Network.NETWORK_TIMEOUT import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext} -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.StructType diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala index 57879c7ca31cf..71ccb5f952f0a 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala @@ -18,11 +18,7 @@ package org.apache.spark.sql.kafka010 import java.{util => ju} -import java.io._ -import java.nio.charset.StandardCharsets -import org.apache.commons.io.IOUtils -import org.apache.kafka.clients.consumer.ConsumerConfig import org.apache.kafka.common.TopicPartition import org.apache.spark.SparkContext @@ -35,7 +31,6 @@ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.connector.read.streaming import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit, ReadMaxRows, SupportsAdmissionControl} import org.apache.spark.sql.execution.streaming._ -import org.apache.spark.sql.kafka010.KafkaSource._ import org.apache.spark.sql.kafka010.KafkaSourceProvider._ import org.apache.spark.sql.types._ diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala index 748d623a0a32a..3ace0874674b6 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala +++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala @@ -30,7 +30,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.kafka010.KafkaConfigUpdater import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap -import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability, TableProvider} +import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability} import org.apache.spark.sql.connector.read.{Batch, Scan, ScanBuilder} import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream} import org.apache.spark.sql.connector.write.{BatchWrite, LogicalWriteInfo, SupportsTruncate, WriteBuilder} diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala index fe783ffe53a3b..08f673455d729 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala @@ -1178,7 +1178,6 @@ class KafkaMicroBatchV2SourceSuite extends KafkaMicroBatchSourceSuiteBase { } testWithUninterruptibleThread("minPartitions is supported") { - import testImplicits._ val topic = newTopic() val tp = new TopicPartition(topic, 0) diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala index c5f3086b38c99..43ed4a8378a8c 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala +++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala @@ -26,7 +26,6 @@ import javax.security.auth.login.Configuration import scala.collection.JavaConverters._ import scala.io.Source -import scala.util.Random import scala.util.control.NonFatal import com.google.common.io.Files @@ -38,13 +37,12 @@ import org.apache.hadoop.minikdc.MiniKdc import org.apache.hadoop.security.UserGroupInformation import org.apache.kafka.clients.CommonClientConfigs import org.apache.kafka.clients.admin._ -import org.apache.kafka.clients.consumer.KafkaConsumer import org.apache.kafka.clients.producer._ import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.config.SaslConfigs import org.apache.kafka.common.network.ListenerName import org.apache.kafka.common.security.auth.SecurityProtocol.{PLAINTEXT, SASL_PLAINTEXT} -import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer} +import org.apache.kafka.common.serialization.StringSerializer import org.apache.kafka.common.utils.SystemTime import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer} import org.apache.zookeeper.server.auth.SASLAuthenticationProvider diff --git a/external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenUtil.scala b/external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenUtil.scala index 307a69f9b84c5..bc790418decd3 100644 --- a/external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenUtil.scala +++ b/external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenUtil.scala @@ -36,7 +36,7 @@ import org.apache.kafka.common.security.auth.SecurityProtocol.{SASL_PLAINTEXT, S import org.apache.kafka.common.security.scram.ScramLoginModule import org.apache.kafka.common.security.token.delegation.DelegationToken -import org.apache.spark.{SparkConf, SparkEnv} +import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.internal.Logging diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala index ac81f92f86109..c0724909bc350 100644 --- a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala +++ b/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala @@ -19,8 +19,6 @@ package org.apache.spark.streaming.kafka010.mocks import java.util.concurrent.{ScheduledFuture, TimeUnit} -import scala.collection.mutable.PriorityQueue - import kafka.utils.Scheduler import org.apache.kafka.common.utils.Time import org.jmock.lib.concurrent.DeterministicScheduler diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala index 11e949536f2b6..770eb2d89d522 100644 --- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala +++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala @@ -21,7 +21,6 @@ import java.util.concurrent._ import scala.util.control.NonFatal import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer -import com.amazonaws.services.kinesis.clientlibrary.lib.worker.ShutdownReason import org.apache.spark.internal.Logging import org.apache.spark.streaming.Duration diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala index 8c3931a1c87fd..e778d083b3f70 100644 --- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala +++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala @@ -50,8 +50,6 @@ private[kinesis] class KinesisInputDStream[T: ClassTag]( val metricsEnabledDimensions: Set[String] ) extends ReceiverInputDStream[T](_ssc) { - import KinesisReadConfigurations._ - private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala index d26acf924c0a3..7bc86c4871cfb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/package.scala @@ -17,8 +17,6 @@ package org.apache.spark.ml -import org.apache.spark.ml.attribute.{Attribute, AttributeGroup} - /** * ==ML attributes== * diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala index 95f37671e1399..9191b3ec4bc2b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala @@ -67,6 +67,10 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR * This binary classifier optimizes the Hinge Loss using the OWLQN optimizer. * Only supports L2 regularization currently. * + * Since 3.1.0, it supports stacking instances into blocks and using GEMV for + * better performance. + * The block size will be 1.0 MB, if param maxBlockSizeInMB is set 0.0 by default. + * */ @Since("2.2.0") class LinearSVC @Since("2.2.0") ( @@ -154,7 +158,7 @@ class LinearSVC @Since("2.2.0") ( /** * Sets the value of param [[maxBlockSizeInMB]]. - * Default is 0.0. + * Default is 0.0, then 1.0 MB will be chosen. * * @group expertSetParam */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index a43ad466a7c80..057196dd67a52 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -50,7 +50,7 @@ import org.apache.spark.util.VersionUtils private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol with HasStandardization with HasWeightCol with HasThreshold with HasAggregationDepth - with HasBlockSize { + with HasMaxBlockSizeInMB { import org.apache.spark.ml.classification.LogisticRegression.supportedFamilyNames @@ -245,7 +245,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas setDefault(regParam -> 0.0, elasticNetParam -> 0.0, maxIter -> 100, tol -> 1E-6, fitIntercept -> true, family -> "auto", standardization -> true, threshold -> 0.5, - aggregationDepth -> 2, blockSize -> 1) + aggregationDepth -> 2, maxBlockSizeInMB -> 0.0) protected def usingBoundConstrainedOptimization: Boolean = { isSet(lowerBoundsOnCoefficients) || isSet(upperBoundsOnCoefficients) || @@ -276,6 +276,10 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas * * This class supports fitting traditional logistic regression model by LBFGS/OWLQN and * bound (box) constrained logistic regression model by LBFGSB. + * + * Since 3.1.0, it supports stacking instances into blocks and using GEMV/GEMM for + * better performance. + * The block size will be 1.0 MB, if param maxBlockSizeInMB is set 0.0 by default. */ @Since("1.2.0") class LogisticRegression @Since("1.2.0") ( @@ -426,22 +430,13 @@ class LogisticRegression @Since("1.2.0") ( def setUpperBoundsOnIntercepts(value: Vector): this.type = set(upperBoundsOnIntercepts, value) /** - * Set block size for stacking input data in matrices. - * If blockSize == 1, then stacking will be skipped, and each vector is treated individually; - * If blockSize > 1, then vectors will be stacked to blocks, and high-level BLAS routines - * will be used if possible (for example, GEMV instead of DOT, GEMM instead of GEMV). - * Recommended size is between 10 and 1000. An appropriate choice of the block size depends - * on the sparsity and dim of input datasets, the underlying BLAS implementation (for example, - * f2jBLAS, OpenBLAS, intel MKL) and its configuration (for example, number of threads). - * Note that existing BLAS implementations are mainly optimized for dense matrices, if the - * input dataset is sparse, stacking may bring no performance gain, the worse is possible - * performance regression. - * Default is 1. + * Sets the value of param [[maxBlockSizeInMB]]. + * Default is 0.0, then 1.0 MB will be chosen. * * @group expertSetParam */ @Since("3.1.0") - def setBlockSize(value: Int): this.type = set(blockSize, value) + def setMaxBlockSizeInMB(value: Double): this.type = set(maxBlockSizeInMB, value) private def assertBoundConstrainedOptimizationParamsValid( numCoefficientSets: Int, @@ -495,31 +490,24 @@ class LogisticRegression @Since("1.2.0") ( this } - override protected[spark] def train(dataset: Dataset[_]): LogisticRegressionModel = { - val handlePersistence = dataset.storageLevel == StorageLevel.NONE - train(dataset, handlePersistence) - } - protected[spark] def train( - dataset: Dataset[_], - handlePersistence: Boolean): LogisticRegressionModel = instrumented { instr => + dataset: Dataset[_]): LogisticRegressionModel = instrumented { instr => instr.logPipelineStage(this) instr.logDataset(dataset) instr.logParams(this, labelCol, weightCol, featuresCol, predictionCol, rawPredictionCol, probabilityCol, regParam, elasticNetParam, standardization, threshold, thresholds, maxIter, - tol, fitIntercept, blockSize) + tol, fitIntercept, maxBlockSizeInMB) + + if (dataset.storageLevel != StorageLevel.NONE) { + instr.logWarning(s"Input instances will be standardized, blockified to blocks, and " + + s"then cached during training. Be careful of double caching!") + } val instances = extractInstances(dataset) .setName("training instances") - if (handlePersistence && $(blockSize) == 1) { - instances.persist(StorageLevel.MEMORY_AND_DISK) - } - - var requestedMetrics = Seq("mean", "std", "count") - if ($(blockSize) != 1) requestedMetrics +:= "numNonZeros" val (summarizer, labelSummarizer) = Summarizer - .getClassificationSummarizers(instances, $(aggregationDepth), requestedMetrics) + .getClassificationSummarizers(instances, $(aggregationDepth), Seq("mean", "std", "count")) val numFeatures = summarizer.mean.size val histogram = labelSummarizer.histogram @@ -547,14 +535,13 @@ class LogisticRegression @Since("1.2.0") ( instr.logNamedValue("lowestLabelWeight", labelSummarizer.histogram.min.toString) instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.max.toString) instr.logSumOfWeights(summarizer.weightSum) - if ($(blockSize) > 1) { - val scale = 1.0 / summarizer.count / numFeatures - val sparsity = 1 - summarizer.numNonzeros.toArray.map(_ * scale).sum - instr.logNamedValue("sparsity", sparsity.toString) - if (sparsity > 0.5) { - instr.logWarning(s"sparsity of input dataset is $sparsity, " + - s"which may hurt performance in high-level BLAS.") - } + + var actualBlockSizeInMB = $(maxBlockSizeInMB) + if (actualBlockSizeInMB == 0) { + // TODO: for Multinomial logistic regression, take numClasses into account + actualBlockSizeInMB = InstanceBlock.DefaultBlockSizeInMB + require(actualBlockSizeInMB > 0, "inferred actual BlockSizeInMB must > 0") + instr.logNamedValue("actualBlockSizeInMB", actualBlockSizeInMB.toString) } val isMultinomial = checkMultinomial(numClasses) @@ -584,7 +571,6 @@ class LogisticRegression @Since("1.2.0") ( } else { Vectors.dense(if (numClasses == 2) Double.PositiveInfinity else Double.NegativeInfinity) } - if (instances.getStorageLevel != StorageLevel.NONE) instances.unpersist() return createModel(dataset, numClasses, coefMatrix, interceptVec, Array(0.0)) } @@ -636,14 +622,9 @@ class LogisticRegression @Since("1.2.0") ( Note that the intercept in scaled space and original space is the same; as a result, no scaling is needed. */ - val (allCoefficients, objectiveHistory) = if ($(blockSize) == 1) { - trainOnRows(instances, featuresStd, numClasses, initialCoefWithInterceptMatrix, - regularization, optimizer) - } else { - trainOnBlocks(instances, featuresStd, numClasses, initialCoefWithInterceptMatrix, - regularization, optimizer) - } - if (instances.getStorageLevel != StorageLevel.NONE) instances.unpersist() + val (allCoefficients, objectiveHistory) = + trainImpl(instances, actualBlockSizeInMB, featuresStd, numClasses, + initialCoefWithInterceptMatrix, regularization, optimizer) if (allCoefficients == null) { val msg = s"${optimizer.getClass.getName} failed." @@ -949,40 +930,9 @@ class LogisticRegression @Since("1.2.0") ( initialCoefWithInterceptMatrix } - private def trainOnRows( - instances: RDD[Instance], - featuresStd: Array[Double], - numClasses: Int, - initialCoefWithInterceptMatrix: Matrix, - regularization: Option[L2Regularization], - optimizer: FirstOrderMinimizer[BDV[Double], DiffFunction[BDV[Double]]]) = { - val bcFeaturesStd = instances.context.broadcast(featuresStd) - val getAggregatorFunc = new LogisticAggregator(bcFeaturesStd, numClasses, $(fitIntercept), - checkMultinomial(numClasses))(_) - - val costFun = new RDDLossFunction(instances, getAggregatorFunc, - regularization, $(aggregationDepth)) - val states = optimizer.iterations(new CachedDiffFunction(costFun), - new BDV[Double](initialCoefWithInterceptMatrix.toArray)) - - /* - Note that in Logistic Regression, the objective history (loss + regularization) - is log-likelihood which is invariant under feature standardization. As a result, - the objective history from optimizer is the same as the one in the original space. - */ - val arrayBuilder = mutable.ArrayBuilder.make[Double] - var state: optimizer.State = null - while (states.hasNext) { - state = states.next() - arrayBuilder += state.adjustedValue - } - bcFeaturesStd.destroy() - - (if (state == null) null else state.x.toArray, arrayBuilder.result) - } - - private def trainOnBlocks( + private def trainImpl( instances: RDD[Instance], + actualBlockSizeInMB: Double, featuresStd: Array[Double], numClasses: Int, initialCoefWithInterceptMatrix: Matrix, @@ -996,9 +946,11 @@ class LogisticRegression @Since("1.2.0") ( val func = StandardScalerModel.getTransformFunc(Array.empty, inverseStd, false, true) iter.map { case Instance(label, weight, vec) => Instance(label, weight, func(vec)) } } - val blocks = InstanceBlock.blokify(standardized, $(blockSize)) + + val maxMemUsage = (actualBlockSizeInMB * 1024L * 1024L).ceil.toLong + val blocks = InstanceBlock.blokifyWithMaxMemUsage(standardized, maxMemUsage) .persist(StorageLevel.MEMORY_AND_DISK) - .setName(s"training blocks (blockSize=${$(blockSize)})") + .setName(s"training blocks (blockSizeInMB=$actualBlockSizeInMB)") val getAggregatorFunc = new BlockLogisticAggregator(numFeatures, numClasses, $(fitIntercept), checkMultinomial(numClasses))(_) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VarianceThresholdSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VarianceThresholdSelector.scala index cd245dd723348..2c7186015d400 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/VarianceThresholdSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VarianceThresholdSelector.scala @@ -17,13 +17,10 @@ package org.apache.spark.ml.feature -import scala.collection.mutable.ArrayBuilder - import org.apache.hadoop.fs.Path import org.apache.spark.annotation.Since import org.apache.spark.ml._ -import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NominalAttribute} import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ @@ -31,7 +28,7 @@ import org.apache.spark.ml.stat.Summarizer import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.sql.types.StructType /** diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala index 6ff970cc72dfd..ac63024768d77 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/package.scala @@ -17,8 +17,6 @@ package org.apache.spark.ml -import org.apache.spark.ml.feature.{HashingTF, IDF, IDFModel, VectorAssembler} - /** * == Feature transformers == * diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/AFTAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/AFTAggregator.scala index 8a5d7fe34e7a0..fd59b4b71c41b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/AFTAggregator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/AFTAggregator.scala @@ -18,8 +18,8 @@ package org.apache.spark.ml.optim.aggregator import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.feature._ import org.apache.spark.ml.linalg._ -import org.apache.spark.ml.regression.AFTPoint /** * AFTAggregator computes the gradient and loss for a AFT loss function, @@ -108,7 +108,7 @@ import org.apache.spark.ml.regression.AFTPoint private[ml] class AFTAggregator( bcFeaturesStd: Broadcast[Array[Double]], fitIntercept: Boolean)(bcCoefficients: Broadcast[Vector]) - extends DifferentiableLossAggregator[AFTPoint, AFTAggregator] { + extends DifferentiableLossAggregator[Instance, AFTAggregator] { protected override val dim: Int = bcCoefficients.value.size @@ -116,10 +116,10 @@ private[ml] class AFTAggregator( * Add a new training data to this AFTAggregator, and update the loss and gradient * of the objective function. * - * @param data The AFTPoint representation for one data point to be added into this aggregator. + * @param data The Instance representation for one data point to be added into this aggregator. * @return This AFTAggregator object. */ - def add(data: AFTPoint): this.type = { + def add(data: Instance): this.type = { val coefficients = bcCoefficients.value.toArray val intercept = coefficients(dim - 2) // sigma is the scale parameter of the AFT model @@ -127,7 +127,7 @@ private[ml] class AFTAggregator( val xi = data.features val ti = data.label - val delta = data.censor + val delta = data.weight require(ti > 0.0, "The lifetime or label should be greater than 0.") @@ -176,7 +176,7 @@ private[ml] class AFTAggregator( */ private[ml] class BlockAFTAggregator( fitIntercept: Boolean)(bcCoefficients: Broadcast[Vector]) - extends DifferentiableLossAggregator[(Matrix, Array[Double], Array[Double]), + extends DifferentiableLossAggregator[InstanceBlock, BlockAFTAggregator] { protected override val dim: Int = bcCoefficients.value.size @@ -196,16 +196,13 @@ private[ml] class BlockAFTAggregator( * * @return This BlockAFTAggregator object. */ - def add(block: (Matrix, Array[Double], Array[Double])): this.type = { - val (matrix, labels, censors) = block - require(matrix.isTransposed) - require(numFeatures == matrix.numCols, s"Dimensions mismatch when adding new " + - s"instance. Expecting $numFeatures but got ${matrix.numCols}.") - require(labels.forall(_ > 0.0), "The lifetime or label should be greater than 0.") - - val size = matrix.numRows - require(labels.length == size && censors.length == size) + def add(block: InstanceBlock): this.type = { + require(block.matrix.isTransposed) + require(numFeatures == block.numFeatures, s"Dimensions mismatch when adding new " + + s"instance. Expecting $numFeatures but got ${block.numFeatures}.") + require(block.labels.forall(_ > 0.0), "The lifetime or label should be greater than 0.") + val size = block.size val intercept = coefficientsArray(dim - 2) // sigma is the scale parameter of the AFT model val sigma = math.exp(coefficientsArray(dim - 1)) @@ -216,26 +213,30 @@ private[ml] class BlockAFTAggregator( } else { Vectors.zeros(size).toDense } - BLAS.gemv(1.0, matrix, linear, 1.0, vec) + BLAS.gemv(1.0, block.matrix, linear, 1.0, vec) // in-place convert margins to gradient scales // then, vec represents gradient scales + var localLossSum = 0.0 var i = 0 var sigmaGradSum = 0.0 while (i < size) { - val ti = labels(i) - val delta = censors(i) + val ti = block.getLabel(i) + // here use Instance.weight to store censor for convenience + val delta = block.getWeight(i) val margin = vec(i) val epsilon = (math.log(ti) - margin) / sigma val expEpsilon = math.exp(epsilon) - lossSum += delta * math.log(sigma) - delta * epsilon + expEpsilon + localLossSum += delta * math.log(sigma) - delta * epsilon + expEpsilon val multiplier = (delta - expEpsilon) / sigma vec.values(i) = multiplier sigmaGradSum += delta + multiplier * sigma * epsilon i += 1 } + lossSum += localLossSum + weightSum += size - matrix match { + block.matrix match { case dm: DenseMatrix => BLAS.nativeBLAS.dgemv("N", dm.numCols, dm.numRows, 1.0, dm.values, dm.numCols, vec.values, 1, 1.0, gradientSumArray, 1) @@ -249,7 +250,6 @@ private[ml] class BlockAFTAggregator( if (fitIntercept) gradientSumArray(dim - 2) += vec.values.sum gradientSumArray(dim - 1) += sigmaGradSum - weightSum += size this } diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala index b1990f7c60f64..3d72512563154 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HingeAggregator.scala @@ -162,24 +162,26 @@ private[ml] class BlockHingeAggregator( // in-place convert dotProducts to gradient scales // then, vec represents gradient scales + var localLossSum = 0.0 var i = 0 while (i < size) { val weight = block.getWeight(i) if (weight > 0) { - weightSum += weight // Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x))) // Therefore the gradient is -(2y - 1)*x val label = block.getLabel(i) val labelScaled = label + label - 1.0 val loss = (1.0 - labelScaled * vec(i)) * weight if (loss > 0) { - lossSum += loss + localLossSum += loss val gradScale = -labelScaled * weight vec.values(i) = gradScale } else { vec.values(i) = 0.0 } } else { vec.values(i) = 0.0 } i += 1 } + lossSum += localLossSum + weightSum += block.weightIter.sum // predictions are all correct, no gradient signal if (vec.values.forall(_ == 0)) return this diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HuberAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HuberAggregator.scala index 59ecc038e5569..35582dbc990e6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HuberAggregator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HuberAggregator.scala @@ -167,7 +167,6 @@ private[ml] class BlockHuberAggregator( protected override val dim: Int = bcParameters.value.size private val numFeatures = if (fitIntercept) dim - 2 else dim - 1 - private val sigma = bcParameters.value(dim - 1) private val intercept = if (fitIntercept) bcParameters.value(dim - 2) else 0.0 // make transient so we do not serialize between aggregation stages @transient private lazy val linear = Vectors.dense(bcParameters.value.toArray.take(numFeatures)) @@ -187,7 +186,9 @@ private[ml] class BlockHuberAggregator( s"instance weights ${block.weightIter.mkString("[", ",", "]")} has to be >= 0.0") if (block.weightIter.forall(_ == 0)) return this + val size = block.size + val sigma = bcParameters.value(dim - 1) // vec here represents margins or dotProducts val vec = if (fitIntercept) { @@ -200,23 +201,23 @@ private[ml] class BlockHuberAggregator( // in-place convert margins to multipliers // then, vec represents multipliers var sigmaGradSum = 0.0 + var localLossSum = 0.0 var i = 0 while (i < size) { val weight = block.getWeight(i) if (weight > 0) { - weightSum += weight val label = block.getLabel(i) val margin = vec(i) val linearLoss = label - margin if (math.abs(linearLoss) <= sigma * epsilon) { - lossSum += 0.5 * weight * (sigma + math.pow(linearLoss, 2.0) / sigma) + localLossSum += 0.5 * weight * (sigma + math.pow(linearLoss, 2.0) / sigma) val linearLossDivSigma = linearLoss / sigma val multiplier = -1.0 * weight * linearLossDivSigma vec.values(i) = multiplier sigmaGradSum += 0.5 * weight * (1.0 - math.pow(linearLossDivSigma, 2.0)) } else { - lossSum += 0.5 * weight * + localLossSum += 0.5 * weight * (sigma + 2.0 * epsilon * math.abs(linearLoss) - sigma * epsilon * epsilon) val sign = if (linearLoss >= 0) -1.0 else 1.0 val multiplier = weight * sign * epsilon @@ -226,6 +227,8 @@ private[ml] class BlockHuberAggregator( } else { vec.values(i) = 0.0 } i += 1 } + lossSum += localLossSum + weightSum += block.weightIter.sum block.matrix match { case dm: DenseMatrix => diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregator.scala index fa3bda00d802d..d5e1ea980840b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LeastSquaresAggregator.scala @@ -267,9 +267,6 @@ private[ml] class BlockLeastSquaresAggregator( val offset = if (fitIntercept) labelMean / labelStd - sum else 0.0 (Vectors.dense(coefficientsArray), offset) } - // do not use tuple assignment above because it will circumvent the @transient tag - @transient private lazy val effectiveCoefficientsVec = effectiveCoefAndOffset._1 - @transient private lazy val offset = effectiveCoefAndOffset._2 /** * Add a new training instance block to this BlockLeastSquaresAggregator, and update the loss @@ -286,7 +283,9 @@ private[ml] class BlockLeastSquaresAggregator( s"instance weights ${block.weightIter.mkString("[", ",", "]")} has to be >= 0.0") if (block.weightIter.forall(_ == 0)) return this + val size = block.size + val (effectiveCoefficientsVec, offset) = effectiveCoefAndOffset // vec here represents diffs val vec = new DenseVector(Array.tabulate(size)(i => offset - block.getLabel(i) / labelStd)) @@ -294,16 +293,18 @@ private[ml] class BlockLeastSquaresAggregator( // in-place convert diffs to multipliers // then, vec represents multipliers + var localLossSum = 0.0 var i = 0 while (i < size) { val weight = block.getWeight(i) val diff = vec(i) - lossSum += weight * diff * diff / 2 - weightSum += weight + localLossSum += weight * diff * diff / 2 val multiplier = weight * diff vec.values(i) = multiplier i += 1 } + lossSum += localLossSum + weightSum += block.weightIter.sum val gradSumVec = new DenseVector(gradientSumArray) BLAS.gemv(1.0, block.matrix.transpose, vec, 1.0, gradSumVec) diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala index a331122776b5c..2496c789f8da6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/LogisticAggregator.scala @@ -466,24 +466,26 @@ private[ml] class BlockLogisticAggregator( // in-place convert margins to multiplier // then, vec represents multiplier + var localLossSum = 0.0 var i = 0 while (i < size) { val weight = block.getWeight(i) if (weight > 0) { - weightSum += weight val label = block.getLabel(i) val margin = vec(i) if (label > 0) { // The following is equivalent to log(1 + exp(margin)) but more numerically stable. - lossSum += weight * Utils.log1pExp(margin) + localLossSum += weight * Utils.log1pExp(margin) } else { - lossSum += weight * (Utils.log1pExp(margin) - margin) + localLossSum += weight * (Utils.log1pExp(margin) - margin) } val multiplier = weight * (1.0 / (1.0 + math.exp(margin)) - label) vec.values(i) = multiplier } else { vec.values(i) = 0.0 } i += 1 } + lossSum += localLossSum + weightSum += block.weightIter.sum // predictions are all correct, no gradient signal if (vec.values.forall(_ == 0)) return @@ -514,10 +516,11 @@ private[ml] class BlockLogisticAggregator( // mat here represents margins, shape: S X C val mat = DenseMatrix.zeros(size, numClasses) if (fitIntercept) { + val localCoefficientsArray = coefficientsArray val offset = numClasses * numFeatures var j = 0 while (j < numClasses) { - val intercept = coefficientsArray(offset + j) + val intercept = localCoefficientsArray(offset + j) var i = 0 while (i < size) { mat.update(i, j, intercept); i += 1 } j += 1 @@ -527,13 +530,13 @@ private[ml] class BlockLogisticAggregator( // in-place convert margins to multipliers // then, mat represents multipliers + var localLossSum = 0.0 var i = 0 val tmp = Array.ofDim[Double](numClasses) val interceptGradSumArr = if (fitIntercept) Array.ofDim[Double](numClasses) else null while (i < size) { val weight = block.getWeight(i) if (weight > 0) { - weightSum += weight val label = block.getLabel(i) var maxMargin = Double.NegativeInfinity @@ -566,15 +569,17 @@ private[ml] class BlockLogisticAggregator( } if (maxMargin > 0) { - lossSum += weight * (math.log(sum) - marginOfLabel + maxMargin) + localLossSum += weight * (math.log(sum) - marginOfLabel + maxMargin) } else { - lossSum += weight * (math.log(sum) - marginOfLabel) + localLossSum += weight * (math.log(sum) - marginOfLabel) } } else { var j = 0; while (j < numClasses) { mat.update(i, j, 0.0); j += 1 } } i += 1 } + lossSum += localLossSum + weightSum += block.weightIter.sum // mat (multipliers): S X C, dense N // mat.transpose (multipliers): C X S, dense T diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala index 0640fe355fdd6..2f6b9c1e11aac 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala @@ -111,8 +111,8 @@ private[shared] object SharedParamsCodeGen { isValid = "ParamValidators.gt(0)", isExpertParam = true), ParamDesc[Double]("maxBlockSizeInMB", "Maximum memory in MB for stacking input data " + "into blocks. Data is stacked within partitions. If more than remaining data size in a " + - "partition then it is adjusted to the data size. If 0, try to infer an appropriate " + - "value. Must be >= 0.", + "partition then it is adjusted to the data size. Default 0.0 represents choosing " + + "optimal value, depends on specific algorithm. Must be >= 0.", Some("0.0"), isValid = "ParamValidators.gtEq(0.0)", isExpertParam = true) ) diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala index 2fbda45a9e97a..425bf91fd00ba 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala @@ -570,10 +570,10 @@ trait HasBlockSize extends Params { trait HasMaxBlockSizeInMB extends Params { /** - * Param for Maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value. Must be >= 0.. + * Param for Maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0.. * @group expertParam */ - final val maxBlockSizeInMB: DoubleParam = new DoubleParam(this, "maxBlockSizeInMB", "Maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value. Must be >= 0.", ParamValidators.gtEq(0.0)) + final val maxBlockSizeInMB: DoubleParam = new DoubleParam(this, "maxBlockSizeInMB", "Maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0.", ParamValidators.gtEq(0.0)) setDefault(maxBlockSizeInMB, 0.0) diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index a0e5924a7ee3a..088f6a682be82 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -31,7 +31,7 @@ import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.json4s.JsonDSL._ -import org.apache.spark.{Dependency, Partitioner, ShuffleDependency, SparkContext, SparkException} +import org.apache.spark.{Partitioner, SparkException} import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.ml.{Estimator, Model} diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/TopByKeyAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/TopByKeyAggregator.scala index 517179c0eb9ae..ed41169070c59 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/TopByKeyAggregator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/TopByKeyAggregator.scala @@ -17,7 +17,6 @@ package org.apache.spark.ml.recommendation -import scala.language.implicitConversions import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.{Encoder, Encoders} diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 3870a71a91a20..4d214dc74ed8b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -27,7 +27,7 @@ import org.apache.spark.SparkException import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging import org.apache.spark.ml.PredictorParams -import org.apache.spark.ml.feature.StandardScalerModel +import org.apache.spark.ml.feature._ import org.apache.spark.ml.linalg._ import org.apache.spark.ml.optim.aggregator._ import org.apache.spark.ml.optim.loss.RDDLossFunction @@ -47,8 +47,8 @@ import org.apache.spark.storage.StorageLevel * Params for accelerated failure time (AFT) regression. */ private[regression] trait AFTSurvivalRegressionParams extends PredictorParams - with HasMaxIter with HasTol with HasFitIntercept with HasAggregationDepth with HasBlockSize - with Logging { + with HasMaxIter with HasTol with HasFitIntercept with HasAggregationDepth + with HasMaxBlockSizeInMB with Logging { /** * Param for censor column name. @@ -92,7 +92,8 @@ private[regression] trait AFTSurvivalRegressionParams extends PredictorParams setDefault(censorCol -> "censor", quantileProbabilities -> Array(0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99), - fitIntercept -> true, maxIter -> 100, tol -> 1E-6, aggregationDepth -> 2, blockSize -> 1) + fitIntercept -> true, maxIter -> 100, tol -> 1E-6, aggregationDepth -> 2, + maxBlockSizeInMB -> 0.0) /** Checks whether the input has quantiles column name. */ private[regression] def hasQuantilesCol: Boolean = { @@ -127,6 +128,10 @@ private[regression] trait AFTSurvivalRegressionParams extends PredictorParams * (see * Accelerated failure time model (Wikipedia)) * based on the Weibull distribution of the survival time. + * + * Since 3.1.0, it supports stacking instances into blocks and using GEMV for + * better performance. + * The block size will be 1.0 MB, if param maxBlockSizeInMB is set 0.0 by default. */ @Since("1.6.0") class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: String) @@ -184,55 +189,39 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value) /** - * Set block size for stacking input data in matrices. - * If blockSize == 1, then stacking will be skipped, and each vector is treated individually; - * If blockSize > 1, then vectors will be stacked to blocks, and high-level BLAS routines - * will be used if possible (for example, GEMV instead of DOT, GEMM instead of GEMV). - * Recommended size is between 10 and 1000. An appropriate choice of the block size depends - * on the sparsity and dim of input datasets, the underlying BLAS implementation (for example, - * f2jBLAS, OpenBLAS, intel MKL) and its configuration (for example, number of threads). - * Note that existing BLAS implementations are mainly optimized for dense matrices, if the - * input dataset is sparse, stacking may bring no performance gain, the worse is possible - * performance regression. - * Default is 1. + * Sets the value of param [[maxBlockSizeInMB]]. + * Default is 0.0, then 1.0 MB will be chosen. * * @group expertSetParam */ @Since("3.1.0") - def setBlockSize(value: Int): this.type = set(blockSize, value) - - /** - * Extract [[featuresCol]], [[labelCol]] and [[censorCol]] from input dataset, - * and put it in an RDD with strong types. - */ - protected[ml] def extractAFTPoints(dataset: Dataset[_]): RDD[AFTPoint] = { - dataset.select(col($(featuresCol)), col($(labelCol)).cast(DoubleType), - col($(censorCol)).cast(DoubleType)).rdd.map { - case Row(features: Vector, label: Double, censor: Double) => - AFTPoint(features, label, censor) - } - } + def setMaxBlockSizeInMB(value: Double): this.type = set(maxBlockSizeInMB, value) override protected def train( dataset: Dataset[_]): AFTSurvivalRegressionModel = instrumented { instr => instr.logPipelineStage(this) instr.logDataset(dataset) instr.logParams(this, labelCol, featuresCol, censorCol, predictionCol, quantilesCol, - fitIntercept, maxIter, tol, aggregationDepth, blockSize) + fitIntercept, maxIter, tol, aggregationDepth, maxBlockSizeInMB) instr.logNamedValue("quantileProbabilities.size", $(quantileProbabilities).length) - val instances = extractAFTPoints(dataset) - .setName("training instances") - - if ($(blockSize) == 1 && dataset.storageLevel == StorageLevel.NONE) { - instances.persist(StorageLevel.MEMORY_AND_DISK) + if (dataset.storageLevel != StorageLevel.NONE) { + instr.logWarning(s"Input instances will be standardized, blockified to blocks, and " + + s"then cached during training. Be careful of double caching!") } - var requestedMetrics = Seq("mean", "std", "count") - if ($(blockSize) != 1) requestedMetrics +:= "numNonZeros" + val instances = dataset.select(col($(featuresCol)), col($(labelCol)).cast(DoubleType), + col($(censorCol)).cast(DoubleType)) + .rdd.map { case Row(features: Vector, label: Double, censor: Double) => + require(censor == 1.0 || censor == 0.0, "censor must be 1.0 or 0.0") + // AFT does not support instance weighting, + // here use Instance.weight to store censor for convenience + Instance(label, censor, features) + }.setName("training instances") + val summarizer = instances.treeAggregate( - Summarizer.createSummarizerBuffer(requestedMetrics: _*))( - seqOp = (c: SummarizerBuffer, v: AFTPoint) => c.add(v.features), + Summarizer.createSummarizerBuffer("mean", "std", "count"))( + seqOp = (c: SummarizerBuffer, i: Instance) => c.add(i.features), combOp = (c1: SummarizerBuffer, c2: SummarizerBuffer) => c1.merge(c2), depth = $(aggregationDepth) ) @@ -241,14 +230,12 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S val numFeatures = featuresStd.length instr.logNumFeatures(numFeatures) instr.logNumExamples(summarizer.count) - if ($(blockSize) > 1) { - val scale = 1.0 / summarizer.count / numFeatures - val sparsity = 1 - summarizer.numNonzeros.toArray.map(_ * scale).sum - instr.logNamedValue("sparsity", sparsity.toString) - if (sparsity > 0.5) { - instr.logWarning(s"sparsity of input dataset is $sparsity, " + - s"which may hurt performance in high-level BLAS.") - } + + var actualBlockSizeInMB = $(maxBlockSizeInMB) + if (actualBlockSizeInMB == 0) { + actualBlockSizeInMB = InstanceBlock.DefaultBlockSizeInMB + require(actualBlockSizeInMB > 0, "inferred actual BlockSizeInMB must > 0") + instr.logNamedValue("actualBlockSizeInMB", actualBlockSizeInMB.toString) } if (!$(fitIntercept) && (0 until numFeatures).exists { i => @@ -268,12 +255,8 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S */ val initialParameters = Vectors.zeros(numFeatures + 2) - val (rawCoefficients, objectiveHistory) = if ($(blockSize) == 1) { - trainOnRows(instances, featuresStd, optimizer, initialParameters) - } else { - trainOnBlocks(instances, featuresStd, optimizer, initialParameters) - } - if (instances.getStorageLevel != StorageLevel.NONE) instances.unpersist() + val (rawCoefficients, objectiveHistory) = + trainImpl(instances, actualBlockSizeInMB, featuresStd, optimizer, initialParameters) if (rawCoefficients == null) { val msg = s"${optimizer.getClass.getName} failed." @@ -290,47 +273,24 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S new AFTSurvivalRegressionModel(uid, coefficients, intercept, scale) } - private def trainOnRows( - instances: RDD[AFTPoint], + private def trainImpl( + instances: RDD[Instance], + actualBlockSizeInMB: Double, featuresStd: Array[Double], optimizer: BreezeLBFGS[BDV[Double]], initialParameters: Vector): (Array[Double], Array[Double]) = { val bcFeaturesStd = instances.context.broadcast(featuresStd) - val getAggregatorFunc = new AFTAggregator(bcFeaturesStd, $(fitIntercept))(_) - val costFun = new RDDLossFunction(instances, getAggregatorFunc, None, $(aggregationDepth)) - - val states = optimizer.iterations(new CachedDiffFunction(costFun), - initialParameters.asBreeze.toDenseVector) - - val arrayBuilder = mutable.ArrayBuilder.make[Double] - var state: optimizer.State = null - while (states.hasNext) { - state = states.next() - arrayBuilder += state.adjustedValue - } - bcFeaturesStd.destroy() - - (if (state != null) state.x.toArray else null, arrayBuilder.result) - } - private def trainOnBlocks( - instances: RDD[AFTPoint], - featuresStd: Array[Double], - optimizer: BreezeLBFGS[BDV[Double]], - initialParameters: Vector): (Array[Double], Array[Double]) = { - val bcFeaturesStd = instances.context.broadcast(featuresStd) - val blocks = instances.mapPartitions { iter => + val standardized = instances.mapPartitions { iter => val inverseStd = bcFeaturesStd.value.map { std => if (std != 0) 1.0 / std else 0.0 } val func = StandardScalerModel.getTransformFunc(Array.empty, inverseStd, false, true) - iter.grouped($(blockSize)).map { seq => - val matrix = Matrices.fromVectors(seq.map(point => func(point.features))) - val labels = seq.map(_.label).toArray - val censors = seq.map(_.censor).toArray - (matrix, labels, censors) - } + iter.map { case Instance(label, weight, vec) => Instance(label, weight, func(vec)) } } - blocks.persist(StorageLevel.MEMORY_AND_DISK) - .setName(s"training blocks (blockSize=${$(blockSize)})") + + val maxMemUsage = (actualBlockSizeInMB * 1024L * 1024L).ceil.toLong + val blocks = InstanceBlock.blokifyWithMaxMemUsage(standardized, maxMemUsage) + .persist(StorageLevel.MEMORY_AND_DISK) + .setName(s"training blocks (blockSizeInMB=$actualBlockSizeInMB)") val getAggregatorFunc = new BlockAFTAggregator($(fitIntercept))(_) val costFun = new RDDLossFunction(blocks, getAggregatorFunc, None, $(aggregationDepth)) diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 235a7f9b6ebd5..11a1984b0ab4c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala @@ -56,7 +56,7 @@ import org.apache.spark.util.VersionUtils.majorMinorVersion private[regression] trait LinearRegressionParams extends PredictorParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol with HasFitIntercept with HasStandardization with HasWeightCol with HasSolver - with HasAggregationDepth with HasLoss with HasBlockSize { + with HasAggregationDepth with HasLoss with HasMaxBlockSizeInMB { import LinearRegression._ @@ -107,7 +107,7 @@ private[regression] trait LinearRegressionParams extends PredictorParams setDefault(regParam -> 0.0, fitIntercept -> true, standardization -> true, elasticNetParam -> 0.0, maxIter -> 100, tol -> 1E-6, solver -> Auto, - aggregationDepth -> 2, loss -> SquaredError, epsilon -> 1.35, blockSize -> 1) + aggregationDepth -> 2, loss -> SquaredError, epsilon -> 1.35, maxBlockSizeInMB -> 0.0) override protected def validateAndTransformSchema( schema: StructType, @@ -175,6 +175,10 @@ private[regression] trait LinearRegressionParams extends PredictorParams * $$ * * + * Since 3.1.0, it supports stacking instances into blocks and using GEMV for + * better performance. + * The block size will be 1.0 MB, if param maxBlockSizeInMB is set 0.0 by default. + * * Note: Fitting with huber loss only supports none and L2 regularization. */ @Since("1.3.0") @@ -312,29 +316,26 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String def setEpsilon(value: Double): this.type = set(epsilon, value) /** - * Set block size for stacking input data in matrices. - * If blockSize == 1, then stacking will be skipped, and each vector is treated individually; - * If blockSize > 1, then vectors will be stacked to blocks, and high-level BLAS routines - * will be used if possible (for example, GEMV instead of DOT, GEMM instead of GEMV). - * Recommended size is between 10 and 1000. An appropriate choice of the block size depends - * on the sparsity and dim of input datasets, the underlying BLAS implementation (for example, - * f2jBLAS, OpenBLAS, intel MKL) and its configuration (for example, number of threads). - * Note that existing BLAS implementations are mainly optimized for dense matrices, if the - * input dataset is sparse, stacking may bring no performance gain, the worse is possible - * performance regression. - * Default is 1. + * Sets the value of param [[maxBlockSizeInMB]]. + * Default is 0.0, then 1.0 MB will be chosen. * * @group expertSetParam */ @Since("3.1.0") - def setBlockSize(value: Int): this.type = set(blockSize, value) + def setMaxBlockSizeInMB(value: Double): this.type = set(maxBlockSizeInMB, value) - override protected def train(dataset: Dataset[_]): LinearRegressionModel = instrumented { instr => + override protected def train( + dataset: Dataset[_]): LinearRegressionModel = instrumented { instr => instr.logPipelineStage(this) instr.logDataset(dataset) instr.logParams(this, labelCol, featuresCol, weightCol, predictionCol, solver, tol, elasticNetParam, fitIntercept, maxIter, regParam, standardization, aggregationDepth, loss, - epsilon, blockSize) + epsilon, maxBlockSizeInMB) + + if (dataset.storageLevel != StorageLevel.NONE) { + instr.logWarning(s"Input instances will be standardized, blockified to blocks, and " + + s"then cached during training. Be careful of double caching!") + } // Extract the number of features before deciding optimization solver. val numFeatures = MetadataUtils.getNumFeatures(dataset, $(featuresCol)) @@ -348,35 +349,26 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String val instances = extractInstances(dataset) .setName("training instances") - if (dataset.storageLevel == StorageLevel.NONE && $(blockSize) == 1) { - instances.persist(StorageLevel.MEMORY_AND_DISK) - } + val (summarizer, labelSummarizer) = Summarizer + .getRegressionSummarizers(instances, $(aggregationDepth), Seq("mean", "std", "count")) - var requestedMetrics = Seq("mean", "std", "count") - if ($(blockSize) != 1) requestedMetrics +:= "numNonZeros" - val (featuresSummarizer, ySummarizer) = Summarizer - .getRegressionSummarizers(instances, $(aggregationDepth), requestedMetrics) + val yMean = labelSummarizer.mean(0) + val rawYStd = labelSummarizer.std(0) - val yMean = ySummarizer.mean(0) - val rawYStd = ySummarizer.std(0) - - instr.logNumExamples(ySummarizer.count) + instr.logNumExamples(labelSummarizer.count) instr.logNamedValue(Instrumentation.loggerTags.meanOfLabels, yMean) instr.logNamedValue(Instrumentation.loggerTags.varianceOfLabels, rawYStd) - instr.logSumOfWeights(featuresSummarizer.weightSum) - if ($(blockSize) > 1) { - val scale = 1.0 / featuresSummarizer.count / numFeatures - val sparsity = 1 - featuresSummarizer.numNonzeros.toArray.map(_ * scale).sum - instr.logNamedValue("sparsity", sparsity.toString) - if (sparsity > 0.5) { - instr.logWarning(s"sparsity of input dataset is $sparsity, " + - s"which may hurt performance in high-level BLAS.") - } + instr.logSumOfWeights(summarizer.weightSum) + + var actualBlockSizeInMB = $(maxBlockSizeInMB) + if (actualBlockSizeInMB == 0) { + actualBlockSizeInMB = InstanceBlock.DefaultBlockSizeInMB + require(actualBlockSizeInMB > 0, "inferred actual BlockSizeInMB must > 0") + instr.logNamedValue("actualBlockSizeInMB", actualBlockSizeInMB.toString) } if (rawYStd == 0.0) { if ($(fitIntercept) || yMean == 0.0) { - if (instances.getStorageLevel != StorageLevel.NONE) instances.unpersist() return trainWithConstantLabel(dataset, instr, numFeatures, yMean) } else { require($(regParam) == 0.0, "The standard deviation of the label is zero. " + @@ -389,8 +381,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String // if y is constant (rawYStd is zero), then y cannot be scaled. In this case // setting yStd=abs(yMean) ensures that y is not scaled anymore in l-bfgs algorithm. val yStd = if (rawYStd > 0) rawYStd else math.abs(yMean) - val featuresMean = featuresSummarizer.mean.toArray - val featuresStd = featuresSummarizer.std.toArray + val featuresMean = summarizer.mean.toArray + val featuresStd = summarizer.std.toArray if (!$(fitIntercept) && (0 until numFeatures).exists { i => featuresStd(i) == 0.0 && featuresMean(i) != 0.0 }) { @@ -426,14 +418,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String Vectors.dense(Array.fill(dim)(1.0)) } - val (parameters, objectiveHistory) = if ($(blockSize) == 1) { - trainOnRows(instances, yMean, yStd, featuresMean, featuresStd, - initialValues, regularization, optimizer) - } else { - trainOnBlocks(instances, yMean, yStd, featuresMean, featuresStd, - initialValues, regularization, optimizer) - } - if (instances.getStorageLevel != StorageLevel.NONE) instances.unpersist() + val (parameters, objectiveHistory) = + trainImpl(instances, actualBlockSizeInMB, yMean, yStd, + featuresMean, featuresStd, initialValues, regularization, optimizer) if (parameters == null) { val msg = s"${optimizer.getClass.getName} failed." @@ -541,56 +528,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String } } - private def trainOnRows( - instances: RDD[Instance], - yMean: Double, - yStd: Double, - featuresMean: Array[Double], - featuresStd: Array[Double], - initialValues: Vector, - regularization: Option[L2Regularization], - optimizer: FirstOrderMinimizer[BDV[Double], DiffFunction[BDV[Double]]]) = { - val bcFeaturesMean = instances.context.broadcast(featuresMean) - val bcFeaturesStd = instances.context.broadcast(featuresStd) - - val costFun = $(loss) match { - case SquaredError => - val getAggregatorFunc = new LeastSquaresAggregator(yStd, yMean, $(fitIntercept), - bcFeaturesStd, bcFeaturesMean)(_) - new RDDLossFunction(instances, getAggregatorFunc, regularization, $(aggregationDepth)) - case Huber => - val getAggregatorFunc = new HuberAggregator($(fitIntercept), $(epsilon), bcFeaturesStd)(_) - new RDDLossFunction(instances, getAggregatorFunc, regularization, $(aggregationDepth)) - } - - val states = optimizer.iterations(new CachedDiffFunction(costFun), - initialValues.asBreeze.toDenseVector) - - /* - Note that in Linear Regression, the objective history (loss + regularization) returned - from optimizer is computed in the scaled space given by the following formula. -
- $$ - L &= 1/2n||\sum_i w_i(x_i - \bar{x_i}) / \hat{x_i} - (y - \bar{y}) / \hat{y}||^2 - + regTerms \\ - $$ -
- */ - val arrayBuilder = mutable.ArrayBuilder.make[Double] - var state: optimizer.State = null - while (states.hasNext) { - state = states.next() - arrayBuilder += state.adjustedValue - } - - bcFeaturesMean.destroy() - bcFeaturesStd.destroy() - - (if (state == null) null else state.x.toArray, arrayBuilder.result) - } - - private def trainOnBlocks( + private def trainImpl( instances: RDD[Instance], + actualBlockSizeInMB: Double, yMean: Double, yStd: Double, featuresMean: Array[Double], @@ -606,9 +546,11 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String val func = StandardScalerModel.getTransformFunc(Array.empty, inverseStd, false, true) iter.map { case Instance(label, weight, vec) => Instance(label, weight, func(vec)) } } - val blocks = InstanceBlock.blokify(standardized, $(blockSize)) + + val maxMemUsage = (actualBlockSizeInMB * 1024L * 1024L).ceil.toLong + val blocks = InstanceBlock.blokifyWithMaxMemUsage(standardized, maxMemUsage) .persist(StorageLevel.MEMORY_AND_DISK) - .setName(s"training blocks (blockSize=${$(blockSize)})") + .setName(s"training blocks (blockSizeInMB=$actualBlockSizeInMB)") val costFun = $(loss) match { case SquaredError => diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala index 21eb17dfaacb3..75262ac4fe06b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala @@ -30,7 +30,6 @@ import org.apache.spark.mllib.regression._ import org.apache.spark.mllib.util.{DataValidators, Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession -import org.apache.spark.storage.StorageLevel /** * Classification model trained using Multinomial/Binary Logistic Regression. @@ -339,10 +338,8 @@ class LogisticRegressionWithLBFGS // Convert our input into a DataFrame val spark = SparkSession.builder().sparkContext(input.context).getOrCreate() val df = spark.createDataFrame(input.map(_.asML)) - // Determine if we should cache the DF - val handlePersistence = input.getStorageLevel == StorageLevel.NONE // Train our model - val mlLogisticRegressionModel = lr.train(df, handlePersistence) + val mlLogisticRegressionModel = lr.train(df) // convert the model val weights = Vectors.dense(mlLogisticRegressionModel.coefficients.toArray) createModel(weights, mlLogisticRegressionModel.intercept) diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 51a6ae3c7e49b..d0b282db1ece8 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -593,8 +593,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { .setMaxIter(5) .setFamily("multinomial") val model = mlor.fit(dataset) - Seq(4, 16, 64).foreach { blockSize => - val model2 = mlor.setBlockSize(blockSize).fit(dataset) + Seq(0, 0.01, 0.1, 1, 2, 4).foreach { s => + val model2 = mlor.setMaxBlockSizeInMB(s).fit(dataset) assert(model.interceptVector ~== model2.interceptVector relTol 1e-6) assert(model.coefficientMatrix ~== model2.coefficientMatrix relTol 1e-6) } @@ -606,8 +606,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { .setMaxIter(5) .setFamily("binomial") val model = blor.fit(dataset) - Seq(4, 16, 64).foreach { blockSize => - val model2 = blor.setBlockSize(blockSize).fit(dataset) + Seq(0, 0.01, 0.1, 1, 2, 4).foreach { s => + val model2 = blor.setMaxBlockSizeInMB(s).fit(dataset) assert(model.intercept ~== model2.intercept relTol 1e-6) assert(model.coefficients ~== model2.coefficients relTol 1e-6) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/fpm/PrefixSpanSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/fpm/PrefixSpanSuite.scala index 2252151af306b..cc8982f338702 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/fpm/PrefixSpanSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/fpm/PrefixSpanSuite.scala @@ -17,7 +17,6 @@ package org.apache.spark.ml.fpm import org.apache.spark.ml.util.MLTest -import org.apache.spark.sql.DataFrame class PrefixSpanSuite extends MLTest { diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala index 63ccfa3834624..e745e7f67df98 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala @@ -436,8 +436,8 @@ class AFTSurvivalRegressionSuite extends MLTest with DefaultReadWriteTest { .setQuantileProbabilities(quantileProbabilities) .setQuantilesCol("quantiles") val model = aft.fit(dataset) - Seq(4, 16, 64).foreach { blockSize => - val model2 = aft.setBlockSize(blockSize).fit(dataset) + Seq(0, 0.01, 0.1, 1, 2, 4).foreach { s => + val model2 = aft.setMaxBlockSizeInMB(s).fit(dataset) assert(model.coefficients ~== model2.coefficients relTol 1e-9) assert(model.intercept ~== model2.intercept relTol 1e-9) assert(model.scale ~== model2.scale relTol 1e-9) diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index a30c47293c543..a0e17a4b40fd2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -507,8 +507,6 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest val residualDeviancesR = Array(3.809296, 3.70055) - import GeneralizedLinearRegression._ - var idx = 0 val link = "log" val dataset = datasetPoissonLogWithZero @@ -790,8 +788,6 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest val expected = Seq(0.5108256, 0.1201443, 1.600000, 1.886792, 0.625, 0.530, -0.4700036, -0.6348783, 1.325782, 1.463641) - import GeneralizedLinearRegression._ - var idx = 0 for (family <- GeneralizedLinearRegression.supportedFamilyNames.sortWith(_ < _)) { for (useWeight <- Seq(false, true)) { diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala index fb70883bffc5f..b3098be0a36fb 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala @@ -672,8 +672,8 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest with PMMLRe .setLoss(loss) .setMaxIter(3) val model = lir.fit(dataset) - Seq(4, 16, 64).foreach { blockSize => - val model2 = lir.setBlockSize(blockSize).fit(dataset) + Seq(0, 0.01, 0.1, 1, 2, 4).foreach { s => + val model2 = lir.setMaxBlockSizeInMB(s).fit(dataset) assert(model.intercept ~== model2.intercept relTol 1e-9) assert(model.coefficients ~== model2.coefficients relTol 1e-9) assert(model.scale ~== model2.scale relTol 1e-9) diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala index 68ba57c0d5fc8..e438a4135908e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala @@ -29,7 +29,6 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext { import testImplicits._ import Summarizer._ - import SummaryBuilderImpl._ private case class ExpectedMetrics( mean: Vector, diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala index 2a83d0aaf9699..3ca6816ce7c0d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/RandomForestSuite.scala @@ -19,7 +19,6 @@ package org.apache.spark.ml.tree.impl import scala.annotation.tailrec import scala.collection.mutable -import scala.language.implicitConversions import org.apache.spark.SparkFunSuite import org.apache.spark.ml.classification.DecisionTreeClassificationModel diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala index dd0139b94f098..c5bf202a2d337 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/util/DefaultReadWriteTest.scala @@ -19,7 +19,6 @@ package org.apache.spark.ml.util import java.io.{File, IOException} -import org.json4s.JNothing import org.scalatest.Suite import org.apache.spark.{SparkException, SparkFunSuite} diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/PMMLReadWriteTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/PMMLReadWriteTest.scala index d2c4832b12bac..19e9fe4bdb30e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/util/PMMLReadWriteTest.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/util/PMMLReadWriteTest.scala @@ -23,10 +23,7 @@ import org.dmg.pmml.PMML import org.scalatest.Suite import org.apache.spark.SparkContext -import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ -import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.Dataset trait PMMLReadWriteTest extends TempDirectory { self: Suite => /** diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index 56d41403f74cc..8f311bbf9f840 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -20,7 +20,6 @@ package org.apache.spark.mllib.clustering import java.util.{ArrayList => JArrayList} import breeze.linalg.{argmax, argtopk, max, DenseMatrix => BDM} -import org.scalatest.Assertions import org.apache.spark.SparkFunSuite import org.apache.spark.graphx.Edge diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala index 9d7177e0a149e..0e789821aa5f3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala @@ -22,7 +22,7 @@ import java.{util => ju} import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, SparseVector => BSV} import org.apache.spark.{SparkException, SparkFunSuite} -import org.apache.spark.mllib.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vectors} +import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ diff --git a/pom.xml b/pom.xml index 25c6da7100056..3ae2e7420e154 100644 --- a/pom.xml +++ b/pom.xml @@ -164,6 +164,7 @@ 3.2.2 2.12.10 2.12 + -Ywarn-unused-import 2.0.0 --test @@ -189,7 +190,7 @@ 3.5.2 3.0.0 0.12.0 - 4.7.1 + 4.8-1 1.1 3.141.59 2.40.0 @@ -2537,6 +2538,7 @@ -deprecation -feature -explaintypes + ${scalac.arg.unused-imports} -target:jvm-1.8 @@ -3266,6 +3268,7 @@ 2.13.3 2.13 + -Wconf:cat=unused-imports:e diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst index 4039698d39958..9c9ff7fa7844b 100644 --- a/python/docs/source/getting_started/install.rst +++ b/python/docs/source/getting_started/install.rst @@ -48,7 +48,7 @@ If you want to install extra dependencies for a specific componenet, you can ins pip install pyspark[sql] -For PySpark with a different Hadoop version, you can install it by using ``HADOOP_VERSION`` environment variables as below: +For PySpark with/without a specific Hadoop version, you can install it by using ``HADOOP_VERSION`` environment variables as below: .. code-block:: bash @@ -68,8 +68,13 @@ It is recommended to use ``-v`` option in ``pip`` to track the installation and HADOOP_VERSION=2.7 pip install pyspark -v -Supported versions of Hadoop are ``HADOOP_VERSION=2.7`` and ``HADOOP_VERSION=3.2`` (default). -Note that this installation of PySpark with a different version of Hadoop is experimental. It can change or be removed between minor releases. +Supported values in ``HADOOP_VERSION`` are: + +- ``without``: Spark pre-built with user-provided Apache Hadoop +- ``2.7``: Spark pre-built for Apache Hadoop 2.7 +- ``3.2``: Spark pre-built for Apache Hadoop 3.2 and later (default) + +Note that this installation way of PySpark with/without a specific Hadoop version is experimental. It can change or be removed between minor releases. Using Conda diff --git a/python/docs/source/user_guide/arrow_pandas.rst b/python/docs/source/user_guide/arrow_pandas.rst index fe04315f87ad5..91d8155523391 100644 --- a/python/docs/source/user_guide/arrow_pandas.rst +++ b/python/docs/source/user_guide/arrow_pandas.rst @@ -341,8 +341,9 @@ Supported SQL Types .. currentmodule:: pyspark.sql.types -Currently, all Spark SQL data types are supported by Arrow-based conversion except :class:`MapType`, +Currently, all Spark SQL data types are supported by Arrow-based conversion except :class:`ArrayType` of :class:`TimestampType`, and nested :class:`StructType`. +:class: `MapType` is only supported when using PyArrow 2.0.0 and above. Setting Arrow Batch Size ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 8f13f3275cb5b..50882fc895d6c 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -783,7 +783,7 @@ class LinearSVCTrainingSummary(LinearSVCSummary, _TrainingSummary): class _LogisticRegressionParams(_ProbabilisticClassifierParams, HasRegParam, HasElasticNetParam, HasMaxIter, HasFitIntercept, HasTol, HasStandardization, HasWeightCol, HasAggregationDepth, - HasThreshold, HasBlockSize): + HasThreshold, HasMaxBlockSizeInMB): """ Params for :py:class:`LogisticRegression` and :py:class:`LogisticRegressionModel`. @@ -836,7 +836,7 @@ class _LogisticRegressionParams(_ProbabilisticClassifierParams, HasRegParam, def __init__(self, *args): super(_LogisticRegressionParams, self).__init__(*args) self._setDefault(maxIter=100, regParam=0.0, tol=1E-6, threshold=0.5, family="auto", - blockSize=1) + maxBlockSizeInMB=0.0) @since("1.4.0") def setThreshold(self, value): @@ -980,8 +980,8 @@ class LogisticRegression(_JavaProbabilisticClassifier, _LogisticRegressionParams LogisticRegressionModel... >>> blorModel.getProbabilityCol() 'newProbability' - >>> blorModel.getBlockSize() - 1 + >>> blorModel.getMaxBlockSizeInMB() + 0.0 >>> blorModel.setThreshold(0.1) LogisticRegressionModel... >>> blorModel.getThreshold() @@ -1047,7 +1047,7 @@ def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="p aggregationDepth=2, family="auto", lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, - blockSize=1): + maxBlockSizeInMB=0.0): """ __init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ @@ -1057,7 +1057,7 @@ def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="p aggregationDepth=2, family="auto", \ lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \ lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, \ - blockSize=1): + maxBlockSizeInMB=0.0): If the threshold and thresholds Params are both set, they must be equivalent. """ super(LogisticRegression, self).__init__() @@ -1076,7 +1076,7 @@ def setParams(self, *, featuresCol="features", labelCol="label", predictionCol=" aggregationDepth=2, family="auto", lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, - blockSize=1): + maxBlockSizeInMB=0.0): """ setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ @@ -1085,7 +1085,7 @@ def setParams(self, *, featuresCol="features", labelCol="label", predictionCol=" aggregationDepth=2, family="auto", \ lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \ lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None, \ - blockSize=1): + maxBlockSizeInMB=0.0): Sets params for logistic regression. If the threshold and thresholds Params are both set, they must be equivalent. """ @@ -1181,11 +1181,11 @@ def setAggregationDepth(self, value): return self._set(aggregationDepth=value) @since("3.1.0") - def setBlockSize(self, value): + def setMaxBlockSizeInMB(self, value): """ - Sets the value of :py:attr:`blockSize`. + Sets the value of :py:attr:`maxBlockSizeInMB`. """ - return self._set(blockSize=value) + return self._set(maxBlockSizeInMB=value) class LogisticRegressionModel(_JavaProbabilisticClassificationModel, _LogisticRegressionParams, diff --git a/python/pyspark/ml/classification.pyi b/python/pyspark/ml/classification.pyi index 9f72d24f63117..4bde851bb1e0d 100644 --- a/python/pyspark/ml/classification.pyi +++ b/python/pyspark/ml/classification.pyi @@ -257,7 +257,7 @@ class _LogisticRegressionParams( HasWeightCol, HasAggregationDepth, HasThreshold, - HasBlockSize, + HasMaxBlockSizeInMB, ): threshold: Param[float] family: Param[str] @@ -305,7 +305,7 @@ class LogisticRegression( upperBoundsOnCoefficients: Optional[Matrix] = ..., lowerBoundsOnIntercepts: Optional[Vector] = ..., upperBoundsOnIntercepts: Optional[Vector] = ..., - blockSize: int = ... + maxBlockSizeInMB: float = ... ) -> None: ... def setParams( self, @@ -330,7 +330,7 @@ class LogisticRegression( upperBoundsOnCoefficients: Optional[Matrix] = ..., lowerBoundsOnIntercepts: Optional[Vector] = ..., upperBoundsOnIntercepts: Optional[Vector] = ..., - blockSize: int = ... + maxBlockSizeInMB: float = ... ) -> LogisticRegression: ... def setFamily(self, value: str) -> LogisticRegression: ... def setLowerBoundsOnCoefficients(self, value: Matrix) -> LogisticRegression: ... @@ -345,7 +345,7 @@ class LogisticRegression( def setStandardization(self, value: bool) -> LogisticRegression: ... def setWeightCol(self, value: str) -> LogisticRegression: ... def setAggregationDepth(self, value: int) -> LogisticRegression: ... - def setBlockSize(self, value: int) -> LogisticRegression: ... + def setMaxBlockSizeInMB(self, value: float) -> LogisticRegression: ... class LogisticRegressionModel( _JavaProbabilisticClassificationModel[Vector], diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 53d26972c4b4a..bcab51f76bd49 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -168,8 +168,8 @@ def get$Name(self): "adjusted to the size of this data.", None, "TypeConverters.toInt"), ("maxBlockSizeInMB", "maximum memory in MB for stacking input data into blocks. Data is " + "stacked within partitions. If more than remaining data size in a partition then it " + - "is adjusted to the data size. If 0, try to infer an appropriate value. Must be >= 0.", - "0.0", "TypeConverters.toFloat")] + "is adjusted to the data size. Default 0.0 represents choosing optimal value, depends " + + "on specific algorithm. Must be >= 0.", "0.0", "TypeConverters.toFloat")] code = [] for name, doc, defaultValueStr, typeConverter in shared: diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index cbef7386e2214..9311e4481e2b4 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -601,10 +601,10 @@ def getBlockSize(self): class HasMaxBlockSizeInMB(Params): """ - Mixin for param maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value. Must be >= 0. + Mixin for param maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. """ - maxBlockSizeInMB = Param(Params._dummy(), "maxBlockSizeInMB", "maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value. Must be >= 0.", typeConverter=TypeConverters.toFloat) + maxBlockSizeInMB = Param(Params._dummy(), "maxBlockSizeInMB", "maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0.", typeConverter=TypeConverters.toFloat) def __init__(self): super(HasMaxBlockSizeInMB, self).__init__() diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index d1a5852fd65bd..5ce484d964a5a 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -24,7 +24,7 @@ from pyspark.ml.base import _PredictorParams from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol, HasWeightCol, \ Param, Params, TypeConverters, HasMaxIter, HasTol, HasFitIntercept, HasAggregationDepth, \ - HasBlockSize, HasRegParam, HasSolver, HasStepSize, HasSeed, HasElasticNetParam, \ + HasMaxBlockSizeInMB, HasRegParam, HasSolver, HasStepSize, HasSeed, HasElasticNetParam, \ HasStandardization, HasLoss, HasVarianceCol from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \ _TreeEnsembleModel, _RandomForestParams, _GBTParams, _TreeRegressorParams @@ -87,7 +87,7 @@ class _JavaRegressionModel(RegressionModel, JavaPredictionModel, metaclass=ABCMe class _LinearRegressionParams(_PredictorParams, HasRegParam, HasElasticNetParam, HasMaxIter, HasTol, HasFitIntercept, HasStandardization, HasWeightCol, HasSolver, - HasAggregationDepth, HasLoss, HasBlockSize): + HasAggregationDepth, HasLoss, HasMaxBlockSizeInMB): """ Params for :py:class:`LinearRegression` and :py:class:`LinearRegressionModel`. @@ -107,7 +107,7 @@ class _LinearRegressionParams(_PredictorParams, HasRegParam, HasElasticNetParam, def __init__(self, *args): super(_LinearRegressionParams, self).__init__(*args) self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, loss="squaredError", epsilon=1.35, - blockSize=1) + maxBlockSizeInMB=0.0) @since("2.3.0") def getEpsilon(self): @@ -166,8 +166,8 @@ class LinearRegression(_JavaRegressor, _LinearRegressionParams, JavaMLWritable, LinearRegressionModel... >>> model.getMaxIter() 5 - >>> model.getBlockSize() - 1 + >>> model.getMaxBlockSizeInMB() + 0.0 >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> abs(model.predict(test0.head().features) - (-1.0)) < 0.001 True @@ -207,12 +207,12 @@ class LinearRegression(_JavaRegressor, _LinearRegressionParams, JavaMLWritable, def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, standardization=True, solver="auto", weightCol=None, aggregationDepth=2, - loss="squaredError", epsilon=1.35, blockSize=1): + loss="squaredError", epsilon=1.35, maxBlockSizeInMB=0.0): """ __init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ standardization=True, solver="auto", weightCol=None, aggregationDepth=2, \ - loss="squaredError", epsilon=1.35, blockSize=1) + loss="squaredError", epsilon=1.35, maxBlockSizeInMB=0.0) """ super(LinearRegression, self).__init__() self._java_obj = self._new_java_obj( @@ -225,12 +225,12 @@ def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="p def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, standardization=True, solver="auto", weightCol=None, aggregationDepth=2, - loss="squaredError", epsilon=1.35, blockSize=1): + loss="squaredError", epsilon=1.35, maxBlockSizeInMB=0.0): """ setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ standardization=True, solver="auto", weightCol=None, aggregationDepth=2, \ - loss="squaredError", epsilon=1.35, blockSize=1) + loss="squaredError", epsilon=1.35, maxBlockSizeInMB=0.0) Sets params for linear regression. """ kwargs = self._input_kwargs @@ -307,11 +307,11 @@ def setLoss(self, value): return self._set(lossType=value) @since("3.1.0") - def setBlockSize(self, value): + def setMaxBlockSizeInMB(self, value): """ - Sets the value of :py:attr:`blockSize`. + Sets the value of :py:attr:`maxBlockSizeInMB`. """ - return self._set(blockSize=value) + return self._set(maxBlockSizeInMB=value) class LinearRegressionModel(_JavaRegressionModel, _LinearRegressionParams, GeneralJavaMLWritable, @@ -1683,7 +1683,7 @@ def evaluateEachIteration(self, dataset, loss): class _AFTSurvivalRegressionParams(_PredictorParams, HasMaxIter, HasTol, HasFitIntercept, - HasAggregationDepth, HasBlockSize): + HasAggregationDepth, HasMaxBlockSizeInMB): """ Params for :py:class:`AFTSurvivalRegression` and :py:class:`AFTSurvivalRegressionModel`. @@ -1710,7 +1710,7 @@ def __init__(self, *args): super(_AFTSurvivalRegressionParams, self).__init__(*args) self._setDefault(censorCol="censor", quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], - maxIter=100, tol=1E-6, blockSize=1) + maxIter=100, tol=1E-6, maxBlockSizeInMB=0.0) @since("1.6.0") def getCensorCol(self): @@ -1762,8 +1762,8 @@ class AFTSurvivalRegression(_JavaRegressor, _AFTSurvivalRegressionParams, 10 >>> aftsr.clear(aftsr.maxIter) >>> model = aftsr.fit(df) - >>> model.getBlockSize() - 1 + >>> model.getMaxBlockSizeInMB() + 0.0 >>> model.setFeaturesCol("features") AFTSurvivalRegressionModel... >>> model.predict(Vectors.dense(6.3)) @@ -1802,12 +1802,12 @@ class AFTSurvivalRegression(_JavaRegressor, _AFTSurvivalRegressionParams, def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction", fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", quantileProbabilities=list([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), - quantilesCol=None, aggregationDepth=2, blockSize=1): + quantilesCol=None, aggregationDepth=2, maxBlockSizeInMB=0.0): """ __init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \ quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \ - quantilesCol=None, aggregationDepth=2, blockSize=1) + quantilesCol=None, aggregationDepth=2, maxBlockSizeInMB=0.0) """ super(AFTSurvivalRegression, self).__init__() self._java_obj = self._new_java_obj( @@ -1820,12 +1820,12 @@ def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="p def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction", fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", quantileProbabilities=list([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]), - quantilesCol=None, aggregationDepth=2, blockSize=1): + quantilesCol=None, aggregationDepth=2, maxBlockSizeInMB=0.0): """ setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \ fitIntercept=True, maxIter=100, tol=1E-6, censorCol="censor", \ quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99], \ - quantilesCol=None, aggregationDepth=2, blockSize=1): + quantilesCol=None, aggregationDepth=2, maxBlockSizeInMB=0.0): """ kwargs = self._input_kwargs return self._set(**kwargs) @@ -1883,11 +1883,11 @@ def setAggregationDepth(self, value): return self._set(aggregationDepth=value) @since("3.1.0") - def setBlockSize(self, value): + def setMaxBlockSizeInMB(self, value): """ - Sets the value of :py:attr:`blockSize`. + Sets the value of :py:attr:`maxBlockSizeInMB`. """ - return self._set(blockSize=value) + return self._set(maxBlockSizeInMB=value) class AFTSurvivalRegressionModel(_JavaRegressionModel, _AFTSurvivalRegressionParams, diff --git a/python/pyspark/ml/regression.pyi b/python/pyspark/ml/regression.pyi index 991eb4f12ac85..5cb0e7a5092f7 100644 --- a/python/pyspark/ml/regression.pyi +++ b/python/pyspark/ml/regression.pyi @@ -24,7 +24,7 @@ from pyspark.ml import PredictionModel, Predictor from pyspark.ml.base import _PredictorParams from pyspark.ml.param.shared import ( HasAggregationDepth, - HasBlockSize, + HasMaxBlockSizeInMB, HasElasticNetParam, HasFeaturesCol, HasFitIntercept, @@ -86,7 +86,7 @@ class _LinearRegressionParams( HasSolver, HasAggregationDepth, HasLoss, - HasBlockSize, + HasMaxBlockSizeInMB, ): solver: Param[str] loss: Param[str] @@ -116,7 +116,7 @@ class LinearRegression( weightCol: Optional[str] = ..., aggregationDepth: int = ..., epsilon: float = ..., - blockSize: int = ... + maxBlockSizeInMB: float = ... ) -> None: ... def setParams( self, @@ -134,7 +134,7 @@ class LinearRegression( weightCol: Optional[str] = ..., aggregationDepth: int = ..., epsilon: float = ..., - blockSize: int = ... + maxBlockSizeInMB: float = ... ) -> LinearRegression: ... def setEpsilon(self, value: float) -> LinearRegression: ... def setMaxIter(self, value: int) -> LinearRegression: ... @@ -147,7 +147,7 @@ class LinearRegression( def setSolver(self, value: str) -> LinearRegression: ... def setAggregationDepth(self, value: int) -> LinearRegression: ... def setLoss(self, value: str) -> LinearRegression: ... - def setBlockSize(self, value: int) -> LinearRegression: ... + def setMaxBlockSizeInMB(self, value: float) -> LinearRegression: ... class LinearRegressionModel( _JavaRegressionModel[Vector], @@ -522,7 +522,7 @@ class _AFTSurvivalRegressionParams( HasTol, HasFitIntercept, HasAggregationDepth, - HasBlockSize, + HasMaxBlockSizeInMB, ): censorCol: Param[str] quantileProbabilities: Param[List[float]] @@ -551,7 +551,7 @@ class AFTSurvivalRegression( quantileProbabilities: List[float] = ..., quantilesCol: Optional[str] = ..., aggregationDepth: int = ..., - blockSize: int = ... + maxBlockSizeInMB: float = ... ) -> None: ... def setParams( self, @@ -566,7 +566,7 @@ class AFTSurvivalRegression( quantileProbabilities: List[float] = ..., quantilesCol: Optional[str] = ..., aggregationDepth: int = ..., - blockSize: int = ... + maxBlockSizeInMB: float = ... ) -> AFTSurvivalRegression: ... def setCensorCol(self, value: str) -> AFTSurvivalRegression: ... def setQuantileProbabilities(self, value: List[float]) -> AFTSurvivalRegression: ... @@ -575,7 +575,7 @@ class AFTSurvivalRegression( def setTol(self, value: float) -> AFTSurvivalRegression: ... def setFitIntercept(self, value: bool) -> AFTSurvivalRegression: ... def setAggregationDepth(self, value: int) -> AFTSurvivalRegression: ... - def setBlockSize(self, value: int) -> AFTSurvivalRegression: ... + def setMaxBlockSizeInMB(self, value: float) -> AFTSurvivalRegression: ... class AFTSurvivalRegressionModel( _JavaRegressionModel[Vector], diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 86a88a5bf341e..4af5d1f484ee4 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -3527,7 +3527,7 @@ def schema_of_json(json, options={}): Parameters ---------- json : :class:`Column` or str - a JSON string or a string literal containing a JSON string. + a JSON string or a foldable string column containing a JSON string. options : dict, optional options to control parsing. accepts the same options as the JSON datasource @@ -3564,7 +3564,7 @@ def schema_of_csv(csv, options={}): Parameters ---------- csv : :class:`Column` or str - a CSV string or a string literal containing a CSV string. + a CSV string or a foldable string column containing a CSV string. options : dict, optional options to control parsing. accepts the same options as the CSV datasource diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 3456c12e59c09..d8a241417532e 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -22,7 +22,7 @@ from pyspark.sql.pandas.serializers import ArrowCollectSerializer from pyspark.sql.types import IntegralType from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, \ - DoubleType, BooleanType, TimestampType, StructType, DataType + DoubleType, BooleanType, MapType, TimestampType, StructType, DataType from pyspark.traceback_utils import SCCallSiteSync @@ -100,7 +100,8 @@ def toPandas(self): # of PyArrow is found, if 'spark.sql.execution.arrow.pyspark.enabled' is enabled. if use_arrow: try: - from pyspark.sql.pandas.types import _check_series_localize_timestamps + from pyspark.sql.pandas.types import _check_series_localize_timestamps, \ + _convert_map_items_to_dict import pyarrow # Rename columns to avoid duplicated column names. tmp_column_names = ['col_{}'.format(i) for i in range(len(self.columns))] @@ -117,6 +118,9 @@ def toPandas(self): if isinstance(field.dataType, TimestampType): pdf[field.name] = \ _check_series_localize_timestamps(pdf[field.name], timezone) + elif isinstance(field.dataType, MapType): + pdf[field.name] = \ + _convert_map_items_to_dict(pdf[field.name]) return pdf else: return pd.DataFrame.from_records([], columns=self.columns) diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py index 16462e8702a0b..750aa4b0e6c56 100644 --- a/python/pyspark/sql/pandas/functions.py +++ b/python/pyspark/sql/pandas/functions.py @@ -284,7 +284,6 @@ def calculate(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]: should be checked for accuracy by users. Currently, - :class:`pyspark.sql.types.MapType`, :class:`pyspark.sql.types.ArrayType` of :class:`pyspark.sql.types.TimestampType` and nested :class:`pyspark.sql.types.StructType` are currently not supported as output types. diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py index 73d36ee555fb5..2dcfdc1046049 100644 --- a/python/pyspark/sql/pandas/serializers.py +++ b/python/pyspark/sql/pandas/serializers.py @@ -117,7 +117,8 @@ def __init__(self, timezone, safecheck, assign_cols_by_name): self._assign_cols_by_name = assign_cols_by_name def arrow_to_pandas(self, arrow_column): - from pyspark.sql.pandas.types import _check_series_localize_timestamps + from pyspark.sql.pandas.types import _check_series_localize_timestamps, \ + _convert_map_items_to_dict import pyarrow # If the given column is a date type column, creates a series of datetime.date directly @@ -127,6 +128,8 @@ def arrow_to_pandas(self, arrow_column): if pyarrow.types.is_timestamp(arrow_column.type): return _check_series_localize_timestamps(s, self._timezone) + elif pyarrow.types.is_map(arrow_column.type): + return _convert_map_items_to_dict(s) else: return s @@ -147,7 +150,8 @@ def _create_batch(self, series): """ import pandas as pd import pyarrow as pa - from pyspark.sql.pandas.types import _check_series_convert_timestamps_internal + from pyspark.sql.pandas.types import _check_series_convert_timestamps_internal, \ + _convert_dict_to_map_items from pandas.api.types import is_categorical_dtype # Make input conform to [(series1, type1), (series2, type2), ...] if not isinstance(series, (list, tuple)) or \ @@ -160,6 +164,8 @@ def create_array(s, t): # Ensure timestamp series are in expected form for Spark internal representation if t is not None and pa.types.is_timestamp(t): s = _check_series_convert_timestamps_internal(s, self._timezone) + elif t is not None and pa.types.is_map(t): + s = _convert_dict_to_map_items(s) elif is_categorical_dtype(s.dtype): # Note: This can be removed once minimum pyarrow version is >= 0.16.1 s = s.astype(s.dtypes.categories.dtype) diff --git a/python/pyspark/sql/pandas/types.py b/python/pyspark/sql/pandas/types.py index 67557120715ac..7e4d61b0d21b8 100644 --- a/python/pyspark/sql/pandas/types.py +++ b/python/pyspark/sql/pandas/types.py @@ -20,14 +20,15 @@ pandas instances during the type conversion. """ -from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, \ - DoubleType, DecimalType, StringType, BinaryType, DateType, TimestampType, ArrayType, \ - StructType, StructField, BooleanType +from pyspark.sql.types import BooleanType, ByteType, ShortType, IntegerType, LongType, \ + FloatType, DoubleType, DecimalType, StringType, BinaryType, DateType, TimestampType, \ + ArrayType, MapType, StructType, StructField def to_arrow_type(dt): """ Convert Spark data type to pyarrow type """ + from distutils.version import LooseVersion import pyarrow as pa if type(dt) == BooleanType: arrow_type = pa.bool_() @@ -58,6 +59,13 @@ def to_arrow_type(dt): if type(dt.elementType) in [StructType, TimestampType]: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.list_(to_arrow_type(dt.elementType)) + elif type(dt) == MapType: + if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): + raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") + if type(dt.keyType) in [StructType, TimestampType] or \ + type(dt.valueType) in [StructType, TimestampType]: + raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) + arrow_type = pa.map_(to_arrow_type(dt.keyType), to_arrow_type(dt.valueType)) elif type(dt) == StructType: if any(type(field.dataType) == StructType for field in dt): raise TypeError("Nested StructType not supported in conversion to Arrow") @@ -81,6 +89,8 @@ def to_arrow_schema(schema): def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ + from distutils.version import LooseVersion + import pyarrow as pa import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() @@ -110,6 +120,12 @@ def from_arrow_type(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) + elif types.is_map(at): + if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): + raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") + if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): + raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) + spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) @@ -306,3 +322,23 @@ def _check_series_convert_timestamps_tz_local(s, timezone): `pandas.Series` where if it is a timestamp, has been converted to tz-naive """ return _check_series_convert_timestamps_localize(s, timezone, None) + + +def _convert_map_items_to_dict(s): + """ + Convert a series with items as list of (key, value), as made from an Arrow column of map type, + to dict for compatibility with non-arrow MapType columns. + :param s: pandas.Series of lists of (key, value) pairs + :return: pandas.Series of dictionaries + """ + return s.apply(lambda m: None if m is None else {k: v for k, v in m}) + + +def _convert_dict_to_map_items(s): + """ + Convert a series of dictionaries to list of (key, value) pairs to match expected data + for Arrow column of map type. + :param s: pandas.Series of dictionaries + :return: pandas.Series of lists of (key, value) pairs + """ + return s.apply(lambda d: list(d.items()) if d is not None else None) diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index 55d5e9017b345..e764c42d88a31 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -21,13 +21,13 @@ import time import unittest import warnings +from distutils.version import LooseVersion from pyspark import SparkContext, SparkConf from pyspark.sql import Row, SparkSession from pyspark.sql.functions import udf from pyspark.sql.types import StructType, StringType, IntegerType, LongType, \ - FloatType, DoubleType, DecimalType, DateType, TimestampType, BinaryType, StructField, MapType, \ - ArrayType + FloatType, DoubleType, DecimalType, DateType, TimestampType, BinaryType, StructField, ArrayType from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \ pandas_requirement_message, pyarrow_requirement_message from pyspark.testing.utils import QuietTest @@ -114,9 +114,10 @@ def create_pandas_data_frame(self): return pd.DataFrame(data=data_dict) def test_toPandas_fallback_enabled(self): + ts = datetime.datetime(2015, 11, 1, 0, 30) with self.sql_conf({"spark.sql.execution.arrow.pyspark.fallback.enabled": True}): - schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)]) - df = self.spark.createDataFrame([({u'a': 1},)], schema=schema) + schema = StructType([StructField("a", ArrayType(TimestampType()), True)]) + df = self.spark.createDataFrame([([ts],)], schema=schema) with QuietTest(self.sc): with self.warnings_lock: with warnings.catch_warnings(record=True) as warns: @@ -129,10 +130,10 @@ def test_toPandas_fallback_enabled(self): self.assertTrue(len(user_warns) > 0) self.assertTrue( "Attempting non-optimization" in str(user_warns[-1])) - assert_frame_equal(pdf, pd.DataFrame({u'map': [{u'a': 1}]})) + assert_frame_equal(pdf, pd.DataFrame({"a": [[ts]]})) def test_toPandas_fallback_disabled(self): - schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)]) + schema = StructType([StructField("a", ArrayType(TimestampType()), True)]) df = self.spark.createDataFrame([(None,)], schema=schema) with QuietTest(self.sc): with self.warnings_lock: @@ -336,6 +337,62 @@ def test_toPandas_with_array_type(self): self.assertTrue(expected[r][e] == result_arrow[r][e] and result[r][e] == result_arrow[r][e]) + def test_createDataFrame_with_map_type(self): + map_data = [{"a": 1}, {"b": 2, "c": 3}, {}, None, {"d": None}] + + pdf = pd.DataFrame({"id": [0, 1, 2, 3, 4], "m": map_data}) + schema = "id long, m map" + + with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}): + df = self.spark.createDataFrame(pdf, schema=schema) + + if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): + with QuietTest(self.sc): + with self.assertRaisesRegex(Exception, "MapType.*only.*pyarrow 2.0.0"): + self.spark.createDataFrame(pdf, schema=schema) + else: + df_arrow = self.spark.createDataFrame(pdf, schema=schema) + + result = df.collect() + result_arrow = df_arrow.collect() + + self.assertEqual(len(result), len(result_arrow)) + for row, row_arrow in zip(result, result_arrow): + i, m = row + _, m_arrow = row_arrow + self.assertEqual(m, map_data[i]) + self.assertEqual(m_arrow, map_data[i]) + + def test_toPandas_with_map_type(self): + pdf = pd.DataFrame({"id": [0, 1, 2, 3], + "m": [{}, {"a": 1}, {"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}]}) + + with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}): + df = self.spark.createDataFrame(pdf, schema="id long, m map") + + if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): + with QuietTest(self.sc): + with self.assertRaisesRegex(Exception, "MapType.*only.*pyarrow 2.0.0"): + df.toPandas() + else: + pdf_non, pdf_arrow = self._toPandas_arrow_toggle(df) + assert_frame_equal(pdf_arrow, pdf_non) + + def test_toPandas_with_map_type_nulls(self): + pdf = pd.DataFrame({"id": [0, 1, 2, 3, 4], + "m": [{"a": 1}, {"b": 2, "c": 3}, {}, None, {"d": None}]}) + + with self.sql_conf({"spark.sql.execution.arrow.pyspark.enabled": False}): + df = self.spark.createDataFrame(pdf, schema="id long, m map") + + if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): + with QuietTest(self.sc): + with self.assertRaisesRegex(Exception, "MapType.*only.*pyarrow 2.0.0"): + df.toPandas() + else: + pdf_non, pdf_arrow = self._toPandas_arrow_toggle(df) + assert_frame_equal(pdf_arrow, pdf_non) + def test_createDataFrame_with_int_col_names(self): import numpy as np pdf = pd.DataFrame(np.random.rand(4, 2)) @@ -345,26 +402,28 @@ def test_createDataFrame_with_int_col_names(self): self.assertEqual(pdf_col_names, df_arrow.columns) def test_createDataFrame_fallback_enabled(self): + ts = datetime.datetime(2015, 11, 1, 0, 30) with QuietTest(self.sc): with self.sql_conf({"spark.sql.execution.arrow.pyspark.fallback.enabled": True}): with warnings.catch_warnings(record=True) as warns: # we want the warnings to appear even if this test is run from a subclass warnings.simplefilter("always") df = self.spark.createDataFrame( - pd.DataFrame([[{u'a': 1}]]), "a: map") + pd.DataFrame({"a": [[ts]]}), "a: array") # Catch and check the last UserWarning. user_warns = [ warn.message for warn in warns if isinstance(warn.message, UserWarning)] self.assertTrue(len(user_warns) > 0) self.assertTrue( "Attempting non-optimization" in str(user_warns[-1])) - self.assertEqual(df.collect(), [Row(a={u'a': 1})]) + self.assertEqual(df.collect(), [Row(a=[ts])]) def test_createDataFrame_fallback_disabled(self): with QuietTest(self.sc): with self.assertRaisesRegexp(TypeError, 'Unsupported type'): self.spark.createDataFrame( - pd.DataFrame([[{u'a': 1}]]), "a: map") + pd.DataFrame({"a": [[datetime.datetime(2015, 11, 1, 0, 30)]]}), + "a: array") # Regression test for SPARK-23314 def test_timestamp_dst(self): diff --git a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py index f9a7dd69b61fb..4afc1dfcc1c6e 100644 --- a/python/pyspark/sql/tests/test_pandas_cogrouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_cogrouped_map.py @@ -176,9 +176,9 @@ def test_wrong_return_type(self): with QuietTest(self.sc): with self.assertRaisesRegexp( NotImplementedError, - 'Invalid return type.*MapType'): + 'Invalid return type.*ArrayType.*TimestampType'): left.groupby('id').cogroup(right.groupby('id')).applyInPandas( - lambda l, r: l, 'id long, v map') + lambda l, r: l, 'id long, v array') def test_wrong_args(self): left = self.data1 diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py index 93e37125eaa33..ee68b95fc478d 100644 --- a/python/pyspark/sql/tests/test_pandas_grouped_map.py +++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py @@ -26,7 +26,7 @@ window from pyspark.sql.types import IntegerType, DoubleType, ArrayType, BinaryType, ByteType, \ LongType, DecimalType, ShortType, FloatType, StringType, BooleanType, StructType, \ - StructField, NullType, MapType, TimestampType + StructField, NullType, TimestampType from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \ pandas_requirement_message, pyarrow_requirement_message from pyspark.testing.utils import QuietTest @@ -246,10 +246,10 @@ def test_wrong_return_type(self): with QuietTest(self.sc): with self.assertRaisesRegexp( NotImplementedError, - 'Invalid return type.*grouped map Pandas UDF.*MapType'): + 'Invalid return type.*grouped map Pandas UDF.*ArrayType.*TimestampType'): pandas_udf( lambda pdf: pdf, - 'id long, v map', + 'id long, v array', PandasUDFType.GROUPED_MAP) def test_wrong_args(self): @@ -276,7 +276,6 @@ def test_wrong_args(self): def test_unsupported_types(self): common_err_msg = 'Invalid return type.*grouped map Pandas UDF.*' unsupported_types = [ - StructField('map', MapType(StringType(), IntegerType())), StructField('arr_ts', ArrayType(TimestampType())), StructField('null', NullType()), StructField('struct', StructType([StructField('l', LongType())])), diff --git a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py index 451308927629b..2cbcf31f6e7b3 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py +++ b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py @@ -21,7 +21,7 @@ from pyspark.sql import Row from pyspark.sql.functions import array, explode, col, lit, mean, sum, \ udf, pandas_udf, PandasUDFType -from pyspark.sql.types import ArrayType, TimestampType, DoubleType, MapType +from pyspark.sql.types import ArrayType, TimestampType from pyspark.sql.utils import AnalysisException from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \ pandas_requirement_message, pyarrow_requirement_message @@ -159,7 +159,7 @@ def mean_and_std_udf(v): with QuietTest(self.sc): with self.assertRaisesRegexp(NotImplementedError, 'not supported'): - @pandas_udf(MapType(DoubleType(), DoubleType()), PandasUDFType.GROUPED_AGG) + @pandas_udf(ArrayType(TimestampType()), PandasUDFType.GROUPED_AGG) def mean_and_std_udf(v): return {v.mean(): v.std()} diff --git a/python/pyspark/sql/tests/test_pandas_udf_scalar.py b/python/pyspark/sql/tests/test_pandas_udf_scalar.py index 6d325c9085ce1..5da5d043ceca4 100644 --- a/python/pyspark/sql/tests/test_pandas_udf_scalar.py +++ b/python/pyspark/sql/tests/test_pandas_udf_scalar.py @@ -22,6 +22,7 @@ import unittest from datetime import date, datetime from decimal import Decimal +from distutils.version import LooseVersion from pyspark import TaskContext from pyspark.rdd import PythonEvalType @@ -379,6 +380,20 @@ def test_vectorized_udf_nested_struct(self): 'Invalid return type with scalar Pandas UDFs'): pandas_udf(lambda x: x, returnType=nested_type, functionType=udf_type) + def test_vectorized_udf_map_type(self): + data = [({},), ({"a": 1},), ({"a": 1, "b": 2},), ({"a": 1, "b": 2, "c": 3},)] + schema = StructType([StructField("map", MapType(StringType(), LongType()))]) + df = self.spark.createDataFrame(data, schema=schema) + for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: + if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): + with QuietTest(self.sc): + with self.assertRaisesRegex(Exception, "MapType.*not supported"): + pandas_udf(lambda x: x, MapType(StringType(), LongType()), udf_type) + else: + map_f = pandas_udf(lambda x: x, MapType(StringType(), LongType()), udf_type) + result = df.select(map_f(col('map'))) + self.assertEquals(df.collect(), result.collect()) + def test_vectorized_udf_complex(self): df = self.spark.range(10).select( col('id').cast('int').alias('a'), @@ -504,8 +519,8 @@ def test_vectorized_udf_wrong_return_type(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: with self.assertRaisesRegexp( NotImplementedError, - 'Invalid return type.*scalar Pandas UDF.*MapType'): - pandas_udf(lambda x: x, MapType(LongType(), LongType()), udf_type) + 'Invalid return type.*scalar Pandas UDF.*ArrayType.*TimestampType'): + pandas_udf(lambda x: x, ArrayType(TimestampType()), udf_type) def test_vectorized_udf_return_scalar(self): df = self.spark.range(10) @@ -577,8 +592,8 @@ def test_vectorized_udf_unsupported_types(self): for udf_type in [PandasUDFType.SCALAR, PandasUDFType.SCALAR_ITER]: with self.assertRaisesRegexp( NotImplementedError, - 'Invalid return type.*scalar Pandas UDF.*MapType'): - pandas_udf(lambda x: x, MapType(StringType(), IntegerType()), udf_type) + 'Invalid return type.*scalar Pandas UDF.*ArrayType.*TimestampType'): + pandas_udf(lambda x: x, ArrayType(TimestampType()), udf_type) with self.assertRaisesRegexp( NotImplementedError, 'Invalid return type.*scalar Pandas UDF.*ArrayType.StructType'): diff --git a/python/pyspark/util.py b/python/pyspark/util.py index 275a72b37be97..09c5963927456 100644 --- a/python/pyspark/util.py +++ b/python/pyspark/util.py @@ -16,10 +16,14 @@ # limitations under the License. # -import threading +import itertools +import os +import platform import re import sys +import threading import traceback +import types from py4j.clientserver import ClientServer @@ -76,6 +80,144 @@ def wrapper(*args, **kwargs): return wrapper +def walk_tb(tb): + while tb is not None: + yield tb + tb = tb.tb_next + + +def try_simplify_traceback(tb): + """ + Simplify the traceback. It removes the tracebacks in the current package, and only + shows the traceback that is related to the thirdparty and user-specified codes. + + Returns + ------- + TracebackType or None + Simplified traceback instance. It returns None if it fails to simplify. + + Notes + ----- + This keeps the tracebacks once it sees they are from a different file even + though the following tracebacks are from the current package. + + Examples + -------- + >>> import importlib + >>> import sys + >>> import traceback + >>> import tempfile + >>> with tempfile.TemporaryDirectory() as tmp_dir: + ... with open("%s/dummy_module.py" % tmp_dir, "w") as f: + ... _ = f.write( + ... 'def raise_stop_iteration():\\n' + ... ' raise StopIteration()\\n\\n' + ... 'def simple_wrapper(f):\\n' + ... ' def wrapper(*a, **k):\\n' + ... ' return f(*a, **k)\\n' + ... ' return wrapper\\n') + ... f.flush() + ... spec = importlib.util.spec_from_file_location( + ... "dummy_module", "%s/dummy_module.py" % tmp_dir) + ... dummy_module = importlib.util.module_from_spec(spec) + ... spec.loader.exec_module(dummy_module) + >>> def skip_doctest_traceback(tb): + ... import pyspark + ... root = os.path.dirname(pyspark.__file__) + ... pairs = zip(walk_tb(tb), traceback.extract_tb(tb)) + ... for cur_tb, cur_frame in pairs: + ... if cur_frame.filename.startswith(root): + ... return cur_tb + + Regular exceptions should show the file name of the current package as below. + + >>> exc_info = None + >>> try: + ... fail_on_stopiteration(dummy_module.raise_stop_iteration)() + ... except Exception as e: + ... tb = sys.exc_info()[-1] + ... e.__cause__ = None + ... exc_info = "".join( + ... traceback.format_exception(type(e), e, tb)) + >>> print(exc_info) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + Traceback (most recent call last): + File ... + ... + File "/.../pyspark/util.py", line ... + ... + RuntimeError: ... + >>> "pyspark/util.py" in exc_info + True + + If the traceback is simplified with this method, it hides the current package file name: + + >>> exc_info = None + >>> try: + ... fail_on_stopiteration(dummy_module.raise_stop_iteration)() + ... except Exception as e: + ... tb = try_simplify_traceback(sys.exc_info()[-1]) + ... e.__cause__ = None + ... exc_info = "".join( + ... traceback.format_exception( + ... type(e), e, try_simplify_traceback(skip_doctest_traceback(tb)))) + >>> print(exc_info) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + RuntimeError: ... + >>> "pyspark/util.py" in exc_info + False + + In the case below, the traceback contains the current package in the middle. + In this case, it just hides the top occurrence only. + + >>> exc_info = None + >>> try: + ... fail_on_stopiteration(dummy_module.simple_wrapper( + ... fail_on_stopiteration(dummy_module.raise_stop_iteration)))() + ... except Exception as e: + ... tb = sys.exc_info()[-1] + ... e.__cause__ = None + ... exc_info_a = "".join( + ... traceback.format_exception(type(e), e, tb)) + ... exc_info_b = "".join( + ... traceback.format_exception( + ... type(e), e, try_simplify_traceback(skip_doctest_traceback(tb)))) + >>> exc_info_a.count("pyspark/util.py") + 2 + >>> exc_info_b.count("pyspark/util.py") + 1 + """ + if "pypy" in platform.python_implementation().lower(): + # Traceback modification is not supported with PyPy in PySpark. + return None + if sys.version_info[:2] < (3, 7): + # Traceback creation is not supported Python < 3.7. + # See https://bugs.python.org/issue30579. + return None + + import pyspark + + root = os.path.dirname(pyspark.__file__) + tb_next = None + new_tb = None + pairs = zip(walk_tb(tb), traceback.extract_tb(tb)) + last_seen = [] + + for cur_tb, cur_frame in pairs: + if not cur_frame.filename.startswith(root): + # Filter the stacktrace from the PySpark source itself. + last_seen = [(cur_tb, cur_frame)] + break + + for cur_tb, cur_frame in reversed(list(itertools.chain(last_seen, pairs))): + # Once we have seen the file names outside, don't skip. + new_tb = types.TracebackType( + tb_next=tb_next, + tb_frame=cur_tb.tb_frame, + tb_lasti=cur_tb.tb_frame.f_lasti, + tb_lineno=cur_tb.tb_frame.f_lineno) + tb_next = new_tb + return new_tb + + def _print_missing_jar(lib_name, pkg_name, jar_name, spark_version): print(""" ________________________________________________________________________________________________ @@ -183,6 +325,8 @@ def __del__(self): if __name__ == "__main__": import doctest - (failure_count, test_count) = doctest.testmod() - if failure_count: - sys.exit(-1) + + if "pypy" not in platform.python_implementation().lower() and sys.version_info[:2] >= (3, 7): + (failure_count, test_count) = doctest.testmod() + if failure_count: + sys.exit(-1) diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 704e96ba0666b..1b09d327a5dfe 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -44,7 +44,7 @@ from pyspark.sql.pandas.serializers import ArrowStreamPandasUDFSerializer, CogroupUDFSerializer from pyspark.sql.pandas.types import to_arrow_type from pyspark.sql.types import StructType -from pyspark.util import fail_on_stopiteration +from pyspark.util import fail_on_stopiteration, try_simplify_traceback from pyspark import shuffle pickleSer = PickleSerializer() @@ -607,17 +607,19 @@ def process(): # reuse. TaskContext._setTaskContext(None) BarrierTaskContext._setTaskContext(None) - except BaseException: + except BaseException as e: try: - exc_info = traceback.format_exc() - if isinstance(exc_info, bytes): - # exc_info may contains other encoding bytes, replace the invalid bytes and convert - # it back to utf-8 again - exc_info = exc_info.decode("utf-8", "replace").encode("utf-8") - else: - exc_info = exc_info.encode("utf-8") + exc_info = None + if os.environ.get("SPARK_SIMPLIFIED_TRACEBACK", False): + tb = try_simplify_traceback(sys.exc_info()[-1]) + if tb is not None: + e.__cause__ = None + exc_info = "".join(traceback.format_exception(type(e), e, tb)) + if exc_info is None: + exc_info = traceback.format_exc() + write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile) - write_with_length(exc_info, outfile) + write_with_length(exc_info.encode("utf-8"), outfile) except IOError: # JVM close the socket pass diff --git a/repl/src/test/scala-2.12/org/apache/spark/repl/Repl2Suite.scala b/repl/src/test/scala-2.12/org/apache/spark/repl/Repl2Suite.scala index 4ffa8beaf4740..90af9ec299efc 100644 --- a/repl/src/test/scala-2.12/org/apache/spark/repl/Repl2Suite.scala +++ b/repl/src/test/scala-2.12/org/apache/spark/repl/Repl2Suite.scala @@ -18,17 +18,12 @@ package org.apache.spark.repl import java.io._ -import java.nio.file.Files import scala.tools.nsc.interpreter.SimpleReader -import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import org.scalatest.BeforeAndAfterAll import org.apache.spark.{SparkContext, SparkFunSuite} -import org.apache.spark.internal.Logging -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION class Repl2Suite extends SparkFunSuite with BeforeAndAfterAll { test("propagation of local properties") { diff --git a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala index 5428fa4ee9df7..f696e93e9cef2 100644 --- a/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala +++ b/repl/src/test/scala/org/apache/spark/repl/ExecutorClassLoaderSuite.scala @@ -28,7 +28,6 @@ import java.util.Collections import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider} import scala.io.Source -import scala.language.implicitConversions import com.google.common.io.Files import org.mockito.ArgumentMatchers.{any, anyString} diff --git a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala index 95d908cec5de0..6566d29d16e91 100644 --- a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala +++ b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala @@ -23,7 +23,7 @@ import java.nio.file.Files import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import org.scalatest.BeforeAndAfterAll -import org.apache.spark.{SparkContext, SparkFunSuite} +import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml index 9ae48f4da8b05..edeb95fdba684 100644 --- a/resource-managers/kubernetes/core/pom.xml +++ b/resource-managers/kubernetes/core/pom.xml @@ -30,7 +30,7 @@ kubernetes - 4.11.1 + 4.12.0 diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala index e8bf8f9c9b505..7e5edd905781a 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala @@ -261,12 +261,19 @@ private[spark] object KubernetesUtils extends Logging { isLocalDependency(Utils.resolveURI(resource)) } - def renameMainAppResource(resource: String, conf: SparkConf): String = { + def renameMainAppResource( + resource: String, + conf: Option[SparkConf] = None, + shouldUploadLocal: Boolean): String = { if (isLocalAndResolvable(resource)) { - SparkLauncher.NO_RESOURCE + if (shouldUploadLocal) { + uploadFileUri(resource, conf) + } else { + SparkLauncher.NO_RESOURCE + } } else { resource - } + } } def uploadFileUri(uri: String, conf: Option[SparkConf] = None): String = { diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala index 6503bc823ec0d..f5ba261c8f405 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala @@ -159,7 +159,7 @@ private[spark] class BasicDriverFeatureStep(conf: KubernetesDriverConf) KUBERNETES_DRIVER_SUBMIT_CHECK.key -> "true", MEMORY_OVERHEAD_FACTOR.key -> overheadFactor.toString) // try upload local, resolvable files to a hadoop compatible file system - Seq(JARS, FILES).foreach { key => + Seq(JARS, FILES, SUBMIT_PYTHON_FILES).foreach { key => val value = conf.get(key).filter(uri => KubernetesUtils.isLocalAndResolvable(uri)) val resolved = KubernetesUtils.uploadAndTransformFileUris(value, Some(conf.sparkConf)) if (resolved.nonEmpty) { diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStep.scala index ebe44855f1d0d..d49381ba897d4 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStep.scala @@ -62,7 +62,11 @@ private[spark] class DriverCommandFeatureStep(conf: KubernetesDriverConf) } private def configureForJava(pod: SparkPod, res: String): SparkPod = { - val driverContainer = baseDriverContainer(pod, res).build() + // re-write primary resource, app jar is also added to spark.jars by default in SparkSubmit + // no uploading takes place here + val newResName = KubernetesUtils + .renameMainAppResource(resource = res, shouldUploadLocal = false) + val driverContainer = baseDriverContainer(pod, newResName).build() SparkPod(pod.pod, driverContainer) } @@ -73,7 +77,10 @@ private[spark] class DriverCommandFeatureStep(conf: KubernetesDriverConf) .withValue(conf.get(PYSPARK_MAJOR_PYTHON_VERSION)) .build()) - val pythonContainer = baseDriverContainer(pod, res) + // re-write primary resource to be the remote one and upload the related file + val newResName = KubernetesUtils + .renameMainAppResource(res, Option(conf.sparkConf), true) + val pythonContainer = baseDriverContainer(pod, newResName) .addAllToEnv(pythonEnvs.asJava) .build() @@ -88,7 +95,7 @@ private[spark] class DriverCommandFeatureStep(conf: KubernetesDriverConf) private def baseDriverContainer(pod: SparkPod, resource: String): ContainerBuilder = { // re-write primary resource, app jar is also added to spark.jars by default in SparkSubmit val resolvedResource = if (conf.mainAppResource.isInstanceOf[JavaMainAppResource]) { - KubernetesUtils.renameMainAppResource(resource, conf.sparkConf) + KubernetesUtils.renameMainAppResource(resource, Option(conf.sparkConf), false) } else { resource } diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/EnvSecretsFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/EnvSecretsFeatureStep.scala index d78f04dcc40e6..222e19c5e20f1 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/EnvSecretsFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/EnvSecretsFeatureStep.scala @@ -18,7 +18,7 @@ package org.apache.spark.deploy.k8s.features import scala.collection.JavaConverters._ -import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, HasMetadata} +import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder} import org.apache.spark.deploy.k8s.{KubernetesConf, SparkPod} diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/MountSecretsFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/MountSecretsFeatureStep.scala index f4e1a3a326729..9de7686c8a9c0 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/MountSecretsFeatureStep.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/MountSecretsFeatureStep.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.deploy.k8s.features -import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder, VolumeBuilder, VolumeMountBuilder} +import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder, VolumeBuilder, VolumeMountBuilder} import org.apache.spark.deploy.k8s.{KubernetesConf, SparkPod} diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala index c029b248f7ea4..863cb28bc827c 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala @@ -35,7 +35,6 @@ import org.apache.spark.deploy.k8s.KubernetesUtils.addOwnerReference import org.apache.spark.internal.Logging import org.apache.spark.internal.config.DYN_ALLOCATION_EXECUTOR_IDLE_TIMEOUT import org.apache.spark.resource.ResourceProfile -import org.apache.spark.scheduler.cluster.SchedulerBackendUtils import org.apache.spark.util.{Clock, Utils} private[spark] class ExecutorPodsAllocator( diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala index cc5c2f4b6325d..151e98ba17e3b 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala @@ -25,7 +25,6 @@ import io.fabric8.kubernetes.client.Config import org.apache.spark.SparkContext import org.apache.spark.deploy.k8s.{KubernetesConf, KubernetesUtils, SparkKubernetesClientFactory} import org.apache.spark.deploy.k8s.Config._ -import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl} import org.apache.spark.util.{SystemClock, ThreadUtils} diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/Fabric8Aliases.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/Fabric8Aliases.scala index 23055813a9786..5b36bd144d0f9 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/Fabric8Aliases.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/Fabric8Aliases.scala @@ -17,7 +17,7 @@ package org.apache.spark.deploy.k8s import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapList, DoneableConfigMap, DoneablePod, HasMetadata, Pod, PodList} -import io.fabric8.kubernetes.client.{Watch, Watcher} +import io.fabric8.kubernetes.client.Watch import io.fabric8.kubernetes.client.dsl.{FilterWatchListDeletable, MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource, Resource} object Fabric8Aliases { @@ -25,9 +25,9 @@ object Fabric8Aliases { type CONFIG_MAPS = MixedOperation[ ConfigMap, ConfigMapList, DoneableConfigMap, Resource[ConfigMap, DoneableConfigMap]] type LABELED_PODS = FilterWatchListDeletable[ - Pod, PodList, java.lang.Boolean, Watch, Watcher[Pod]] + Pod, PodList, java.lang.Boolean, Watch] type LABELED_CONFIG_MAPS = FilterWatchListDeletable[ - ConfigMap, ConfigMapList, java.lang.Boolean, Watch, Watcher[ConfigMap]] + ConfigMap, ConfigMapList, java.lang.Boolean, Watch] type SINGLE_POD = PodResource[Pod, DoneablePod] type RESOURCE_LIST = NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable[ HasMetadata, Boolean] diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesTestConf.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesTestConf.scala index 83d9481e6f2b0..0567f32c23134 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesTestConf.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesTestConf.scala @@ -21,7 +21,6 @@ import io.fabric8.kubernetes.api.model.Pod import org.apache.spark.SparkConf import org.apache.spark.deploy.k8s.Config._ -import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.submit.{JavaMainAppResource, MainAppResource} /** diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesUtilsSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesUtilsSuite.scala index 7c231586af935..ef57a4b861508 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesUtilsSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/KubernetesUtilsSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.deploy.k8s import scala.collection.JavaConverters._ -import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, PodBuilder} +import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder} import org.apache.spark.SparkFunSuite diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/PodBuilderSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/PodBuilderSuite.scala index 26bd317de8ec6..4d4c4baeb12c0 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/PodBuilderSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/PodBuilderSuite.scala @@ -26,7 +26,6 @@ import org.mockito.Mockito.{mock, never, verify, when} import scala.collection.JavaConverters._ import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} -import org.apache.spark.deploy.k8s._ import org.apache.spark.internal.config.ConfigEntry abstract class PodBuilderSuite extends SparkFunSuite { diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStepSuite.scala index 6a7366e9c6b7a..a44d465e35087 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverCommandFeatureStepSuite.scala @@ -20,11 +20,8 @@ import scala.collection.JavaConverters._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.k8s._ -import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.submit._ -import org.apache.spark.internal.config._ -import org.apache.spark.util.Utils class DriverCommandFeatureStepSuite extends SparkFunSuite { diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStepSuite.scala index 18afd10395566..413371d056b26 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/DriverServiceFeatureStepSuite.scala @@ -25,7 +25,6 @@ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.k8s.{KubernetesTestConf, SparkPod} import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ -import org.apache.spark.deploy.k8s.submit.JavaMainAppResource import org.apache.spark.internal.config._ import org.apache.spark.internal.config.UI._ import org.apache.spark.util.ManualClock diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/HadoopConfDriverFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/HadoopConfDriverFeatureStepSuite.scala index e1c01dbdc7358..c078e69b8a14b 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/HadoopConfDriverFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/HadoopConfDriverFeatureStepSuite.scala @@ -27,7 +27,6 @@ import io.fabric8.kubernetes.api.model.ConfigMap import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.k8s._ import org.apache.spark.deploy.k8s.Constants._ -import org.apache.spark.deploy.k8s.submit.JavaMainAppResource import org.apache.spark.util.{SparkConfWithEnv, Utils} class HadoopConfDriverFeatureStepSuite extends SparkFunSuite { diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/KerberosConfDriverFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/KerberosConfDriverFeatureStepSuite.scala index 41ca3a94ce7a7..094fcb39782f4 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/KerberosConfDriverFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/KerberosConfDriverFeatureStepSuite.scala @@ -26,14 +26,13 @@ import com.google.common.io.Files import io.fabric8.kubernetes.api.model.{ConfigMap, Secret} import org.apache.commons.codec.binary.Base64 import org.apache.hadoop.io.Text -import org.apache.hadoop.security.{Credentials, UserGroupInformation} +import org.apache.hadoop.security.UserGroupInformation import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.k8s._ import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ -import org.apache.spark.deploy.k8s.submit.JavaMainAppResource import org.apache.spark.internal.config._ import org.apache.spark.util.Utils diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala index bbb89fd0a1c24..95ee37e3daa41 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/MountVolumesFeatureStepSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.deploy.k8s.features import scala.collection.JavaConverters._ -import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.SparkFunSuite import org.apache.spark.deploy.k8s._ class MountVolumesFeatureStepSuite extends SparkFunSuite { diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala index 528b755c41605..8401f7102ad8e 100644 --- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala +++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala @@ -29,7 +29,7 @@ import org.mockito.stubbing.Answer import org.scalatest.BeforeAndAfter import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite} -import org.apache.spark.deploy.k8s.{KubernetesExecutorConf, KubernetesExecutorSpec, SparkPod} +import org.apache.spark.deploy.k8s.{KubernetesExecutorConf, KubernetesExecutorSpec} import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ import org.apache.spark.deploy.k8s.Fabric8Aliases._ diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml index 5274c0579eb05..258d3dfc3df9d 100644 --- a/resource-managers/kubernetes/integration-tests/pom.xml +++ b/resource-managers/kubernetes/integration-tests/pom.xml @@ -28,7 +28,7 @@ 1.3.0 - 4.11.1 + 4.12.0 kubernetes-integration-tests diff --git a/resource-managers/kubernetes/integration-tests/src/test/resources/log-config-test-log4j.properties b/resource-managers/kubernetes/integration-tests/src/test/resources/log-config-test-log4j.properties new file mode 100644 index 0000000000000..d3e13d8542ba1 --- /dev/null +++ b/resource-managers/kubernetes/integration-tests/src/test/resources/log-config-test-log4j.properties @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This log4j config file is for integration test SparkConfPropagateSuite. +log4j.rootCategory=DEBUG, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c: %m%n diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala index e712b95cdbcea..8f6e9cd8af740 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DepsTestsSuite.scala @@ -30,6 +30,7 @@ import org.scalatest.time.{Minutes, Span} import org.apache.spark.SparkException import org.apache.spark.deploy.k8s.integrationtest.DepsTestsSuite.{DEPS_TIMEOUT, FILE_CONTENTS, HOST_PATH} import org.apache.spark.deploy.k8s.integrationtest.KubernetesSuite.{INTERVAL, MinikubeTag, TIMEOUT} +import org.apache.spark.deploy.k8s.integrationtest.Utils.getExamplesJarName import org.apache.spark.deploy.k8s.integrationtest.backend.minikube.Minikube private[spark] trait DepsTestsSuite { k8sSuite: KubernetesSuite => @@ -120,16 +121,18 @@ private[spark] trait DepsTestsSuite { k8sSuite: KubernetesSuite => .endSpec() .build() - kubernetesTestComponents + // try until the service from a previous test is deleted + Eventually.eventually(TIMEOUT, INTERVAL) (kubernetesTestComponents .kubernetesClient .services() - .create(minioService) + .create(minioService)) - kubernetesTestComponents + // try until the stateful set of a previous test is deleted + Eventually.eventually(TIMEOUT, INTERVAL) (kubernetesTestComponents .kubernetesClient .apps() .statefulSets() - .create(minioStatefulSet) + .create(minioStatefulSet)) } private def deleteMinioStorage(): Unit = { @@ -138,47 +141,52 @@ private[spark] trait DepsTestsSuite { k8sSuite: KubernetesSuite => .apps() .statefulSets() .withName(cName) + .withGracePeriod(0) .delete() kubernetesTestComponents .kubernetesClient .services() .withName(svcName) + .withGracePeriod(0) .delete() } test("Launcher client dependencies", k8sTestTag, MinikubeTag) { - val packages = if (Utils.isHadoop3) { - "org.apache.hadoop:hadoop-aws:3.2.0" - } else { - "com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.6" - } - val fileName = Utils.createTempFile(FILE_CONTENTS, HOST_PATH) - try { - setupMinioStorage() - val minioUrlStr = getServiceUrl(svcName) - val minioUrl = new URL(minioUrlStr) - val minioHost = minioUrl.getHost - val minioPort = minioUrl.getPort - val examplesJar = Utils.getExamplesJarAbsolutePath(sparkHomeDir) - sparkAppConf - .set("spark.hadoop.fs.s3a.access.key", ACCESS_KEY) - .set("spark.hadoop.fs.s3a.secret.key", SECRET_KEY) - .set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") - .set("spark.hadoop.fs.s3a.endpoint", s"$minioHost:$minioPort") - .set("spark.kubernetes.file.upload.path", s"s3a://$BUCKET") - .set("spark.files", s"$HOST_PATH/$fileName") - .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") - .set("spark.jars.packages", packages) - .set("spark.driver.extraJavaOptions", "-Divy.cache.dir=/tmp -Divy.home=/tmp") - createS3Bucket(ACCESS_KEY, SECRET_KEY, minioUrlStr) + tryDepsTest({ + val fileName = Utils.createTempFile(FILE_CONTENTS, HOST_PATH) + sparkAppConf.set("spark.files", s"$HOST_PATH/$fileName") + val examplesJar = Utils.getTestFileAbsolutePath(getExamplesJarName(), sparkHomeDir) runSparkRemoteCheckAndVerifyCompletion(appResource = examplesJar, appArgs = Array(fileName), timeout = Option(DEPS_TIMEOUT)) - } finally { - // make sure this always runs - deleteMinioStorage() - } + }) + } + + test("Launcher python client dependencies using a zip file", k8sTestTag, MinikubeTag) { + val inDepsFile = Utils.getTestFileAbsolutePath("py_container_checks.py", sparkHomeDir) + val outDepsFile = s"${inDepsFile.substring(0, inDepsFile.lastIndexOf("."))}.zip" + Utils.createZipFile(inDepsFile, outDepsFile) + testPythonDeps(outDepsFile) + } + + private def testPythonDeps(depsFile: String): Unit = { + tryDepsTest({ + val pySparkFiles = Utils.getTestFileAbsolutePath("pyfiles.py", sparkHomeDir) + setPythonSparkConfProperties(sparkAppConf) + runSparkApplicationAndVerifyCompletion( + appResource = pySparkFiles, + mainClass = "", + expectedLogOnCompletion = Seq( + "Python runtime version check is: True", + "Python environment version check is: True", + "Python runtime version check for executor is: True"), + appArgs = Array("python3"), + driverPodChecker = doBasicDriverPyPodCheck, + executorPodChecker = doBasicExecutorPyPodCheck, + appLocator = appLocator, + isJVM = false, + pyFiles = Option(depsFile)) }) } private def extractS3Key(data: String, key: String): String = { @@ -222,6 +230,48 @@ private[spark] trait DepsTestsSuite { k8sSuite: KubernetesSuite => url } } + + private def getServiceHostAndPort(minioUrlStr : String) : (String, Int) = { + val minioUrl = new URL(minioUrlStr) + (minioUrl.getHost, minioUrl.getPort) + } + + private def setCommonSparkConfPropertiesForS3Access( + conf: SparkAppConf, + minioUrlStr: String): Unit = { + val (minioHost, minioPort) = getServiceHostAndPort(minioUrlStr) + val packages = if (Utils.isHadoop3) { + "org.apache.hadoop:hadoop-aws:3.2.0" + } else { + "com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.6" + } + conf.set("spark.hadoop.fs.s3a.access.key", ACCESS_KEY) + .set("spark.hadoop.fs.s3a.secret.key", SECRET_KEY) + .set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") + .set("spark.hadoop.fs.s3a.endpoint", s"$minioHost:$minioPort") + .set("spark.kubernetes.file.upload.path", s"s3a://$BUCKET") + .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") + .set("spark.jars.packages", packages) + .set("spark.driver.extraJavaOptions", "-Divy.cache.dir=/tmp -Divy.home=/tmp") + } + + private def setPythonSparkConfProperties(conf: SparkAppConf): Unit = { + sparkAppConf.set("spark.kubernetes.container.image", pyImage) + .set("spark.kubernetes.pyspark.pythonVersion", "3") + } + + private def tryDepsTest(runTest: => Unit): Unit = { + try { + setupMinioStorage() + val minioUrlStr = getServiceUrl(svcName) + createS3Bucket(ACCESS_KEY, SECRET_KEY, minioUrlStr) + setCommonSparkConfPropertiesForS3Access(sparkAppConf, minioUrlStr) + runTest + } finally { + // make sure this always runs + deleteMinioStorage() + } + } } private[spark] object DepsTestsSuite { diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala index f1d8217e31b71..cc226b341916d 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesSuite.scala @@ -41,10 +41,10 @@ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ class KubernetesSuite extends SparkFunSuite - with BeforeAndAfterAll with BeforeAndAfter with BasicTestsSuite with SecretsTestsSuite - with PythonTestsSuite with ClientModeTestsSuite with PodTemplateSuite with PVTestsSuite - with DepsTestsSuite with DecommissionSuite with RTestsSuite with Logging with Eventually - with Matchers { + with BeforeAndAfterAll with BeforeAndAfter with BasicTestsSuite with SparkConfPropagateSuite + with SecretsTestsSuite with PythonTestsSuite with ClientModeTestsSuite with PodTemplateSuite + with PVTestsSuite with DepsTestsSuite with DecommissionSuite with RTestsSuite with Logging + with Eventually with Matchers { import KubernetesSuite._ diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala index af980f0494369..0bf01e6b66427 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/KubernetesTestComponents.scala @@ -21,7 +21,6 @@ import java.util.UUID import scala.collection.JavaConverters._ import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer import io.fabric8.kubernetes.client.DefaultKubernetesClient import org.scalatest.concurrent.Eventually diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala new file mode 100644 index 0000000000000..6d15201d19796 --- /dev/null +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/SparkConfPropagateSuite.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.deploy.k8s.integrationtest + +import java.io.{BufferedWriter, File, FileWriter} +import java.net.URL + +import scala.io.{BufferedSource, Source} + +import io.fabric8.kubernetes.api.model._ + +import org.apache.spark.internal.config + +private[spark] trait SparkConfPropagateSuite { k8sSuite: KubernetesSuite => + import KubernetesSuite.{k8sTestTag, SPARK_PI_MAIN_CLASS} + + test("Verify logging configuration is picked from the provided SPARK_CONF_DIR/log4j.properties", + k8sTestTag) { + val loggingConfigFileName = "log-config-test-log4j.properties" + val loggingConfURL: URL = this.getClass.getClassLoader.getResource(loggingConfigFileName) + assert(loggingConfURL != null, "Logging configuration file not available.") + + val content = Source.createBufferedSource(loggingConfURL.openStream()).getLines().mkString("\n") + val logConfFilePath = s"${sparkHomeDir.toFile}/conf/log4j.properties" + + try { + val writer = new BufferedWriter(new FileWriter(logConfFilePath)) + writer.write(content) + writer.close() + + sparkAppConf.set("spark.driver.extraJavaOptions", "-Dlog4j.debug") + + runSparkApplicationAndVerifyCompletion( + appResource = containerLocalSparkDistroExamplesJar, + mainClass = SPARK_PI_MAIN_CLASS, + expectedLogOnCompletion = (Seq("DEBUG", + s"log4j: Reading configuration from URL file:/opt/spark/conf/log4j.properties", + "Pi is roughly 3")), + appArgs = Array.empty[String], + driverPodChecker = doBasicDriverPodCheck, + executorPodChecker = doBasicExecutorPodCheck, + appLocator = appLocator, + isJVM = true) + } finally { + new File(logConfFilePath).delete() + } + } +} diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/Utils.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/Utils.scala index 9bcd6e9503532..ee44cb5f85835 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/Utils.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/Utils.scala @@ -16,21 +16,21 @@ */ package org.apache.spark.deploy.k8s.integrationtest -import java.io.{Closeable, File, PrintWriter} +import java.io.{Closeable, File, FileInputStream, FileOutputStream, PrintWriter} import java.nio.file.{Files, Path} import java.util.concurrent.CountDownLatch +import java.util.zip.{ZipEntry, ZipOutputStream} import scala.collection.JavaConverters._ -import scala.util.Try import io.fabric8.kubernetes.client.dsl.ExecListener import okhttp3.Response +import org.apache.commons.compress.utils.IOUtils import org.apache.commons.io.output.ByteArrayOutputStream import org.apache.hadoop.util.VersionInfo import org.apache.spark.{SPARK_VERSION, SparkException} import org.apache.spark.internal.Logging -import org.apache.spark.util.{Utils => SparkUtils} object Utils extends Logging { @@ -114,23 +114,22 @@ object Utils extends Logging { filename } - def getExamplesJarAbsolutePath(sparkHomeDir: Path): String = { - val jarName = getExamplesJarName() - val jarPathsFound = Files + def getTestFileAbsolutePath(fileName: String, sparkHomeDir: Path): String = { + val filePathsFound = Files .walk(sparkHomeDir) .filter(Files.isRegularFile(_)) - .filter((f: Path) => {f.toFile.getName == jarName}) + .filter((f: Path) => {f.toFile.getName == fileName}) // we should not have more than one here under current test build dir // we only need one though - val jarPath = jarPathsFound + val filePath = filePathsFound .iterator() .asScala .map(_.toAbsolutePath.toString) .toArray .headOption - jarPath match { - case Some(jar) => jar - case _ => throw new SparkException(s"No valid $jarName file was found " + + filePath match { + case Some(file) => file + case _ => throw new SparkException(s"No valid $fileName file was found " + s"under spark home test dir ${sparkHomeDir.toAbsolutePath}!") } } @@ -138,4 +137,16 @@ object Utils extends Logging { def isHadoop3(): Boolean = { VersionInfo.getVersion.startsWith("3") } + + def createZipFile(inFile: String, outFile: String): Unit = { + val fileToZip = new File(inFile) + val fis = new FileInputStream(fileToZip) + val fos = new FileOutputStream(outFile) + val zipOut = new ZipOutputStream(fos) + val zipEntry = new ZipEntry(fileToZip.getName) + zipOut.putNextEntry(zipEntry) + IOUtils.copy(fis, zipOut) + IOUtils.closeQuietly(fis) + IOUtils.closeQuietly(zipOut) + } } diff --git a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/backend/cloud/KubeConfigBackend.scala b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/backend/cloud/KubeConfigBackend.scala index be1834c0b5dea..0fbed4a220e68 100644 --- a/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/backend/cloud/KubeConfigBackend.scala +++ b/resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/backend/cloud/KubeConfigBackend.scala @@ -16,8 +16,6 @@ */ package org.apache.spark.deploy.k8s.integrationtest.backend.cloud -import java.nio.file.Paths - import io.fabric8.kubernetes.client.{Config, DefaultKubernetesClient} import io.fabric8.kubernetes.client.utils.Utils import org.apache.commons.lang3.StringUtils diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala index e1c0d18b73a2b..bd42f6f05655f 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/config.scala @@ -113,6 +113,16 @@ package object config { .stringConf .createOptional + private[spark] val DISPATCHER_QUEUE = + ConfigBuilder("spark.mesos.dispatcher.queue") + .doc("Set the name of the dispatcher queue to which the application is submitted. " + + "The specified queue must be added to the dispatcher " + + "with \"spark.mesos.dispatcher.queue.[QueueName]\". If no queue is specified, then " + + "the application is submitted to the \"default\" queue with 0.0 priority.") + .version("3.1.0") + .stringConf + .createWithDefaultString("default") + private[spark] val DRIVER_LABELS = ConfigBuilder("spark.mesos.driver.labels") .doc("Mesos labels to add to the driver. Labels are free-form key-value pairs. Key-value " + diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala index b8c64a28c72cd..97ef153177674 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/DriverPage.scala @@ -22,7 +22,7 @@ import javax.servlet.http.HttpServletRequest import scala.xml.Node import org.apache.spark.deploy.Command -import org.apache.spark.deploy.mesos.MesosDriverDescription +import org.apache.spark.deploy.mesos.{config, MesosDriverDescription} import org.apache.spark.scheduler.cluster.mesos.{MesosClusterRetryState, MesosClusterSubmissionState} import org.apache.spark.ui.{UIUtils, WebUIPage} @@ -153,6 +153,13 @@ private[ui] class DriverPage(parent: MesosClusterUI) extends WebUIPage("driver") Memory{driver.mem} + + Queue + + {driver.conf.get( + "spark.mesos.dispatcher.queue", config.DISPATCHER_QUEUE.defaultValueString)} + + Submitted{UIUtils.formatDate(driver.submissionDate)} diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala index 772906397546c..5c62ddb37684d 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterPage.scala @@ -23,7 +23,7 @@ import scala.xml.Node import org.apache.mesos.Protos.TaskStatus -import org.apache.spark.deploy.mesos.MesosDriverDescription +import org.apache.spark.deploy.mesos.{config, MesosDriverDescription} import org.apache.spark.deploy.mesos.config._ import org.apache.spark.scheduler.cluster.mesos.MesosClusterSubmissionState import org.apache.spark.ui.{UIUtils, WebUIPage} @@ -36,7 +36,7 @@ private[mesos] class MesosClusterPage(parent: MesosClusterUI) extends WebUIPage( val driverHeader = Seq("Driver ID") val historyHeader = historyServerURL.map(url => Seq("History")).getOrElse(Nil) - val submissionHeader = Seq("Submit Date", "Main Class", "Driver Resources") + val submissionHeader = Seq("Queue", "Submit Date", "Main Class", "Driver Resources") val sandboxHeader = Seq("Sandbox") val queuedHeaders = driverHeader ++ submissionHeader @@ -69,6 +69,10 @@ private[mesos] class MesosClusterPage(parent: MesosClusterUI) extends WebUIPage( val id = submission.submissionId {id} + + {submission.conf.get( + "spark.mesos.dispatcher.queue", config.DISPATCHER_QUEUE.defaultValueString)} + {UIUtils.formatDate(submission.submissionDate)} {submission.command.mainClass} cpus: {submission.cores}, mem: {submission.mem} @@ -99,6 +103,10 @@ private[mesos] class MesosClusterPage(parent: MesosClusterUI) extends WebUIPage( {id} {historyCol} + + {state.driverDescription.conf.get( + "spark.mesos.dispatcher.queue", config.DISPATCHER_QUEUE.defaultValueString)} + {UIUtils.formatDate(state.driverDescription.submissionDate)} {state.driverDescription.command.mainClass} cpus: {state.driverDescription.cores}, mem: {state.driverDescription.mem} diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala index c0cdcda14291f..e260fb8e25f4c 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/mesos/ui/MesosClusterUI.scala @@ -20,7 +20,6 @@ package org.apache.spark.deploy.mesos.ui import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler import org.apache.spark.ui.{SparkUI, WebUI} -import org.apache.spark.ui.JettyUtils._ /** * UI that displays driver results from the [[org.apache.spark.deploy.mesos.MesosClusterDispatcher]] diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala index 39168a5e3c7a5..b18737cf6126d 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala @@ -21,6 +21,7 @@ import java.io.File import java.util.{Collections, Date, List => JList} import scala.collection.JavaConverters._ +import scala.collection.immutable import scala.collection.mutable import scala.collection.mutable.ArrayBuffer @@ -131,6 +132,8 @@ private[spark] class MesosClusterScheduler( private val queuedCapacity = conf.get(config.MAX_DRIVERS) private val retainedDrivers = conf.get(config.RETAINED_DRIVERS) private val maxRetryWaitTime = conf.get(config.CLUSTER_RETRY_WAIT_MAX_SECONDS) + private val queues: immutable.Map[String, Float] = + conf.getAllWithPrefix("spark.mesos.dispatcher.queue.").map(t => (t._1, t._2.toFloat)).toMap private val schedulerState = engineFactory.createEngine("scheduler") private val stateLock = new Object() // Keyed by submission id @@ -144,7 +147,19 @@ private[spark] class MesosClusterScheduler( // state of the tasks from Mesos. Keyed by task Id. private val pendingRecover = new mutable.HashMap[String, AgentID]() // Stores all the submitted drivers that hasn't been launched, keyed by submission id - private val queuedDrivers = new ArrayBuffer[MesosDriverDescription]() + // and sorted by priority, then by submission date + private val driverOrdering = new Ordering[MesosDriverDescription] { + override def compare(x: MesosDriverDescription, y: MesosDriverDescription): Int = { + val xp = getDriverPriority(x) + val yp = getDriverPriority(y) + if (xp != yp) { + xp compare yp + } else { + y.submissionDate.compareTo(x.submissionDate) + } + } + } + private val queuedDrivers = new mutable.TreeSet[MesosDriverDescription]()(driverOrdering.reverse) // All supervised drivers that are waiting to retry after termination, keyed by submission id private val pendingRetryDrivers = new ArrayBuffer[MesosDriverDescription]() private val queuedDriversState = engineFactory.createEngine("driverQueue") @@ -374,6 +389,16 @@ private[spark] class MesosClusterScheduler( s"${frameworkId}-${desc.submissionId}${retries}" } + private[mesos] def getDriverPriority(desc: MesosDriverDescription): Float = { + val defaultQueueName = config.DISPATCHER_QUEUE.defaultValueString + val queueName = desc.conf.get("spark.mesos.dispatcher.queue", defaultQueueName) + if (queueName != defaultQueueName) { + queues.getOrElse(queueName, throw new NoSuchElementException(queueName)) + } else { + 0.0f + } + } + private def getDriverTaskId(desc: MesosDriverDescription): String = { val sId = desc.submissionId desc.retryState.map(state => sId + s"${RETRY_SEP}${state.retries.toString}").getOrElse(sId) @@ -710,7 +735,7 @@ private[spark] class MesosClusterScheduler( } private def copyBuffer( - buffer: ArrayBuffer[MesosDriverDescription]): ArrayBuffer[MesosDriverDescription] = { + buffer: TraversableOnce[MesosDriverDescription]): ArrayBuffer[MesosDriverDescription] = { val newBuffer = new ArrayBuffer[MesosDriverDescription](buffer.size) buffer.copyToBuffer(newBuffer) newBuffer @@ -827,13 +852,13 @@ private[spark] class MesosClusterScheduler( status: Int): Unit = {} private def removeFromQueuedDrivers(subId: String): Boolean = { - val index = queuedDrivers.indexWhere(_.submissionId == subId) - if (index != -1) { - queuedDrivers.remove(index) + val matchOption = queuedDrivers.find(_.submissionId == subId) + if (matchOption.isEmpty) { + false + } else { + queuedDrivers.remove(matchOption.get) queuedDriversState.expunge(subId) true - } else { - false } } diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala index bbe1ff495d8a6..efcef09132f5b 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala @@ -39,7 +39,7 @@ import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle} import org.apache.spark.network.netty.SparkTransportConf import org.apache.spark.network.shuffle.mesos.MesosExternalBlockStoreClient import org.apache.spark.resource.ResourceProfile -import org.apache.spark.rpc.{RpcEndpointAddress, RpcEndpointRef} +import org.apache.spark.rpc.RpcEndpointAddress import org.apache.spark.scheduler.{ExecutorProcessLost, TaskSchedulerImpl} import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend import org.apache.spark.util.Utils diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala index 981b8e9df1747..a5a2611be3765 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackendUtil.scala @@ -17,7 +17,7 @@ package org.apache.spark.scheduler.cluster.mesos -import org.apache.mesos.Protos.{ContainerInfo, Environment, Image, NetworkInfo, Parameter, Secret, +import org.apache.mesos.Protos.{ContainerInfo, Image, NetworkInfo, Parameter, Secret, TaskState => MesosTaskState, Volume} import org.apache.mesos.Protos.ContainerInfo.{DockerInfo, MesosInfo} import org.apache.mesos.Protos.Environment.Variable diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala index 2be8835f77e36..b5a360167679e 100644 --- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala +++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala @@ -29,10 +29,10 @@ import scala.util.control.NonFatal import com.google.common.base.Splitter import com.google.common.io.Files import org.apache.mesos.{MesosSchedulerDriver, Protos, Scheduler, SchedulerDriver} -import org.apache.mesos.Protos.{SlaveID => AgentID, TaskState => MesosTaskState, _} +import org.apache.mesos.Protos.{TaskState => MesosTaskState, _} import org.apache.mesos.Protos.FrameworkInfo.Capability import org.apache.mesos.Protos.Resource.ReservationInfo -import org.apache.mesos.protobuf.{ByteString, GeneratedMessageV3} +import org.apache.mesos.protobuf.GeneratedMessageV3 import org.apache.spark.{SparkConf, SparkContext, SparkException} import org.apache.spark.TaskState diff --git a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala index 5ff7f99aadb2f..146a135afd795 100644 --- a/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala +++ b/resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala @@ -603,6 +603,136 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi assert(scheduler.getDriverCommandValue(driverDesc) == expectedCmd) } + test("SPARK-23499: Test dispatcher priority queue with non float value") { + val conf = new SparkConf() + conf.set("spark.mesos.dispatcher.queue.ROUTINE", "1.0") + conf.set("spark.mesos.dispatcher.queue.URGENT", "abc") + conf.set("spark.mesos.dispatcher.queue.EXCEPTIONAL", "3.0") + assertThrows[NumberFormatException] { + setScheduler(conf.getAll.toMap) + } + } + + test("SPARK-23499: Get driver priority") { + val conf = new SparkConf() + conf.set("spark.mesos.dispatcher.queue.ROUTINE", "1.0") + conf.set("spark.mesos.dispatcher.queue.URGENT", "2.0") + conf.set("spark.mesos.dispatcher.queue.EXCEPTIONAL", "3.0") + setScheduler(conf.getAll.toMap) + + val mem = 1000 + val cpu = 1 + + // Test queue not declared in scheduler + var desc = new MesosDriverDescription("d1", "jar", mem, cpu, true, + command, + Map("spark.mesos.dispatcher.queue" -> "dummy"), + "s1", + new Date()) + + assertThrows[NoSuchElementException] { + scheduler.getDriverPriority(desc) + } + + // Test with no specified queue + desc = new MesosDriverDescription("d1", "jar", mem, cpu, true, + command, + Map[String, String](), + "s2", + new Date()) + + assert(scheduler.getDriverPriority(desc) == 0.0f) + + // Test with "default" queue specified + desc = new MesosDriverDescription("d1", "jar", mem, cpu, true, + command, + Map("spark.mesos.dispatcher.queue" -> "default"), + "s3", + new Date()) + + assert(scheduler.getDriverPriority(desc) == 0.0f) + + // Test queue declared in scheduler + desc = new MesosDriverDescription("d1", "jar", mem, cpu, true, + command, + Map("spark.mesos.dispatcher.queue" -> "ROUTINE"), + "s4", + new Date()) + + assert(scheduler.getDriverPriority(desc) == 1.0f) + + // Test other queue declared in scheduler + desc = new MesosDriverDescription("d1", "jar", mem, cpu, true, + command, + Map("spark.mesos.dispatcher.queue" -> "URGENT"), + "s5", + new Date()) + + assert(scheduler.getDriverPriority(desc) == 2.0f) + } + + test("SPARK-23499: Can queue drivers with priority") { + val conf = new SparkConf() + conf.set("spark.mesos.dispatcher.queue.ROUTINE", "1.0") + conf.set("spark.mesos.dispatcher.queue.URGENT", "2.0") + conf.set("spark.mesos.dispatcher.queue.EXCEPTIONAL", "3.0") + setScheduler(conf.getAll.toMap) + + val mem = 1000 + val cpu = 1 + + val response0 = scheduler.submitDriver( + new MesosDriverDescription("d1", "jar", 100, 1, true, command, + Map("spark.mesos.dispatcher.queue" -> "ROUTINE"), "s0", new Date())) + assert(response0.success) + + val response1 = scheduler.submitDriver( + new MesosDriverDescription("d1", "jar", 100, 1, true, command, + Map[String, String](), "s1", new Date())) + assert(response1.success) + + val response2 = scheduler.submitDriver( + new MesosDriverDescription("d1", "jar", 100, 1, true, command, + Map("spark.mesos.dispatcher.queue" -> "EXCEPTIONAL"), "s2", new Date())) + assert(response2.success) + + val response3 = scheduler.submitDriver( + new MesosDriverDescription("d1", "jar", 100, 1, true, command, + Map("spark.mesos.dispatcher.queue" -> "URGENT"), "s3", new Date())) + assert(response3.success) + + val state = scheduler.getSchedulerState() + val queuedDrivers = state.queuedDrivers.toList + assert(queuedDrivers(0).submissionId == response2.submissionId) + assert(queuedDrivers(1).submissionId == response3.submissionId) + assert(queuedDrivers(2).submissionId == response0.submissionId) + assert(queuedDrivers(3).submissionId == response1.submissionId) + } + + test("SPARK-23499: Can queue drivers with negative priority") { + val conf = new SparkConf() + conf.set("spark.mesos.dispatcher.queue.LOWER", "-1.0") + setScheduler(conf.getAll.toMap) + + val mem = 1000 + val cpu = 1 + + val response0 = scheduler.submitDriver( + new MesosDriverDescription("d1", "jar", 100, 1, true, command, + Map("spark.mesos.dispatcher.queue" -> "LOWER"), "s0", new Date())) + assert(response0.success) + + val response1 = scheduler.submitDriver( + new MesosDriverDescription("d1", "jar", 100, 1, true, command, + Map[String, String](), "s1", new Date())) + assert(response1.success) + + val state = scheduler.getSchedulerState() + val queuedDrivers = state.queuedDrivers.toList + assert(queuedDrivers(0).submissionId == response1.submissionId) + assert(queuedDrivers(1).submissionId == response0.submissionId) + } + private def launchDriverTask(addlSparkConfVars: Map[String, String]): List[TaskInfo] = { setScheduler() val mem = 1000 diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala index d9262bbac6586..ede39063cf1bd 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala @@ -19,12 +19,11 @@ package org.apache.spark.deploy.yarn import java.io.File import java.nio.ByteBuffer -import java.util.{Collections, Locale} +import java.util.Collections import scala.collection.JavaConverters._ import scala.collection.mutable.{HashMap, ListBuffer} -import org.apache.hadoop.HadoopIllegalArgumentException import org.apache.hadoop.fs.Path import org.apache.hadoop.io.DataOutputBuffer import org.apache.hadoop.security.UserGroupInformation @@ -40,7 +39,6 @@ import org.apache.spark.{SecurityManager, SparkConf, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.network.util.JavaUtils -import org.apache.spark.resource.ResourceProfile import org.apache.spark.util.Utils private[yarn] class ExecutorRunnable( diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala index 5640f7ede33df..7ac5beac76e20 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/LocalityPreferredContainerPlacementStrategy.scala @@ -21,7 +21,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, HashMap, Set} import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.yarn.api.records.{ContainerId, Resource} +import org.apache.hadoop.yarn.api.records.ContainerId import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest import org.apache.spark.SparkConf diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala index 0273de10993eb..09766bf97d8f3 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala @@ -26,7 +26,6 @@ import org.apache.hadoop.yarn.api.records.{ApplicationAccessType, ContainerId, P import org.apache.hadoop.yarn.util.ConverterUtils import org.apache.spark.{SecurityManager, SparkConf} -import org.apache.spark.internal.config._ import org.apache.spark.launcher.YarnCommandBuilderUtils import org.apache.spark.resource.ExecutorResourceRequest import org.apache.spark.util.Utils diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala index 0c3d080cca254..d000287cb7a96 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/launcher/YarnCommandBuilderUtils.scala @@ -17,8 +17,6 @@ package org.apache.spark.launcher -import scala.collection.JavaConverters._ -import scala.collection.mutable.ListBuffer import scala.util.Properties /** diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala index 7f8dd590545c6..5b762f606112c 100644 --- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala +++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala @@ -29,7 +29,6 @@ import org.scalatest.matchers.should.Matchers._ import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging -import org.apache.spark.internal.config._ import org.apache.spark.internal.config.UI._ import org.apache.spark.util.{ResetSystemProperties, Utils} diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/MetadataColumn.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/MetadataColumn.java new file mode 100644 index 0000000000000..cdfa082ced317 --- /dev/null +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/MetadataColumn.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.connector.catalog; + +import org.apache.spark.annotation.Evolving; +import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.types.DataType; + +/** + * Interface for a metadata column. + *

+ * A metadata column can expose additional metadata about a row. For example, rows from Kafka can + * use metadata columns to expose a message's topic, partition number, and offset. + *

+ * A metadata column could also be the result of a transform applied to a value in the row. For + * example, a partition value produced by bucket(id, 16) could be exposed by a metadata column. In + * this case, {@link #transform()} should return a non-null {@link Transform} that produced the + * metadata column's values. + */ +@Evolving +public interface MetadataColumn { + /** + * The name of this metadata column. + * + * @return a String name + */ + String name(); + + /** + * The data type of values in this metadata column. + * + * @return a {@link DataType} + */ + DataType dataType(); + + /** + * @return whether values produced by this metadata column may be null + */ + default boolean isNullable() { + return true; + } + + /** + * Documentation for this metadata column, or null. + * + * @return a documentation String + */ + default String comment() { + return null; + } + + /** + * The {@link Transform} used to produce this metadata column from data rows, or null. + * + * @return a {@link Transform} used to produce the column's values, or null if there isn't one + */ + default Transform transform() { + return null; + } +} diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsMetadataColumns.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsMetadataColumns.java new file mode 100644 index 0000000000000..208abfc302582 --- /dev/null +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsMetadataColumns.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.connector.catalog; + +import org.apache.spark.annotation.Evolving; +import org.apache.spark.sql.connector.read.SupportsPushDownRequiredColumns; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +/** + * An interface for exposing data columns for a table that are not in the table schema. For example, + * a file source could expose a "file" column that contains the path of the file that contained each + * row. + *

+ * The columns returned by {@link #metadataColumns()} may be passed as {@link StructField} in + * requested projections. Sources that implement this interface and column projection using + * {@link SupportsPushDownRequiredColumns} must accept metadata fields passed to + * {@link SupportsPushDownRequiredColumns#pruneColumns(StructType)}. + *

+ * If a table column and a metadata column have the same name, the metadata column will never be + * requested. It is recommended that Table implementations reject data column name that conflict + * with metadata column names. + */ +@Evolving +public interface SupportsMetadataColumns extends Table { + /** + * Metadata columns that are supported by this {@link Table}. + *

+ * The columns returned by this method may be passed as {@link StructField} in requested + * projections using {@link SupportsPushDownRequiredColumns#pruneColumns(StructType)}. + *

+ * If a table column and a metadata column have the same name, the metadata column will never be + * requested and is ignored. It is recommended that Table implementations reject data column names + * that conflict with metadata column names. + * + * @return an array of {@link MetadataColumn} + */ + MetadataColumn[] metadataColumns(); +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SQLConfHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SQLConfHelper.scala new file mode 100644 index 0000000000000..cee35cdb8d840 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SQLConfHelper.scala @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst + +import org.apache.spark.sql.internal.SQLConf + +/** + * Trait for getting the active SQLConf. + */ +trait SQLConfHelper { + + /** + * The active config object within the current scope. + * See [[SQLConf.get]] for more information. + */ + def conf: SQLConf = SQLConf.get +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index c65e181181e83..53c7f17ee6b2e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -30,7 +30,6 @@ import org.apache.spark.sql.catalyst.expressions.objects._ import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} -import org.apache.spark.util.Utils /** @@ -894,10 +893,6 @@ trait ScalaReflection extends Logging { import universe._ - // The Predef.Map is scala.collection.immutable.Map. - // Since the map values can be mutable, we explicitly import scala.collection.Map at here. - import scala.collection.Map - /** * Any codes calling `scala.reflect.api.Types.TypeApi.<:<` should be wrapped by this method to * clean up the Scala reflection garbage automatically. Otherwise, it will leak some objects to diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 690d66bec890d..8d95d8cf49d45 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -58,15 +58,14 @@ import org.apache.spark.util.Utils */ object SimpleAnalyzer extends Analyzer( new CatalogManager( - new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true), FakeV2SessionCatalog, new SessionCatalog( new InMemoryCatalog, - EmptyFunctionRegistry, - new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true)) { + EmptyFunctionRegistry) { override def createDatabase(dbDefinition: CatalogDatabase, ignoreIfExists: Boolean): Unit = {} - }), - new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true)) + })) { + override def resolver: Resolver = caseSensitiveResolution +} object FakeV2SessionCatalog extends TableCatalog { private def fail() = throw new UnsupportedOperationException @@ -130,10 +129,8 @@ object AnalysisContext { * Provides a logical query plan analyzer, which translates [[UnresolvedAttribute]]s and * [[UnresolvedRelation]]s into fully typed objects using information in a [[SessionCatalog]]. */ -class Analyzer( - override val catalogManager: CatalogManager, - conf: SQLConf) - extends RuleExecutor[LogicalPlan] with CheckAnalysis with LookupCatalog { +class Analyzer(override val catalogManager: CatalogManager) + extends RuleExecutor[LogicalPlan] with CheckAnalysis with LookupCatalog with SQLConfHelper { private val v1SessionCatalog: SessionCatalog = catalogManager.v1SessionCatalog @@ -144,10 +141,8 @@ class Analyzer( override def isView(nameParts: Seq[String]): Boolean = v1SessionCatalog.isView(nameParts) // Only for tests. - def this(catalog: SessionCatalog, conf: SQLConf) = { - this( - new CatalogManager(conf, FakeV2SessionCatalog, catalog), - conf) + def this(catalog: SessionCatalog) = { + this(new CatalogManager(FakeV2SessionCatalog, catalog)) } def executeAndCheck(plan: LogicalPlan, tracker: QueryPlanningTracker): LogicalPlan = { @@ -226,6 +221,7 @@ class Analyzer( ResolveRelations :: ResolveTables :: ResolvePartitionSpec :: + AddMetadataColumns :: ResolveReferences :: ResolveCreateNamedStruct :: ResolveDeserializer :: @@ -921,6 +917,29 @@ class Analyzer( } } + /** + * Adds metadata columns to output for child relations when nodes are missing resolved attributes. + * + * References to metadata columns are resolved using columns from [[LogicalPlan.metadataOutput]], + * but the relation's output does not include the metadata columns until the relation is replaced + * using [[DataSourceV2Relation.withMetadataColumns()]]. Unless this rule adds metadata to the + * relation's output, the analyzer will detect that nothing produces the columns. + * + * This rule only adds metadata columns when a node is resolved but is missing input from its + * children. This ensures that metadata columns are not added to the plan unless they are used. By + * checking only resolved nodes, this ensures that * expansion is already done so that metadata + * columns are not accidentally selected by *. + */ + object AddMetadataColumns extends Rule[LogicalPlan] { + def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { + case node if node.resolved && node.children.nonEmpty && node.missingInput.nonEmpty => + node resolveOperatorsUp { + case rel: DataSourceV2Relation => + rel.withMetadataColumns() + } + } + } + /** * Resolve table relations with concrete relations from v2 catalog. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala index 6eed152e6dd77..47a45b0e529c6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.Literal._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala index d3bb72badeb13..deeb8215d22c6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveCatalogs.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, LookupCatalog, SupportsNamespaces, TableCatalog, TableChange} +import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, LookupCatalog, TableCatalog, TableChange} /** * Resolves catalogs from the multi-part identifiers in SQL statements, and convert the statements diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala index 814ea8c9768ae..7dcc6a81b48cd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala @@ -304,7 +304,7 @@ object UnsupportedOperationChecker extends Logging { case LeftAnti => if (right.isStreaming) { - throwError("Left anti joins with a streaming DataFrame/Dataset " + + throwError(s"$LeftAnti joins with a streaming DataFrame/Dataset " + "on the right are not supported") } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala index 51eb3d033ddc4..2fa6bf0acea67 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala @@ -21,7 +21,6 @@ import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.DataType /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/timeZoneAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/timeZoneAnalysis.scala index d8062744a4264..9234b58eb9f6e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/timeZoneAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/timeZoneAnalysis.scala @@ -16,10 +16,10 @@ */ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, ListQuery, TimeZoneAwareExpression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.DataType /** @@ -47,10 +47,7 @@ object ResolveTimeZone extends Rule[LogicalPlan] { * Mix-in trait for constructing valid [[Cast]] expressions. */ trait CastSupport { - /** - * Configuration used to create a valid cast expression. - */ - def conf: SQLConf + self: SQLConfHelper => /** * Create a Cast expression with the session local time zone. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala index 83acfb8d4a71c..98bd84fb94bd6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/v2ResolutionPlans.scala @@ -18,11 +18,10 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.catalog.CatalogFunction import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan} -import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, SupportsNamespaces, Table, TableCatalog} +import org.apache.spark.sql.catalyst.plans.logical.LeafNode +import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, Table, TableCatalog} /** * Holds the name of a namespace that has yet to be looked up in a catalog. It will be resolved to diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala index 65601640fa044..06de023098a1c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.Alias import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, View} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.internal.SQLConf /** * This file defines view types and analysis rules related to views. @@ -54,8 +53,6 @@ import org.apache.spark.sql.internal.SQLConf * completely resolved during the batch of Resolution. */ object EliminateView extends Rule[LogicalPlan] with CastSupport { - override def conf: SQLConf = SQLConf.get - override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { // The child has the different output attributes with the View operator. Adds a Project over // the child of the view. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala index db930cf7890e6..5643bf8b3a9b7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalog.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.catalog -import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException, NoSuchPartitionException, NoSuchTableException} +import org.apache.spark.sql.catalyst.analysis.{FunctionAlreadyExistsException, NoSuchDatabaseException, NoSuchFunctionException, NoSuchTableException} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.types.StructType diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index c00d51dc3df1f..17ab6664df75c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -61,10 +61,11 @@ class SessionCatalog( externalCatalogBuilder: () => ExternalCatalog, globalTempViewManagerBuilder: () => GlobalTempViewManager, functionRegistry: FunctionRegistry, - conf: SQLConf, hadoopConf: Configuration, parser: ParserInterface, - functionResourceLoader: FunctionResourceLoader) extends Logging { + functionResourceLoader: FunctionResourceLoader, + cacheSize: Int = SQLConf.get.tableRelationCacheSize, + cacheTTL: Long = SQLConf.get.metadataCacheTTL) extends SQLConfHelper with Logging { import SessionCatalog._ import CatalogTypes.TablePartitionSpec @@ -77,18 +78,21 @@ class SessionCatalog( () => externalCatalog, () => new GlobalTempViewManager(conf.getConf(GLOBAL_TEMP_DATABASE)), functionRegistry, - conf, new Configuration(), new CatalystSqlParser(), - DummyFunctionResourceLoader) + DummyFunctionResourceLoader, + conf.tableRelationCacheSize, + conf.metadataCacheTTL) + } + + // For testing only. + def this(externalCatalog: ExternalCatalog, functionRegistry: FunctionRegistry) = { + this(externalCatalog, functionRegistry, SQLConf.get) } // For testing only. def this(externalCatalog: ExternalCatalog) = { - this( - externalCatalog, - new SimpleFunctionRegistry, - new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true)) + this(externalCatalog, new SimpleFunctionRegistry) } lazy val externalCatalog = externalCatalogBuilder() @@ -136,9 +140,6 @@ class SessionCatalog( } private val tableRelationCache: Cache[QualifiedTableName, LogicalPlan] = { - val cacheSize = conf.tableRelationCacheSize - val cacheTTL = conf.metadataCacheTTL - var builder = CacheBuilder.newBuilder() .maximumSize(cacheSize) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 9c93691ca3b41..ee7216e93ebb5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -28,7 +28,7 @@ import org.apache.commons.lang3.StringUtils import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier} +import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, SQLConfHelper, TableIdentifier} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Cast, ExprId, Literal} import org.apache.spark.sql.catalyst.plans.logical._ @@ -177,8 +177,7 @@ case class CatalogTablePartition( case class BucketSpec( numBuckets: Int, bucketColumnNames: Seq[String], - sortColumnNames: Seq[String]) { - def conf: SQLConf = SQLConf.get + sortColumnNames: Seq[String]) extends SQLConfHelper { if (numBuckets <= 0 || numBuckets > conf.bucketingMaxBuckets) { throw new AnalysisException( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala index b61c4b8d065f2..4cd649b07a5c0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.expressions.objects.Invoke import org.apache.spark.sql.catalyst.plans.{Inner, JoinType} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String /** * A collection of implicit conversions that create a DSL for constructing catalyst data structures. @@ -102,6 +103,10 @@ package object dsl { def like(other: Expression, escapeChar: Char = '\\'): Expression = Like(expr, other, escapeChar) def rlike(other: Expression): Expression = RLike(expr, other) + def likeAll(others: Expression*): Expression = + LikeAll(expr, others.map(_.eval(EmptyRow).asInstanceOf[UTF8String])) + def notLikeAll(others: Expression*): Expression = + NotLikeAll(expr, others.map(_.eval(EmptyRow).asInstanceOf[UTF8String])) def contains(other: Expression): Expression = Contains(expr, other) def startsWith(other: Expression): Expression = StartsWith(expr, other) def endsWith(other: Expression): Expression = EndsWith(expr, other) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 3d5c1855f6975..9ab38044e6a88 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.catalyst.encoders -import java.io.ObjectInputStream - import scala.reflect.ClassTag import scala.reflect.runtime.universe.{typeTag, TypeTag} @@ -33,7 +31,7 @@ import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, Initial import org.apache.spark.sql.catalyst.optimizer.{ReassignLambdaVariableID, SimplifyCasts} import org.apache.spark.sql.catalyst.plans.logical.{CatalystSerde, DeserializeToObject, LeafNode, LocalRelation} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DataType, ObjectType, StringType, StructField, StructType} +import org.apache.spark.sql.types.{ObjectType, StringType, StructField, StructType} import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 4af12d61e86d9..5afc308e52ead 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.catalyst.expressions -import java.math.{BigDecimal => JavaBigDecimal} import java.time.ZoneId import java.util.Locale import java.util.concurrent.TimeUnit._ @@ -25,6 +24,7 @@ import java.util.concurrent.TimeUnit._ import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} +import org.apache.spark.sql.catalyst.expressions.Cast.{forceNullable, resolvableNullability} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.util._ @@ -258,13 +258,18 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit def dataType: DataType + /** + * Returns true iff we can cast `from` type to `to` type. + */ + def canCast(from: DataType, to: DataType): Boolean + override def toString: String = { val ansi = if (ansiEnabled) "ansi_" else "" s"${ansi}cast($child as ${dataType.simpleString})" } override def checkInputDataTypes(): TypeCheckResult = { - if (Cast.canCast(child.dataType, dataType)) { + if (canCast(child.dataType, dataType)) { TypeCheckResult.TypeCheckSuccess } else { TypeCheckResult.TypeCheckFailure( @@ -1753,6 +1758,12 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String copy(timeZoneId = Option(timeZoneId)) override protected val ansiEnabled: Boolean = SQLConf.get.ansiEnabled + + override def canCast(from: DataType, to: DataType): Boolean = if (ansiEnabled) { + AnsiCast.canCast(from, to) + } else { + Cast.canCast(from, to) + } } /** @@ -1770,6 +1781,110 @@ case class AnsiCast(child: Expression, dataType: DataType, timeZoneId: Option[St copy(timeZoneId = Option(timeZoneId)) override protected val ansiEnabled: Boolean = true + + override def canCast(from: DataType, to: DataType): Boolean = AnsiCast.canCast(from, to) +} + +object AnsiCast { + /** + * As per section 6.13 "cast specification" in "Information technology — Database languages " + + * "- SQL — Part 2: Foundation (SQL/Foundation)": + * If the is a , then the valid combinations of TD and SD + * in a are given by the following table. “Y” indicates that the + * combination is syntactically valid without restriction; “M” indicates that the combination + * is valid subject to other Syntax Rules in this Sub- clause being satisfied; and “N” indicates + * that the combination is not valid: + * SD TD + * EN AN C D T TS YM DT BO UDT B RT CT RW + * EN Y Y Y N N N M M N M N M N N + * AN Y Y Y N N N N N N M N M N N + * C Y Y Y Y Y Y Y Y Y M N M N N + * D N N Y Y N Y N N N M N M N N + * T N N Y N Y Y N N N M N M N N + * TS N N Y Y Y Y N N N M N M N N + * YM M N Y N N N Y N N M N M N N + * DT M N Y N N N N Y N M N M N N + * BO N N Y N N N N N Y M N M N N + * UDT M M M M M M M M M M M M M N + * B N N N N N N N N N M Y M N N + * RT M M M M M M M M M M M M N N + * CT N N N N N N N N N M N N M N + * RW N N N N N N N N N N N N N M + * + * Where: + * EN = Exact Numeric + * AN = Approximate Numeric + * C = Character (Fixed- or Variable-Length, or Character Large Object) + * D = Date + * T = Time + * TS = Timestamp + * YM = Year-Month Interval + * DT = Day-Time Interval + * BO = Boolean + * UDT = User-Defined Type + * B = Binary (Fixed- or Variable-Length or Binary Large Object) + * RT = Reference type + * CT = Collection type + * RW = Row type + * + * Spark's ANSI mode follows the syntax rules, except it specially allow the following + * straightforward type conversions which are disallowed as per the SQL standard: + * - Numeric <=> Boolean + * - String <=> Binary + */ + def canCast(from: DataType, to: DataType): Boolean = (from, to) match { + case (fromType, toType) if fromType == toType => true + + case (NullType, _) => true + + case (StringType, _: BinaryType) => true + + case (StringType, BooleanType) => true + case (_: NumericType, BooleanType) => true + + case (StringType, TimestampType) => true + case (DateType, TimestampType) => true + + case (StringType, _: CalendarIntervalType) => true + + case (StringType, DateType) => true + case (TimestampType, DateType) => true + + case (_: NumericType, _: NumericType) => true + case (StringType, _: NumericType) => true + case (BooleanType, _: NumericType) => true + + case (_: NumericType, StringType) => true + case (_: DateType, StringType) => true + case (_: TimestampType, StringType) => true + case (_: CalendarIntervalType, StringType) => true + case (BooleanType, StringType) => true + case (BinaryType, StringType) => true + + case (ArrayType(fromType, fn), ArrayType(toType, tn)) => + canCast(fromType, toType) && + resolvableNullability(fn || forceNullable(fromType, toType), tn) + + case (MapType(fromKey, fromValue, fn), MapType(toKey, toValue, tn)) => + canCast(fromKey, toKey) && + (!forceNullable(fromKey, toKey)) && + canCast(fromValue, toValue) && + resolvableNullability(fn || forceNullable(fromValue, toValue), tn) + + case (StructType(fromFields), StructType(toFields)) => + fromFields.length == toFields.length && + fromFields.zip(toFields).forall { + case (fromField, toField) => + canCast(fromField.dataType, toField.dataType) && + resolvableNullability( + fromField.nullable || forceNullable(fromField.dataType, toField.dataType), + toField.nullable) + } + + case (udt1: UserDefinedType[_], udt2: UserDefinedType[_]) if udt2.acceptsType(udt1) => true + + case _ => false + } } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 35b192cc5544a..1d23953484046 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -24,9 +24,7 @@ import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TypeCheckResult import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.trees.TreeNode -import org.apache.spark.sql.catalyst.util.toPrettySQL import org.apache.spark.sql.catalyst.util.truncatedString import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -63,7 +61,8 @@ import org.apache.spark.sql.types._ * functions. * - [[NamedExpression]]: An [[Expression]] that is named. * - [[TimeZoneAwareExpression]]: A common base trait for time zone aware expressions. - * - [[SubqueryExpression]]: A base interface for expressions that contain a [[LogicalPlan]]. + * - [[SubqueryExpression]]: A base interface for expressions that contain a + * [[org.apache.spark.sql.catalyst.plans.logical.LogicalPlan]]. * * - [[LeafExpression]]: an expression that has no child. * - [[UnaryExpression]]: an expression that has one child. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala index a2daec0b1ade1..91c9457af7de3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedMutableProjection.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp +import org.apache.spark.sql.internal.SQLConf /** @@ -33,6 +34,15 @@ class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mutable def this(expressions: Seq[Expression], inputSchema: Seq[Attribute]) = this(bindReferences(expressions, inputSchema)) + private[this] val subExprEliminationEnabled = SQLConf.get.subexpressionEliminationEnabled + private[this] lazy val runtime = + new SubExprEvaluationRuntime(SQLConf.get.subexpressionEliminationCacheMaxEntries) + private[this] val exprs = if (subExprEliminationEnabled) { + runtime.proxyExpressions(expressions) + } else { + expressions + } + private[this] val buffer = new Array[Any](expressions.size) override def initialize(partitionIndex: Int): Unit = { @@ -76,11 +86,15 @@ class InterpretedMutableProjection(expressions: Seq[Expression]) extends Mutable }.toArray override def apply(input: InternalRow): InternalRow = { + if (subExprEliminationEnabled) { + runtime.setInput(input) + } + var i = 0 while (i < validExprs.length) { - val (expr, ordinal) = validExprs(i) + val (_, ordinal) = validExprs(i) // Store the result into buffer first, to make the projection atomic (needed by aggregation) - buffer(ordinal) = expr.eval(input) + buffer(ordinal) = exprs(ordinal).eval(input) i += 1 } i = 0 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedSafeProjection.scala index 70789dac1d87a..0e71892db666b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedSafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedSafeProjection.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.aggregate.NoOp import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, GenericArrayData, MapData} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -30,6 +31,15 @@ import org.apache.spark.sql.types._ */ class InterpretedSafeProjection(expressions: Seq[Expression]) extends Projection { + private[this] val subExprEliminationEnabled = SQLConf.get.subexpressionEliminationEnabled + private[this] lazy val runtime = + new SubExprEvaluationRuntime(SQLConf.get.subexpressionEliminationCacheMaxEntries) + private[this] val exprs = if (subExprEliminationEnabled) { + runtime.proxyExpressions(expressions) + } else { + expressions + } + private[this] val mutableRow = new SpecificInternalRow(expressions.map(_.dataType)) private[this] val exprsWithWriters = expressions.zipWithIndex.filter { @@ -49,7 +59,7 @@ class InterpretedSafeProjection(expressions: Seq[Expression]) extends Projection } } } - (e, f) + (exprs(i), f) } private def generateSafeValueConverter(dt: DataType): Any => Any = dt match { @@ -97,6 +107,10 @@ class InterpretedSafeProjection(expressions: Seq[Expression]) extends Projection } override def apply(row: InternalRow): InternalRow = { + if (subExprEliminationEnabled) { + runtime.setInput(row) + } + var i = 0 while (i < exprsWithWriters.length) { val (expr, writer) = exprsWithWriters(i) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedUnsafeProjection.scala index 39a16e917c4a5..f3ca4f06cd372 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedUnsafeProjection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/InterpretedUnsafeProjection.scala @@ -20,6 +20,7 @@ import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{UnsafeArrayWriter, UnsafeRowWriter, UnsafeWriter} import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{UserDefinedType, _} import org.apache.spark.unsafe.Platform @@ -33,6 +34,15 @@ import org.apache.spark.unsafe.Platform class InterpretedUnsafeProjection(expressions: Array[Expression]) extends UnsafeProjection { import InterpretedUnsafeProjection._ + private[this] val subExprEliminationEnabled = SQLConf.get.subexpressionEliminationEnabled + private[this] lazy val runtime = + new SubExprEvaluationRuntime(SQLConf.get.subexpressionEliminationCacheMaxEntries) + private[this] val exprs = if (subExprEliminationEnabled) { + runtime.proxyExpressions(expressions) + } else { + expressions.toSeq + } + /** Number of (top level) fields in the resulting row. */ private[this] val numFields = expressions.length @@ -63,17 +73,21 @@ class InterpretedUnsafeProjection(expressions: Array[Expression]) extends Unsafe } override def initialize(partitionIndex: Int): Unit = { - expressions.foreach(_.foreach { + exprs.foreach(_.foreach { case n: Nondeterministic => n.initialize(partitionIndex) case _ => }) } override def apply(row: InternalRow): UnsafeRow = { + if (subExprEliminationEnabled) { + runtime.setInput(row) + } + // Put the expression results in the intermediate row. var i = 0 while (i < numFields) { - values(i) = expressions(i).eval(row) + values(i) = exprs(i).eval(row) i += 1 } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ProjectionOverSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ProjectionOverSchema.scala index 6f1d9d065ab1a..241c761624b76 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ProjectionOverSchema.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ProjectionOverSchema.scala @@ -55,8 +55,8 @@ case class ProjectionOverSchema(schema: StructType) { getProjection(child).map { projection => MapKeys(projection) } case MapValues(child) => getProjection(child).map { projection => MapValues(projection) } - case GetMapValue(child, key) => - getProjection(child).map { projection => GetMapValue(projection, key) } + case GetMapValue(child, key, failOnError) => + getProjection(child).map { projection => GetMapValue(projection, key, failOnError) } case GetStructFieldObject(child, field: StructField) => getProjection(child).map(p => (p, p.dataType)).map { case (projection, projSchema: StructType) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala index 6e2bd96784b94..0a69d5aa6b9ad 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, DataType, UserDefinedType} +import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, DataType} import org.apache.spark.util.Utils /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SelectedField.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SelectedField.scala index adcc4be10687e..f2acb75ea6ac4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SelectedField.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SelectedField.scala @@ -91,7 +91,7 @@ object SelectedField { } val newField = StructField(field.name, newFieldDataType, field.nullable) selectField(child, Option(ArrayType(struct(newField), containsNull))) - case GetMapValue(child, _) => + case GetMapValue(child, _, _) => // GetMapValue does not select a field from a struct (i.e. prune the struct) so it can't be // the top-level extractor. However it can be part of an extractor chain. val MapType(keyType, _, valueContainsNull) = child.dataType diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SubExprEvaluationRuntime.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SubExprEvaluationRuntime.scala new file mode 100644 index 0000000000000..3189d81289903 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SubExprEvaluationRuntime.scala @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.expressions + +import java.util.IdentityHashMap + +import scala.collection.JavaConverters._ + +import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} +import com.google.common.util.concurrent.{ExecutionError, UncheckedExecutionException} + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.types.DataType + +/** + * This class helps subexpression elimination for interpreted evaluation + * such as `InterpretedUnsafeProjection`. It maintains an evaluation cache. + * This class wraps `ExpressionProxy` around given expressions. The `ExpressionProxy` + * intercepts expression evaluation and loads from the cache first. + */ +class SubExprEvaluationRuntime(cacheMaxEntries: Int) { + // The id assigned to `ExpressionProxy`. `SubExprEvaluationRuntime` will use assigned ids of + // `ExpressionProxy` to decide the equality when loading from cache. `SubExprEvaluationRuntime` + // won't be use by multi-threads so we don't need to consider concurrency here. + private var proxyExpressionCurrentId = 0 + + private[sql] val cache: LoadingCache[ExpressionProxy, ResultProxy] = CacheBuilder.newBuilder() + .maximumSize(cacheMaxEntries) + .build( + new CacheLoader[ExpressionProxy, ResultProxy]() { + override def load(expr: ExpressionProxy): ResultProxy = { + ResultProxy(expr.proxyEval(currentInput)) + } + }) + + private var currentInput: InternalRow = null + + def getEval(proxy: ExpressionProxy): Any = try { + cache.get(proxy).result + } catch { + // Cache.get() may wrap the original exception. See the following URL + // http://google.github.io/guava/releases/14.0/api/docs/com/google/common/cache/ + // Cache.html#get(K,%20java.util.concurrent.Callable) + case e @ (_: UncheckedExecutionException | _: ExecutionError) => + throw e.getCause + } + + /** + * Sets given input row as current row for evaluating expressions. This cleans up the cache + * too as new input comes. + */ + def setInput(input: InternalRow = null): Unit = { + currentInput = input + cache.invalidateAll() + } + + /** + * Recursively replaces expression with its proxy expression in `proxyMap`. + */ + private def replaceWithProxy( + expr: Expression, + proxyMap: IdentityHashMap[Expression, ExpressionProxy]): Expression = { + if (proxyMap.containsKey(expr)) { + proxyMap.get(expr) + } else { + expr.mapChildren(replaceWithProxy(_, proxyMap)) + } + } + + /** + * Finds subexpressions and wraps them with `ExpressionProxy`. + */ + def proxyExpressions(expressions: Seq[Expression]): Seq[Expression] = { + val equivalentExpressions: EquivalentExpressions = new EquivalentExpressions + + expressions.foreach(equivalentExpressions.addExprTree(_)) + + val proxyMap = new IdentityHashMap[Expression, ExpressionProxy] + + val commonExprs = equivalentExpressions.getAllEquivalentExprs.filter(_.size > 1) + commonExprs.foreach { e => + val expr = e.head + val proxy = ExpressionProxy(expr, proxyExpressionCurrentId, this) + proxyExpressionCurrentId += 1 + + proxyMap.putAll(e.map(_ -> proxy).toMap.asJava) + } + + // Only adding proxy if we find subexpressions. + if (!proxyMap.isEmpty) { + expressions.map(replaceWithProxy(_, proxyMap)) + } else { + expressions + } + } +} + +/** + * A proxy for an catalyst `Expression`. Given a runtime object `SubExprEvaluationRuntime`, + * when this is asked to evaluate, it will load from the evaluation cache in the runtime first. + */ +case class ExpressionProxy( + child: Expression, + id: Int, + runtime: SubExprEvaluationRuntime) extends Expression { + + final override def dataType: DataType = child.dataType + final override def nullable: Boolean = child.nullable + final override def children: Seq[Expression] = child :: Nil + + // `ExpressionProxy` is for interpreted expression evaluation only. So cannot `doGenCode`. + final override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = + throw new UnsupportedOperationException(s"Cannot generate code for expression: $this") + + def proxyEval(input: InternalRow = null): Any = child.eval(input) + + override def eval(input: InternalRow = null): Any = runtime.getEval(this) + + override def equals(obj: Any): Boolean = obj match { + case other: ExpressionProxy => this.id == other.id + case _ => false + } + + override def hashCode(): Int = this.id.hashCode() +} + +/** + * A simple wrapper for holding `Any` in the cache of `SubExprEvaluationRuntime`. + */ +case class ResultProxy(result: Any) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 9aa827a58d87a..1ff4a93cf0acd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -38,9 +38,8 @@ import org.apache.spark.metrics.source.CodegenMetrics import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, MapData, SQLOrderingUtil} +import org.apache.spark.sql.catalyst.util.{ArrayData, MapData, SQLOrderingUtil} import org.apache.spark.sql.catalyst.util.DateTimeConstants.NANOS_PER_MILLIS -import org.apache.spark.sql.catalyst.util.DateTimeUtils._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform @@ -1555,8 +1554,8 @@ object CodeGenerator extends Logging { } /** - * Generates code creating a [[UnsafeArrayData]] or [[GenericArrayData]] based on - * given parameters. + * Generates code creating a [[UnsafeArrayData]] or + * [[org.apache.spark.sql.catalyst.util.GenericArrayData]] based on given parameters. * * @param arrayName name of the array to create * @param elementType data type of the elements in source array diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala index 7404030b661c8..c246d07f189b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratePredicate.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.catalyst.expressions.codegen -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala index 070570d8f20b2..27b1f89f70870 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala @@ -17,12 +17,8 @@ package org.apache.spark.sql.catalyst.expressions.codegen -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer - import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow} import org.apache.spark.sql.types.StructType -import org.apache.spark.unsafe.Platform abstract class UnsafeRowJoiner { def join(row1: UnsafeRow, row2: UnsafeRow): UnsafeRow diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index ee98ebf5a8a50..0765bfdd78fa6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -1911,7 +1911,9 @@ case class ArrayPosition(left: Expression, right: Expression) If `spark.sql.ansi.enabled` is set to true, it throws ArrayIndexOutOfBoundsException for invalid indices. - _FUNC_(map, key) - Returns value for given key, or NULL if the key is not contained in the map + _FUNC_(map, key) - Returns value for given key. The function returns NULL + if the key is not contained in the map and `spark.sql.ansi.enabled` is set to false. + If `spark.sql.ansi.enabled` is set to true, it throws NoSuchElementException instead. """, examples = """ Examples: @@ -1931,6 +1933,9 @@ case class ElementAt( @transient private lazy val mapKeyType = left.dataType.asInstanceOf[MapType].keyType + @transient private lazy val mapValueContainsNull = + left.dataType.asInstanceOf[MapType].valueContainsNull + @transient private lazy val arrayContainsNull = left.dataType.asInstanceOf[ArrayType].containsNull @transient private lazy val ordering: Ordering[Any] = TypeUtils.getInterpretedOrdering(mapKeyType) @@ -1989,7 +1994,7 @@ case class ElementAt( override def nullable: Boolean = left.dataType match { case _: ArrayType => computeNullabilityFromArray(left, right, failOnError, nullability) - case _: MapType => true + case _: MapType => if (failOnError) mapValueContainsNull else true } override def nullSafeEval(value: Any, ordinal: Any): Any = doElementAt(value, ordinal) @@ -2022,7 +2027,7 @@ case class ElementAt( } } case _: MapType => - (value, ordinal) => getValueEval(value, ordinal, mapKeyType, ordering) + (value, ordinal) => getValueEval(value, ordinal, mapKeyType, ordering, failOnError) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -2069,7 +2074,7 @@ case class ElementAt( """.stripMargin }) case _: MapType => - doGetValueGenCode(ctx, ev, left.dataType.asInstanceOf[MapType]) + doGetValueGenCode(ctx, ev, left.dataType.asInstanceOf[MapType], failOnError) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala index 363d388692c9f..767650d022200 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala @@ -336,7 +336,12 @@ trait GetArrayItemUtil { trait GetMapValueUtil extends BinaryExpression with ImplicitCastInputTypes { // todo: current search is O(n), improve it. - def getValueEval(value: Any, ordinal: Any, keyType: DataType, ordering: Ordering[Any]): Any = { + def getValueEval( + value: Any, + ordinal: Any, + keyType: DataType, + ordering: Ordering[Any], + failOnError: Boolean): Any = { val map = value.asInstanceOf[MapData] val length = map.numElements() val keys = map.keyArray() @@ -352,14 +357,24 @@ trait GetMapValueUtil extends BinaryExpression with ImplicitCastInputTypes { } } - if (!found || values.isNullAt(i)) { + if (!found) { + if (failOnError) { + throw new NoSuchElementException(s"Key $ordinal does not exist.") + } else { + null + } + } else if (values.isNullAt(i)) { null } else { values.get(i, dataType) } } - def doGetValueGenCode(ctx: CodegenContext, ev: ExprCode, mapType: MapType): ExprCode = { + def doGetValueGenCode( + ctx: CodegenContext, + ev: ExprCode, + mapType: MapType, + failOnError: Boolean): ExprCode = { val index = ctx.freshName("index") val length = ctx.freshName("length") val keys = ctx.freshName("keys") @@ -368,12 +383,22 @@ trait GetMapValueUtil extends BinaryExpression with ImplicitCastInputTypes { val values = ctx.freshName("values") val keyType = mapType.keyType val nullCheck = if (mapType.valueContainsNull) { - s" || $values.isNullAt($index)" + s"""else if ($values.isNullAt($index)) { + ${ev.isNull} = true; + } + """ } else { "" } + val keyJavaType = CodeGenerator.javaType(keyType) nullSafeCodeGen(ctx, ev, (eval1, eval2) => { + val keyNotFoundBranch = if (failOnError) { + s"""throw new NoSuchElementException("Key " + $eval2 + " does not exist.");""" + } else { + s"${ev.isNull} = true;" + } + s""" final int $length = $eval1.numElements(); final ArrayData $keys = $eval1.keyArray(); @@ -390,9 +415,9 @@ trait GetMapValueUtil extends BinaryExpression with ImplicitCastInputTypes { } } - if (!$found$nullCheck) { - ${ev.isNull} = true; - } else { + if (!$found) { + $keyNotFoundBranch + } $nullCheck else { ${ev.value} = ${CodeGenerator.getValue(values, dataType, index)}; } """ @@ -405,9 +430,14 @@ trait GetMapValueUtil extends BinaryExpression with ImplicitCastInputTypes { * * We need to do type checking here as `key` expression maybe unresolved. */ -case class GetMapValue(child: Expression, key: Expression) +case class GetMapValue( + child: Expression, + key: Expression, + failOnError: Boolean = SQLConf.get.ansiEnabled) extends GetMapValueUtil with ExtractValue with NullIntolerant { + def this(child: Expression, key: Expression) = this(child, key, SQLConf.get.ansiEnabled) + @transient private lazy val ordering: Ordering[Any] = TypeUtils.getInterpretedOrdering(keyType) @@ -442,10 +472,10 @@ case class GetMapValue(child: Expression, key: Expression) // todo: current search is O(n), improve it. override def nullSafeEval(value: Any, ordinal: Any): Any = { - getValueEval(value, ordinal, keyType, ordering) + getValueEval(value, ordinal, keyType, ordering, failOnError) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - doGetValueGenCode(ctx, ev, child.dataType.asInstanceOf[MapType]) + doGetValueGenCode(ctx, ev, child.dataType.asInstanceOf[MapType], failOnError) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index 9fef8e9415e72..4454afb6c099b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -23,7 +23,7 @@ import java.util.concurrent.atomic.AtomicReference import scala.collection.mutable import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion, UnresolvedAttribute, UnresolvedException} +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion, UnresolvedException} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.internal.SQLConf diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index f440534745ba1..53d6394d0d1f1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -18,14 +18,11 @@ package org.apache.spark.sql.catalyst.expressions import scala.collection.immutable.TreeSet -import scala.collection.mutable import org.apache.spark.internal.Logging -import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReference -import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LeafNode, LogicalPlan, Project} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index c9dd7c7acddde..b4d9921488d5f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -20,10 +20,12 @@ package org.apache.spark.sql.catalyst.expressions import java.util.Locale import java.util.regex.{Matcher, MatchResult, Pattern} +import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import org.apache.commons.text.StringEscapeUtils +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.codegen._ @@ -178,6 +180,88 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) } } +/** + * Optimized version of LIKE ALL, when all pattern values are literal. + */ +abstract class LikeAllBase extends UnaryExpression with ImplicitCastInputTypes with NullIntolerant { + + protected def patterns: Seq[UTF8String] + + protected def isNotLikeAll: Boolean + + override def inputTypes: Seq[DataType] = StringType :: Nil + + override def dataType: DataType = BooleanType + + override def nullable: Boolean = true + + private lazy val hasNull: Boolean = patterns.contains(null) + + private lazy val cache = patterns.filterNot(_ == null) + .map(s => Pattern.compile(StringUtils.escapeLikeRegex(s.toString, '\\'))) + + private lazy val matchFunc = if (isNotLikeAll) { + (p: Pattern, inputValue: String) => !p.matcher(inputValue).matches() + } else { + (p: Pattern, inputValue: String) => p.matcher(inputValue).matches() + } + + override def eval(input: InternalRow): Any = { + val exprValue = child.eval(input) + if (exprValue == null) { + null + } else { + if (cache.forall(matchFunc(_, exprValue.toString))) { + if (hasNull) null else true + } else { + false + } + } + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val eval = child.genCode(ctx) + val patternClass = classOf[Pattern].getName + val javaDataType = CodeGenerator.javaType(child.dataType) + val pattern = ctx.freshName("pattern") + val valueArg = ctx.freshName("valueArg") + val patternCache = ctx.addReferenceObj("patternCache", cache.asJava) + + val checkNotMatchCode = if (isNotLikeAll) { + s"$pattern.matcher($valueArg.toString()).matches()" + } else { + s"!$pattern.matcher($valueArg.toString()).matches()" + } + + ev.copy(code = + code""" + |${eval.code} + |boolean ${ev.isNull} = false; + |boolean ${ev.value} = true; + |if (${eval.isNull}) { + | ${ev.isNull} = true; + |} else { + | $javaDataType $valueArg = ${eval.value}; + | for ($patternClass $pattern: $patternCache) { + | if ($checkNotMatchCode) { + | ${ev.value} = false; + | break; + | } + | } + | if (${ev.value} && $hasNull) ${ev.isNull} = true; + |} + """.stripMargin) + } +} + +case class LikeAll(child: Expression, patterns: Seq[UTF8String]) extends LikeAllBase { + override def isNotLikeAll: Boolean = false +} + +case class NotLikeAll(child: Expression, patterns: Seq[UTF8String]) extends LikeAllBase { + override def isNotLikeAll: Boolean = true +} + // scalastyle:off line.contains.tab @ExpressionDescription( usage = "str _FUNC_ regexp - Returns true if `str` matches `regexp`, or false otherwise.", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonFilters.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonFilters.scala index d6adbe83584e3..0d5974af19ac3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonFilters.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonFilters.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.json import org.apache.spark.sql.catalyst.{InternalRow, StructFilters} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources import org.apache.spark.sql.types.StructType diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala index 7a21ce254a235..0ff11ca49f3d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.types.StructType /** * Simplify redundant [[CreateNamedStruct]], [[CreateArray]] and [[CreateMap]] expressions. @@ -71,7 +70,7 @@ object SimplifyExtractValueOps extends Rule[LogicalPlan] { // out of bounds, mimic the runtime behavior and return null Literal(null, ga.dataType) } - case GetMapValue(CreateMap(elems, _), key) => CaseKeyWhen(key, elems) + case GetMapValue(CreateMap(elems, _), key, _) => CaseKeyWhen(key, elems) } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala index bfc36ec477a73..4434c29cbb3c4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala @@ -17,10 +17,10 @@ package org.apache.spark.sql.catalyst.optimizer -import org.apache.spark.sql.catalyst.expressions.{Alias, And, ArrayTransform, CaseWhen, Coalesce, CreateArray, CreateMap, CreateNamedStruct, CreateStruct, EqualTo, ExpectsInputTypes, Expression, GetStructField, If, IsNull, KnownFloatingPointNormalized, LambdaFunction, Literal, NamedLambdaVariable, UnaryExpression} +import org.apache.spark.sql.catalyst.expressions.{Alias, And, ArrayTransform, CaseWhen, Coalesce, CreateArray, CreateMap, CreateNamedStruct, EqualTo, ExpectsInputTypes, Expression, GetStructField, If, IsNull, KnownFloatingPointNormalized, LambdaFunction, Literal, NamedLambdaVariable, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery, Window} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Window} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.types._ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index e492d01650097..c4b9936fa4c4f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -85,7 +85,7 @@ abstract class Optimizer(catalogManager: CatalogManager) OptimizeWindowFunctions, CollapseWindow, CombineFilters, - CombineLimits, + EliminateLimits, CombineUnions, // Constant folding and strength reduction TransposeWindow, @@ -377,9 +377,8 @@ object SimpleTestOptimizer extends SimpleTestOptimizer class SimpleTestOptimizer extends Optimizer( new CatalogManager( - new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true), FakeV2SessionCatalog, - new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, new SQLConf()))) + new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry))) /** * Remove redundant aliases from a query plan. A redundant alias is an alias that does not change @@ -1452,11 +1451,20 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper { } /** - * Combines two adjacent [[Limit]] operators into one, merging the - * expressions into one single expression. + * This rule optimizes Limit operators by: + * 1. Eliminate [[Limit]] operators if it's child max row <= limit. + * 2. Combines two adjacent [[Limit]] operators into one, merging the + * expressions into one single expression. */ -object CombineLimits extends Rule[LogicalPlan] { - def apply(plan: LogicalPlan): LogicalPlan = plan transform { +object EliminateLimits extends Rule[LogicalPlan] { + private def canEliminate(limitExpr: Expression, child: LogicalPlan): Boolean = { + limitExpr.foldable && child.maxRows.exists { _ <= limitExpr.eval().asInstanceOf[Int] } + } + + def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { + case Limit(l, child) if canEliminate(l, child) => + child + case GlobalLimit(le, GlobalLimit(ne, grandChild)) => GlobalLimit(Least(Seq(ne, le)), grandChild) case LocalLimit(le, LocalLimit(ne, grandChild)) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala index 2627202c09c45..15d4561b47a23 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala @@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ -import org.apache.spark.sql.internal.SQLConf /** * Collapse plans consisting empty local relations generated by [[PruneFilters]]. @@ -47,8 +46,6 @@ object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper wit private def nullValueProjectList(plan: LogicalPlan): Seq[NamedExpression] = plan.output.map{ a => Alias(cast(Literal(null), a.dataType), a.name)(a.exprId) } - override def conf: SQLConf = SQLConf.get - def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { case p: Union if p.children.exists(isEmptyLocalRelation) => val newChildren = p.children.filterNot(isEmptyLocalRelation) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala index 33b398e11cde9..ef3de4738c75c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceNullWithFalseInPredicate.scala @@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.expressions.{LambdaFunction, Literal, MapFi import org.apache.spark.sql.catalyst.expressions.Literal.FalseLiteral import org.apache.spark.sql.catalyst.plans.logical.{Filter, Join, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.BooleanType import org.apache.spark.util.Utils diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala index 2aa762e2595ad..b65fc7f7e2bde 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/StarSchemaDetection.scala @@ -19,18 +19,16 @@ package org.apache.spark.sql.catalyst.optimizer import scala.annotation.tailrec +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.internal.SQLConf /** * Encapsulates star-schema detection logic. */ -object StarSchemaDetection extends PredicateHelper { - - private def conf = SQLConf.get +object StarSchemaDetection extends PredicateHelper with SQLConfHelper { /** * Star schema consists of one or more fact tables referencing a number of dimension diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala index 76b9bd03f216c..9aa7e3201ab1b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/finishAnalysis.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.catalyst.optimizer -import java.time.LocalDate - import scala.collection.mutable import org.apache.spark.sql.catalyst.expressions._ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala index cb076f6e35184..11532d22204a4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.optimizer import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.analysis.CleanupAliases import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.SubExprUtils._ import org.apache.spark.sql.catalyst.expressions.aggregate._ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index a5b8c118d6c54..23de8ab09dd0a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -28,7 +28,7 @@ import org.antlr.v4.runtime.tree.{ParseTree, RuleNode, TerminalNode} import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.{FunctionIdentifier, SQLConfHelper, TableIdentifier} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, FunctionResource, FunctionResourceType} import org.apache.spark.sql.catalyst.expressions._ @@ -51,11 +51,9 @@ import org.apache.spark.util.random.RandomSampler * The AstBuilder converts an ANTLR4 ParseTree into a catalyst Expression, LogicalPlan or * TableIdentifier. */ -class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { +class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logging { import ParserUtils._ - protected def conf: SQLConf = SQLConf.get - protected def typedVisit[T](ctx: ParseTree): T = { ctx.accept(this).asInstanceOf[T] } @@ -1408,7 +1406,20 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging { case Some(SqlBaseParser.ANY) | Some(SqlBaseParser.SOME) => getLikeQuantifierExprs(ctx.expression).reduceLeft(Or) case Some(SqlBaseParser.ALL) => - getLikeQuantifierExprs(ctx.expression).reduceLeft(And) + validate(!ctx.expression.isEmpty, "Expected something between '(' and ')'.", ctx) + val expressions = ctx.expression.asScala.map(expression) + if (expressions.size > SQLConf.get.optimizerLikeAllConversionThreshold && + expressions.forall(_.foldable) && expressions.forall(_.dataType == StringType)) { + // If there are many pattern expressions, will throw StackOverflowError. + // So we use LikeAll or NotLikeAll instead. + val patterns = expressions.map(_.eval(EmptyRow).asInstanceOf[UTF8String]) + ctx.NOT match { + case null => LikeAll(e, patterns.toSeq) + case _ => NotLikeAll(e, patterns.toSeq) + } + } else { + getLikeQuantifierExprs(ctx.expression).reduceLeft(And) + } case _ => val escapeChar = Option(ctx.escapeChar).map(string).map { str => if (str.length != 1) { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala index 73a58f79ff132..ac3fbbf6b0512 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParseDriver.scala @@ -23,19 +23,16 @@ import org.antlr.v4.runtime.tree.TerminalNodeImpl import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.{FunctionIdentifier, SQLConfHelper, TableIdentifier} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.trees.Origin -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, StructType} /** * Base SQL parsing infrastructure. */ -abstract class AbstractSqlParser extends ParserInterface with Logging { - - protected def conf: SQLConf = SQLConf.get +abstract class AbstractSqlParser extends ParserInterface with SQLConfHelper with Logging { /** Creates/Resolves DataType for a given SQL string. */ override def parseDataType(sqlText: String): DataType = parse(sqlText) { parser => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala index b1884eac27f73..864ca4f57483d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.plans import scala.collection.mutable import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, TreeNode, TreeNodeTag} import org.apache.spark.sql.internal.SQLConf @@ -35,15 +36,10 @@ import org.apache.spark.sql.types.{DataType, StructType} * The tree traverse APIs like `transform`, `foreach`, `collect`, etc. that are * inherited from `TreeNode`, do not traverse into query plans inside subqueries. */ -abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanType] { +abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] + extends TreeNode[PlanType] with SQLConfHelper { self: PlanType => - /** - * The active config object within the current scope. - * See [[SQLConf.get]] for more information. - */ - def conf: SQLConf = SQLConf.get - def output: Seq[Attribute] /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala index d8d18b46bcc74..2c6a716a2ed48 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala @@ -17,10 +17,9 @@ package org.apache.spark.sql.catalyst.plans.logical -import org.apache.spark.sql.catalyst.analysis.CheckAnalysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, TreeNode} +import org.apache.spark.sql.catalyst.trees.CurrentOrigin import org.apache.spark.util.Utils @@ -33,7 +32,7 @@ import org.apache.spark.util.Utils * analyzed flag set to true. * * The analyzer rules should use the various resolve methods, in lieu of the various transform - * methods defined in [[TreeNode]] and [[QueryPlan]]. + * methods defined in [[org.apache.spark.sql.catalyst.trees.TreeNode]] and [[QueryPlan]]. * * To prevent accidental use of the transform methods, this trait also overrides the transform * methods to throw exceptions in test mode, if they are used in the analyzer. @@ -44,7 +43,8 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan => /** * Recursively marks all nodes in this plan tree as analyzed. - * This should only be called by [[CheckAnalysis]]. + * This should only be called by + * [[org.apache.spark.sql.catalyst.analysis.CheckAnalysis]]. */ private[catalyst] def setAnalyzed(): Unit = { if (!_analyzed) { @@ -155,7 +155,7 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan => * In analyzer, use [[resolveOperatorsDown()]] instead. If this is used in the analyzer, * an exception will be thrown in test mode. It is however OK to call this function within * the scope of a [[resolveOperatorsDown()]] call. - * @see [[TreeNode.transformDown()]]. + * @see [[org.apache.spark.sql.catalyst.trees.TreeNode.transformDown()]]. */ override def transformDown(rule: PartialFunction[LogicalPlan, LogicalPlan]): LogicalPlan = { assertNotAnalysisRule() @@ -164,7 +164,7 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan => /** * Use [[resolveOperators()]] in the analyzer. - * @see [[TreeNode.transformUp()]] + * @see [[org.apache.spark.sql.catalyst.trees.TreeNode.transformUp()]] */ override def transformUp(rule: PartialFunction[LogicalPlan, LogicalPlan]): LogicalPlan = { assertNotAnalysisRule() diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index 48dfc5fd57e63..ad5c3fd74e9b5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -33,6 +33,9 @@ abstract class LogicalPlan with QueryPlanConstraints with Logging { + /** Metadata fields that can be projected from this node */ + def metadataOutput: Seq[Attribute] = children.flatMap(_.metadataOutput) + /** Returns true if this subtree has data from a streaming data source. */ def isStreaming: Boolean = children.exists(_.isStreaming) @@ -86,7 +89,8 @@ abstract class LogicalPlan } } - private[this] lazy val childAttributes = AttributeSeq(children.flatMap(_.output)) + private[this] lazy val childAttributes = + AttributeSeq(children.flatMap(c => c.output ++ c.metadataOutput)) private[this] lazy val outputAttributes = AttributeSeq(output) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala index 49f89bed154bb..1346f80247a1f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala @@ -20,18 +20,10 @@ package org.apache.spark.sql.catalyst.plans.logical import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} import java.math.{MathContext, RoundingMode} -import scala.util.control.NonFatal - import net.jpountz.lz4.{LZ4BlockInputStream, LZ4BlockOutputStream} -import org.apache.spark.internal.Logging -import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.CatalogColumnStat import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate._ -import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils} -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.util.Utils diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 17bf704c6d67a..f96e07863fa69 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -17,17 +17,14 @@ package org.apache.spark.sql.catalyst.plans.logical -import scala.collection.mutable - import org.apache.spark.sql.catalyst.AliasIdentifier -import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation} +import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction} +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning} import org.apache.spark.sql.catalyst.util.truncatedString -import org.apache.spark.sql.connector.catalog.Identifier import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.util.random.RandomSampler @@ -886,6 +883,12 @@ case class SubqueryAlias( val qualifierList = identifier.qualifier :+ alias child.output.map(_.withQualifier(qualifierList)) } + + override def metadataOutput: Seq[Attribute] = { + val qualifierList = identifier.qualifier :+ alias + child.metadataOutput.map(_.withQualifier(qualifierList)) + } + override def doCanonicalize(): LogicalPlan = child.canonicalized } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala index a325b61fcc5a9..4b5e278fccdfb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.util.Utils /** * A general hint for the child that is not yet resolved. This node is generated by the parser and diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala index 6925423f003ba..8e58c4f314df0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ProjectEstimation.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation -import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeMap} +import org.apache.spark.sql.catalyst.expressions.AttributeMap import org.apache.spark.sql.catalyst.plans.logical.{Project, Statistics} object ProjectEstimation { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala index a774217ecc832..4ef71bbc7c098 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala @@ -18,10 +18,10 @@ package org.apache.spark.sql.catalyst.rules import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.trees.TreeNode -import org.apache.spark.sql.internal.SQLConf -abstract class Rule[TreeType <: TreeNode[_]] extends Logging { +abstract class Rule[TreeType <: TreeNode[_]] extends SQLConfHelper with Logging { /** Name for this rule, automatically inferred based on class name. */ val ruleName: String = { @@ -30,6 +30,4 @@ abstract class Rule[TreeType <: TreeNode[_]] extends Logging { } def apply(plan: TreeType): TreeType - - def conf: SQLConf = SQLConf.get } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 1ab7bbdcff697..ff2b366a9bc75 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -33,7 +33,6 @@ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.JoinType -import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning} import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat import org.apache.spark.sql.catalyst.util.truncatedString diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala index ebbf241088f80..44203316edd94 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayData.scala @@ -22,7 +22,6 @@ import scala.reflect.ClassTag import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, UnsafeArrayData} import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.array.ByteArrayMethods object ArrayData { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala index 1a78422e57a4c..46860ae1771de 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.util -import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneId} +import java.time.{LocalDate, LocalDateTime, LocalTime} import java.time.temporal.ChronoField import java.util.{Calendar, TimeZone} import java.util.Calendar.{DAY_OF_MONTH, DST_OFFSET, ERA, HOUR_OF_DAY, MINUTE, MONTH, SECOND, YEAR, ZONE_OFFSET} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala index fc2ab99a3da8c..0779bf53fe446 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/catalog/CatalogManager.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.connector.catalog import scala.collection.mutable import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.internal.SQLConf @@ -37,9 +38,8 @@ import org.apache.spark.sql.internal.SQLConf // need to track current database at all. private[sql] class CatalogManager( - conf: SQLConf, defaultSessionCatalog: CatalogPlugin, - val v1SessionCatalog: SessionCatalog) extends Logging { + val v1SessionCatalog: SessionCatalog) extends SQLConfHelper with Logging { import CatalogManager.SESSION_CATALOG_NAME import CatalogV2Util._ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Implicits.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Implicits.scala index dfacf6e83ef57..8d91ea7c50cde 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Implicits.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Implicits.scala @@ -21,7 +21,9 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{PartitionSpec, ResolvedPartitionSpec, UnresolvedPartitionSpec} -import org.apache.spark.sql.connector.catalog.{SupportsAtomicPartitionManagement, SupportsDelete, SupportsPartitionManagement, SupportsRead, SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.catalyst.expressions.AttributeReference +import org.apache.spark.sql.connector.catalog.{MetadataColumn, SupportsAtomicPartitionManagement, SupportsDelete, SupportsPartitionManagement, SupportsRead, SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap object DataSourceV2Implicits { @@ -78,6 +80,18 @@ object DataSourceV2Implicits { def supportsAny(capabilities: TableCapability*): Boolean = capabilities.exists(supports) } + implicit class MetadataColumnsHelper(metadata: Array[MetadataColumn]) { + def asStruct: StructType = { + val fields = metadata.map { metaCol => + val field = StructField(metaCol.name, metaCol.dataType, metaCol.isNullable) + Option(metaCol.comment).map(field.withComment).getOrElse(field) + } + StructType(fields) + } + + def toAttributes: Seq[AttributeReference] = asStruct.toAttributes + } + implicit class OptionsHelper(options: Map[String, String]) { def asOptions: CaseInsensitiveStringMap = { new CaseInsensitiveStringMap(options.asJava) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala index 45d89498f5ae9..f541411daeff4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala @@ -21,10 +21,10 @@ import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelat import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.catalyst.util.truncatedString -import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, Table, TableCapability} -import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, Statistics => V2Statistics, SupportsReportStatistics} +import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, MetadataColumn, SupportsMetadataColumns, Table, TableCapability} +import org.apache.spark.sql.connector.read.{Scan, Statistics => V2Statistics, SupportsReportStatistics} import org.apache.spark.sql.connector.read.streaming.{Offset, SparkDataStream} -import org.apache.spark.sql.connector.write.WriteBuilder +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.Utils @@ -35,8 +35,9 @@ import org.apache.spark.util.Utils * @param output the output attributes of this relation. * @param catalog catalogPlugin for the table. None if no catalog is specified. * @param identifier the identifier for the table. None if no identifier is defined. - * @param options The options for this table operation. It's used to create fresh [[ScanBuilder]] - * and [[WriteBuilder]]. + * @param options The options for this table operation. It's used to create fresh + * [[org.apache.spark.sql.connector.read.ScanBuilder]] and + * [[org.apache.spark.sql.connector.write.WriteBuilder]]. */ case class DataSourceV2Relation( table: Table, @@ -48,6 +49,21 @@ case class DataSourceV2Relation( import DataSourceV2Implicits._ + override lazy val metadataOutput: Seq[AttributeReference] = table match { + case hasMeta: SupportsMetadataColumns => + val resolve = SQLConf.get.resolver + val outputNames = outputSet.map(_.name) + def isOutputColumn(col: MetadataColumn): Boolean = { + outputNames.exists(name => resolve(col.name, name)) + } + // filter out metadata columns that have names conflicting with output columns. if the table + // has a column "line" and the table can produce a metadata column called "line", then the + // data column should be returned, not the metadata column. + hasMeta.metadataColumns.filterNot(isOutputColumn).toAttributes + case _ => + Nil + } + override def name: String = table.name() override def skipSchemaResolution: Boolean = table.supports(TableCapability.ACCEPT_ANY_SCHEMA) @@ -78,6 +94,14 @@ case class DataSourceV2Relation( override def newInstance(): DataSourceV2Relation = { copy(output = output.map(_.newInstance())) } + + def withMetadataColumns(): DataSourceV2Relation = { + if (metadataOutput.nonEmpty) { + DataSourceV2Relation(table, output ++ metadataOutput, catalog, identifier, options) + } else { + this + } + } } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index f2e309013a5b6..fcf222c8fdab0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -216,6 +216,18 @@ object SQLConf { "for using switch statements in InSet must be non-negative and less than or equal to 600") .createWithDefault(400) + val OPTIMIZER_LIKE_ALL_CONVERSION_THRESHOLD = + buildConf("spark.sql.optimizer.likeAllConversionThreshold") + .internal() + .doc("Configure the maximum size of the pattern sequence in like all. Spark will convert " + + "the logical combination of like to avoid StackOverflowError. 200 is an empirical value " + + "that will not cause StackOverflowError.") + .version("3.1.0") + .intConf + .checkValue(threshold => threshold >= 0, "The maximum size of pattern sequence " + + "in like all must be non-negative") + .createWithDefault(200) + val PLAN_CHANGE_LOG_LEVEL = buildConf("spark.sql.planChangeLog.level") .internal() .doc("Configures the log level for logging the change from the original plan to the new " + @@ -539,6 +551,15 @@ object SQLConf { .booleanConf .createWithDefault(true) + val SUBEXPRESSION_ELIMINATION_CACHE_MAX_ENTRIES = + buildConf("spark.sql.subexpressionElimination.cache.maxEntries") + .internal() + .doc("The maximum entries of the cache used for interpreted subexpression elimination.") + .version("3.1.0") + .intConf + .checkValue(_ >= 0, "The maximum must not be negative") + .createWithDefault(100) + val CASE_SENSITIVE = buildConf("spark.sql.caseSensitive") .internal() .doc("Whether the query analyzer should be case sensitive or not. " + @@ -815,6 +836,18 @@ object SQLConf { .booleanConf .createWithDefault(true) + val HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD = + buildConf("spark.sql.hive.metastorePartitionPruningInSetThreshold") + .doc("The threshold of set size for InSet predicate when pruning partitions through Hive " + + "Metastore. When the set size exceeds the threshold, we rewrite the InSet predicate " + + "to be greater than or equal to the minimum value in set and less than or equal to the " + + "maximum value in set. Larger values may cause Hive Metastore stack overflow.") + .version("3.1.0") + .internal() + .intConf + .checkValue(_ > 0, "The value of metastorePartitionPruningInSetThreshold must be positive") + .createWithDefault(1000) + val HIVE_MANAGE_FILESOURCE_PARTITIONS = buildConf("spark.sql.hive.manageFilesourcePartitions") .doc("When true, enable metastore partition management for file source tables as well. " + @@ -1256,7 +1289,7 @@ object SQLConf { val REMOVE_REDUNDANT_SORTS_ENABLED = buildConf("spark.sql.execution.removeRedundantSorts") .internal() .doc("Whether to remove redundant physical sort node") - .version("3.1.0") + .version("2.4.8") .booleanConf .createWithDefault(true) @@ -1882,7 +1915,7 @@ object SQLConf { "1. pyspark.sql.DataFrame.toPandas " + "2. pyspark.sql.SparkSession.createDataFrame when its input is a Pandas DataFrame " + "The following data types are unsupported: " + - "MapType, ArrayType of TimestampType, and nested StructType.") + "ArrayType of TimestampType, and nested StructType.") .version("3.0.0") .fallbackConf(ARROW_EXECUTION_ENABLED) @@ -1942,6 +1975,16 @@ object SQLConf { .version("3.0.0") .fallbackConf(BUFFER_SIZE) + val PYSPARK_SIMPLIFIEID_TRACEBACK = + buildConf("spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled") + .doc( + "When true, the traceback from Python UDFs is simplified. It hides " + + "the Python worker, (de)serialization, etc from PySpark in tracebacks, and only " + + "shows the exception messages from UDFs. Note that this works only with CPython 3.7+.") + .version("3.1.0") + .booleanConf + .createWithDefault(false) + val PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME = buildConf("spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName") .internal() @@ -3006,6 +3049,8 @@ class SQLConf extends Serializable with Logging { def optimizerInSetSwitchThreshold: Int = getConf(OPTIMIZER_INSET_SWITCH_THRESHOLD) + def optimizerLikeAllConversionThreshold: Int = getConf(OPTIMIZER_LIKE_ALL_CONVERSION_THRESHOLD) + def planChangeLogLevel: String = getConf(PLAN_CHANGE_LOG_LEVEL) def planChangeRules: Option[String] = getConf(PLAN_CHANGE_LOG_RULES) @@ -3142,6 +3187,9 @@ class SQLConf extends Serializable with Logging { def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING) + def metastorePartitionPruningInSetThreshold: Int = + getConf(HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD) + def manageFilesourcePartitions: Boolean = getConf(HIVE_MANAGE_FILESOURCE_PARTITIONS) def filesourcePartitionFileCacheSize: Long = getConf(HIVE_FILESOURCE_PARTITION_FILE_CACHE_SIZE) @@ -3233,6 +3281,9 @@ class SQLConf extends Serializable with Logging { def subexpressionEliminationEnabled: Boolean = getConf(SUBEXPRESSION_ELIMINATION_ENABLED) + def subexpressionEliminationCacheMaxEntries: Int = + getConf(SUBEXPRESSION_ELIMINATION_CACHE_MAX_ENTRIES) + def autoBroadcastJoinThreshold: Long = getConf(AUTO_BROADCASTJOIN_THRESHOLD) def limitScaleUpFactor: Int = getConf(LIMIT_SCALE_UP_FACTOR) @@ -3405,6 +3456,8 @@ class SQLConf extends Serializable with Logging { def pandasUDFBufferSize: Int = getConf(PANDAS_UDF_BUFFER_SIZE) + def pysparkSimplifiedTraceback: Boolean = getConf(PYSPARK_SIMPLIFIEID_TRACEBACK) + def pandasGroupedMapAssignColumnsByName: Boolean = getConf(SQLConf.PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala index 043c88f88843c..7556a19f0d316 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -31,7 +31,7 @@ import org.apache.spark.annotation.Stable import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.Resolver import org.apache.spark.sql.catalyst.expressions.{Cast, Expression} -import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.DataTypeJsonUtils.{DataTypeJsonDeserializer, DataTypeJsonSerializer} import org.apache.spark.sql.catalyst.util.StringUtils.StringConcat import org.apache.spark.sql.internal.SQLConf diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala index 6be6d81ec3bb7..960e174f9c368 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.types -import java.lang.{Long => JLong} import java.math.{BigDecimal => JavaBigDecimal, BigInteger, MathContext, RoundingMode} import scala.util.Try diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RowJsonSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RowJsonSuite.scala index ac18b0f79b5f3..1962fca66c059 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RowJsonSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RowJsonSuite.scala @@ -17,14 +17,13 @@ package org.apache.spark.sql import java.sql.{Date, Timestamp} -import java.time.{Instant, LocalDate} +import java.time.LocalDate import org.json4s.JsonAST.{JArray, JBool, JDecimal, JDouble, JLong, JNull, JObject, JString, JValue} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.encoders.{ExamplePoint, ExamplePointUDT} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala index e8c7aed6d72ce..164bbd7f34d04 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala @@ -134,7 +134,6 @@ object ScroogeLikeExample { } trait ScroogeLikeExample extends Product1[Int] with Serializable { - import ScroogeLikeExample._ def x: Int diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisExternalCatalogSuite.scala index 3dd38091051d8..df99cd851cc3e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisExternalCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisExternalCatalogSuite.scala @@ -27,13 +27,11 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogStorageFormat, CatalogTable, CatalogTableType, ExternalCatalog, InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project} -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ class AnalysisExternalCatalogSuite extends AnalysisTest with Matchers { private def getAnalyzer(externCatalog: ExternalCatalog, databasePath: File): Analyzer = { - val conf = new SQLConf() - val catalog = new SessionCatalog(externCatalog, FunctionRegistry.builtin, conf) + val catalog = new SessionCatalog(externCatalog, FunctionRegistry.builtin) catalog.createDatabase( CatalogDatabase("default", "", databasePath.toURI, Map.empty), ignoreIfExists = false) @@ -44,7 +42,7 @@ class AnalysisExternalCatalogSuite extends AnalysisTest with Matchers { CatalogStorageFormat.empty, StructType(Seq(StructField("a", IntegerType, nullable = true)))), ignoreIfExists = false) - new Analyzer(catalog, conf) + new Analyzer(catalog) } test("query builtin functions don't call the external catalog") { @@ -66,7 +64,7 @@ class AnalysisExternalCatalogSuite extends AnalysisTest with Matchers { withTempDir { tempDir => val inMemoryCatalog = new InMemoryCatalog val externCatalog = spy(inMemoryCatalog) - val catalog = new SessionCatalog(externCatalog, FunctionRegistry.builtin, conf) + val catalog = new SessionCatalog(externCatalog, FunctionRegistry.builtin) catalog.createDatabase( CatalogDatabase("default", "", new URI(tempDir.toString), Map.empty), ignoreIfExists = false) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 37dcee1e59ee8..f0a24d4a56048 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.analysis -import java.util.{Locale, TimeZone} +import java.util.TimeZone import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag @@ -771,22 +771,23 @@ class AnalysisSuite extends AnalysisTest with Matchers { // RuleExecutor only throw exception or log warning when the rule is supposed to run // more than once. val maxIterations = 2 - val conf = new SQLConf().copy(SQLConf.ANALYZER_MAX_ITERATIONS -> maxIterations) - val testAnalyzer = new Analyzer( - new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf), conf) + withSQLConf(SQLConf.ANALYZER_MAX_ITERATIONS.key -> maxIterations.toString) { + val testAnalyzer = new Analyzer( + new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin)) - val plan = testRelation2.select( - $"a" / Literal(2) as "div1", - $"a" / $"b" as "div2", - $"a" / $"c" as "div3", - $"a" / $"d" as "div4", - $"e" / $"e" as "div5") + val plan = testRelation2.select( + $"a" / Literal(2) as "div1", + $"a" / $"b" as "div2", + $"a" / $"c" as "div3", + $"a" / $"d" as "div4", + $"e" / $"e" as "div5") - val message = intercept[TreeNodeException[LogicalPlan]] { - testAnalyzer.execute(plan) - }.getMessage - assert(message.startsWith(s"Max iterations ($maxIterations) reached for batch Resolution, " + - s"please set '${SQLConf.ANALYZER_MAX_ITERATIONS.key}' to a larger value.")) + val message = intercept[TreeNodeException[LogicalPlan]] { + testAnalyzer.execute(plan) + }.getMessage + assert(message.startsWith(s"Max iterations ($maxIterations) reached for batch Resolution, " + + s"please set '${SQLConf.ANALYZER_MAX_ITERATIONS.key}' to a larger value.")) + } } test("SPARK-30886 Deprecate two-parameter TRIM/LTRIM/RTRIM") { @@ -802,7 +803,7 @@ class AnalysisSuite extends AnalysisTest with Matchers { withLogAppender(logAppender) { val testAnalyzer1 = new Analyzer( - new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf), conf) + new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin)) val plan1 = testRelation2.select( UnresolvedFunction(f, $"a" :: Nil, isDistinct = false)) @@ -824,7 +825,7 @@ class AnalysisSuite extends AnalysisTest with Matchers { // New analyzer from new SessionState val testAnalyzer2 = new Analyzer( - new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf), conf) + new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin)) val plan4 = testRelation2.select( UnresolvedFunction(f, $"c" :: $"d" :: Nil, isDistinct = false)) testAnalyzer2.execute(plan4) @@ -933,9 +934,8 @@ class AnalysisSuite extends AnalysisTest with Matchers { val maxIterations = 2 val maxIterationsEnough = 5 withSQLConf(SQLConf.ANALYZER_MAX_ITERATIONS.key -> maxIterations.toString) { - val conf = SQLConf.get val testAnalyzer = new Analyzer( - new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf), conf) + new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin)) val plan = testRelation2.select( $"a" / Literal(2) as "div1", diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala index 8c14ffffa17a5..37db4be502a83 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala @@ -34,7 +34,7 @@ trait AnalysisTest extends PlanTest { protected def extendedAnalysisRules: Seq[Rule[LogicalPlan]] = Nil protected def getAnalyzer: Analyzer = { - val catalog = new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf) + val catalog = new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin) catalog.createDatabase( CatalogDatabase("default", "", new URI("loc"), Map.empty), ignoreIfExists = false) @@ -43,7 +43,7 @@ trait AnalysisTest extends PlanTest { catalog.createTempView("TaBlE3", TestRelations.testRelation3, overrideIfExists = true) catalog.createGlobalTempView("TaBlE4", TestRelations.testRelation4, overrideIfExists = true) catalog.createGlobalTempView("TaBlE5", TestRelations.testRelation5, overrideIfExists = true) - new Analyzer(catalog, conf) { + new Analyzer(catalog) { override val extendedResolutionRules = EliminateSubqueryAliases +: extendedAnalysisRules } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CreateTablePartitioningValidationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CreateTablePartitioningValidationSuite.scala index f433229595e9e..1c849fa21e4ea 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CreateTablePartitioningValidationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/CreateTablePartitioningValidationSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.{CreateTableAsSelect, LeafNode} import org.apache.spark.sql.connector.InMemoryTableCatalog import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} -import org.apache.spark.sql.connector.expressions.{Expressions, LogicalExpressions} +import org.apache.spark.sql.connector.expressions.Expressions import org.apache.spark.sql.types.{DoubleType, LongType, StringType, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DataSourceV2AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DataSourceV2AnalysisSuite.scala index 349237c2aa893..67bafbd4a8122 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DataSourceV2AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DataSourceV2AnalysisSuite.scala @@ -223,11 +223,11 @@ abstract class DataSourceV2StrictAnalysisSuite extends DataSourceV2AnalysisBaseS abstract class DataSourceV2AnalysisBaseSuite extends AnalysisTest { override def getAnalyzer: Analyzer = { - val catalog = new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf) + val catalog = new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin) catalog.createDatabase( CatalogDatabase("default", "", new URI("loc"), Map.empty), ignoreIfExists = false) - new Analyzer(catalog, conf) { + new Analyzer(catalog) { override val extendedResolutionRules = EliminateSubqueryAliases :: Nil } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala index d5991ff10ce6c..9892e62a9ce19 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala @@ -24,15 +24,14 @@ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} import org.apache.spark.sql.catalyst.expressions.aggregate._ -import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project, Union} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ class DecimalPrecisionSuite extends AnalysisTest with BeforeAndAfter { - private val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf) - private val analyzer = new Analyzer(catalog, conf) + private val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry) + private val analyzer = new Analyzer(catalog) private val relation = LocalRelation( AttributeReference("i", IntegerType)(), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/LookupFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/LookupFunctionsSuite.scala index cea0f2a9cbc97..e0f3c9a835b6e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/LookupFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/LookupFunctionsSuite.scala @@ -24,19 +24,17 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, InMemoryCatalog, import org.apache.spark.sql.catalyst.expressions.Alias import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.internal.SQLConf class LookupFunctionsSuite extends PlanTest { test("SPARK-23486: the functionExists for the Persistent function check") { val externalCatalog = new CustomInMemoryCatalog - val conf = new SQLConf() - val catalog = new SessionCatalog(externalCatalog, FunctionRegistry.builtin, conf) + val catalog = new SessionCatalog(externalCatalog, FunctionRegistry.builtin) val analyzer = { catalog.createDatabase( CatalogDatabase("default", "", new URI("loc"), Map.empty), ignoreIfExists = false) - new Analyzer(catalog, conf) + new Analyzer(catalog) } def table(ref: String): LogicalPlan = UnresolvedRelation(TableIdentifier(ref)) @@ -56,14 +54,13 @@ class LookupFunctionsSuite extends PlanTest { test("SPARK-23486: the functionExists for the Registered function check") { val externalCatalog = new InMemoryCatalog - val conf = new SQLConf() val customerFunctionReg = new CustomerFunctionRegistry - val catalog = new SessionCatalog(externalCatalog, customerFunctionReg, conf) + val catalog = new SessionCatalog(externalCatalog, customerFunctionReg) val analyzer = { catalog.createDatabase( CatalogDatabase("default", "", new URI("loc"), Map.empty), ignoreIfExists = false) - new Analyzer(catalog, conf) + new Analyzer(catalog) } def table(ref: String): LogicalPlan = UnresolvedRelation(TableIdentifier(ref)) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala index e449b9669cc72..ea2284e5420bd 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveNaturalJoinSuite.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.catalyst.analysis -import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelperSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelperSuite.scala index 8cf41a02320d2..7566545f98355 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelperSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelperSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet} import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, Filter, LeafNode, LocalRelation} +import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, Filter, LeafNode} import org.apache.spark.sql.types.{IntegerType, MetadataBuilder, TimestampType} class StreamingJoinHelperSuite extends AnalysisTest { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TableLookupCacheSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TableLookupCacheSuite.scala index 06ea531833a43..3e9a8b71a8fb6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TableLookupCacheSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TableLookupCacheSuite.scala @@ -29,13 +29,11 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogStorageFor import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.connector.InMemoryTableCatalog import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogNotFoundException, Identifier, Table, V1Table} -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ class TableLookupCacheSuite extends AnalysisTest with Matchers { private def getAnalyzer(externalCatalog: ExternalCatalog, databasePath: File): Analyzer = { - val conf = new SQLConf() - val v1Catalog = new SessionCatalog(externalCatalog, FunctionRegistry.builtin, conf) + val v1Catalog = new SessionCatalog(externalCatalog, FunctionRegistry.builtin) v1Catalog.createDatabase( CatalogDatabase("default", "", databasePath.toURI, Map.empty), ignoreIfExists = false) @@ -64,7 +62,7 @@ class TableLookupCacheSuite extends AnalysisTest with Matchers { when(catalogManager.currentCatalog).thenReturn(v2Catalog) when(catalogManager.currentNamespace).thenReturn(Array("default")) - new Analyzer(catalogManager, conf) + new Analyzer(catalogManager) } test("table lookups to external catalog are cached") { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala index 21dde3ca8ca51..3be417de472c6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationsSuite.scala @@ -32,7 +32,6 @@ import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{IntegerType, LongType, MetadataBuilder} -import org.apache.spark.unsafe.types.CalendarInterval /** A dummy command for testing unsupported operations. */ case class DummyCommand() extends Command @@ -393,16 +392,14 @@ class UnsupportedOperationsSuite extends SparkFunSuite with SQLHelper { testBinaryOperationInStreamingPlan( "single inner join in append mode", _.join(_, joinType = Inner), - outputMode = Append, - streamStreamSupported = true) + outputMode = Append) testBinaryOperationInStreamingPlan( "multiple inner joins in append mode", (x: LogicalPlan, y: LogicalPlan) => { x.join(y, joinType = Inner).join(streamRelation, joinType = Inner) }, - outputMode = Append, - streamStreamSupported = true) + outputMode = Append) testBinaryOperationInStreamingPlan( "inner join in update mode", @@ -419,209 +416,135 @@ class UnsupportedOperationsSuite extends SparkFunSuite with SQLHelper { batchStreamSupported = false, streamBatchSupported = false) - // Left outer joins: *-stream not allowed + // Left outer, left semi, left anti join: *-stream not allowed + Seq((LeftOuter, "LeftOuter join"), (LeftSemi, "LeftSemi join"), (LeftAnti, "LeftAnti join")) + .foreach { case (joinType, name) => + testBinaryOperationInStreamingPlan( + name, + _.join(_, joinType = joinType), + batchStreamSupported = false, + streamStreamSupported = false, + expectedMsg = name) + } + + // Right outer joins: stream-* not allowed testBinaryOperationInStreamingPlan( - "left outer join", - _.join(_, joinType = LeftOuter), - batchStreamSupported = false, + "right outer join", + _.join(_, joinType = RightOuter), + streamBatchSupported = false, streamStreamSupported = false, expectedMsg = "outer join") - // Left outer joins: update and complete mode not allowed - assertNotSupportedInStreamingPlan( - s"left outer join with stream-stream relations and update mode", - streamRelation.join(streamRelation, joinType = LeftOuter, - condition = Some(attribute === attribute)), - OutputMode.Update(), - Seq("is not supported in Update output mode")) - assertNotSupportedInStreamingPlan( - s"left outer join with stream-stream relations and complete mode", - Aggregate(Nil, aggExprs("d"), streamRelation.join(streamRelation, joinType = LeftOuter, - condition = Some(attribute === attribute))), - OutputMode.Complete(), - Seq("is not supported in Complete output mode")) - - // Left outer joins: stream-stream allowed with join on watermark attribute - // Note that the attribute need not be watermarked on both sides. - assertSupportedInStreamingPlan( - s"left outer join with stream-stream relations and join on attribute with left watermark", - streamRelation.join(streamRelation, joinType = LeftOuter, - condition = Some(attributeWithWatermark === attribute)), - OutputMode.Append()) - assertSupportedInStreamingPlan( - s"left outer join with stream-stream relations and join on attribute with right watermark", - streamRelation.join(streamRelation, joinType = LeftOuter, - condition = Some(attribute === attributeWithWatermark)), - OutputMode.Append()) - assertNotSupportedInStreamingPlan( - s"left outer join with stream-stream relations and join on non-watermarked attribute", - streamRelation.join(streamRelation, joinType = LeftOuter, - condition = Some(attribute === attribute)), - OutputMode.Append(), - Seq("watermark in the join keys")) - - // Left outer joins: stream-stream allowed with range condition yielding state value watermark - assertSupportedInStreamingPlan( - s"left outer join with stream-stream relations and state value watermark", { - val leftRelation = streamRelation - val rightTimeWithWatermark = - AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata) - val rightRelation = new TestStreamingRelation(rightTimeWithWatermark) - leftRelation.join( - rightRelation, - joinType = LeftOuter, - condition = Some(attribute > rightTimeWithWatermark + 10)) - }, - OutputMode.Append()) - - // Left outer joins: stream-stream not allowed with insufficient range condition - assertNotSupportedInStreamingPlan( - s"left outer join with stream-stream relations and state value watermark", { - val leftRelation = streamRelation - val rightTimeWithWatermark = - AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata) - val rightRelation = new TestStreamingRelation(rightTimeWithWatermark) - leftRelation.join( - rightRelation, - joinType = LeftOuter, - condition = Some(attribute < rightTimeWithWatermark + 10)) - }, - OutputMode.Append(), - Seq("appropriate range condition")) - - // Left semi joins: stream-* not allowed - testBinaryOperationInStreamingPlan( - "left semi join", - _.join(_, joinType = LeftSemi), - streamStreamSupported = false, - batchStreamSupported = false, - expectedMsg = "LeftSemi join") + // Left outer, right outer, left semi joins + Seq(LeftOuter, RightOuter, LeftSemi).foreach { joinType => + // Update mode not allowed + assertNotSupportedInStreamingPlan( + s"$joinType join with stream-stream relations and update mode", + streamRelation.join(streamRelation, joinType = joinType, + condition = Some(attribute === attribute)), + OutputMode.Update(), + Seq("is not supported in Update output mode")) - // Left semi joins: update and complete mode not allowed - assertNotSupportedInStreamingPlan( - "left semi join with stream-stream relations and update mode", - streamRelation.join(streamRelation, joinType = LeftSemi, - condition = Some(attribute === attribute)), - OutputMode.Update(), - Seq("is not supported in Update output mode")) - assertNotSupportedInStreamingPlan( - "left semi join with stream-stream relations and complete mode", - Aggregate(Nil, aggExprs("d"), streamRelation.join(streamRelation, joinType = LeftSemi, - condition = Some(attribute === attribute))), - OutputMode.Complete(), - Seq("is not supported in Complete output mode")) - - // Left semi joins: stream-stream allowed with join on watermark attribute - // Note that the attribute need not be watermarked on both sides. - assertSupportedInStreamingPlan( - "left semi join with stream-stream relations and join on attribute with left watermark", - streamRelation.join(streamRelation, joinType = LeftSemi, - condition = Some(attributeWithWatermark === attribute)), - OutputMode.Append()) - assertSupportedInStreamingPlan( - "left semi join with stream-stream relations and join on attribute with right watermark", - streamRelation.join(streamRelation, joinType = LeftSemi, - condition = Some(attribute === attributeWithWatermark)), - OutputMode.Append()) - assertNotSupportedInStreamingPlan( - "left semi join with stream-stream relations and join on non-watermarked attribute", - streamRelation.join(streamRelation, joinType = LeftSemi, - condition = Some(attribute === attribute)), - OutputMode.Append(), - Seq("without a watermark in the join keys")) + // Complete mode not allowed + assertNotSupportedInStreamingPlan( + s"$joinType join with stream-stream relations and complete mode", + Aggregate(Nil, aggExprs("d"), streamRelation.join(streamRelation, joinType = joinType, + condition = Some(attribute === attribute))), + OutputMode.Complete(), + Seq("is not supported in Complete output mode")) + + // Stream-stream allowed with join on watermark attribute + // Note that the attribute need not be watermarked on both sides. + assertSupportedInStreamingPlan( + s"$joinType join with stream-stream relations and join on attribute with left watermark", + streamRelation.join(streamRelation, joinType = joinType, + condition = Some(attributeWithWatermark === attribute)), + OutputMode.Append()) + assertSupportedInStreamingPlan( + s"$joinType join with stream-stream relations and join on attribute with right watermark", + streamRelation.join(streamRelation, joinType = joinType, + condition = Some(attribute === attributeWithWatermark)), + OutputMode.Append()) + assertNotSupportedInStreamingPlan( + s"$joinType join with stream-stream relations and join on non-watermarked attribute", + streamRelation.join(streamRelation, joinType = joinType, + condition = Some(attribute === attribute)), + OutputMode.Append(), + Seq("without a watermark in the join keys")) + + val timeWithWatermark = + AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata) + val relationWithWatermark = new TestStreamingRelation(timeWithWatermark) + val (leftRelation, rightRelation) = + if (joinType == RightOuter) { + (relationWithWatermark, streamRelation) + } else { + (streamRelation, relationWithWatermark) + } - // Left semi joins: stream-stream allowed with range condition yielding state value watermark - assertSupportedInStreamingPlan( - "left semi join with stream-stream relations and state value watermark", { - val leftRelation = streamRelation - val rightTimeWithWatermark = - AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata) - val rightRelation = new TestStreamingRelation(rightTimeWithWatermark) - leftRelation.join( - rightRelation, - joinType = LeftSemi, - condition = Some(attribute > rightTimeWithWatermark + 10)) - }, - OutputMode.Append()) + // stream-stream allowed with range condition yielding state value watermark + assertSupportedInStreamingPlan( + s"$joinType join with stream-stream relations and state value watermark", + leftRelation.join(rightRelation, joinType = joinType, + condition = Some(attribute > timeWithWatermark + 10)), + OutputMode.Append()) - // Left semi joins: stream-stream not allowed with insufficient range condition - assertNotSupportedInStreamingPlan( - "left semi join with stream-stream relations and state value watermark", { - val leftRelation = streamRelation - val rightTimeWithWatermark = - AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata) - val rightRelation = new TestStreamingRelation(rightTimeWithWatermark) - leftRelation.join( - rightRelation, - joinType = LeftSemi, - condition = Some(attribute < rightTimeWithWatermark + 10)) - }, - OutputMode.Append(), - Seq("appropriate range condition")) + // stream-stream not allowed with insufficient range condition + assertNotSupportedInStreamingPlan( + s"$joinType join with stream-stream relations and state value watermark", + leftRelation.join(rightRelation, joinType = joinType, + condition = Some(attribute < timeWithWatermark + 10)), + OutputMode.Append(), + Seq("is not supported without a watermark in the join keys, or a watermark on " + + "the nullable side and an appropriate range condition")) + } - // Left anti joins: stream-* not allowed - testBinaryOperationInStreamingPlan( - "left anti join", - _.join(_, joinType = LeftAnti), - streamStreamSupported = false, - batchStreamSupported = false, - expectedMsg = "Left anti join") + // stream-stream inner join doesn't emit late rows, whereas outer joins could + Seq((Inner, false), (LeftOuter, true), (RightOuter, true)).map { + case (joinType, expectFailure) => + assertPassOnGlobalWatermarkLimit( + s"single $joinType join in Append mode", + streamRelation.join(streamRelation, joinType = RightOuter, + condition = Some(attributeWithWatermark === attribute)), + OutputMode.Append()) - // Right outer joins: stream-* not allowed - testBinaryOperationInStreamingPlan( - "right outer join", - _.join(_, joinType = RightOuter), - streamBatchSupported = false, - streamStreamSupported = false, - expectedMsg = "outer join") + testGlobalWatermarkLimit( + s"streaming aggregation after stream-stream $joinType join in Append mode", + streamRelation.join(streamRelation, joinType = joinType, + condition = Some(attributeWithWatermark === attribute)) + .groupBy("a")(count("*")), + OutputMode.Append(), + expectFailure = expectFailure) - // Right outer joins: stream-stream allowed with join on watermark attribute - // Note that the attribute need not be watermarked on both sides. - assertSupportedInStreamingPlan( - s"right outer join with stream-stream relations and join on attribute with left watermark", - streamRelation.join(streamRelation, joinType = RightOuter, - condition = Some(attributeWithWatermark === attribute)), - OutputMode.Append()) - assertSupportedInStreamingPlan( - s"right outer join with stream-stream relations and join on attribute with right watermark", - streamRelation.join(streamRelation, joinType = RightOuter, - condition = Some(attribute === attributeWithWatermark)), - OutputMode.Append()) - assertNotSupportedInStreamingPlan( - s"right outer join with stream-stream relations and join on non-watermarked attribute", - streamRelation.join(streamRelation, joinType = RightOuter, - condition = Some(attribute === attribute)), - OutputMode.Append(), - Seq("watermark in the join keys")) + Seq(Inner, LeftOuter, RightOuter).foreach { joinType2 => + testGlobalWatermarkLimit( + s"streaming-stream $joinType2 after stream-stream $joinType join in Append mode", + streamRelation.join( + streamRelation.join(streamRelation, joinType = joinType, + condition = Some(attributeWithWatermark === attribute)), + joinType = joinType2, + condition = Some(attributeWithWatermark === attribute)), + OutputMode.Append(), + expectFailure = expectFailure) + } - // Right outer joins: stream-stream allowed with range condition yielding state value watermark - assertSupportedInStreamingPlan( - s"right outer join with stream-stream relations and state value watermark", { - val leftTimeWithWatermark = - AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata) - val leftRelation = new TestStreamingRelation(leftTimeWithWatermark) - val rightRelation = streamRelation - leftRelation.join( - rightRelation, - joinType = RightOuter, - condition = Some(leftTimeWithWatermark + 10 < attribute)) - }, - OutputMode.Append()) + testGlobalWatermarkLimit( + s"FlatMapGroupsWithState after stream-stream $joinType join in Append mode", + FlatMapGroupsWithState( + null, att, att, Seq(att), Seq(att), att, null, Append, + isMapGroupsWithState = false, null, + streamRelation.join(streamRelation, joinType = joinType, + condition = Some(attributeWithWatermark === attribute))), + OutputMode.Append(), + expectFailure = expectFailure) - // Right outer joins: stream-stream not allowed with insufficient range condition - assertNotSupportedInStreamingPlan( - s"right outer join with stream-stream relations and state value watermark", { - val leftTimeWithWatermark = - AttributeReference("b", IntegerType)().withMetadata(watermarkMetadata) - val leftRelation = new TestStreamingRelation(leftTimeWithWatermark) - val rightRelation = streamRelation - leftRelation.join( - rightRelation, - joinType = RightOuter, - condition = Some(leftTimeWithWatermark + 10 > attribute)) - }, - OutputMode.Append(), - Seq("appropriate range condition")) + testGlobalWatermarkLimit( + s"deduplicate after stream-stream $joinType join in Append mode", + Deduplicate(Seq(attribute), streamRelation.join(streamRelation, joinType = joinType, + condition = Some(attributeWithWatermark === attribute))), + OutputMode.Append(), + expectFailure = expectFailure) + } // Cogroup: only batch-batch is allowed testBinaryOperationInStreamingPlan( @@ -744,53 +667,6 @@ class UnsupportedOperationsSuite extends SparkFunSuite with SQLHelper { OutputMode.Append()) } - // stream-stream join - // stream-stream inner join doesn't emit late rows, whereas outer joins could - Seq((Inner, false), (LeftOuter, true), (RightOuter, true)).map { case (joinType, expectFailure) => - assertPassOnGlobalWatermarkLimit( - s"single $joinType join in Append mode", - streamRelation.join(streamRelation, joinType = RightOuter, - condition = Some(attributeWithWatermark === attribute)), - OutputMode.Append()) - - testGlobalWatermarkLimit( - s"streaming aggregation after stream-stream $joinType join in Append mode", - streamRelation.join(streamRelation, joinType = joinType, - condition = Some(attributeWithWatermark === attribute)) - .groupBy("a")(count("*")), - OutputMode.Append(), - expectFailure = expectFailure) - - Seq(Inner, LeftOuter, RightOuter).map { joinType2 => - testGlobalWatermarkLimit( - s"streaming-stream $joinType2 after stream-stream $joinType join in Append mode", - streamRelation.join( - streamRelation.join(streamRelation, joinType = joinType, - condition = Some(attributeWithWatermark === attribute)), - joinType = joinType2, - condition = Some(attributeWithWatermark === attribute)), - OutputMode.Append(), - expectFailure = expectFailure) - } - - testGlobalWatermarkLimit( - s"FlatMapGroupsWithState after stream-stream $joinType join in Append mode", - FlatMapGroupsWithState( - null, att, att, Seq(att), Seq(att), att, null, Append, - isMapGroupsWithState = false, null, - streamRelation.join(streamRelation, joinType = joinType, - condition = Some(attributeWithWatermark === attribute))), - OutputMode.Append(), - expectFailure = expectFailure) - - testGlobalWatermarkLimit( - s"deduplicate after stream-stream $joinType join in Append mode", - Deduplicate(Seq(attribute), streamRelation.join(streamRelation, joinType = joinType, - condition = Some(attributeWithWatermark === attribute))), - OutputMode.Append(), - expectFailure = expectFailure) - } - // FlatMapGroupsWithState { assertPassOnGlobalWatermarkLimit( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala index ad40cc010361c..f30ae70dceffa 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala @@ -1618,26 +1618,28 @@ abstract class SessionCatalogSuite extends AnalysisTest with Eventually { import org.apache.spark.sql.catalyst.dsl.plans._ Seq(true, false) foreach { caseSensitive => - val conf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> caseSensitive) - val catalog = new SessionCatalog(newBasicCatalog(), new SimpleFunctionRegistry, conf) - catalog.setCurrentDatabase("db1") - try { - val analyzer = new Analyzer(catalog, conf) - - // The analyzer should report the undefined function rather than the undefined table first. - val cause = intercept[AnalysisException] { - analyzer.execute( - UnresolvedRelation(TableIdentifier("undefined_table")).select( - UnresolvedFunction("undefined_fn", Nil, isDistinct = false) + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + val catalog = new SessionCatalog(newBasicCatalog(), new SimpleFunctionRegistry) + catalog.setCurrentDatabase("db1") + try { + val analyzer = new Analyzer(catalog) + + // The analyzer should report the undefined function + // rather than the undefined table first. + val cause = intercept[AnalysisException] { + analyzer.execute( + UnresolvedRelation(TableIdentifier("undefined_table")).select( + UnresolvedFunction("undefined_fn", Nil, isDistinct = false) + ) ) - ) - } + } - assert(cause.getMessage.contains("Undefined function: 'undefined_fn'")) - // SPARK-21318: the error message should contains the current database name - assert(cause.getMessage.contains("db1")) - } finally { - catalog.reset() + assert(cause.getMessage.contains("Undefined function: 'undefined_fn'")) + // SPARK-21318: the error message should contains the current database name + assert(cause.getMessage.contains("db1")) + } finally { + catalog.reset() + } } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala index 61133e2db5cbd..afb76d8a5a68c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala @@ -38,9 +38,6 @@ import org.apache.spark.unsafe.types.UTF8String abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { - // Whether it is required to set SQLConf.ANSI_ENABLED as true for testing numeric overflow. - protected def requiredAnsiEnabledForOverflowTestCases: Boolean - protected def cast(v: Any, targetType: DataType, timeZoneId: Option[String] = None): CastBase // expected cannot be null @@ -55,8 +52,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { test("null cast") { import DataTypeTestUtils._ - // follow [[org.apache.spark.sql.catalyst.expressions.Cast.canCast]] logic - // to ensure we test every possible cast situation here atomicTypes.zip(atomicTypes).foreach { case (from, to) => checkNullCast(from, to) } @@ -65,14 +60,10 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { atomicTypes.foreach(dt => checkNullCast(dt, StringType)) checkNullCast(StringType, BinaryType) checkNullCast(StringType, BooleanType) - checkNullCast(DateType, BooleanType) - checkNullCast(TimestampType, BooleanType) numericTypes.foreach(dt => checkNullCast(dt, BooleanType)) checkNullCast(StringType, TimestampType) - checkNullCast(BooleanType, TimestampType) checkNullCast(DateType, TimestampType) - numericTypes.foreach(dt => checkNullCast(dt, TimestampType)) checkNullCast(StringType, DateType) checkNullCast(TimestampType, DateType) @@ -80,8 +71,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { checkNullCast(StringType, CalendarIntervalType) numericTypes.foreach(dt => checkNullCast(StringType, dt)) numericTypes.foreach(dt => checkNullCast(BooleanType, dt)) - numericTypes.foreach(dt => checkNullCast(DateType, dt)) - numericTypes.foreach(dt => checkNullCast(TimestampType, dt)) for (from <- numericTypes; to <- numericTypes) checkNullCast(from, to) } @@ -215,6 +204,39 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(cast(cast(0, BooleanType), IntegerType), 0) } + test("cast from int") { + checkCast(0, false) + checkCast(1, true) + checkCast(-5, true) + checkCast(1, 1.toByte) + checkCast(1, 1.toShort) + checkCast(1, 1) + checkCast(1, 1.toLong) + checkCast(1, 1.0f) + checkCast(1, 1.0) + checkCast(123, "123") + + checkEvaluation(cast(123, DecimalType.USER_DEFAULT), Decimal(123)) + checkEvaluation(cast(123, DecimalType(3, 0)), Decimal(123)) + checkEvaluation(cast(1, LongType), 1.toLong) + } + + test("cast from long") { + checkCast(0L, false) + checkCast(1L, true) + checkCast(-5L, true) + checkCast(1L, 1.toByte) + checkCast(1L, 1.toShort) + checkCast(1L, 1) + checkCast(1L, 1.toLong) + checkCast(1L, 1.0f) + checkCast(1L, 1.0) + checkCast(123L, "123") + + checkEvaluation(cast(123L, DecimalType.USER_DEFAULT), Decimal(123)) + checkEvaluation(cast(123L, DecimalType(3, 0)), Decimal(123)) + } + test("cast from float") { checkCast(0.0f, false) checkCast(0.5f, true) @@ -237,8 +259,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { checkCast(1.5, 1.toLong) checkCast(1.5, 1.5f) checkCast(1.5, "1.5") - - checkEvaluation(cast(cast(1.toDouble, TimestampType), DoubleType), 1.toDouble) } test("cast from string") { @@ -305,18 +325,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { cast(cast("5", ByteType), ShortType), IntegerType), FloatType), DoubleType), LongType), 5.toLong) - checkEvaluation( - cast(cast(cast(cast(cast(cast("5", ByteType), TimestampType), - DecimalType.SYSTEM_DEFAULT), LongType), StringType), ShortType), - 5.toShort) - checkEvaluation( - cast(cast(cast(cast(cast(cast("5", TimestampType, UTC_OPT), ByteType), - DecimalType.SYSTEM_DEFAULT), LongType), StringType), ShortType), - null) - checkEvaluation(cast(cast(cast(cast(cast(cast("5", DecimalType.SYSTEM_DEFAULT), - ByteType), TimestampType), LongType), StringType), ShortType), - 5.toShort) - checkEvaluation(cast("23", DoubleType), 23d) checkEvaluation(cast("23", IntegerType), 23) checkEvaluation(cast("23", FloatType), 23f) @@ -350,58 +358,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { checkCast(Decimal(1.5), "1.5") } - test("cast from date") { - val d = Date.valueOf("1970-01-01") - checkEvaluation(cast(d, ShortType), null) - checkEvaluation(cast(d, IntegerType), null) - checkEvaluation(cast(d, LongType), null) - checkEvaluation(cast(d, FloatType), null) - checkEvaluation(cast(d, DoubleType), null) - checkEvaluation(cast(d, DecimalType.SYSTEM_DEFAULT), null) - checkEvaluation(cast(d, DecimalType(10, 2)), null) - checkEvaluation(cast(d, StringType), "1970-01-01") - - checkEvaluation( - cast(cast(d, TimestampType, UTC_OPT), StringType, UTC_OPT), - "1970-01-01 00:00:00") - } - - test("cast from timestamp") { - val millis = 15 * 1000 + 3 - val seconds = millis * 1000 + 3 - val ts = new Timestamp(millis) - val tss = new Timestamp(seconds) - checkEvaluation(cast(ts, ShortType), 15.toShort) - checkEvaluation(cast(ts, IntegerType), 15) - checkEvaluation(cast(ts, LongType), 15.toLong) - checkEvaluation(cast(ts, FloatType), 15.003f) - checkEvaluation(cast(ts, DoubleType), 15.003) - - checkEvaluation(cast(cast(tss, ShortType), TimestampType), - fromJavaTimestamp(ts) * MILLIS_PER_SECOND) - checkEvaluation(cast(cast(tss, IntegerType), TimestampType), - fromJavaTimestamp(ts) * MILLIS_PER_SECOND) - checkEvaluation(cast(cast(tss, LongType), TimestampType), - fromJavaTimestamp(ts) * MILLIS_PER_SECOND) - checkEvaluation( - cast(cast(millis.toFloat / MILLIS_PER_SECOND, TimestampType), FloatType), - millis.toFloat / MILLIS_PER_SECOND) - checkEvaluation( - cast(cast(millis.toDouble / MILLIS_PER_SECOND, TimestampType), DoubleType), - millis.toDouble / MILLIS_PER_SECOND) - checkEvaluation( - cast(cast(Decimal(1), TimestampType), DecimalType.SYSTEM_DEFAULT), - Decimal(1)) - - // A test for higher precision than millis - checkEvaluation(cast(cast(0.000001, TimestampType), DoubleType), 0.000001) - - checkEvaluation(cast(Double.NaN, TimestampType), null) - checkEvaluation(cast(1.0 / 0.0, TimestampType), null) - checkEvaluation(cast(Float.NaN, TimestampType), null) - checkEvaluation(cast(1.0f / 0.0f, TimestampType), null) - } - test("cast from array") { val array = Literal.create(Seq("123", "true", "f", null), ArrayType(StringType, containsNull = true)) @@ -635,16 +591,20 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(cast("", BooleanType), null) } + protected def checkInvalidCastFromNumericType(to: DataType): Unit = { + assert(cast(1.toByte, to).checkInputDataTypes().isFailure) + assert(cast(1.toShort, to).checkInputDataTypes().isFailure) + assert(cast(1, to).checkInputDataTypes().isFailure) + assert(cast(1L, to).checkInputDataTypes().isFailure) + assert(cast(1.0.toFloat, to).checkInputDataTypes().isFailure) + assert(cast(1.0, to).checkInputDataTypes().isFailure) + } + test("SPARK-16729 type checking for casting to date type") { assert(cast("1234", DateType).checkInputDataTypes().isSuccess) assert(cast(new Timestamp(1), DateType).checkInputDataTypes().isSuccess) assert(cast(false, DateType).checkInputDataTypes().isFailure) - assert(cast(1.toByte, DateType).checkInputDataTypes().isFailure) - assert(cast(1.toShort, DateType).checkInputDataTypes().isFailure) - assert(cast(1, DateType).checkInputDataTypes().isFailure) - assert(cast(1L, DateType).checkInputDataTypes().isFailure) - assert(cast(1.0.toFloat, DateType).checkInputDataTypes().isFailure) - assert(cast(1.0, DateType).checkInputDataTypes().isFailure) + checkInvalidCastFromNumericType(DateType) } test("SPARK-20302 cast with same structure") { @@ -686,117 +646,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { assert(ctx.inlinedMutableStates.length == 0) } - test("SPARK-22825 Cast array to string") { - val ret1 = cast(Literal.create(Array(1, 2, 3, 4, 5)), StringType) - checkEvaluation(ret1, "[1, 2, 3, 4, 5]") - val ret2 = cast(Literal.create(Array("ab", "cde", "f")), StringType) - checkEvaluation(ret2, "[ab, cde, f]") - Seq(false, true).foreach { omitNull => - withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> omitNull.toString) { - val ret3 = cast(Literal.create(Array("ab", null, "c")), StringType) - checkEvaluation(ret3, s"[ab,${if (omitNull) "" else " null"}, c]") - } - } - val ret4 = - cast(Literal.create(Array("ab".getBytes, "cde".getBytes, "f".getBytes)), StringType) - checkEvaluation(ret4, "[ab, cde, f]") - val ret5 = cast( - Literal.create(Array("2014-12-03", "2014-12-04", "2014-12-06").map(Date.valueOf)), - StringType) - checkEvaluation(ret5, "[2014-12-03, 2014-12-04, 2014-12-06]") - val ret6 = cast( - Literal.create(Array("2014-12-03 13:01:00", "2014-12-04 15:05:00") - .map(Timestamp.valueOf)), - StringType) - checkEvaluation(ret6, "[2014-12-03 13:01:00, 2014-12-04 15:05:00]") - val ret7 = cast(Literal.create(Array(Array(1, 2, 3), Array(4, 5))), StringType) - checkEvaluation(ret7, "[[1, 2, 3], [4, 5]]") - val ret8 = cast( - Literal.create(Array(Array(Array("a"), Array("b", "c")), Array(Array("d")))), - StringType) - checkEvaluation(ret8, "[[[a], [b, c]], [[d]]]") - } - - test("SPARK-33291: Cast array with null elements to string") { - Seq(false, true).foreach { omitNull => - withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> omitNull.toString) { - val ret1 = cast(Literal.create(Array(null, null)), StringType) - checkEvaluation( - ret1, - s"[${if (omitNull) "" else "null"},${if (omitNull) "" else " null"}]") - } - } - } - - test("SPARK-22973 Cast map to string") { - Seq( - false -> ("{", "}"), - true -> ("[", "]")).foreach { case (legacyCast, (lb, rb)) => - withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyCast.toString) { - val ret1 = cast(Literal.create(Map(1 -> "a", 2 -> "b", 3 -> "c")), StringType) - checkEvaluation(ret1, s"${lb}1 -> a, 2 -> b, 3 -> c$rb") - val ret2 = cast( - Literal.create(Map("1" -> "a".getBytes, "2" -> null, "3" -> "c".getBytes)), - StringType) - checkEvaluation(ret2, s"${lb}1 -> a, 2 ->${if (legacyCast) "" else " null"}, 3 -> c$rb") - val ret3 = cast( - Literal.create(Map( - 1 -> Date.valueOf("2014-12-03"), - 2 -> Date.valueOf("2014-12-04"), - 3 -> Date.valueOf("2014-12-05"))), - StringType) - checkEvaluation(ret3, s"${lb}1 -> 2014-12-03, 2 -> 2014-12-04, 3 -> 2014-12-05$rb") - val ret4 = cast( - Literal.create(Map( - 1 -> Timestamp.valueOf("2014-12-03 13:01:00"), - 2 -> Timestamp.valueOf("2014-12-04 15:05:00"))), - StringType) - checkEvaluation(ret4, s"${lb}1 -> 2014-12-03 13:01:00, 2 -> 2014-12-04 15:05:00$rb") - val ret5 = cast( - Literal.create(Map( - 1 -> Array(1, 2, 3), - 2 -> Array(4, 5, 6))), - StringType) - checkEvaluation(ret5, s"${lb}1 -> [1, 2, 3], 2 -> [4, 5, 6]$rb") - } - } - } - - test("SPARK-22981 Cast struct to string") { - Seq( - false -> ("{", "}"), - true -> ("[", "]")).foreach { case (legacyCast, (lb, rb)) => - withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyCast.toString) { - val ret1 = cast(Literal.create((1, "a", 0.1)), StringType) - checkEvaluation(ret1, s"${lb}1, a, 0.1$rb") - val ret2 = cast(Literal.create(Tuple3[Int, String, String](1, null, "a")), StringType) - checkEvaluation(ret2, s"${lb}1,${if (legacyCast) "" else " null"}, a$rb") - val ret3 = cast(Literal.create( - (Date.valueOf("2014-12-03"), Timestamp.valueOf("2014-12-03 15:05:00"))), StringType) - checkEvaluation(ret3, s"${lb}2014-12-03, 2014-12-03 15:05:00$rb") - val ret4 = cast(Literal.create(((1, "a"), 5, 0.1)), StringType) - checkEvaluation(ret4, s"$lb${lb}1, a$rb, 5, 0.1$rb") - val ret5 = cast(Literal.create((Seq(1, 2, 3), "a", 0.1)), StringType) - checkEvaluation(ret5, s"$lb[1, 2, 3], a, 0.1$rb") - val ret6 = cast(Literal.create((1, Map(1 -> "a", 2 -> "b", 3 -> "c"))), StringType) - checkEvaluation(ret6, s"${lb}1, ${lb}1 -> a, 2 -> b, 3 -> c$rb$rb") - } - } - } - - test("SPARK-33291: Cast struct with null elements to string") { - Seq( - false -> ("{", "}"), - true -> ("[", "]")).foreach { case (legacyCast, (lb, rb)) => - withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyCast.toString) { - val ret1 = cast(Literal.create(Tuple2[String, String](null, null)), StringType) - checkEvaluation( - ret1, - s"$lb${if (legacyCast) "" else "null"},${if (legacyCast) "" else " null"}$rb") - } - } - } - test("up-cast") { def isCastSafe(from: NumericType, to: NumericType): Boolean = (from, to) match { case (_, dt: DecimalType) => dt.isWiderThan(from) @@ -869,20 +718,6 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { } } - test("Throw exception on casting out-of-range value to decimal type") { - withSQLConf(SQLConf.ANSI_ENABLED.key -> requiredAnsiEnabledForOverflowTestCases.toString) { - checkExceptionInExpression[ArithmeticException]( - cast(Literal("134.12"), DecimalType(3, 2)), "cannot be represented") - checkExceptionInExpression[ArithmeticException]( - cast(Literal(Timestamp.valueOf("2019-07-25 22:04:36")), DecimalType(3, 2)), - "cannot be represented") - checkExceptionInExpression[ArithmeticException]( - cast(Literal(BigDecimal(134.12)), DecimalType(3, 2)), "cannot be represented") - checkExceptionInExpression[ArithmeticException]( - cast(Literal(134.12), DecimalType(3, 2)), "cannot be represented") - } - } - test("Process Infinity, -Infinity, NaN in case insensitive manner") { Seq("inf", "+inf", "infinity", "+infiNity", " infinity ").foreach { value => checkEvaluation(cast(value, FloatType), Float.PositiveInfinity) @@ -903,14 +738,15 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(cast(value, DoubleType), Double.NaN) } } +} + +abstract class AnsiCastSuiteBase extends CastSuiteBase { private def testIntMaxAndMin(dt: DataType): Unit = { assert(Seq(IntegerType, ShortType, ByteType).contains(dt)) Seq(Int.MaxValue + 1L, Int.MinValue - 1L).foreach { value => checkExceptionInExpression[ArithmeticException](cast(value, dt), "overflow") checkExceptionInExpression[ArithmeticException](cast(Decimal(value.toString), dt), "overflow") - checkExceptionInExpression[ArithmeticException]( - cast(Literal(value * MICROS_PER_SECOND, TimestampType), dt), "overflow") checkExceptionInExpression[ArithmeticException]( cast(Literal(value * 1.5f, FloatType), dt), "overflow") checkExceptionInExpression[ArithmeticException]( @@ -930,151 +766,219 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper { } } - test("Throw exception on casting out-of-range value to byte type") { - withSQLConf(SQLConf.ANSI_ENABLED.key -> requiredAnsiEnabledForOverflowTestCases.toString) { - testIntMaxAndMin(ByteType) - Seq(Byte.MaxValue + 1, Byte.MinValue - 1).foreach { value => - checkExceptionInExpression[ArithmeticException](cast(value, ByteType), "overflow") - checkExceptionInExpression[ArithmeticException]( - cast(Literal(value * MICROS_PER_SECOND, TimestampType), ByteType), "overflow") - checkExceptionInExpression[ArithmeticException]( - cast(Literal(value.toFloat, FloatType), ByteType), "overflow") - checkExceptionInExpression[ArithmeticException]( - cast(Literal(value.toDouble, DoubleType), ByteType), "overflow") - } + test("ANSI mode: Throw exception on casting out-of-range value to byte type") { + testIntMaxAndMin(ByteType) + Seq(Byte.MaxValue + 1, Byte.MinValue - 1).foreach { value => + checkExceptionInExpression[ArithmeticException](cast(value, ByteType), "overflow") + checkExceptionInExpression[ArithmeticException]( + cast(Literal(value.toFloat, FloatType), ByteType), "overflow") + checkExceptionInExpression[ArithmeticException]( + cast(Literal(value.toDouble, DoubleType), ByteType), "overflow") + } - Seq(Byte.MaxValue, 0.toByte, Byte.MinValue).foreach { value => - checkEvaluation(cast(value, ByteType), value) - checkEvaluation(cast(value.toString, ByteType), value) - checkEvaluation(cast(Decimal(value.toString), ByteType), value) - checkEvaluation(cast(Literal(value * MICROS_PER_SECOND, TimestampType), ByteType), value) - checkEvaluation(cast(Literal(value.toInt, DateType), ByteType), null) - checkEvaluation(cast(Literal(value.toFloat, FloatType), ByteType), value) - checkEvaluation(cast(Literal(value.toDouble, DoubleType), ByteType), value) - } + Seq(Byte.MaxValue, 0.toByte, Byte.MinValue).foreach { value => + checkEvaluation(cast(value, ByteType), value) + checkEvaluation(cast(value.toString, ByteType), value) + checkEvaluation(cast(Decimal(value.toString), ByteType), value) + checkEvaluation(cast(Literal(value.toFloat, FloatType), ByteType), value) + checkEvaluation(cast(Literal(value.toDouble, DoubleType), ByteType), value) } } - test("Throw exception on casting out-of-range value to short type") { - withSQLConf(SQLConf.ANSI_ENABLED.key -> requiredAnsiEnabledForOverflowTestCases.toString) { - testIntMaxAndMin(ShortType) - Seq(Short.MaxValue + 1, Short.MinValue - 1).foreach { value => - checkExceptionInExpression[ArithmeticException](cast(value, ShortType), "overflow") - checkExceptionInExpression[ArithmeticException]( - cast(Literal(value * MICROS_PER_SECOND, TimestampType), ShortType), "overflow") - checkExceptionInExpression[ArithmeticException]( - cast(Literal(value.toFloat, FloatType), ShortType), "overflow") - checkExceptionInExpression[ArithmeticException]( - cast(Literal(value.toDouble, DoubleType), ShortType), "overflow") - } + test("ANSI mode: Throw exception on casting out-of-range value to short type") { + testIntMaxAndMin(ShortType) + Seq(Short.MaxValue + 1, Short.MinValue - 1).foreach { value => + checkExceptionInExpression[ArithmeticException](cast(value, ShortType), "overflow") + checkExceptionInExpression[ArithmeticException]( + cast(Literal(value.toFloat, FloatType), ShortType), "overflow") + checkExceptionInExpression[ArithmeticException]( + cast(Literal(value.toDouble, DoubleType), ShortType), "overflow") + } - Seq(Short.MaxValue, 0.toShort, Short.MinValue).foreach { value => - checkEvaluation(cast(value, ShortType), value) - checkEvaluation(cast(value.toString, ShortType), value) - checkEvaluation(cast(Decimal(value.toString), ShortType), value) - checkEvaluation(cast(Literal(value * MICROS_PER_SECOND, TimestampType), ShortType), value) - checkEvaluation(cast(Literal(value.toInt, DateType), ShortType), null) - checkEvaluation(cast(Literal(value.toFloat, FloatType), ShortType), value) - checkEvaluation(cast(Literal(value.toDouble, DoubleType), ShortType), value) - } + Seq(Short.MaxValue, 0.toShort, Short.MinValue).foreach { value => + checkEvaluation(cast(value, ShortType), value) + checkEvaluation(cast(value.toString, ShortType), value) + checkEvaluation(cast(Decimal(value.toString), ShortType), value) + checkEvaluation(cast(Literal(value.toFloat, FloatType), ShortType), value) + checkEvaluation(cast(Literal(value.toDouble, DoubleType), ShortType), value) } } - test("Throw exception on casting out-of-range value to int type") { - withSQLConf(SQLConf.ANSI_ENABLED.key -> requiredAnsiEnabledForOverflowTestCases.toString) { - testIntMaxAndMin(IntegerType) - testLongMaxAndMin(IntegerType) + test("ANSI mode: Throw exception on casting out-of-range value to int type") { + testIntMaxAndMin(IntegerType) + testLongMaxAndMin(IntegerType) - Seq(Int.MaxValue, 0, Int.MinValue).foreach { value => - checkEvaluation(cast(value, IntegerType), value) - checkEvaluation(cast(value.toString, IntegerType), value) - checkEvaluation(cast(Decimal(value.toString), IntegerType), value) - checkEvaluation(cast(Literal(value * MICROS_PER_SECOND, TimestampType), IntegerType), value) - checkEvaluation(cast(Literal(value * 1.0, DoubleType), IntegerType), value) - } - checkEvaluation(cast(Int.MaxValue + 0.9D, IntegerType), Int.MaxValue) - checkEvaluation(cast(Int.MinValue - 0.9D, IntegerType), Int.MinValue) + Seq(Int.MaxValue, 0, Int.MinValue).foreach { value => + checkEvaluation(cast(value, IntegerType), value) + checkEvaluation(cast(value.toString, IntegerType), value) + checkEvaluation(cast(Decimal(value.toString), IntegerType), value) + checkEvaluation(cast(Literal(value * 1.0, DoubleType), IntegerType), value) } + checkEvaluation(cast(Int.MaxValue + 0.9D, IntegerType), Int.MaxValue) + checkEvaluation(cast(Int.MinValue - 0.9D, IntegerType), Int.MinValue) } - test("Throw exception on casting out-of-range value to long type") { - withSQLConf(SQLConf.ANSI_ENABLED.key -> requiredAnsiEnabledForOverflowTestCases.toString) { - testLongMaxAndMin(LongType) + test("ANSI mode: Throw exception on casting out-of-range value to long type") { + testLongMaxAndMin(LongType) - Seq(Long.MaxValue, 0, Long.MinValue).foreach { value => - checkEvaluation(cast(value, LongType), value) - checkEvaluation(cast(value.toString, LongType), value) - checkEvaluation(cast(Decimal(value.toString), LongType), value) - checkEvaluation(cast(Literal(value, TimestampType), LongType), - Math.floorDiv(value, MICROS_PER_SECOND)) - } - checkEvaluation(cast(Long.MaxValue + 0.9F, LongType), Long.MaxValue) - checkEvaluation(cast(Long.MinValue - 0.9F, LongType), Long.MinValue) - checkEvaluation(cast(Long.MaxValue + 0.9D, LongType), Long.MaxValue) - checkEvaluation(cast(Long.MinValue - 0.9D, LongType), Long.MinValue) + Seq(Long.MaxValue, 0, Long.MinValue).foreach { value => + checkEvaluation(cast(value, LongType), value) + checkEvaluation(cast(value.toString, LongType), value) + checkEvaluation(cast(Decimal(value.toString), LongType), value) } + checkEvaluation(cast(Long.MaxValue + 0.9F, LongType), Long.MaxValue) + checkEvaluation(cast(Long.MinValue - 0.9F, LongType), Long.MinValue) + checkEvaluation(cast(Long.MaxValue + 0.9D, LongType), Long.MaxValue) + checkEvaluation(cast(Long.MinValue - 0.9D, LongType), Long.MinValue) } -} -/** - * Test suite for data type casting expression [[Cast]]. - */ -class CastSuite extends CastSuiteBase { - // It is required to set SQLConf.ANSI_ENABLED as true for testing numeric overflow. - override protected def requiredAnsiEnabledForOverflowTestCases: Boolean = true + test("ANSI mode: Throw exception on casting out-of-range value to decimal type") { + checkExceptionInExpression[ArithmeticException]( + cast(Literal("134.12"), DecimalType(3, 2)), "cannot be represented") + checkExceptionInExpression[ArithmeticException]( + cast(Literal(BigDecimal(134.12)), DecimalType(3, 2)), "cannot be represented") + checkExceptionInExpression[ArithmeticException]( + cast(Literal(134.12), DecimalType(3, 2)), "cannot be represented") + } - override def cast(v: Any, targetType: DataType, timeZoneId: Option[String] = None): CastBase = { - v match { - case lit: Expression => Cast(lit, targetType, timeZoneId) - case _ => Cast(Literal(v), targetType, timeZoneId) + test("ANSI mode: disallow type conversions between Numeric types and Timestamp type") { + import DataTypeTestUtils.numericTypes + checkInvalidCastFromNumericType(TimestampType) + val timestampLiteral = Literal(1L, TimestampType) + numericTypes.foreach { numericType => + assert(cast(timestampLiteral, numericType).checkInputDataTypes().isFailure) } } - test("cast from int") { - checkCast(0, false) - checkCast(1, true) - checkCast(-5, true) - checkCast(1, 1.toByte) - checkCast(1, 1.toShort) - checkCast(1, 1) - checkCast(1, 1.toLong) - checkCast(1, 1.0f) - checkCast(1, 1.0) - checkCast(123, "123") + test("ANSI mode: disallow type conversions between Numeric types and Date type") { + import DataTypeTestUtils.numericTypes + checkInvalidCastFromNumericType(DateType) + val dateLiteral = Literal(1, DateType) + numericTypes.foreach { numericType => + assert(cast(dateLiteral, numericType).checkInputDataTypes().isFailure) + } + } - checkEvaluation(cast(123, DecimalType.USER_DEFAULT), Decimal(123)) - checkEvaluation(cast(123, DecimalType(3, 0)), Decimal(123)) - checkEvaluation(cast(123, DecimalType(3, 1)), null) - checkEvaluation(cast(123, DecimalType(2, 0)), null) + test("ANSI mode: disallow type conversions between Numeric types and Binary type") { + import DataTypeTestUtils.numericTypes + checkInvalidCastFromNumericType(BinaryType) + val binaryLiteral = Literal(new Array[Byte](1.toByte), BinaryType) + numericTypes.foreach { numericType => + assert(cast(binaryLiteral, numericType).checkInputDataTypes().isFailure) + } } - test("cast from long") { - checkCast(0L, false) - checkCast(1L, true) - checkCast(-5L, true) - checkCast(1L, 1.toByte) - checkCast(1L, 1.toShort) - checkCast(1L, 1) - checkCast(1L, 1.toLong) - checkCast(1L, 1.0f) - checkCast(1L, 1.0) - checkCast(123L, "123") + test("ANSI mode: disallow type conversions between Datatime types and Boolean types") { + val timestampLiteral = Literal(1L, TimestampType) + assert(cast(timestampLiteral, BooleanType).checkInputDataTypes().isFailure) + val dateLiteral = Literal(1, DateType) + assert(cast(dateLiteral, BooleanType).checkInputDataTypes().isFailure) - checkEvaluation(cast(123L, DecimalType.USER_DEFAULT), Decimal(123)) - checkEvaluation(cast(123L, DecimalType(3, 0)), Decimal(123)) - checkEvaluation(cast(123L, DecimalType(3, 1)), null) + val booleanLiteral = Literal(true, BooleanType) + assert(cast(booleanLiteral, TimestampType).checkInputDataTypes().isFailure) + assert(cast(booleanLiteral, DateType).checkInputDataTypes().isFailure) + } - checkEvaluation(cast(123L, DecimalType(2, 0)), null) + test("ANSI mode: disallow casting complex types as String type") { + assert(cast(Literal.create(Array(1, 2, 3, 4, 5)), StringType).checkInputDataTypes().isFailure) + assert(cast(Literal.create(Map(1 -> "a")), StringType).checkInputDataTypes().isFailure) + assert(cast(Literal.create((1, "a", 0.1)), StringType).checkInputDataTypes().isFailure) } - test("cast from int 2") { - checkEvaluation(cast(1, LongType), 1.toLong) + test("cast from invalid string to numeric should throw NumberFormatException") { + // cast to IntegerType + Seq(IntegerType, ShortType, ByteType, LongType).foreach { dataType => + val array = Literal.create(Seq("123", "true", "f", null), + ArrayType(StringType, containsNull = true)) + checkExceptionInExpression[NumberFormatException]( + cast(array, ArrayType(dataType, containsNull = true)), + "invalid input syntax for type numeric: true") + checkExceptionInExpression[NumberFormatException]( + cast("string", dataType), "invalid input syntax for type numeric: string") + checkExceptionInExpression[NumberFormatException]( + cast("123-string", dataType), "invalid input syntax for type numeric: 123-string") + checkExceptionInExpression[NumberFormatException]( + cast("2020-07-19", dataType), "invalid input syntax for type numeric: 2020-07-19") + checkExceptionInExpression[NumberFormatException]( + cast("1.23", dataType), "invalid input syntax for type numeric: 1.23") + } + + Seq(DoubleType, FloatType, DecimalType.USER_DEFAULT).foreach { dataType => + checkExceptionInExpression[NumberFormatException]( + cast("string", dataType), "invalid input syntax for type numeric: string") + checkExceptionInExpression[NumberFormatException]( + cast("123.000.00", dataType), "invalid input syntax for type numeric: 123.000.00") + checkExceptionInExpression[NumberFormatException]( + cast("abc.com", dataType), "invalid input syntax for type numeric: abc.com") + } + } + + test("Fast fail for cast string type to decimal type in ansi mode") { + checkEvaluation(cast("12345678901234567890123456789012345678", DecimalType(38, 0)), + Decimal("12345678901234567890123456789012345678")) + checkExceptionInExpression[ArithmeticException]( + cast("123456789012345678901234567890123456789", DecimalType(38, 0)), + "out of decimal type range") + checkExceptionInExpression[ArithmeticException]( + cast("12345678901234567890123456789012345678", DecimalType(38, 1)), + "cannot be represented as Decimal(38, 1)") + checkEvaluation(cast("0.00000000000000000000000000000000000001", DecimalType(38, 0)), + Decimal("0")) + checkEvaluation(cast("0.00000000000000000000000000000000000000000001", DecimalType(38, 0)), + Decimal("0")) + checkEvaluation(cast("0.00000000000000000000000000000000000001", DecimalType(38, 18)), + Decimal("0E-18")) + checkEvaluation(cast("6E-120", DecimalType(38, 0)), + Decimal("0")) + + checkEvaluation(cast("6E+37", DecimalType(38, 0)), + Decimal("60000000000000000000000000000000000000")) + checkExceptionInExpression[ArithmeticException]( + cast("6E+38", DecimalType(38, 0)), + "out of decimal type range") + checkExceptionInExpression[ArithmeticException]( + cast("6E+37", DecimalType(38, 1)), + "cannot be represented as Decimal(38, 1)") + + checkExceptionInExpression[NumberFormatException]( + cast("abcd", DecimalType(38, 1)), + "invalid input syntax for type numeric") + } +} + +/** + * Test suite for data type casting expression [[Cast]]. + */ +class CastSuite extends CastSuiteBase { + + override def cast(v: Any, targetType: DataType, timeZoneId: Option[String] = None): CastBase = { + v match { + case lit: Expression => Cast(lit, targetType, timeZoneId) + case _ => Cast(Literal(v), targetType, timeZoneId) + } + } + + test("null cast #2") { + import DataTypeTestUtils._ + + checkNullCast(DateType, BooleanType) + checkNullCast(TimestampType, BooleanType) + checkNullCast(BooleanType, TimestampType) + numericTypes.foreach(dt => checkNullCast(dt, TimestampType)) + numericTypes.foreach(dt => checkNullCast(TimestampType, dt)) + numericTypes.foreach(dt => checkNullCast(DateType, dt)) + } + + test("cast from long #2") { + checkEvaluation(cast(123L, DecimalType(3, 1)), null) + checkEvaluation(cast(123L, DecimalType(2, 0)), null) + } + + test("cast from int #2") { checkEvaluation(cast(cast(1000, TimestampType), LongType), 1000.toLong) checkEvaluation(cast(cast(-1200, TimestampType), LongType), -1200.toLong) - checkEvaluation(cast(123, DecimalType.USER_DEFAULT), Decimal(123)) - checkEvaluation(cast(123, DecimalType(3, 0)), Decimal(123)) checkEvaluation(cast(123, DecimalType(3, 1)), null) checkEvaluation(cast(123, DecimalType(2, 0)), null) } @@ -1343,6 +1247,58 @@ class CastSuite extends CastSuiteBase { } } + test("cast from date") { + val d = Date.valueOf("1970-01-01") + checkEvaluation(cast(d, ShortType), null) + checkEvaluation(cast(d, IntegerType), null) + checkEvaluation(cast(d, LongType), null) + checkEvaluation(cast(d, FloatType), null) + checkEvaluation(cast(d, DoubleType), null) + checkEvaluation(cast(d, DecimalType.SYSTEM_DEFAULT), null) + checkEvaluation(cast(d, DecimalType(10, 2)), null) + checkEvaluation(cast(d, StringType), "1970-01-01") + + checkEvaluation( + cast(cast(d, TimestampType, UTC_OPT), StringType, UTC_OPT), + "1970-01-01 00:00:00") + } + + test("cast from timestamp") { + val millis = 15 * 1000 + 3 + val seconds = millis * 1000 + 3 + val ts = new Timestamp(millis) + val tss = new Timestamp(seconds) + checkEvaluation(cast(ts, ShortType), 15.toShort) + checkEvaluation(cast(ts, IntegerType), 15) + checkEvaluation(cast(ts, LongType), 15.toLong) + checkEvaluation(cast(ts, FloatType), 15.003f) + checkEvaluation(cast(ts, DoubleType), 15.003) + + checkEvaluation(cast(cast(tss, ShortType), TimestampType), + fromJavaTimestamp(ts) * MILLIS_PER_SECOND) + checkEvaluation(cast(cast(tss, IntegerType), TimestampType), + fromJavaTimestamp(ts) * MILLIS_PER_SECOND) + checkEvaluation(cast(cast(tss, LongType), TimestampType), + fromJavaTimestamp(ts) * MILLIS_PER_SECOND) + checkEvaluation( + cast(cast(millis.toFloat / MILLIS_PER_SECOND, TimestampType), FloatType), + millis.toFloat / MILLIS_PER_SECOND) + checkEvaluation( + cast(cast(millis.toDouble / MILLIS_PER_SECOND, TimestampType), DoubleType), + millis.toDouble / MILLIS_PER_SECOND) + checkEvaluation( + cast(cast(Decimal(1), TimestampType), DecimalType.SYSTEM_DEFAULT), + Decimal(1)) + + // A test for higher precision than millis + checkEvaluation(cast(cast(0.000001, TimestampType), DoubleType), 0.000001) + + checkEvaluation(cast(Double.NaN, TimestampType), null) + checkEvaluation(cast(1.0 / 0.0, TimestampType), null) + checkEvaluation(cast(Float.NaN, TimestampType), null) + checkEvaluation(cast(1.0f / 0.0f, TimestampType), null) + } + test("cast a timestamp before the epoch 1970-01-01 00:00:00Z") { withDefaultTimeZone(UTC) { val negativeTs = Timestamp.valueOf("1900-05-05 18:34:56.1") @@ -1396,93 +1352,199 @@ class CastSuite extends CastSuiteBase { checkEvaluation(cast("abcd", DecimalType(38, 1)), null) } + + test("SPARK-22825 Cast array to string") { + val ret1 = cast(Literal.create(Array(1, 2, 3, 4, 5)), StringType) + checkEvaluation(ret1, "[1, 2, 3, 4, 5]") + val ret2 = cast(Literal.create(Array("ab", "cde", "f")), StringType) + checkEvaluation(ret2, "[ab, cde, f]") + Seq(false, true).foreach { omitNull => + withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> omitNull.toString) { + val ret3 = cast(Literal.create(Array("ab", null, "c")), StringType) + checkEvaluation(ret3, s"[ab,${if (omitNull) "" else " null"}, c]") + } + } + val ret4 = + cast(Literal.create(Array("ab".getBytes, "cde".getBytes, "f".getBytes)), StringType) + checkEvaluation(ret4, "[ab, cde, f]") + val ret5 = cast( + Literal.create(Array("2014-12-03", "2014-12-04", "2014-12-06").map(Date.valueOf)), + StringType) + checkEvaluation(ret5, "[2014-12-03, 2014-12-04, 2014-12-06]") + val ret6 = cast( + Literal.create(Array("2014-12-03 13:01:00", "2014-12-04 15:05:00") + .map(Timestamp.valueOf)), + StringType) + checkEvaluation(ret6, "[2014-12-03 13:01:00, 2014-12-04 15:05:00]") + val ret7 = cast(Literal.create(Array(Array(1, 2, 3), Array(4, 5))), StringType) + checkEvaluation(ret7, "[[1, 2, 3], [4, 5]]") + val ret8 = cast( + Literal.create(Array(Array(Array("a"), Array("b", "c")), Array(Array("d")))), + StringType) + checkEvaluation(ret8, "[[[a], [b, c]], [[d]]]") + } + + test("SPARK-33291: Cast array with null elements to string") { + Seq(false, true).foreach { omitNull => + withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> omitNull.toString) { + val ret1 = cast(Literal.create(Array(null, null)), StringType) + checkEvaluation( + ret1, + s"[${if (omitNull) "" else "null"},${if (omitNull) "" else " null"}]") + } + } + } + + test("SPARK-22973 Cast map to string") { + Seq( + false -> ("{", "}"), + true -> ("[", "]")).foreach { case (legacyCast, (lb, rb)) => + withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyCast.toString) { + val ret1 = cast(Literal.create(Map(1 -> "a", 2 -> "b", 3 -> "c")), StringType) + checkEvaluation(ret1, s"${lb}1 -> a, 2 -> b, 3 -> c$rb") + val ret2 = cast( + Literal.create(Map("1" -> "a".getBytes, "2" -> null, "3" -> "c".getBytes)), + StringType) + checkEvaluation(ret2, s"${lb}1 -> a, 2 ->${if (legacyCast) "" else " null"}, 3 -> c$rb") + val ret3 = cast( + Literal.create(Map( + 1 -> Date.valueOf("2014-12-03"), + 2 -> Date.valueOf("2014-12-04"), + 3 -> Date.valueOf("2014-12-05"))), + StringType) + checkEvaluation(ret3, s"${lb}1 -> 2014-12-03, 2 -> 2014-12-04, 3 -> 2014-12-05$rb") + val ret4 = cast( + Literal.create(Map( + 1 -> Timestamp.valueOf("2014-12-03 13:01:00"), + 2 -> Timestamp.valueOf("2014-12-04 15:05:00"))), + StringType) + checkEvaluation(ret4, s"${lb}1 -> 2014-12-03 13:01:00, 2 -> 2014-12-04 15:05:00$rb") + val ret5 = cast( + Literal.create(Map( + 1 -> Array(1, 2, 3), + 2 -> Array(4, 5, 6))), + StringType) + checkEvaluation(ret5, s"${lb}1 -> [1, 2, 3], 2 -> [4, 5, 6]$rb") + } + } + } + + test("SPARK-22981 Cast struct to string") { + Seq( + false -> ("{", "}"), + true -> ("[", "]")).foreach { case (legacyCast, (lb, rb)) => + withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyCast.toString) { + val ret1 = cast(Literal.create((1, "a", 0.1)), StringType) + checkEvaluation(ret1, s"${lb}1, a, 0.1$rb") + val ret2 = cast(Literal.create(Tuple3[Int, String, String](1, null, "a")), StringType) + checkEvaluation(ret2, s"${lb}1,${if (legacyCast) "" else " null"}, a$rb") + val ret3 = cast(Literal.create( + (Date.valueOf("2014-12-03"), Timestamp.valueOf("2014-12-03 15:05:00"))), StringType) + checkEvaluation(ret3, s"${lb}2014-12-03, 2014-12-03 15:05:00$rb") + val ret4 = cast(Literal.create(((1, "a"), 5, 0.1)), StringType) + checkEvaluation(ret4, s"$lb${lb}1, a$rb, 5, 0.1$rb") + val ret5 = cast(Literal.create((Seq(1, 2, 3), "a", 0.1)), StringType) + checkEvaluation(ret5, s"$lb[1, 2, 3], a, 0.1$rb") + val ret6 = cast(Literal.create((1, Map(1 -> "a", 2 -> "b", 3 -> "c"))), StringType) + checkEvaluation(ret6, s"${lb}1, ${lb}1 -> a, 2 -> b, 3 -> c$rb$rb") + } + } + } + + test("SPARK-33291: Cast struct with null elements to string") { + Seq( + false -> ("{", "}"), + true -> ("[", "]")).foreach { case (legacyCast, (lb, rb)) => + withSQLConf(SQLConf.LEGACY_COMPLEX_TYPES_TO_STRING.key -> legacyCast.toString) { + val ret1 = cast(Literal.create(Tuple2[String, String](null, null)), StringType) + checkEvaluation( + ret1, + s"$lb${if (legacyCast) "" else "null"},${if (legacyCast) "" else " null"}$rb") + } + } + } + + test("data type casting II") { + checkEvaluation( + cast(cast(cast(cast(cast(cast("5", ByteType), TimestampType), + DecimalType.SYSTEM_DEFAULT), LongType), StringType), ShortType), + 5.toShort) + checkEvaluation( + cast(cast(cast(cast(cast(cast("5", TimestampType, UTC_OPT), ByteType), + DecimalType.SYSTEM_DEFAULT), LongType), StringType), ShortType), + null) + checkEvaluation(cast(cast(cast(cast(cast(cast("5", DecimalType.SYSTEM_DEFAULT), + ByteType), TimestampType), LongType), StringType), ShortType), + 5.toShort) + } + + test("Cast from double II") { + checkEvaluation(cast(cast(1.toDouble, TimestampType), DoubleType), 1.toDouble) + } } /** - * Test suite for data type casting expression [[AnsiCast]]. + * Test suite for data type casting expression [[Cast]] with ANSI mode disabled. */ -class AnsiCastSuite extends CastSuiteBase { - // It is not required to set SQLConf.ANSI_ENABLED as true for testing numeric overflow. - override protected def requiredAnsiEnabledForOverflowTestCases: Boolean = false +class CastSuiteWithAnsiModeOn extends AnsiCastSuiteBase { + override def beforeAll(): Unit = { + super.beforeAll() + SQLConf.get.setConf(SQLConf.ANSI_ENABLED, true) + } + + override def afterAll(): Unit = { + super.afterAll() + SQLConf.get.unsetConf(SQLConf.ANSI_ENABLED) + } override def cast(v: Any, targetType: DataType, timeZoneId: Option[String] = None): CastBase = { v match { - case lit: Expression => AnsiCast(lit, targetType, timeZoneId) - case _ => AnsiCast(Literal(v), targetType, timeZoneId) + case lit: Expression => Cast(lit, targetType, timeZoneId) + case _ => Cast(Literal(v), targetType, timeZoneId) } } +} - test("cast from invalid string to numeric should throw NumberFormatException") { - // cast to IntegerType - Seq(IntegerType, ShortType, ByteType, LongType).foreach { dataType => - val array = Literal.create(Seq("123", "true", "f", null), - ArrayType(StringType, containsNull = true)) - checkExceptionInExpression[NumberFormatException]( - cast(array, ArrayType(dataType, containsNull = true)), - "invalid input syntax for type numeric: true") - checkExceptionInExpression[NumberFormatException]( - cast("string", dataType), "invalid input syntax for type numeric: string") - checkExceptionInExpression[NumberFormatException]( - cast("123-string", dataType), "invalid input syntax for type numeric: 123-string") - checkExceptionInExpression[NumberFormatException]( - cast("2020-07-19", dataType), "invalid input syntax for type numeric: 2020-07-19") - checkExceptionInExpression[NumberFormatException]( - cast("1.23", dataType), "invalid input syntax for type numeric: 1.23") - } +/** + * Test suite for data type casting expression [[AnsiCast]] with ANSI mode enabled. + */ +class AnsiCastSuiteWithAnsiModeOn extends AnsiCastSuiteBase { + override def beforeAll(): Unit = { + super.beforeAll() + SQLConf.get.setConf(SQLConf.ANSI_ENABLED, true) + } - Seq(DoubleType, FloatType, DecimalType.USER_DEFAULT).foreach { dataType => - checkExceptionInExpression[NumberFormatException]( - cast("string", dataType), "invalid input syntax for type numeric: string") - checkExceptionInExpression[NumberFormatException]( - cast("123.000.00", dataType), "invalid input syntax for type numeric: 123.000.00") - checkExceptionInExpression[NumberFormatException]( - cast("abc.com", dataType), "invalid input syntax for type numeric: abc.com") - } + override def afterAll(): Unit = { + super.afterAll() + SQLConf.get.unsetConf(SQLConf.ANSI_ENABLED) } - test("cast a timestamp before the epoch 1970-01-01 00:00:00Z") { - def errMsg(t: String): String = s"Casting -2198208303900000 to $t causes overflow" - withDefaultTimeZone(UTC) { - val negativeTs = Timestamp.valueOf("1900-05-05 18:34:56.1") - assert(negativeTs.getTime < 0) - val expectedSecs = Math.floorDiv(negativeTs.getTime, MILLIS_PER_SECOND) - checkExceptionInExpression[ArithmeticException](cast(negativeTs, ByteType), errMsg("byte")) - checkExceptionInExpression[ArithmeticException](cast(negativeTs, ShortType), errMsg("short")) - checkExceptionInExpression[ArithmeticException](cast(negativeTs, IntegerType), errMsg("int")) - checkEvaluation(cast(negativeTs, LongType), expectedSecs) + override def cast(v: Any, targetType: DataType, timeZoneId: Option[String] = None): CastBase = { + v match { + case lit: Expression => AnsiCast(lit, targetType, timeZoneId) + case _ => AnsiCast(Literal(v), targetType, timeZoneId) } } +} - test("Fast fail for cast string type to decimal type in ansi mode") { - checkEvaluation(cast("12345678901234567890123456789012345678", DecimalType(38, 0)), - Decimal("12345678901234567890123456789012345678")) - checkExceptionInExpression[ArithmeticException]( - cast("123456789012345678901234567890123456789", DecimalType(38, 0)), - "out of decimal type range") - checkExceptionInExpression[ArithmeticException]( - cast("12345678901234567890123456789012345678", DecimalType(38, 1)), - "cannot be represented as Decimal(38, 1)") - - checkEvaluation(cast("0.00000000000000000000000000000000000001", DecimalType(38, 0)), - Decimal("0")) - checkEvaluation(cast("0.00000000000000000000000000000000000000000001", DecimalType(38, 0)), - Decimal("0")) - checkEvaluation(cast("0.00000000000000000000000000000000000001", DecimalType(38, 18)), - Decimal("0E-18")) - checkEvaluation(cast("6E-120", DecimalType(38, 0)), - Decimal("0")) +/** + * Test suite for data type casting expression [[AnsiCast]] with ANSI mode disabled. + */ +class AnsiCastSuiteWithAnsiModeOff extends AnsiCastSuiteBase { + override def beforeAll(): Unit = { + super.beforeAll() + SQLConf.get.setConf(SQLConf.ANSI_ENABLED, false) + } - checkEvaluation(cast("6E+37", DecimalType(38, 0)), - Decimal("60000000000000000000000000000000000000")) - checkExceptionInExpression[ArithmeticException]( - cast("6E+38", DecimalType(38, 0)), - "out of decimal type range") - checkExceptionInExpression[ArithmeticException]( - cast("6E+37", DecimalType(38, 1)), - "cannot be represented as Decimal(38, 1)") + override def afterAll(): Unit = { + super.afterAll() + SQLConf.get.unsetConf(SQLConf.ANSI_ENABLED) + } - checkExceptionInExpression[NumberFormatException]( - cast("abcd", DecimalType(38, 1)), - "invalid input syntax for type numeric") + override def cast(v: Any, targetType: DataType, timeZoneId: Option[String] = None): CastBase = { + v match { + case lit: Expression => AnsiCast(lit, targetType, timeZoneId) + case _ => AnsiCast(Literal(v), targetType, timeZoneId) + } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala index 6ee88c9eaef86..095894b9fffac 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala @@ -1915,4 +1915,19 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper } } } + + test("SPARK-33460: element_at NoSuchElementException") { + Seq(true, false).foreach { ansiEnabled => + withSQLConf(SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) { + val map = Literal.create(Map(1 -> "a", 2 -> "b"), MapType(IntegerType, StringType)) + val expr: Expression = ElementAt(map, Literal(5)) + if (ansiEnabled) { + val errMsg = "Key 5 does not exist." + checkExceptionInExpression[Exception](expr, errMsg) + } else { + checkEvaluation(expr, null) + } + } + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala index 67ab2071de037..3d6f6937e780b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala @@ -85,6 +85,23 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper { } } + test("SPARK-33460: GetMapValue NoSuchElementException") { + Seq(true, false).foreach { ansiEnabled => + withSQLConf(SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) { + val map = Literal.create(Map(1 -> "a", 2 -> "b"), MapType(IntegerType, StringType)) + + if (ansiEnabled) { + checkExceptionInExpression[Exception]( + GetMapValue(map, Literal(5)), + "Key 5 does not exist." + ) + } else { + checkEvaluation(GetMapValue(map, Literal(5)), null) + } + } + } + } + test("SPARK-26637 handles GetArrayItem nullability correctly when input array size is constant") { // CreateArray case val a = AttributeReference("a", IntegerType, nullable = false)() diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala index c31310bc54023..8f030b45e5d3e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MutableProjectionSuite.scala @@ -80,4 +80,50 @@ class MutableProjectionSuite extends SparkFunSuite with ExpressionEvalHelper { assert(errMsg.contains("MutableProjection cannot use UnsafeRow for output data types:")) } } + + test("SPARK-33473: subexpression elimination for interpreted MutableProjection") { + Seq("true", "false").foreach { enabled => + withSQLConf( + SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> enabled, + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString) { + val one = BoundReference(0, DoubleType, true) + val two = BoundReference(1, DoubleType, true) + + val mul = Multiply(one, two) + val mul2 = Multiply(mul, mul) + val sqrt = Sqrt(mul2) + val sum = Add(mul2, sqrt) + + val proj = MutableProjection.create(Seq(sum)) + val result = (d1: Double, d2: Double) => + ((d1 * d2) * (d1 * d2)) + Math.sqrt((d1 * d2) * (d1 * d2)) + + val inputRows = Seq( + InternalRow.fromSeq(Seq(1.0, 2.0)), + InternalRow.fromSeq(Seq(2.0, 3.0)), + InternalRow.fromSeq(Seq(1.0, null)), + InternalRow.fromSeq(Seq(null, 2.0)), + InternalRow.fromSeq(Seq(3.0, 4.0)), + InternalRow.fromSeq(Seq(null, null)) + ) + val expectedResults = Seq( + result(1.0, 2.0), + result(2.0, 3.0), + null, + null, + result(3.0, 4.0), + null + ) + + inputRows.zip(expectedResults).foreach { case (inputRow, expected) => + val projRow = proj.apply(inputRow) + if (expected != null) { + assert(projRow.getDouble(0) == expected) + } else { + assert(projRow.isNullAt(0)) + } + } + } + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala index ff33324c3bb18..bc2b93e5390da 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala @@ -28,7 +28,7 @@ import scala.util.Random import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.sql.{RandomDataGenerator, Row} -import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, JavaTypeInference, ScalaReflection} +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.ScroogeLikeExample import org.apache.spark.sql.catalyst.analysis.{ResolveTimeZone, SimpleAnalyzer, UnresolvedDeserializer} import org.apache.spark.sql.catalyst.dsl.expressions._ @@ -37,9 +37,8 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjectio import org.apache.spark.sql.catalyst.expressions.objects._ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project} import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData, IntervalUtils} -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +import org.apache.spark.unsafe.types.UTF8String class InvokeTargetClass extends Serializable { def filterInt(e: Any): Any = e.asInstanceOf[Int] > 0 diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 77a32a735f76d..cc5ab5dc7b4e0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -48,6 +48,30 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(mkExpr(regex), expected, create_row(input)) // check row input } + test("LIKE ALL") { + checkEvaluation(Literal.create(null, StringType).likeAll("%foo%", "%oo"), null) + checkEvaluation(Literal.create("foo", StringType).likeAll("%foo%", "%oo"), true) + checkEvaluation(Literal.create("foo", StringType).likeAll("%foo%", "%bar%"), false) + checkEvaluation(Literal.create("foo", StringType) + .likeAll("%foo%", Literal.create(null, StringType)), null) + checkEvaluation(Literal.create("foo", StringType) + .likeAll(Literal.create(null, StringType), "%foo%"), null) + checkEvaluation(Literal.create("foo", StringType) + .likeAll("%feo%", Literal.create(null, StringType)), false) + checkEvaluation(Literal.create("foo", StringType) + .likeAll(Literal.create(null, StringType), "%feo%"), false) + checkEvaluation(Literal.create("foo", StringType).notLikeAll("tee", "%yoo%"), true) + checkEvaluation(Literal.create("foo", StringType).notLikeAll("%oo%", "%yoo%"), false) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAll("%foo%", Literal.create(null, StringType)), false) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAll(Literal.create(null, StringType), "%foo%"), false) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAll("%yoo%", Literal.create(null, StringType)), null) + checkEvaluation(Literal.create("foo", StringType) + .notLikeAll(Literal.create(null, StringType), "%yoo%"), null) + } + test("LIKE Pattern") { // null handling diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubExprEvaluationRuntimeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubExprEvaluationRuntimeSuite.scala new file mode 100644 index 0000000000000..64b619ca7766b --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubExprEvaluationRuntimeSuite.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.SparkFunSuite + +class SubExprEvaluationRuntimeSuite extends SparkFunSuite { + + test("Evaluate ExpressionProxy should create cached result") { + val runtime = new SubExprEvaluationRuntime(1) + val proxy = ExpressionProxy(Literal(1), 0, runtime) + assert(runtime.cache.size() == 0) + proxy.eval() + assert(runtime.cache.size() == 1) + assert(runtime.cache.get(proxy) == ResultProxy(1)) + } + + test("SubExprEvaluationRuntime cannot exceed configured max entries") { + val runtime = new SubExprEvaluationRuntime(2) + assert(runtime.cache.size() == 0) + + val proxy1 = ExpressionProxy(Literal(1), 0, runtime) + proxy1.eval() + assert(runtime.cache.size() == 1) + assert(runtime.cache.get(proxy1) == ResultProxy(1)) + + val proxy2 = ExpressionProxy(Literal(2), 1, runtime) + proxy2.eval() + assert(runtime.cache.size() == 2) + assert(runtime.cache.get(proxy2) == ResultProxy(2)) + + val proxy3 = ExpressionProxy(Literal(3), 2, runtime) + proxy3.eval() + assert(runtime.cache.size() == 2) + assert(runtime.cache.get(proxy3) == ResultProxy(3)) + } + + test("setInput should empty cached result") { + val runtime = new SubExprEvaluationRuntime(2) + val proxy1 = ExpressionProxy(Literal(1), 0, runtime) + assert(runtime.cache.size() == 0) + proxy1.eval() + assert(runtime.cache.size() == 1) + assert(runtime.cache.get(proxy1) == ResultProxy(1)) + + val proxy2 = ExpressionProxy(Literal(2), 1, runtime) + proxy2.eval() + assert(runtime.cache.size() == 2) + assert(runtime.cache.get(proxy2) == ResultProxy(2)) + + runtime.setInput() + assert(runtime.cache.size() == 0) + } + + test("Wrap ExpressionProxy on subexpressions") { + val runtime = new SubExprEvaluationRuntime(1) + + val one = Literal(1) + val two = Literal(2) + val mul = Multiply(one, two) + val mul2 = Multiply(mul, mul) + val sqrt = Sqrt(mul2) + val sum = Add(mul2, sqrt) + + // ( (one * two) * (one * two) ) + sqrt( (one * two) * (one * two) ) + val proxyExpressions = runtime.proxyExpressions(Seq(sum)) + val proxys = proxyExpressions.flatMap(_.collect { + case p: ExpressionProxy => p + }) + // ( (one * two) * (one * two) ) + assert(proxys.size == 2) + assert(proxys.forall(_.child == mul2)) + } + + test("ExpressionProxy won't be on non deterministic") { + val runtime = new SubExprEvaluationRuntime(1) + + val sum = Add(Rand(0), Rand(0)) + val proxys = runtime.proxyExpressions(Seq(sum, sum)).flatMap(_.collect { + case p: ExpressionProxy => p + }) + assert(proxys.isEmpty) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala index 303fa137d8925..53e8ee9fbe715 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentileSuite.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.catalyst.util.{ArrayData, QuantileSummaries} import org.apache.spark.sql.catalyst.util.QuantileSummaries.Stats -import org.apache.spark.sql.types.{ArrayType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, IntegralType, LongType} +import org.apache.spark.sql.types.{ArrayType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, IntegralType} import org.apache.spark.util.SizeEstimator class ApproximatePercentileSuite extends SparkFunSuite { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala index 67e3bc69543e8..d660afb7f8a05 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeBlockSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.types.{BooleanType, IntegerType} +import org.apache.spark.sql.types.IntegerType class CodeBlockSuite extends SparkFunSuite { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala index 4c9bcfe8f93a6..180665e653727 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala @@ -23,13 +23,14 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String /** * A test suite for generated projections */ -class GeneratedProjectionSuite extends SparkFunSuite { +class GeneratedProjectionSuite extends SparkFunSuite with ExpressionEvalHelper { test("generated projections on wider table") { val N = 1000 @@ -246,4 +247,50 @@ class GeneratedProjectionSuite extends SparkFunSuite { val row2 = mutableProj(result) assert(result === row2) } + + test("SPARK-33473: subexpression elimination for interpreted SafeProjection") { + Seq("true", "false").foreach { enabled => + withSQLConf( + SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> enabled, + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString) { + val one = BoundReference(0, DoubleType, true) + val two = BoundReference(1, DoubleType, true) + + val mul = Multiply(one, two) + val mul2 = Multiply(mul, mul) + val sqrt = Sqrt(mul2) + val sum = Add(mul2, sqrt) + + val proj = SafeProjection.create(Seq(sum)) + val result = (d1: Double, d2: Double) => + ((d1 * d2) * (d1 * d2)) + Math.sqrt((d1 * d2) * (d1 * d2)) + + val inputRows = Seq( + InternalRow.fromSeq(Seq(1.0, 2.0)), + InternalRow.fromSeq(Seq(2.0, 3.0)), + InternalRow.fromSeq(Seq(1.0, null)), + InternalRow.fromSeq(Seq(null, 2.0)), + InternalRow.fromSeq(Seq(3.0, 4.0)), + InternalRow.fromSeq(Seq(null, null)) + ) + val expectedResults = Seq( + result(1.0, 2.0), + result(2.0, 3.0), + null, + null, + result(3.0, 4.0), + null + ) + + inputRows.zip(expectedResults).foreach { case (inputRow, expected) => + val projRow = proj.apply(inputRow) + if (expected != null) { + assert(projRow.getDouble(0) == expected) + } else { + assert(projRow.isNullAt(0)) + } + } + } + } + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala index 03d75340e31e9..04dcf50e0c3c5 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/BooleanSimplificationSuite.scala @@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.BooleanType class BooleanSimplificationSuite extends PlanTest with ExpressionEvalHelper with PredicateHelper { @@ -188,25 +187,23 @@ class BooleanSimplificationSuite extends PlanTest with ExpressionEvalHelper with checkCondition(!(('e || 'f) && ('g || 'h)), (!'e && !'f) || (!'g && !'h)) } - private val caseInsensitiveConf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> false) - private val caseInsensitiveAnalyzer = new Analyzer( - new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, caseInsensitiveConf), - caseInsensitiveConf) + private val analyzer = new Analyzer( + new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry)) test("(a && b) || (a && c) => a && (b || c) when case insensitive") { - val plan = caseInsensitiveAnalyzer.execute( + val plan = analyzer.execute( testRelation.where(('a > 2 && 'b > 3) || ('A > 2 && 'b < 5))) val actual = Optimize.execute(plan) - val expected = caseInsensitiveAnalyzer.execute( + val expected = analyzer.execute( testRelation.where('a > 2 && ('b > 3 || 'b < 5))) comparePlans(actual, expected) } test("(a || b) && (a || c) => a || (b && c) when case insensitive") { - val plan = caseInsensitiveAnalyzer.execute( + val plan = analyzer.execute( testRelation.where(('a > 2 || 'b > 3) && ('A > 2 || 'b < 5))) val actual = Optimize.execute(plan) - val expected = caseInsensitiveAnalyzer.execute( + val expected = analyzer.execute( testRelation.where('a > 2 || ('b > 3 && 'b < 5))) comparePlans(actual, expected) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala index b190dd5a7c220..70f130f834c68 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombiningLimitsSuite.scala @@ -30,8 +30,8 @@ class CombiningLimitsSuite extends PlanTest { Batch("Column Pruning", FixedPoint(100), ColumnPruning, RemoveNoopOperators) :: - Batch("Combine Limit", FixedPoint(10), - CombineLimits) :: + Batch("Eliminate Limit", FixedPoint(10), + EliminateLimits) :: Batch("Constant Folding", FixedPoint(10), NullPropagation, ConstantFolding, @@ -90,4 +90,31 @@ class CombiningLimitsSuite extends PlanTest { comparePlans(optimized, correctAnswer) } + + test("SPARK-33442: Change Combine Limit to Eliminate limit using max row") { + // test child max row <= limit. + val query1 = testRelation.select().groupBy()(count(1)).limit(1).analyze + val optimized1 = Optimize.execute(query1) + val expected1 = testRelation.select().groupBy()(count(1)).analyze + comparePlans(optimized1, expected1) + + // test child max row > limit. + val query2 = testRelation.select().groupBy()(count(1)).limit(0).analyze + val optimized2 = Optimize.execute(query2) + comparePlans(optimized2, query2) + + // test child max row is none + val query3 = testRelation.select(Symbol("a")).limit(1).analyze + val optimized3 = Optimize.execute(query3) + comparePlans(optimized3, query3) + + // test sort after limit + val query4 = testRelation.select().groupBy()(count(1)) + .orderBy(count(1).asc).limit(1).analyze + val optimized4 = Optimize.execute(query4) + // the top project has been removed, so we need optimize expected too + val expected4 = Optimize.execute( + testRelation.select().groupBy()(count(1)).orderBy(count(1).asc).analyze) + comparePlans(optimized4, expected4) + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala index f40691bd1a038..51c751923e414 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateDistinctSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.PlanTest -import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Expand, LocalRelation, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor class EliminateDistinctSuite extends PlanTest { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsBeforeRepartitionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsBeforeRepartitionSuite.scala index 9f031358611b1..82db174ad41b0 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsBeforeRepartitionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateSortsBeforeRepartitionSuite.scala @@ -27,8 +27,8 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor class EliminateSortsBeforeRepartitionSuite extends PlanTest { - val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf) - val analyzer = new Analyzer(catalog, conf) + val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry) + val analyzer = new Analyzer(catalog) val testRelation = LocalRelation('a.int, 'b.int, 'c.int) val anotherTestRelation = LocalRelation('d.int, 'e.int) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala index 11ec037c94f73..c518fdded2112 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala @@ -25,8 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{BooleanType, IntegerType, StringType, TimestampType} +import org.apache.spark.sql.types.{IntegerType, StringType} import org.apache.spark.unsafe.types.CalendarInterval class FilterPushdownSuite extends PlanTest { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala index d993aee3d7518..e365e3300096e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LimitPushdownSuite.scala @@ -33,7 +33,7 @@ class LimitPushdownSuite extends PlanTest { EliminateSubqueryAliases) :: Batch("Limit pushdown", FixedPoint(100), LimitPushDown, - CombineLimits, + EliminateLimits, ConstantFolding, BooleanSimplification) :: Nil } @@ -74,7 +74,7 @@ class LimitPushdownSuite extends PlanTest { Union(testRelation.limit(1), testRelation2.select('d, 'e, 'f).limit(1)).limit(2) val unionOptimized = Optimize.execute(unionQuery.analyze) val unionCorrectAnswer = - Limit(2, Union(testRelation.limit(1), testRelation2.select('d, 'e, 'f).limit(1))).analyze + Union(testRelation.limit(1), testRelation2.select('d, 'e, 'f).limit(1)).analyze comparePlans(unionOptimized, unionCorrectAnswer) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerStructuralIntegrityCheckerSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerStructuralIntegrityCheckerSuite.scala index 5998437f11f4d..42ab43242a16b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerStructuralIntegrityCheckerSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerStructuralIntegrityCheckerSuite.scala @@ -27,7 +27,6 @@ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LocalRelation, LogicalPlan, OneRowRelation, Project} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.connector.catalog.CatalogManager -import org.apache.spark.sql.internal.SQLConf class OptimizerStructuralIntegrityCheckerSuite extends PlanTest { @@ -45,9 +44,8 @@ class OptimizerStructuralIntegrityCheckerSuite extends PlanTest { object Optimize extends Optimizer( new CatalogManager( - new SQLConf(), FakeV2SessionCatalog, - new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, new SQLConf()))) { + new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry))) { val newBatch = Batch("OptimizeRuleBreakSI", Once, OptimizeRuleBreakSI) override def defaultBatches: Seq[Batch] = Seq(newBatch) ++ super.defaultBatches } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala index 8785bc7cd36cb..17dfc7f3f18f7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PullupCorrelatedPredicatesSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest -import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor class PullupCorrelatedPredicatesSuite extends PlanTest { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregatesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregatesSuite.scala index 8cb939e010c68..5d6abf516f288 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregatesSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregatesSuite.scala @@ -16,23 +16,15 @@ */ package org.apache.spark.sql.catalyst.optimizer -import org.apache.spark.sql.catalyst.analysis.{Analyzer, EmptyFunctionRegistry} -import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.catalyst.expressions.aggregate.CollectSet import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Expand, LocalRelation, LogicalPlan} -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.SQLConf.{CASE_SENSITIVE, GROUP_BY_ORDINAL} import org.apache.spark.sql.types.{IntegerType, StringType} class RewriteDistinctAggregatesSuite extends PlanTest { - override val conf = new SQLConf().copy(CASE_SENSITIVE -> false, GROUP_BY_ORDINAL -> false) - val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf) - val analyzer = new Analyzer(catalog, conf) - val nullInt = Literal(null, IntegerType) val nullString = Literal(null, StringType) val testRelation = LocalRelation('a.string, 'b.string, 'c.string, 'd.string, 'e.int) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyCastsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyCastsSuite.scala index 0ccf8aea660b2..c981cee55d0fa 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyCastsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/SimplifyCastsSuite.scala @@ -17,10 +17,8 @@ package org.apache.spark.sql.catalyst.optimizer -import org.apache.spark.sql.catalyst.dsl._ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.RuleExecutor diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala index 9878969959bfd..dcd2fbbf00529 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext import org.apache.spark.sql.catalyst.plans.PlanTest -import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, OneRowRelation, Project, Range} +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types._ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index cddc392cfa2d7..f93c0dcf59f4c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -21,12 +21,11 @@ import java.util.Locale import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, GlobalTempView, LocalTempView, PersistedView, UnresolvedAttribute, UnresolvedFunc, UnresolvedNamespace, UnresolvedPartitionSpec, UnresolvedRelation, UnresolvedStar, UnresolvedTable, UnresolvedTableOrView} -import org.apache.spark.sql.catalyst.catalog.{ArchiveResource, BucketSpec, FileResource, FunctionResource, FunctionResourceType, JarResource} +import org.apache.spark.sql.catalyst.catalog.{ArchiveResource, BucketSpec, FileResource, FunctionResource, JarResource} import org.apache.spark.sql.catalyst.expressions.{EqualTo, Literal} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition.{after, first} import org.apache.spark.sql.connector.expressions.{ApplyTransform, BucketTransform, DaysTransform, FieldReference, HoursTransform, IdentityTransform, LiteralValue, MonthsTransform, Transform, YearsTransform} -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -1629,32 +1628,6 @@ class DDLParserSuite extends AnalysisTest { TruncateTableStatement(Seq("a", "b", "c"), Some(Map("ds" -> "2017-06-10")))) } - test("SHOW PARTITIONS") { - val sql1 = "SHOW PARTITIONS t1" - val sql2 = "SHOW PARTITIONS db1.t1" - val sql3 = "SHOW PARTITIONS t1 PARTITION(partcol1='partvalue', partcol2='partvalue')" - val sql4 = "SHOW PARTITIONS a.b.c" - val sql5 = "SHOW PARTITIONS a.b.c PARTITION(ds='2017-06-10')" - - val parsed1 = parsePlan(sql1) - val expected1 = ShowPartitionsStatement(Seq("t1"), None) - val parsed2 = parsePlan(sql2) - val expected2 = ShowPartitionsStatement(Seq("db1", "t1"), None) - val parsed3 = parsePlan(sql3) - val expected3 = ShowPartitionsStatement(Seq("t1"), - Some(Map("partcol1" -> "partvalue", "partcol2" -> "partvalue"))) - val parsed4 = parsePlan(sql4) - val expected4 = ShowPartitionsStatement(Seq("a", "b", "c"), None) - val parsed5 = parsePlan(sql5) - val expected5 = ShowPartitionsStatement(Seq("a", "b", "c"), Some(Map("ds" -> "2017-06-10"))) - - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - comparePlans(parsed3, expected3) - comparePlans(parsed4, expected4) - comparePlans(parsed5, expected5) - } - test("REFRESH TABLE") { comparePlans( parsePlan("REFRESH TABLE a.b.c"), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala index 6ad132cdfe449..7c70ab98e4183 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala @@ -22,6 +22,7 @@ import org.scalatest.Suite import org.scalatest.Tag import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode @@ -56,10 +57,7 @@ trait CodegenInterpretedPlanTest extends PlanTest { * Provides helper methods for comparing plans, but without the overhead of * mandating a FunSuite. */ -trait PlanTestBase extends PredicateHelper with SQLHelper { self: Suite => - - // TODO(gatorsmile): remove this from PlanTest and all the analyzer rules - protected def conf = SQLConf.get +trait PlanTestBase extends PredicateHelper with SQLHelper with SQLConfHelper { self: Suite => /** * Since attribute references are given globally unique ids during analysis, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala index 1cf888519077a..878fae4c547b3 100755 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} import org.apache.spark.sql.catalyst.plans.LeftOuter import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.{ColumnStatsMap, FilterEstimation} +import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.ColumnStatsMap import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._ import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala index b0325600e7530..3b47271a114e2 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala @@ -27,6 +27,7 @@ import scala.collection.mutable import org.scalatest.Assertions._ import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, JoinedRow} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.connector.catalog._ import org.apache.spark.sql.connector.expressions.{BucketTransform, DaysTransform, HoursTransform, IdentityTransform, MonthsTransform, Transform, YearsTransform} @@ -34,8 +35,9 @@ import org.apache.spark.sql.connector.read._ import org.apache.spark.sql.connector.write._ import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.sources.{And, EqualTo, Filter, IsNotNull} -import org.apache.spark.sql.types.{DataType, DateType, StructType, TimestampType} +import org.apache.spark.sql.types.{DataType, DateType, StringType, StructField, StructType, TimestampType} import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.unsafe.types.UTF8String /** * A simple in-memory table. Rows are stored as a buffered group produced by each output task. @@ -45,7 +47,24 @@ class InMemoryTable( val schema: StructType, override val partitioning: Array[Transform], override val properties: util.Map[String, String]) - extends Table with SupportsRead with SupportsWrite with SupportsDelete { + extends Table with SupportsRead with SupportsWrite with SupportsDelete + with SupportsMetadataColumns { + + private object PartitionKeyColumn extends MetadataColumn { + override def name: String = "_partition" + override def dataType: DataType = StringType + override def comment: String = "Partition key used to store the row" + } + + private object IndexColumn extends MetadataColumn { + override def name: String = "index" + override def dataType: DataType = StringType + override def comment: String = "Metadata column used to conflict with a data column" + } + + // purposely exposes a metadata column that conflicts with a data column in some tests + override val metadataColumns: Array[MetadataColumn] = Array(IndexColumn, PartitionKeyColumn) + private val metadataColumnNames = metadataColumns.map(_.name).toSet -- schema.map(_.name) private val allowUnsupportedTransforms = properties.getOrDefault("allow-unsupported-transforms", "false").toBoolean @@ -146,7 +165,7 @@ class InMemoryTable( val key = getKey(row) dataMap += dataMap.get(key) .map(key -> _.withRow(row)) - .getOrElse(key -> new BufferedRows().withRow(row)) + .getOrElse(key -> new BufferedRows(key.toArray.mkString("/")).withRow(row)) }) this } @@ -160,17 +179,38 @@ class InMemoryTable( TableCapability.TRUNCATE).asJava override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { - () => new InMemoryBatchScan(data.map(_.asInstanceOf[InputPartition])) + new InMemoryScanBuilder(schema) + } + + class InMemoryScanBuilder(tableSchema: StructType) extends ScanBuilder + with SupportsPushDownRequiredColumns { + private var schema: StructType = tableSchema + + override def build: Scan = + new InMemoryBatchScan(data.map(_.asInstanceOf[InputPartition]), schema) + + override def pruneColumns(requiredSchema: StructType): Unit = { + // if metadata columns are projected, return the table schema and metadata columns + val hasMetadataColumns = requiredSchema.map(_.name).exists(metadataColumnNames.contains) + if (hasMetadataColumns) { + schema = StructType(tableSchema ++ metadataColumnNames + .flatMap(name => metadataColumns.find(_.name == name)) + .map(col => StructField(col.name, col.dataType, col.isNullable))) + } + } } - class InMemoryBatchScan(data: Array[InputPartition]) extends Scan with Batch { + class InMemoryBatchScan(data: Array[InputPartition], schema: StructType) extends Scan with Batch { override def readSchema(): StructType = schema override def toBatch: Batch = this override def planInputPartitions(): Array[InputPartition] = data - override def createReaderFactory(): PartitionReaderFactory = BufferedRowsReaderFactory + override def createReaderFactory(): PartitionReaderFactory = { + val metadataColumns = schema.map(_.name).filter(metadataColumnNames.contains) + new BufferedRowsReaderFactory(metadataColumns) + } } override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { @@ -340,7 +380,8 @@ object InMemoryTable { } } -class BufferedRows extends WriterCommitMessage with InputPartition with Serializable { +class BufferedRows( + val key: String = "") extends WriterCommitMessage with InputPartition with Serializable { val rows = new mutable.ArrayBuffer[InternalRow]() def withRow(row: InternalRow): BufferedRows = { @@ -349,13 +390,24 @@ class BufferedRows extends WriterCommitMessage with InputPartition with Serializ } } -private object BufferedRowsReaderFactory extends PartitionReaderFactory { +private class BufferedRowsReaderFactory( + metadataColumns: Seq[String]) extends PartitionReaderFactory { override def createReader(partition: InputPartition): PartitionReader[InternalRow] = { - new BufferedRowsReader(partition.asInstanceOf[BufferedRows]) + new BufferedRowsReader(partition.asInstanceOf[BufferedRows], metadataColumns) } } -private class BufferedRowsReader(partition: BufferedRows) extends PartitionReader[InternalRow] { +private class BufferedRowsReader( + partition: BufferedRows, + metadataColumns: Seq[String]) extends PartitionReader[InternalRow] { + private def addMetadata(row: InternalRow): InternalRow = { + val metadataRow = new GenericInternalRow(metadataColumns.map { + case "index" => index + case "_partition" => UTF8String.fromString(partition.key) + }.toArray) + new JoinedRow(row, metadataRow) + } + private var index: Int = -1 override def next(): Boolean = { @@ -363,7 +415,7 @@ private class BufferedRowsReader(partition: BufferedRows) extends PartitionReade index < partition.rows.length } - override def get(): InternalRow = partition.rows(index) + override def get(): InternalRow = addMetadata(partition.rows(index)) override def close(): Unit = {} } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogManagerSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogManagerSuite.scala index 7dd0753fcf777..aec361b9799cc 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogManagerSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/CatalogManagerSuite.scala @@ -24,76 +24,77 @@ import scala.collection.JavaConverters._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.analysis.{EmptyFunctionRegistry, FakeV2SessionCatalog, NoSuchNamespaceException} import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, InMemoryCatalog, SessionCatalog} +import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.connector.InMemoryTableCatalog import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.util.CaseInsensitiveStringMap -class CatalogManagerSuite extends SparkFunSuite { +class CatalogManagerSuite extends SparkFunSuite with SQLHelper { - private def createSessionCatalog(conf: SQLConf): SessionCatalog = { + private def createSessionCatalog(): SessionCatalog = { val catalog = new InMemoryCatalog() catalog.createDatabase( CatalogDatabase(SessionCatalog.DEFAULT_DATABASE, "", new URI("fake"), Map.empty), ignoreIfExists = true) - new SessionCatalog(catalog, EmptyFunctionRegistry, conf) + new SessionCatalog(catalog, EmptyFunctionRegistry) } test("CatalogManager should reflect the changes of default catalog") { - val conf = new SQLConf - val catalogManager = new CatalogManager(conf, FakeV2SessionCatalog, createSessionCatalog(conf)) + val catalogManager = new CatalogManager(FakeV2SessionCatalog, createSessionCatalog()) assert(catalogManager.currentCatalog.name() == CatalogManager.SESSION_CATALOG_NAME) assert(catalogManager.currentNamespace.sameElements(Array("default"))) - conf.setConfString("spark.sql.catalog.dummy", classOf[DummyCatalog].getName) - conf.setConfString(SQLConf.DEFAULT_CATALOG.key, "dummy") - - // The current catalog should be changed if the default catalog is set. - assert(catalogManager.currentCatalog.name() == "dummy") - assert(catalogManager.currentNamespace.sameElements(Array("a", "b"))) + withSQLConf("spark.sql.catalog.dummy" -> classOf[DummyCatalog].getName, + SQLConf.DEFAULT_CATALOG.key -> "dummy") { + // The current catalog should be changed if the default catalog is set. + assert(catalogManager.currentCatalog.name() == "dummy") + assert(catalogManager.currentNamespace.sameElements(Array("a", "b"))) + } } test("CatalogManager should keep the current catalog once set") { - val conf = new SQLConf - val catalogManager = new CatalogManager(conf, FakeV2SessionCatalog, createSessionCatalog(conf)) + val catalogManager = new CatalogManager(FakeV2SessionCatalog, createSessionCatalog()) assert(catalogManager.currentCatalog.name() == CatalogManager.SESSION_CATALOG_NAME) - conf.setConfString("spark.sql.catalog.dummy", classOf[DummyCatalog].getName) - catalogManager.setCurrentCatalog("dummy") - assert(catalogManager.currentCatalog.name() == "dummy") - assert(catalogManager.currentNamespace.sameElements(Array("a", "b"))) - - conf.setConfString("spark.sql.catalog.dummy2", classOf[DummyCatalog].getName) - conf.setConfString(SQLConf.DEFAULT_CATALOG.key, "dummy2") - // The current catalog shouldn't be changed if it's set before. - assert(catalogManager.currentCatalog.name() == "dummy") + withSQLConf("spark.sql.catalog.dummy" -> classOf[DummyCatalog].getName) { + catalogManager.setCurrentCatalog("dummy") + assert(catalogManager.currentCatalog.name() == "dummy") + assert(catalogManager.currentNamespace.sameElements(Array("a", "b"))) + + withSQLConf("spark.sql.catalog.dummy2" -> classOf[DummyCatalog].getName, + SQLConf.DEFAULT_CATALOG.key -> "dummy2") { + // The current catalog shouldn't be changed if it's set before. + assert(catalogManager.currentCatalog.name() == "dummy") + } + } } test("current namespace should be updated when switching current catalog") { - val conf = new SQLConf - val catalogManager = new CatalogManager(conf, FakeV2SessionCatalog, createSessionCatalog(conf)) - conf.setConfString("spark.sql.catalog.dummy", classOf[DummyCatalog].getName) - catalogManager.setCurrentCatalog("dummy") - assert(catalogManager.currentNamespace.sameElements(Array("a", "b"))) - catalogManager.setCurrentNamespace(Array("a")) - assert(catalogManager.currentNamespace.sameElements(Array("a"))) - - // If we set current catalog to the same catalog, current namespace should stay the same. - catalogManager.setCurrentCatalog("dummy") - assert(catalogManager.currentNamespace.sameElements(Array("a"))) - - // If we switch to a different catalog, current namespace should be reset. - conf.setConfString("spark.sql.catalog.dummy2", classOf[DummyCatalog].getName) - catalogManager.setCurrentCatalog("dummy2") - assert(catalogManager.currentNamespace.sameElements(Array("a", "b"))) + val catalogManager = new CatalogManager(FakeV2SessionCatalog, createSessionCatalog()) + withSQLConf("spark.sql.catalog.dummy" -> classOf[DummyCatalog].getName) { + catalogManager.setCurrentCatalog("dummy") + assert(catalogManager.currentNamespace.sameElements(Array("a", "b"))) + catalogManager.setCurrentNamespace(Array("a")) + assert(catalogManager.currentNamespace.sameElements(Array("a"))) + + // If we set current catalog to the same catalog, current namespace should stay the same. + catalogManager.setCurrentCatalog("dummy") + assert(catalogManager.currentNamespace.sameElements(Array("a"))) + + // If we switch to a different catalog, current namespace should be reset. + withSQLConf("spark.sql.catalog.dummy2" -> classOf[DummyCatalog].getName) { + catalogManager.setCurrentCatalog("dummy2") + assert(catalogManager.currentNamespace.sameElements(Array("a", "b"))) + } + } } test("set current namespace") { - val conf = new SQLConf - val v1SessionCatalog = createSessionCatalog(conf) + val v1SessionCatalog = createSessionCatalog() v1SessionCatalog.createDatabase( CatalogDatabase( "test", "", v1SessionCatalog.getDefaultDBPath("test"), Map.empty), ignoreIfExists = false) - val catalogManager = new CatalogManager(conf, FakeV2SessionCatalog, v1SessionCatalog) + val catalogManager = new CatalogManager(FakeV2SessionCatalog, v1SessionCatalog) // If the current catalog is session catalog, setting current namespace actually sets // `SessionCatalog.currentDb`. @@ -106,23 +107,25 @@ class CatalogManagerSuite extends SparkFunSuite { } // when switching current catalog, `SessionCatalog.currentDb` should be reset. - conf.setConfString("spark.sql.catalog.dummy", classOf[DummyCatalog].getName) - catalogManager.setCurrentCatalog("dummy") - assert(v1SessionCatalog.getCurrentDatabase == "default") - catalogManager.setCurrentNamespace(Array("test2")) - assert(v1SessionCatalog.getCurrentDatabase == "default") - - // Check namespace existence if currentCatalog implements SupportsNamespaces. - conf.setConfString("spark.sql.catalog.testCatalog", classOf[InMemoryTableCatalog].getName) - catalogManager.setCurrentCatalog("testCatalog") - catalogManager.currentCatalog.asInstanceOf[InMemoryTableCatalog] - .createNamespace(Array("test3"), Map.empty[String, String].asJava) - assert(v1SessionCatalog.getCurrentDatabase == "default") - catalogManager.setCurrentNamespace(Array("test3")) - assert(v1SessionCatalog.getCurrentDatabase == "default") - - intercept[NoSuchNamespaceException] { - catalogManager.setCurrentNamespace(Array("ns1", "ns2")) + withSQLConf("spark.sql.catalog.dummy" -> classOf[DummyCatalog].getName) { + catalogManager.setCurrentCatalog("dummy") + assert(v1SessionCatalog.getCurrentDatabase == "default") + catalogManager.setCurrentNamespace(Array("test2")) + assert(v1SessionCatalog.getCurrentDatabase == "default") + + // Check namespace existence if currentCatalog implements SupportsNamespaces. + withSQLConf("spark.sql.catalog.testCatalog" -> classOf[InMemoryTableCatalog].getName) { + catalogManager.setCurrentCatalog("testCatalog") + catalogManager.currentCatalog.asInstanceOf[InMemoryTableCatalog] + .createNamespace(Array("test3"), Map.empty[String, String].asJava) + assert(v1SessionCatalog.getCurrentDatabase == "default") + catalogManager.setCurrentNamespace(Array("test3")) + assert(v1SessionCatalog.getCurrentDatabase == "default") + + intercept[NoSuchNamespaceException] { + catalogManager.setCurrentNamespace(Array("ns1", "ns2")) + } + } } } } diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt index 49dc7adccbf3c..3d2b2e5c8edba 100644 --- a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt @@ -7,9 +7,9 @@ OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz from_json as subExpr: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -subexpressionElimination off, codegen on 26809 27731 898 0.0 268094225.4 1.0X -subexpressionElimination off, codegen off 25117 26612 1357 0.0 251166638.4 1.1X -subexpressionElimination on, codegen on 2582 2906 282 0.0 25819408.7 10.4X -subexpressionElimination on, codegen off 25635 26131 804 0.0 256346873.1 1.0X +subexpressionElimination off, codegen on 25932 26908 916 0.0 259320042.3 1.0X +subexpressionElimination off, codegen off 26085 26159 65 0.0 260848905.0 1.0X +subexpressionElimination on, codegen on 2860 2939 72 0.0 28603312.9 9.1X +subexpressionElimination on, codegen off 2517 2617 93 0.0 25165157.7 10.3X diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt index 3f131726bc53d..ca2a9c6497500 100644 --- a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt +++ b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt @@ -7,9 +7,9 @@ OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz from_json as subExpr: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -subexpressionElimination off, codegen on 24841 25365 803 0.0 248412787.5 1.0X -subexpressionElimination off, codegen off 25344 26205 941 0.0 253442656.5 1.0X -subexpressionElimination on, codegen on 2883 3019 119 0.0 28833086.8 8.6X -subexpressionElimination on, codegen off 24707 25688 903 0.0 247068775.9 1.0X +subexpressionElimination off, codegen on 26503 27622 1937 0.0 265033362.4 1.0X +subexpressionElimination off, codegen off 24920 25376 430 0.0 249196978.2 1.1X +subexpressionElimination on, codegen on 2421 2466 39 0.0 24213606.1 10.9X +subexpressionElimination on, codegen off 2360 2435 87 0.0 23604320.7 11.2X diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 30792c9bacd53..c164835c753e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql import scala.collection.JavaConverters._ -import scala.language.implicitConversions import org.apache.spark.annotation.Stable import org.apache.spark.internal.Logging diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 991f02d43bc47..31b4c158aa67b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -470,7 +470,6 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { def insertInto(tableName: String): Unit = { import df.sparkSession.sessionState.analyzer.{AsTableIdentifier, NonSessionCatalogAndIdentifier, SessionCatalogAndIdentifier} import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ - import org.apache.spark.sql.connector.catalog.CatalogV2Util._ assertNotBucketed("insertInto") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 3d431d6ff13a9..2c38a65ac2106 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -21,7 +21,6 @@ import java.io.{ByteArrayOutputStream, CharArrayWriter, DataOutputStream} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer -import scala.language.implicitConversions import scala.reflect.runtime.universe.TypeTag import scala.util.control.NonFatal @@ -63,7 +62,7 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.util.SchemaUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.unsafe.array.ByteArrayMethods -import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils private[sql] object Dataset { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index 7e430b682faf4..c40ce0f4777c6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql import java.util.Locale import scala.collection.JavaConverters._ -import scala.language.implicitConversions import org.apache.spark.annotation.Stable import org.apache.spark.api.python.PythonEvalType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala index e9bc25d489718..2f46fa8073bbc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala @@ -18,10 +18,8 @@ package org.apache.spark.sql import org.apache.spark.annotation.Stable -import org.apache.spark.internal.Logging import org.apache.spark.internal.config.{ConfigEntry, OptionalConfigEntry} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.SQLConf.{DeprecatedConfig, RemovedConfig} /** * Runtime configuration interface for Spark. To access this, use `SparkSession.conf`. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index 0f6ae9c5d44e1..cceb38558946e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -30,9 +30,9 @@ import org.apache.spark.sql.catalyst.{JavaTypeInference, ScalaReflection} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF} -import org.apache.spark.sql.execution.aggregate.{ScalaAggregator, ScalaUDAF} +import org.apache.spark.sql.execution.aggregate.ScalaUDAF import org.apache.spark.sql.execution.python.UserDefinedPythonFunction -import org.apache.spark.sql.expressions.{Aggregator, SparkUserDefinedFunction, UserDefinedAggregateFunction, UserDefinedAggregator, UserDefinedFunction} +import org.apache.spark.sql.expressions.{SparkUserDefinedFunction, UserDefinedAggregateFunction, UserDefinedAggregator, UserDefinedFunction} import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala index c6a644f9f2e29..1436574c0d90a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalog import scala.collection.JavaConverters._ -import org.apache.spark.annotation.{Evolving, Experimental, Stable} +import org.apache.spark.annotation.Stable import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset} import org.apache.spark.sql.types.StructType import org.apache.spark.storage.StorageLevel diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index bd9120a1fbe78..303ae47f06b84 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, CatalogUtils} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, CatalogV2Util, Identifier, LookupCatalog, SupportsNamespaces, SupportsPartitionManagement, TableCatalog, TableChange, V1Table} +import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, CatalogV2Util, Identifier, LookupCatalog, SupportsNamespaces, TableCatalog, TableChange, V1Table} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputExpression.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputExpression.scala index fa41e865444da..3ba8745be995f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputExpression.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputExpression.scala @@ -16,8 +16,8 @@ */ package org.apache.spark.sql.execution -import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, NamedExpression, SortOrder} -import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning} +import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeMap, AttributeReference, Expression, NamedExpression, SortOrder} +import org.apache.spark.sql.catalyst.plans.physical.Partitioning /** * A trait that provides functionality to handle aliases in the `outputExpressions`. @@ -25,19 +25,15 @@ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partition trait AliasAwareOutputExpression extends UnaryExecNode { protected def outputExpressions: Seq[NamedExpression] - protected def hasAlias: Boolean = outputExpressions.collectFirst { case _: Alias => }.isDefined + private lazy val aliasMap = AttributeMap(outputExpressions.collect { + case a @ Alias(child: AttributeReference, _) => (child, a.toAttribute) + }) - protected def replaceAliases(exprs: Seq[Expression]): Seq[Expression] = { - exprs.map { - case a: AttributeReference => replaceAlias(a).getOrElse(a) - case other => other - } - } + protected def hasAlias: Boolean = aliasMap.nonEmpty - protected def replaceAlias(attr: AttributeReference): Option[Attribute] = { - outputExpressions.collectFirst { - case a @ Alias(child: AttributeReference, _) if child.semanticEquals(attr) => - a.toAttribute + protected def normalizeExpression(exp: Expression): Expression = { + exp.transform { + case attr: AttributeReference => aliasMap.getOrElse(attr, attr) } } } @@ -50,7 +46,8 @@ trait AliasAwareOutputPartitioning extends AliasAwareOutputExpression { final override def outputPartitioning: Partitioning = { if (hasAlias) { child.outputPartitioning match { - case h: HashPartitioning => h.copy(expressions = replaceAliases(h.expressions)) + case e: Expression => + normalizeExpression(e).asInstanceOf[Partitioning] case other => other } } else { @@ -68,11 +65,10 @@ trait AliasAwareOutputOrdering extends AliasAwareOutputExpression { final override def outputOrdering: Seq[SortOrder] = { if (hasAlias) { - orderingExpressions.map { s => - s.child match { - case a: AttributeReference => s.copy(child = replaceAlias(a).getOrElse(a)) - case _ => s - } + orderingExpressions.map { sortOrder => + val newSortOrder = normalizeExpression(sortOrder).asInstanceOf[SortOrder] + val newSameOrderExpressions = newSortOrder.sameOrderExpressions.map(normalizeExpression) + newSortOrder.copy(sameOrderExpressions = newSameOrderExpressions) } } else { orderingExpressions diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala index 5f72d6005a8dd..f163d85914bc9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, SubqueryExpression} import org.apache.spark.sql.catalyst.optimizer.EliminateResolvedHint import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LogicalPlan, ResolvedHint} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper -import org.apache.spark.sql.execution.columnar.{DefaultCachedBatchSerializer, InMemoryRelation} +import org.apache.spark.sql.execution.columnar.InMemoryRelation import org.apache.spark.sql.execution.command.CommandUtils import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, FileTable} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala index e1b9c8f430c56..b0bbb52bc4990 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala @@ -16,8 +16,6 @@ */ package org.apache.spark.sql.execution -import scala.collection.mutable - import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index 45d28ddb42fc3..44636beeec7fc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -25,7 +25,6 @@ import org.apache.commons.lang3.StringUtils import org.apache.hadoop.fs.Path import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions._ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala index dcec0b019da28..08950c827f5aa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala @@ -22,7 +22,7 @@ import java.sql.{Date, Timestamp} import java.time.{Instant, LocalDate, ZoneOffset} import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, LegacyDateFormats, TimestampFormatter} +import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter} import org.apache.spark.sql.execution.command.{DescribeCommandBase, ExecutedCommandExec, ShowTablesCommand, ShowViewsCommand} import org.apache.spark.sql.execution.datasources.v2.{DescribeTableExec, ShowTablesExec} import org.apache.spark.sql.internal.SQLConf diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index 77f7a4e553f06..040d1f36ed8a5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -343,8 +343,10 @@ object QueryExecution { PlanDynamicPruningFilters(sparkSession), PlanSubqueries(sparkSession), RemoveRedundantProjects, - RemoveRedundantSorts, EnsureRequirements, + // `RemoveRedundantSorts` needs to be added before `EnsureRequirements` to guarantee the same + // number of partitions when instantiating PartitioningCollection. + RemoveRedundantSorts, DisableUnnecessaryBucketedScan, ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.columnarRules), CollapseCodegenStages(), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala index 8746cc6f650d7..bbe3f50492d9f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala @@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.{Final, PartialMerge} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.aggregate.BaseAggregateExec import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanExecBase -import org.apache.spark.sql.execution.window.WindowExec import org.apache.spark.sql.internal.SQLConf /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala index ead8c00031112..062aa69b3adb3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala @@ -135,7 +135,12 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ def longMetric(name: String): SQLMetric = metrics(name) // TODO: Move to `DistributedPlan` - /** Specifies how data is partitioned across different nodes in the cluster. */ + /** + * Specifies how data is partitioned across different nodes in the cluster. + * Note this method may fail if it is invoked before `EnsureRequirements` is applied + * since `PartitioningCollection` requires all its partitionings to have + * the same number of partitions. + */ def outputPartitioning: Partitioning = UnknownPartitioning(0) // TODO: WRONG WIDTH! /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala index 895eeedd86b8b..c88fcecc9983b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala @@ -18,18 +18,15 @@ package org.apache.spark.sql.execution import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.adaptive.LogicalQueryStageStrategy import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, FileSourceStrategy} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Strategy -import org.apache.spark.sql.internal.SQLConf -class SparkPlanner( - val session: SparkSession, - val conf: SQLConf, - val experimentalMethods: ExperimentalMethods) - extends SparkStrategies { +class SparkPlanner(val session: SparkSession, val experimentalMethods: ExperimentalMethods) + extends SparkStrategies with SQLConfHelper { def numPartitions: Int = conf.numShufflePartitions @@ -40,7 +37,7 @@ class SparkPlanner( PythonEvals :: new DataSourceV2Strategy(session) :: FileSourceStrategy :: - DataSourceStrategy(conf) :: + DataSourceStrategy :: SpecialLimits :: Aggregation :: Window :: diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 6c42c051fbba6..85476bcd21e19 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -27,7 +27,7 @@ import org.antlr.v4.runtime.{ParserRuleContext, Token} import org.antlr.v4.runtime.tree.TerminalNode import org.apache.spark.sql.SaveMode -import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser._ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index ba3d83714c302..e9b1aa81895f5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.execution.python._ import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.MemoryPlan import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} +import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType /** @@ -312,8 +312,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { /** * Used to plan streaming aggregation queries that are computed incrementally as part of a - * [[StreamingQuery]]. Currently this rule is injected into the planner - * on-demand, only when planning in a [[org.apache.spark.sql.execution.streaming.StreamExecution]] + * [[org.apache.spark.sql.streaming.StreamingQuery]]. Currently this rule is injected into the + * planner on-demand, only when planning in a + * [[org.apache.spark.sql.execution.streaming.StreamExecution]] */ object StatefulAggregationStrategy extends Strategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala index a8905ca530005..b2963457e22db 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala @@ -29,7 +29,6 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ -import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.aggregate.HashAggregateExec diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 0865e42b440db..570edbf5f78a3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -88,8 +88,8 @@ case class AdaptiveSparkPlanExec( // Exchange nodes) after running these rules. private def queryStagePreparationRules: Seq[Rule[SparkPlan]] = Seq( RemoveRedundantProjects, - RemoveRedundantSorts, EnsureRequirements, + RemoveRedundantSorts, DisableUnnecessaryBucketedScan ) ++ context.session.sessionState.queryStagePrepRules diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala index 011acbf1b22a4..3760782515e97 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, Join, LogicalPlan, NO_BROADCAST_HASH} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.internal.SQLConf /** * This optimization rule detects a join child that has a high ratio of empty partitions and diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStage.scala index 9914eddd53a3d..bff142315f8ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStage.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} -import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.SparkPlan /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReuseAdaptiveSubquery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReuseAdaptiveSubquery.scala index c3c7358641fcb..71540dbd39f95 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReuseAdaptiveSubquery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReuseAdaptiveSubquery.scala @@ -21,7 +21,6 @@ import scala.collection.concurrent.TrieMap import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{BaseSubqueryExec, ExecSubqueryExpression, ReusedSubqueryExec, SparkPlan} -import org.apache.spark.sql.internal.SQLConf case class ReuseAdaptiveSubquery( reuseMap: TrieMap[SparkPlan, BaseSubqueryExec]) extends Rule[SparkPlan] { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala index cdc57dbc7dcc2..aae3d922b28a5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/simpleCosting.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.exchange.{ShuffleExchangeExec, ShuffleExchangeLike} +import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike /** * A simple implementation of [[Cost]], which takes a number of [[Long]] as the cost value. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala index 75651500954cf..1c140d7b6955f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala @@ -28,7 +28,6 @@ import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.unsafe.KVIterator -import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class ObjectAggregationIterator( partIndex: Int, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationMap.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationMap.scala index b5372bcca89dd..9f2cf84a6d7e6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationMap.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationMap.scala @@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, U import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, TypedImperativeAggregate} import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.types.StructType -import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter /** * An aggregation map that supports using safe `SpecificInternalRow`s aggregation buffers, so that diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala index 492b0f2da77cb..deb9e76c51760 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala @@ -19,12 +19,13 @@ package org.apache.spark.sql.execution.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction} +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.execution.metric.SQLMetric /** - * An iterator used to evaluate [[AggregateFunction]]. It assumes the input rows have been - * sorted by values of [[groupingExpressions]]. + * An iterator used to evaluate + * [[org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction]]. + * It assumes the input rows have been sorted by values of [[groupingExpressions]]. */ class SortBasedAggregationIterator( partIndex: Int, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala index 44bc9c2e3a9d0..41e247a02759b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala @@ -17,16 +17,12 @@ package org.apache.spark.sql.execution.aggregate -import scala.reflect.runtime.universe.TypeTag - import org.apache.spark.internal.Logging -import org.apache.spark.sql.{Column, Row} +import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, _} -import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.catalyst.expressions.aggregate.{ImperativeAggregate, TypedImperativeAggregate} -import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateMutableProjection, GenerateSafeProjection} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.expressions.{Aggregator, MutableAggregationBuffer, UserDefinedAggregateFunction} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala index 501e1c460f9c9..f62aa5db0872f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala @@ -63,10 +63,10 @@ object ArrowWriter { val elementVector = createFieldWriter(vector.getDataVector()) new ArrayWriter(vector, elementVector) case (MapType(_, _, _), vector: MapVector) => - val entryWriter = createFieldWriter(vector.getDataVector).asInstanceOf[StructWriter] - val keyWriter = createFieldWriter(entryWriter.valueVector.getChild(MapVector.KEY_NAME)) - val valueWriter = createFieldWriter(entryWriter.valueVector.getChild(MapVector.VALUE_NAME)) - new MapWriter(vector, keyWriter, valueWriter) + val structVector = vector.getDataVector.asInstanceOf[StructVector] + val keyWriter = createFieldWriter(structVector.getChild(MapVector.KEY_NAME)) + val valueWriter = createFieldWriter(structVector.getChild(MapVector.VALUE_NAME)) + new MapWriter(vector, structVector, keyWriter, valueWriter) case (StructType(_), vector: StructVector) => val children = (0 until vector.size()).map { ordinal => createFieldWriter(vector.getChildByOrdinal(ordinal)) @@ -331,11 +331,11 @@ private[arrow] class StructWriter( override def setValue(input: SpecializedGetters, ordinal: Int): Unit = { val struct = input.getStruct(ordinal, children.length) var i = 0 + valueVector.setIndexDefined(count) while (i < struct.numFields) { children(i).write(struct, i) i += 1 } - valueVector.setIndexDefined(count) } override def finish(): Unit = { @@ -351,6 +351,7 @@ private[arrow] class StructWriter( private[arrow] class MapWriter( val valueVector: MapVector, + val structVector: StructVector, val keyWriter: ArrowFieldWriter, val valueWriter: ArrowFieldWriter) extends ArrowFieldWriter { @@ -363,6 +364,7 @@ private[arrow] class MapWriter( val values = map.valueArray() var i = 0 while (i < map.numElements()) { + structVector.setIndexDefined(keyWriter.count) keyWriter.write(keys, i) valueWriter.write(values, i) i += 1 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala index 7334ea1e27284..006fa0fba4138 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala @@ -21,7 +21,7 @@ import java.util.concurrent.{Future => JFuture} import java.util.concurrent.TimeUnit._ import scala.collection.mutable -import scala.concurrent.{ExecutionContext} +import scala.concurrent.ExecutionContext import scala.concurrent.duration.Duration import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} @@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.types.{LongType, StructType} -import org.apache.spark.util.{ThreadUtils, Utils} +import org.apache.spark.util.ThreadUtils import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler} /** Physical plan for Project. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/CoalesceBucketsInJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/CoalesceBucketsInJoin.scala index 40a2a7a2359e0..a4e5be01b45a2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/CoalesceBucketsInJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/CoalesceBucketsInJoin.scala @@ -26,7 +26,6 @@ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partition import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, ProjectExec, SparkPlan} import org.apache.spark.sql.execution.joins.{BaseJoinExec, ShuffledHashJoinExec, SortMergeJoinExec} -import org.apache.spark.sql.internal.SQLConf /** * This rule coalesces one side of the `SortMergeJoin` and `ShuffledHashJoin` diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala index bb59f44abc761..6b195b3b49f09 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala @@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, ProjectExec, SortExec, SparkPlan} import org.apache.spark.sql.execution.aggregate.BaseAggregateExec import org.apache.spark.sql.execution.exchange.Exchange -import org.apache.spark.sql.internal.SQLConf /** * Disable unnecessary bucketed table scan based on actual physical query plan. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala index 45557bfbada6c..d2f65b745f35a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.columnar import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} +import org.apache.spark.unsafe.types.UTF8String class ColumnStatisticsSchema(a: Attribute) extends Serializable { val upperBound = AttributeReference(a.name + ".upperBound", a.dataType, nullable = true)() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala index dedace4af4d14..216636c7ea14f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandCheck.scala @@ -17,14 +17,14 @@ package org.apache.spark.sql.execution.command +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.util.SchemaUtils /** * Checks legitimization of various execution commands. */ -case class CommandCheck(conf: SQLConf) extends (LogicalPlan => Unit) { +object CommandCheck extends (LogicalPlan => Unit) with SQLConfHelper { override def apply(plan: LogicalPlan): Unit = { plan.foreach { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala index 8bf7504716f79..f86f62bbf853b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala @@ -27,7 +27,7 @@ import org.apache.hadoop.fs.{FileSystem, Path, PathFilter} import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} -import org.apache.spark.sql.catalyst.catalog.{CatalogColumnStat, CatalogStatistics, CatalogTable} +import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical._ @@ -163,7 +163,7 @@ object CommandUtils extends Logging { .getConfString("hive.exec.stagingdir", ".hive-staging") val filter = new PathFilterIgnoreNonData(stagingDir) val sizes = InMemoryFileIndex.bulkListLeafFiles(paths.flatten, - sparkSession.sessionState.newHadoopConf(), filter, sparkSession, isRootLevel = true).map { + sparkSession.sessionState.newHadoopConf(), filter, sparkSession).map { case (_, files) => files.map(_.getLen).sum } // the size is 0 where paths(i) is not defined and sizes(i) where it is defined diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala index a1bb5af1ab723..a56007f5d5d95 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala @@ -24,7 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker -import org.apache.spark.sql.execution.datasources.FileFormatWriter import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.util.SerializableConfiguration @@ -35,7 +34,7 @@ trait DataWritingCommand extends Command { /** * The input query plan that produces the data to be written. * IMPORTANT: the input query plan MUST be analyzed, so that we can carry its output columns - * to [[FileFormatWriter]]. + * to [[org.apache.spark.sql.execution.datasources.FileFormatWriter]]. */ def query: LogicalPlan diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala index 61ee6d7f4a299..00accedf21556 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.command import org.apache.spark.internal.Logging import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.IgnoreCachedData import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.types.{StringType, StructField, StructType} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala index ef6b0bba1628e..f99dc8d9f1a8e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/cache.scala @@ -21,7 +21,6 @@ import java.util.Locale import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LogicalPlan} import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala index 68c47d6a6dfaa..6ed40aacd1125 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala @@ -21,7 +21,6 @@ import java.net.URI import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog._ -import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources._ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala index d76b4b8894783..330a503e5f8e2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, NoSuchFunctionException} import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, FunctionResource} -import org.apache.spark.sql.catalyst.expressions.{Attribute, ExpressionInfo} +import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.util.StringUtils import org.apache.spark.sql.types.{StringType, StructField, StructType} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index b55bed9cd7fc0..34ded5d456d09 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.datasources import java.util.{Locale, ServiceConfigurationError, ServiceLoader} import scala.collection.JavaConverters._ -import scala.language.implicitConversions import scala.util.{Failure, Success, Try} import org.apache.hadoop.conf.Configuration diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 822bdbdad8f00..361d1fab03421 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.datasources import java.util.Locale -import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.hadoop.fs.Path @@ -27,7 +26,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, QualifiedTableName} +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, QualifiedTableName, SQLConfHelper} import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog._ @@ -42,9 +41,7 @@ import org.apache.spark.sql.connector.catalog.SupportsRead import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan} import org.apache.spark.sql.execution.command._ -import org.apache.spark.sql.execution.datasources.FileSourceStrategy.{extractPredicatesWithinOutputSet, logInfo} import org.apache.spark.sql.execution.streaming.StreamingRelation -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ @@ -314,8 +311,8 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan] /** * A Strategy for planning scans over data sources defined using the sources API. */ -case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with CastSupport { - import DataSourceStrategy._ +object DataSourceStrategy + extends Strategy with Logging with CastSupport with PredicateHelper with SQLConfHelper { def apply(plan: LogicalPlan): Seq[execution.SparkPlan] = plan match { case ScanOperation(projects, filters, l @ LogicalRelation(t: CatalystScan, _, _, _)) => @@ -466,9 +463,7 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with private[this] def toCatalystRDD(relation: LogicalRelation, rdd: RDD[Row]): RDD[InternalRow] = { toCatalystRDD(relation, relation.output, rdd) } -} -object DataSourceStrategy extends PredicateHelper { /** * The attribute name may differ from the one in the schema if the query analyzer * is case insensitive. We should change attribute names to match the ones in the schema, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala index 28a63c26604ec..1149767bdade2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala @@ -22,11 +22,12 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, FileDataSourceV2, FileTable} +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, FileTable} /** * Replace the File source V2 table in [[InsertIntoStatement]] to V1 [[FileFormat]]. - * E.g, with temporary view `t` using [[FileDataSourceV2]], inserting into view `t` fails + * E.g, with temporary view `t` using + * [[org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2]], inserting into view `t` fails * since there is no corresponding physical plan. * This is a temporary hack for making current data source V2 work. It should be * removed when Catalog support of file data source v2 is finished. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala index d278802e6c9f2..a0b191e60f376 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala @@ -17,10 +17,6 @@ package org.apache.spark.sql.execution.datasources -import java.util.Locale - -import scala.collection.mutable - import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala index 130894e9bc025..21275951b5603 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala @@ -128,7 +128,7 @@ class InMemoryFileIndex( } val filter = FileInputFormat.getInputPathFilter(new JobConf(hadoopConf, this.getClass)) val discovered = InMemoryFileIndex.bulkListLeafFiles( - pathsToFetch.toSeq, hadoopConf, filter, sparkSession, isRootLevel = true) + pathsToFetch.toSeq, hadoopConf, filter, sparkSession) discovered.foreach { case (path, leafFiles) => HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size) fileStatusCache.putLeafFiles(path, leafFiles.toArray) @@ -146,20 +146,17 @@ object InMemoryFileIndex extends Logging { paths: Seq[Path], hadoopConf: Configuration, filter: PathFilter, - sparkSession: SparkSession, - isRootLevel: Boolean): Seq[(Path, Seq[FileStatus])] = { + sparkSession: SparkSession): Seq[(Path, Seq[FileStatus])] = { HadoopFSUtils.parallelListLeafFiles( sc = sparkSession.sparkContext, paths = paths, hadoopConf = hadoopConf, - filter = filter, - isRootLevel = isRootLevel, + filter = new PathFilterWrapper(filter), ignoreMissingFiles = sparkSession.sessionState.conf.ignoreMissingFiles, ignoreLocality = sparkSession.sessionState.conf.ignoreDataLocality, parallelismThreshold = sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold, - parallelismMax = sparkSession.sessionState.conf.parallelPartitionDiscoveryParallelism, - filterFun = Some(shouldFilterOut)) - } + parallelismMax = sparkSession.sessionState.conf.parallelPartitionDiscoveryParallelism) + } /** Checks if we should filter out this path name. */ def shouldFilterOut(pathName: String): Boolean = { @@ -175,3 +172,9 @@ object InMemoryFileIndex extends Logging { exclude && !include } } + +private class PathFilterWrapper(val filter: PathFilter) extends PathFilter with Serializable { + override def accept(path: Path): Boolean = { + (filter == null || filter.accept(path)) && !InMemoryFileIndex.shouldFilterOut(path.getName) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala index 868e5371426c0..1d7abe5b938c2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala @@ -19,8 +19,7 @@ package org.apache.spark.sql.execution.datasources import org.apache.hadoop.mapreduce.TaskAttemptContext -import org.apache.spark.sql.Row -import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StructType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala index 5341e22f5e670..fed9614347f6a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} -import org.apache.spark.sql.types.{StringType, StructType} +import org.apache.spark.sql.types.StructType /** * An abstract class that represents [[FileIndex]]s that are aware of partitioned tables. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala index 4087efc486a4f..796c23c7337d8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{Resolver, TypeCoercion} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Cast, Literal} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal} import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateFormatter, DateTimeUtils, TimestampFormatter} import org.apache.spark.sql.types._ import org.apache.spark.sql.util.SchemaUtils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/RecordReaderIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/RecordReaderIterator.scala index c3dd6939ec5bd..0959d8799f5a1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/RecordReaderIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/RecordReaderIterator.scala @@ -21,8 +21,6 @@ import java.io.Closeable import org.apache.hadoop.mapreduce.RecordReader -import org.apache.spark.sql.catalyst.InternalRow - /** * An adaptor from a Hadoop [[RecordReader]] to an [[Iterator]] over the values returned. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala index 61e0154a0ffe8..76a6a48ca0b0c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField, StructType} +import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructType} /** * Prunes unnecessary physical columns given a [[PhysicalOperation]] over a data source relation. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala index 637ce68ec05a2..b241243363746 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormat.scala @@ -22,14 +22,14 @@ import java.sql.Timestamp import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileStatus, GlobFilter, Path} +import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkException import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter -import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} +import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} import org.apache.spark.sql.internal.SQLConf.SOURCES_BINARY_FILE_MAX_LENGTH import org.apache.spark.sql.sources.{And, DataSourceRegister, EqualTo, Filter, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Not, Or} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 10146be44e8bf..d8fa768a604f4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -23,8 +23,6 @@ import java.nio.charset.{Charset, StandardCharsets} import com.univocity.parsers.csv.CsvParser import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} -import org.apache.hadoop.io.{LongWritable, Text} -import org.apache.hadoop.mapred.TextInputFormat import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index e25ce53941ff6..87ca78db59b29 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources.jdbc -import java.sql.{Connection, PreparedStatement, ResultSet, SQLException} +import java.sql.{Connection, PreparedStatement, ResultSet} import scala.util.control.NonFatal @@ -46,8 +46,8 @@ object JDBCRDD extends Logging { * @param options - JDBC options that contains url, table and other information. * * @return A StructType giving the table's Catalyst schema. - * @throws SQLException if the table specification is garbage. - * @throws SQLException if the table contains an unsupported type. + * @throws java.sql.SQLException if the table specification is garbage. + * @throws java.sql.SQLException if the table contains an unsupported type. */ def resolveTable(options: JDBCOptions): StructType = { val url = options.url diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala index e0fa4584185e9..f2f6f60cb1dde 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala @@ -17,13 +17,10 @@ package org.apache.spark.sql.execution.datasources.json -import java.nio.charset.{Charset, StandardCharsets} - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} -import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.ExprUtils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala index 4ab009c6bd014..32ce7185f7381 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala @@ -23,7 +23,6 @@ import org.apache.orc.mapred.{OrcList, OrcMap, OrcStruct, OrcTimestamp} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArrayData} import org.apache.spark.sql.catalyst.util._ -import org.apache.spark.sql.catalyst.util.RebaseDateTime.rebaseJulianToGregorianDays import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 95f19f9dcee64..1901f5575470e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -35,7 +35,6 @@ import org.apache.parquet.hadoop._ import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel import org.apache.parquet.hadoop.codec.CodecConfig import org.apache.parquet.hadoop.util.ContextUtil -import org.apache.parquet.schema.MessageType import org.apache.spark.{SparkException, TaskContext} import org.apache.spark.internal.Logging @@ -504,7 +503,8 @@ object ParquetFileFormat extends Logging { /** * Reads Spark SQL schema from a Parquet footer. If a valid serialized Spark SQL schema string * can be found in the file metadata, returns the deserialized [[StructType]], otherwise, returns - * a [[StructType]] converted from the [[MessageType]] stored in this footer. + * a [[StructType]] converted from the [[org.apache.parquet.schema.MessageType]] stored in this + * footer. */ def readSchemaFromFooter( footer: Footer, converter: ParquetToSparkSchemaConverter): StructType = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala index e7753cec681cf..70f6726c581a2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOutputWriter.scala @@ -21,7 +21,6 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ import org.apache.parquet.hadoop.ParquetOutputFormat -import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala index 6ef56af927129..f65aef95b6c38 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala @@ -26,9 +26,9 @@ import scala.collection.mutable.ArrayBuffer import org.apache.parquet.column.Dictionary import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter} -import org.apache.parquet.schema.{GroupType, MessageType, OriginalType, Type} -import org.apache.parquet.schema.OriginalType.{INT_32, LIST, UTF8} -import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, DOUBLE, FIXED_LEN_BYTE_ARRAY, INT32, INT64, INT96} +import org.apache.parquet.schema.{GroupType, OriginalType, Type} +import org.apache.parquet.schema.OriginalType.LIST +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, FIXED_LEN_BYTE_ARRAY, INT32, INT64, INT96} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow @@ -107,11 +107,15 @@ private[parquet] class ParquetPrimitiveConverter(val updater: ParentContainerUpd * }}} * 5 converters will be created: * - * - a root [[ParquetRowConverter]] for [[MessageType]] `root`, which contains: - * - a [[ParquetPrimitiveConverter]] for required [[INT_32]] field `f1`, and + * - a root [[ParquetRowConverter]] for [[org.apache.parquet.schema.MessageType]] `root`, + * which contains: + * - a [[ParquetPrimitiveConverter]] for required + * [[org.apache.parquet.schema.OriginalType.INT_32]] field `f1`, and * - a nested [[ParquetRowConverter]] for optional [[GroupType]] `f2`, which contains: - * - a [[ParquetPrimitiveConverter]] for required [[DOUBLE]] field `f21`, and - * - a [[ParquetStringConverter]] for optional [[UTF8]] string field `f22` + * - a [[ParquetPrimitiveConverter]] for required + * [[org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE]] field `f21`, and + * - a [[ParquetStringConverter]] for optional [[org.apache.parquet.schema.OriginalType.UTF8]] + * string field `f22` * * When used as a root converter, [[NoopUpdater]] should be used since root converters don't have * any "parent" container. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index e45514385e292..3a2a642b870f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -29,8 +29,6 @@ import org.apache.spark.sql.connector.catalog.CatalogV2Util.assertNoNullTypeInSc import org.apache.spark.sql.connector.expressions.{FieldReference, RewritableTransform} import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy import org.apache.spark.sql.sources.InsertableRelation import org.apache.spark.sql.types.{AtomicType, StructType} import org.apache.spark.sql.util.SchemaUtils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala index 81b1c81499c74..0ca442baeea2f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala @@ -23,7 +23,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchema} -import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Table, TableCatalog} +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsMetadataColumns, Table} import org.apache.spark.sql.types.StructType case class DescribeTableExec( @@ -41,6 +41,7 @@ case class DescribeTableExec( addPartitioning(rows) if (isExtended) { + addMetadataColumns(rows) addTableDetails(rows) } rows.toSeq @@ -72,6 +73,19 @@ case class DescribeTableExec( } } + private def addMetadataColumns(rows: ArrayBuffer[InternalRow]): Unit = table match { + case hasMeta: SupportsMetadataColumns if hasMeta.metadataColumns.nonEmpty => + rows += emptyRow() + rows += toCatalystRow("# Metadata Columns", "", "") + rows ++= hasMeta.metadataColumns.map { column => + toCatalystRow( + column.name, + column.dataType.simpleString, + Option(column.comment()).getOrElse("")) + } + case _ => + } + private def addPartitioning(rows: ArrayBuffer[InternalRow]): Unit = { rows += emptyRow() rows += toCatalystRow("# Partitioning", "", "") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropNamespaceExec.scala index f7b4317ad65e2..777ee9d385f12 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropNamespaceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropNamespaceExec.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.connector.catalog.{CatalogPlugin, SupportsNamespaces} +import org.apache.spark.sql.connector.catalog.CatalogPlugin /** * Physical plan node for dropping a namespace. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala index e4de70d4ee88f..8cf59f3a59323 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala @@ -25,7 +25,6 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.connector.catalog.{Table, TableProvider} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.datasources._ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala index 7f6ae20d5cd0b..ce8edce6f08d6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala @@ -96,13 +96,11 @@ object PushDownUtils extends PredicateHelper { val exprs = projects ++ filters val requiredColumns = AttributeSet(exprs.flatMap(_.references)) val neededOutput = relation.output.filter(requiredColumns.contains) - if (neededOutput != relation.output) { - r.pruneColumns(neededOutput.toStructType) - val scan = r.build() - scan -> toOutputAttrs(scan.readSchema(), relation) - } else { - r.build() -> relation.output - } + r.pruneColumns(neededOutput.toStructType) + val scan = r.build() + // always project, in case the relation's output has been updated and doesn't match + // the underlying table schema + scan -> toOutputAttrs(scan.readSchema(), relation) case _ => scanBuilder.build() -> relation.output } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala index 95715fd1af56e..7ceee1edee180 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, GenericRowWithSchema} -import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Table, TableCatalog} +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Table} /** * Physical plan node for showing table properties. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableCapabilityCheck.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableCapabilityCheck.scala index 5dfd2e52706d0..cb4a2994de1f4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableCapabilityCheck.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableCapabilityCheck.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic} import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 -import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table} +import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.streaming.StreamingRelation import org.apache.spark.sql.types.BooleanType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TextBasedFileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TextBasedFileScan.scala index 1ca3fd42c0597..f24fb95acb922 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TextBasedFileScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TextBasedFileScan.scala @@ -22,8 +22,6 @@ import org.apache.hadoop.fs.Path import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex -import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.Utils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala index 6dda1d4aaf37e..9ee145580ce6d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala @@ -23,22 +23,21 @@ import java.util import scala.collection.JavaConverters._ import scala.collection.mutable -import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.{SQLConfHelper, TableIdentifier} import org.apache.spark.sql.catalyst.analysis.{NamespaceAlreadyExistsException, NoSuchNamespaceException, NoSuchTableException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogDatabase, CatalogTable, CatalogTableType, CatalogUtils, SessionCatalog} import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogV2Util, Identifier, NamespaceChange, SupportsNamespaces, Table, TableCatalog, TableChange, V1Table} import org.apache.spark.sql.connector.catalog.NamespaceChange.RemoveProperty import org.apache.spark.sql.connector.expressions.{BucketTransform, FieldReference, IdentityTransform, Transform} import org.apache.spark.sql.execution.datasources.DataSource -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap /** * A [[TableCatalog]] that translates calls to the v1 SessionCatalog. */ -class V2SessionCatalog(catalog: SessionCatalog, conf: SQLConf) - extends TableCatalog with SupportsNamespaces { +class V2SessionCatalog(catalog: SessionCatalog) + extends TableCatalog with SupportsNamespaces with SQLConfHelper { import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper import V2SessionCatalog._ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala index 2f9387532c25c..0dbc74395afb1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala @@ -19,10 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2.orc import scala.collection.JavaConverters._ -import org.apache.orc.mapreduce.OrcInputFormat - import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.quoteIfNeeded import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters} import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.orc.OrcFilters diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala index 6973f55e8dca0..93d7db44f2285 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PlanDynamicPruningFilters.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeSeq, BindReferences, DynamicPruningExpression, DynamicPruningSubquery, Expression, ListQuery, Literal, PredicateHelper} import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight} -import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.Aggregate import org.apache.spark.sql.catalyst.plans.physical.BroadcastMode import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{InSubqueryExec, QueryExecution, SparkPlan, SubqueryBroadcastExec} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index cf38fee055ca5..ebbc8a4df5643 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -25,7 +25,6 @@ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoinExec} -import org.apache.spark.sql.internal.SQLConf /** * Ensures that the [[org.apache.spark.sql.catalyst.plans.physical.Partitioning Partitioning]] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala index aeaf59b7f0f4a..e58733b35990a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala @@ -27,7 +27,6 @@ import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expre import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala index b44b13c8de0da..7171c7f7f9746 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala @@ -46,6 +46,8 @@ class ArrowPythonRunner( extends BasePythonRunner[Iterator[InternalRow], ColumnarBatch](funcs, evalType, argOffsets) with PythonArrowOutput { + override val simplifiedTraceback: Boolean = SQLConf.get.pysparkSimplifiedTraceback + override val bufferSize: Int = SQLConf.get.pandasUDFBufferSize require( bufferSize >= 4, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala index 25ce16db264ac..e3d8a943d8cf2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala @@ -27,6 +27,7 @@ import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions, PythonRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.arrow.ArrowWriter +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.ArrowUtils import org.apache.spark.sql.vectorized.ColumnarBatch @@ -49,6 +50,8 @@ class CoGroupedArrowPythonRunner( (Iterator[InternalRow], Iterator[InternalRow]), ColumnarBatch](funcs, evalType, argOffsets) with PythonArrowOutput { + override val simplifiedTraceback: Boolean = SQLConf.get.pysparkSimplifiedTraceback + protected def newWriterThread( env: SparkEnv, worker: Socket, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala index 298d63478b63e..7c476ab03c002 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala @@ -26,7 +26,7 @@ import org.apache.spark.api.python.ChainedPythonFunctions import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.UnaryExecNode import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.util.Utils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala index d341d7019f0ac..d9fe07214d061 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala @@ -23,6 +23,7 @@ import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark._ import org.apache.spark.api.python._ +import org.apache.spark.sql.internal.SQLConf /** * A helper class to run Python UDFs in Spark. @@ -34,6 +35,8 @@ class PythonUDFRunner( extends BasePythonRunner[Array[Byte], Array[Byte]]( funcs, evalType, argOffsets) { + override val simplifiedTraceback: Boolean = SQLConf.get.pysparkSimplifiedTraceback + protected override def newWriterThread( env: SparkEnv, worker: Socket, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala index 59f5a7078a151..ae7b7ef23512c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala @@ -26,7 +26,7 @@ import org.apache.arrow.vector.VectorSchemaRoot import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter} import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel -import org.apache.spark.{SparkException, TaskContext} +import org.apache.spark.TaskContext import org.apache.spark.api.r._ import org.apache.spark.api.r.SpecialLengths import org.apache.spark.broadcast.Broadcast diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala index eb8b8af7950b2..747094b7791c1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder -import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, AttributeReference, Expression, Literal, SortOrder, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder, UnsafeRow} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution} import org.apache.spark.sql.execution._ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala index c2278e8659147..893639a86c88c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala @@ -19,14 +19,12 @@ package org.apache.spark.sql.execution.streaming import java.io._ import java.nio.charset.StandardCharsets -import java.util.{ConcurrentModificationException, EnumSet, UUID} +import java.util.ConcurrentModificationException import scala.reflect.ClassTag import org.apache.commons.io.IOUtils -import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs._ -import org.apache.hadoop.fs.permission.FsPermission import org.json4s.NoTypeHints import org.json4s.jackson.Serialization diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala index bfa60cf7dfd78..b871874f52967 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala @@ -21,14 +21,13 @@ import java.util.UUID import java.util.concurrent.atomic.AtomicInteger import org.apache.spark.internal.Logging -import org.apache.spark.sql.{AnalysisException, SparkSession, Strategy} +import org.apache.spark.sql.{SparkSession, Strategy} import org.apache.spark.sql.catalyst.QueryPlanningTracker import org.apache.spark.sql.catalyst.expressions.{CurrentBatchTimestamp, ExpressionWithRandomSeed} import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, HashPartitioning, SinglePartition} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.{LeafExecNode, LocalLimitExec, QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode} -import org.apache.spark.sql.execution.exchange.{ShuffleExchangeExec, ShuffleExchangeLike} +import org.apache.spark.sql.execution.{LocalLimitExec, QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode} +import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.util.Utils @@ -51,7 +50,6 @@ class IncrementalExecution( // Modified planner with stateful operations. override val planner: SparkPlanner = new SparkPlanner( sparkSession, - sparkSession.sessionState.conf, sparkSession.sessionState.experimentalMethods) { override def strategies: Seq[Strategy] = extraPlanningStrategies ++ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala index aba0463f56cd7..d6be33c76e937 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala @@ -40,7 +40,6 @@ import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table} import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2, ReadLimit, SparkDataStream} import org.apache.spark.sql.connector.write.{LogicalWriteInfoImpl, SupportsTruncate} import org.apache.spark.sql.connector.write.streaming.StreamingWrite -import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.command.StreamingExplainCommand import org.apache.spark.sql.execution.datasources.v2.StreamWriterCommitProgress import org.apache.spark.sql.internal.SQLConf diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala index 516afbea5d9de..fc0cfc30ff2fd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamMetadata.scala @@ -31,14 +31,14 @@ import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.streaming.CheckpointFileManager.CancellableFSDataOutputStream -import org.apache.spark.sql.streaming.StreamingQuery /** - * Contains metadata associated with a [[StreamingQuery]]. This information is written - * in the checkpoint location the first time a query is started and recovered every time the query - * is restarted. + * Contains metadata associated with a [[org.apache.spark.sql.streaming.StreamingQuery]]. + * This information is written in the checkpoint location the first time a query is started + * and recovered every time the query is restarted. * - * @param id unique id of the [[StreamingQuery]] that needs to be persisted across restarts + * @param id unique id of the [[org.apache.spark.sql.streaming.StreamingQuery]] + * that needs to be persisted across restarts */ case class StreamMetadata(id: String) { def json: String = Serialization.write(this)(StreamMetadata.format) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala index 71792facf698a..2f62dbd7ec578 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala @@ -21,13 +21,13 @@ import scala.reflect.ClassTag import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.internal.Logging -import org.apache.spark.rdd.{RDD, ZippedPartitionsBaseRDD, ZippedPartitionsPartition, ZippedPartitionsRDD2} +import org.apache.spark.rdd.{RDD, ZippedPartitionsBaseRDD, ZippedPartitionsPartition} import org.apache.spark.sql.catalyst.analysis.StreamingJoinHelper import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeSet, BoundReference, Expression, NamedExpression, PredicateHelper} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.WatermarkSupport.watermarkExpression -import org.apache.spark.sql.execution.streaming.state.{StateStoreCoordinatorRef, StateStoreProvider, StateStoreProviderId} +import org.apache.spark.sql.execution.streaming.state.{StateStoreCoordinatorRef, StateStoreProviderId} /** @@ -200,8 +200,8 @@ object StreamingSymmetricHashJoinHelper extends Logging { /** * A custom RDD that allows partitions to be "zipped" together, while ensuring the tasks' * preferred location is based on which executors have the required join state stores already - * loaded. This class is a variant of [[ZippedPartitionsRDD2]] which only changes signature - * of `f`. + * loaded. This class is a variant of [[org.apache.spark.rdd.ZippedPartitionsRDD2]] which only + * changes signature of `f`. */ class StateStoreAwareZipPartitionsRDD[A: ClassTag, B: ClassTag, V: ClassTag]( sc: SparkContext, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala index 6d5e7fd5c5cf3..60c66d863a3c5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSink.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.execution.streaming.sources -import org.apache.spark.api.python.PythonException import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.execution.streaming.Sink diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala index 507f860e0452a..fa51dd61a939b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala @@ -21,12 +21,13 @@ import scala.collection.mutable import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.connector.write.{BatchWrite, DataWriter, DataWriterFactory, WriterCommitMessage} +import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage} import org.apache.spark.sql.connector.write.streaming.StreamingDataWriterFactory /** - * A simple [[DataWriterFactory]] whose tasks just pack rows into the commit message for delivery - * to a [[BatchWrite]] on the driver. + * A simple [[org.apache.spark.sql.connector.write.DataWriterFactory]] whose tasks just pack rows + * into the commit message for delivery to a + * [[org.apache.spark.sql.connector.write.BatchWrite]] on the driver. * * Note that, because it sends all rows to the driver, this factory will generally be unsuitable * for production-quality sinks. It's intended for use in tests. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala index a6ac6f2da8e41..778cfeda68af0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala @@ -35,13 +35,12 @@ import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUti import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability} import org.apache.spark.sql.connector.write.{DataWriter, DataWriterFactory, LogicalWriteInfo, PhysicalWriteInfo, SupportsTruncate, WriteBuilder, WriterCommitMessage} import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} -import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.internal.connector.SupportsStreamingUpdateAsAppend import org.apache.spark.sql.types.StructType /** - * A sink that stores the results in memory. This [[Sink]] is primarily intended for use in unit - * tests and does not provide durability. + * A sink that stores the results in memory. This [[org.apache.spark.sql.execution.streaming.Sink]] + * is primarily intended for use in unit tests and does not provide durability. */ class MemorySink extends Table with SupportsWrite with Logging { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala index 7b99ceeb612ee..084ddf8077a15 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStore.scala @@ -27,7 +27,7 @@ import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path -import org.apache.spark.{SparkContext, SparkEnv, SparkException} +import org.apache.spark.{SparkContext, SparkEnv} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.util.UnsafeRowUtils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala index b894e771a6fe2..f21e2ffb80a7b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/StateStoreRDD.scala @@ -23,8 +23,6 @@ import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.execution.streaming.StreamExecution -import org.apache.spark.sql.execution.streaming.continuous.EpochTracker import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala index c1954e1d3858e..8cf3739e11150 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala @@ -25,14 +25,14 @@ import org.apache.spark.TaskContext import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, JoinedRow, Literal, SpecificInternalRow, UnsafeProjection, UnsafeRow} -import org.apache.spark.sql.execution.streaming.{StatefulOperatorStateInfo, StreamingSymmetricHashJoinExec} +import org.apache.spark.sql.execution.streaming.StatefulOperatorStateInfo import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper._ -import org.apache.spark.sql.execution.streaming.state.SymmetricHashJoinStateManager.KeyToValuePair import org.apache.spark.sql.types.{BooleanType, LongType, StructField, StructType} import org.apache.spark.util.NextIterator /** - * Helper class to manage state required by a single side of [[StreamingSymmetricHashJoinExec]]. + * Helper class to manage state required by a single side of + * [[org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinExec]]. * The interface of this class is basically that of a multi-map: * - Get: Returns an iterator of multiple values for given key * - Append: Append a new value to the given key diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala index 639e862fea1da..9a5183a22d23d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala @@ -33,7 +33,6 @@ import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._ import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.streaming.state._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.{OutputMode, StateOperatorProgress} import org.apache.spark.sql.types._ import org.apache.spark.util.{CompletionIterator, NextIterator, Utils} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala index b19540253d7eb..e53e0644eb268 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala @@ -22,7 +22,6 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericInternalRow, SortOrder, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} -import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.{LimitExec, SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala index b693cae824bf9..6e0e36cbe5901 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala @@ -17,17 +17,11 @@ package org.apache.spark.sql.execution.window -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer - import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.physical._ -import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.execution.{ExternalAppendOnlyUnsafeRowArray, SparkPlan, UnaryExecNode} -import org.apache.spark.sql.types.{CalendarIntervalType, DateType, IntegerType, TimestampType} +import org.apache.spark.sql.execution.{ExternalAppendOnlyUnsafeRowArray, SparkPlan} /** * This class calculates and outputs (windowed) aggregates over the rows in a single (sorted) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala index a6a3f3d7384bf..c6b98d48d7dde 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala @@ -23,7 +23,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression -import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.UnaryExecNode import org.apache.spark.sql.types.{CalendarIntervalType, DateType, IntegerType, TimestampType} trait WindowExecBase extends UnaryExecNode { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala index 0cef33509a175..80dd3cf8bc840 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala @@ -17,16 +17,13 @@ package org.apache.spark.sql.expressions -import scala.reflect.runtime.universe.TypeTag - -import org.apache.spark.annotation.{Experimental, Stable} +import org.apache.spark.annotation.Stable import org.apache.spark.sql.{Column, Encoder} -import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.ScalaAggregator -import org.apache.spark.sql.types.{AnyDataType, DataType} +import org.apache.spark.sql.types.DataType /** * A user-defined function. To create one, use the `udf` functions in `functions`. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala index f7591e4d265e0..4e3c5586209e4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala @@ -44,8 +44,6 @@ object typed { override protected def _sqlContext: SQLContext = null } - import implicits._ - /** * Average aggregate function. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 8d6281882f188..9861d21d3a430 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -18,8 +18,7 @@ package org.apache.spark.sql import scala.collection.JavaConverters._ -import scala.language.implicitConversions -import scala.reflect.runtime.universe.{typeTag, TypeTag} +import scala.reflect.runtime.universe.TypeTag import scala.util.Try import org.apache.spark.annotation.Stable @@ -4151,7 +4150,7 @@ object functions { /** * Parses a JSON string and infers its schema in DDL format. * - * @param json a string literal containing a JSON string. + * @param json a foldable string column containing a JSON string. * * @group collection_funcs * @since 2.4.0 @@ -4161,7 +4160,7 @@ object functions { /** * Parses a JSON string and infers its schema in DDL format using options. * - * @param json a string column containing JSON data. + * @param json a foldable string column containing JSON data. * @param options options to control how the json is parsed. accepts the same options and the * json data source. See [[DataFrameReader#json]]. * @return a column with string literal containing schema in DDL format. @@ -4426,7 +4425,7 @@ object functions { /** * Parses a CSV string and infers its schema in DDL format. * - * @param csv a string literal containing a CSV string. + * @param csv a foldable string column containing a CSV string. * * @group collection_funcs * @since 3.0.0 @@ -4436,7 +4435,7 @@ object functions { /** * Parses a CSV string and infers its schema in DDL format using options. * - * @param csv a string literal containing a CSV string. + * @param csv a foldable string column containing a CSV string. * @param options options to control how the CSV is parsed. accepts the same options and the * json data source. See [[DataFrameReader#csv]]. * @return a column with string literal containing schema in DDL format. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index 33c15ec76654d..538a5408723bb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -150,7 +150,6 @@ abstract class BaseSessionStateBuilder( () => session.sharedState.externalCatalog, () => session.sharedState.globalTempViewManager, functionRegistry, - conf, SessionState.newHadoopConf(session.sparkContext.hadoopConfiguration, conf), sqlParser, resourceLoader) @@ -158,9 +157,9 @@ abstract class BaseSessionStateBuilder( catalog } - protected lazy val v2SessionCatalog = new V2SessionCatalog(catalog, conf) + protected lazy val v2SessionCatalog = new V2SessionCatalog(catalog) - protected lazy val catalogManager = new CatalogManager(conf, v2SessionCatalog, catalog) + protected lazy val catalogManager = new CatalogManager(v2SessionCatalog, catalog) /** * Interface exposed to the user for registering user-defined functions. @@ -175,7 +174,7 @@ abstract class BaseSessionStateBuilder( * * Note: this depends on the `conf` and `catalog` fields. */ - protected def analyzer: Analyzer = new Analyzer(catalogManager, conf) { + protected def analyzer: Analyzer = new Analyzer(catalogManager) { override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = new FindDataSourceTable(session) +: new ResolveSQLOnFile(session) +: @@ -197,7 +196,7 @@ abstract class BaseSessionStateBuilder( PreReadCheck +: HiveOnlyCheck +: TableCapabilityCheck +: - CommandCheck(conf) +: + CommandCheck +: customCheckRules } @@ -270,7 +269,7 @@ abstract class BaseSessionStateBuilder( * Note: this depends on the `conf` and `experimentalMethods` fields. */ protected def planner: SparkPlanner = { - new SparkPlanner(session, conf, experimentalMethods) { + new SparkPlanner(session, experimentalMethods) { override def extraPlanningStrategies: Seq[Strategy] = super.extraPlanningStrategies ++ customPlanningStrategies } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index 0f9a89741c192..48d8c3d325347 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.execution._ import org.apache.spark.sql.streaming.StreamingQueryManager -import org.apache.spark.sql.util.{ExecutionListenerManager, QueryExecutionListener} +import org.apache.spark.sql.util.ExecutionListenerManager /** * A class that holds all session-specific state in a given [[SparkSession]]. @@ -52,7 +52,8 @@ import org.apache.spark.sql.util.{ExecutionListenerManager, QueryExecutionListen * @param planner Planner that converts optimized logical plans to physical plans. * @param streamingQueryManagerBuilder A function to create a streaming query manager to * start and stop streaming queries. - * @param listenerManager Interface to register custom [[QueryExecutionListener]]s. + * @param listenerManager Interface to register custominternal/SessionState.scala + * [[org.apache.spark.sql.util.QueryExecutionListener]]s. * @param resourceLoaderBuilder a function to create a session shared resource loader to load JARs, * files, etc. * @param createQueryExecution Function used to create QueryExecution objects. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala index 1acdc4bd5f0e3..89aceacac6007 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala @@ -31,13 +31,11 @@ import org.apache.hadoop.fs.FsUrlStreamHandlerFactory import org.apache.spark.{SparkConf, SparkContext, SparkException} import org.apache.spark.internal.Logging -import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.execution.CacheManager import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.ui.{SQLAppStatusListener, SQLAppStatusStore, SQLTab} import org.apache.spark.sql.internal.StaticSQLConf._ -import org.apache.spark.sql.streaming.StreamingQueryListener import org.apache.spark.sql.streaming.ui.{StreamingQueryStatusListener, StreamingQueryTab} import org.apache.spark.status.ElementTrackingStore import org.apache.spark.util.Utils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala index 2b9c574aaaf0c..248dfa107bc4b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/VariableSubstitution.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.internal import org.apache.spark.internal.config._ +import org.apache.spark.sql.catalyst.SQLConfHelper /** * A helper class that enables substitution using syntax like @@ -25,9 +26,7 @@ import org.apache.spark.internal.config._ * * Variable substitution is controlled by `SQLConf.variableSubstituteEnabled`. */ -class VariableSubstitution { - - private def conf = SQLConf.get +class VariableSubstitution extends SQLConfHelper { private val provider = new ConfigProvider { override def get(key: String): Option[String] = Option(conf.getConfString(key, "")) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index 0fe2d0be966d0..ffdbe9d4e4915 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -29,7 +29,6 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.annotation.Evolving import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.UI.UI_ENABLED import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession} import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala index cdad5ed9942b5..1f7e65dede170 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.streaming.ui import java.text.SimpleDateFormat -import java.util.Locale import org.apache.spark.sql.catalyst.util.DateTimeUtils.getTimeZone diff --git a/sql/core/src/test/resources/sql-tests/inputs/ansi/map.sql b/sql/core/src/test/resources/sql-tests/inputs/ansi/map.sql new file mode 100644 index 0000000000000..23e5b9562973b --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/ansi/map.sql @@ -0,0 +1 @@ +--IMPORT map.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/like-all.sql b/sql/core/src/test/resources/sql-tests/inputs/like-all.sql index a084dbef61a0c..f83277376e680 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/like-all.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/like-all.sql @@ -1,3 +1,7 @@ +-- test cases for like all +--CONFIG_DIM1 spark.sql.optimizer.likeAllConversionThreshold=0 +--CONFIG_DIM1 spark.sql.optimizer.likeAllConversionThreshold=200 + CREATE OR REPLACE TEMPORARY VIEW like_all_table AS SELECT * FROM (VALUES ('google', '%oo%'), ('facebook', '%oo%'), diff --git a/sql/core/src/test/resources/sql-tests/inputs/map.sql b/sql/core/src/test/resources/sql-tests/inputs/map.sql new file mode 100644 index 0000000000000..e2d855fba154e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/map.sql @@ -0,0 +1,5 @@ +-- test cases for map functions + +-- key does not exist +select element_at(map(1, 'a', 2, 'b'), 5); +select map(1, 'a', 2, 'b')[5]; diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/map.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/map.sql.out new file mode 100644 index 0000000000000..12c599812cdee --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/ansi/map.sql.out @@ -0,0 +1,20 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 2 + + +-- !query +select element_at(map(1, 'a', 2, 'b'), 5) +-- !query schema +struct<> +-- !query output +java.util.NoSuchElementException +Key 5 does not exist. + + +-- !query +select map(1, 'a', 2, 'b')[5] +-- !query schema +struct<> +-- !query output +java.util.NoSuchElementException +Key 5 does not exist. diff --git a/sql/core/src/test/resources/sql-tests/results/map.sql.out b/sql/core/src/test/resources/sql-tests/results/map.sql.out new file mode 100644 index 0000000000000..7a0c0d776ca2b --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/map.sql.out @@ -0,0 +1,18 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 2 + + +-- !query +select element_at(map(1, 'a', 2, 'b'), 5) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select map(1, 'a', 2, 'b')[5] +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16.sf100/explain.txt index 509fb0133095b..a446163e3d29d 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16.sf100/explain.txt @@ -1,5 +1,5 @@ == Physical Plan == -TakeOrderedAndProject (44) +* Sort (44) +- * HashAggregate (43) +- Exchange (42) +- * HashAggregate (41) @@ -244,7 +244,7 @@ Functions [3]: [sum(UnscaledValue(cs_ext_ship_cost#6)), sum(UnscaledValue(cs_net Aggregate Attributes [3]: [sum(UnscaledValue(cs_ext_ship_cost#6))#23, sum(UnscaledValue(cs_net_profit#7))#24, count(cs_order_number#5)#27] Results [3]: [count(cs_order_number#5)#27 AS order count #30, MakeDecimal(sum(UnscaledValue(cs_ext_ship_cost#6))#23,17,2) AS total shipping cost #31, MakeDecimal(sum(UnscaledValue(cs_net_profit#7))#24,17,2) AS total net profit #32] -(44) TakeOrderedAndProject +(44) Sort [codegen id : 12] Input [3]: [order count #30, total shipping cost #31, total net profit #32] -Arguments: 100, [order count #30 ASC NULLS FIRST], [order count #30, total shipping cost #31, total net profit #32] +Arguments: [order count #30 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16.sf100/simplified.txt index ea9a0b27ff700..73a9b58010f58 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16.sf100/simplified.txt @@ -1,5 +1,5 @@ -TakeOrderedAndProject [order count ,total shipping cost ,total net profit ] - WholeStageCodegen (12) +WholeStageCodegen (12) + Sort [order count ] HashAggregate [sum,sum,count] [sum(UnscaledValue(cs_ext_ship_cost)),sum(UnscaledValue(cs_net_profit)),count(cs_order_number),order count ,total shipping cost ,total net profit ,sum,sum,count] InputAdapter Exchange #1 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16/explain.txt index 2ae939cfe41f3..ea7e298393e4c 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16/explain.txt @@ -1,5 +1,5 @@ == Physical Plan == -TakeOrderedAndProject (41) +* Sort (41) +- * HashAggregate (40) +- Exchange (39) +- * HashAggregate (38) @@ -229,7 +229,7 @@ Functions [3]: [sum(UnscaledValue(cs_ext_ship_cost#6)), sum(UnscaledValue(cs_net Aggregate Attributes [3]: [sum(UnscaledValue(cs_ext_ship_cost#6))#22, sum(UnscaledValue(cs_net_profit#7))#23, count(cs_order_number#5)#27] Results [3]: [count(cs_order_number#5)#27 AS order count #30, MakeDecimal(sum(UnscaledValue(cs_ext_ship_cost#6))#22,17,2) AS total shipping cost #31, MakeDecimal(sum(UnscaledValue(cs_net_profit#7))#23,17,2) AS total net profit #32] -(41) TakeOrderedAndProject +(41) Sort [codegen id : 8] Input [3]: [order count #30, total shipping cost #31, total net profit #32] -Arguments: 100, [order count #30 ASC NULLS FIRST], [order count #30, total shipping cost #31, total net profit #32] +Arguments: [order count #30 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16/simplified.txt index a044b05365f8e..169f07c2d85e5 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16/simplified.txt @@ -1,5 +1,5 @@ -TakeOrderedAndProject [order count ,total shipping cost ,total net profit ] - WholeStageCodegen (8) +WholeStageCodegen (8) + Sort [order count ] HashAggregate [sum,sum,count] [sum(UnscaledValue(cs_ext_ship_cost)),sum(UnscaledValue(cs_net_profit)),count(cs_order_number),order count ,total shipping cost ,total net profit ,sum,sum,count] InputAdapter Exchange #1 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q2.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q2.sf100/explain.txt index fe5966bb4dfb3..61e5ae0121819 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q2.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q2.sf100/explain.txt @@ -1,46 +1,45 @@ == Physical Plan == -* Sort (42) -+- Exchange (41) - +- * Project (40) - +- * SortMergeJoin Inner (39) - :- * Sort (27) - : +- Exchange (26) - : +- * Project (25) - : +- * BroadcastHashJoin Inner BuildRight (24) - : :- * HashAggregate (18) - : : +- Exchange (17) - : : +- * HashAggregate (16) - : : +- * Project (15) - : : +- * BroadcastHashJoin Inner BuildRight (14) - : : :- Union (9) - : : : :- * Project (4) - : : : : +- * Filter (3) - : : : : +- * ColumnarToRow (2) - : : : : +- Scan parquet default.web_sales (1) - : : : +- * Project (8) - : : : +- * Filter (7) - : : : +- * ColumnarToRow (6) - : : : +- Scan parquet default.catalog_sales (5) - : : +- BroadcastExchange (13) - : : +- * Filter (12) - : : +- * ColumnarToRow (11) - : : +- Scan parquet default.date_dim (10) - : +- BroadcastExchange (23) - : +- * Project (22) - : +- * Filter (21) - : +- * ColumnarToRow (20) - : +- Scan parquet default.date_dim (19) - +- * Sort (38) - +- Exchange (37) - +- * Project (36) - +- * BroadcastHashJoin Inner BuildRight (35) - :- * HashAggregate (29) - : +- ReusedExchange (28) - +- BroadcastExchange (34) - +- * Project (33) - +- * Filter (32) - +- * ColumnarToRow (31) - +- Scan parquet default.date_dim (30) +* Sort (41) ++- Exchange (40) + +- * Project (39) + +- * SortMergeJoin Inner (38) + :- * Sort (26) + : +- * Project (25) + : +- * BroadcastHashJoin Inner BuildRight (24) + : :- * HashAggregate (18) + : : +- Exchange (17) + : : +- * HashAggregate (16) + : : +- * Project (15) + : : +- * BroadcastHashJoin Inner BuildRight (14) + : : :- Union (9) + : : : :- * Project (4) + : : : : +- * Filter (3) + : : : : +- * ColumnarToRow (2) + : : : : +- Scan parquet default.web_sales (1) + : : : +- * Project (8) + : : : +- * Filter (7) + : : : +- * ColumnarToRow (6) + : : : +- Scan parquet default.catalog_sales (5) + : : +- BroadcastExchange (13) + : : +- * Filter (12) + : : +- * ColumnarToRow (11) + : : +- Scan parquet default.date_dim (10) + : +- BroadcastExchange (23) + : +- * Project (22) + : +- * Filter (21) + : +- * ColumnarToRow (20) + : +- Scan parquet default.date_dim (19) + +- * Sort (37) + +- Exchange (36) + +- * Project (35) + +- * BroadcastHashJoin Inner BuildRight (34) + :- * HashAggregate (28) + : +- ReusedExchange (27) + +- BroadcastExchange (33) + +- * Project (32) + +- * Filter (31) + +- * ColumnarToRow (30) + +- Scan parquet default.date_dim (29) (1) Scan parquet default.web_sales @@ -157,77 +156,73 @@ Join condition: None Output [8]: [d_week_seq#10 AS d_week_seq1#45, sun_sales#35 AS sun_sales1#46, mon_sales#36 AS mon_sales1#47, tue_sales#37 AS tue_sales1#48, wed_sales#38 AS wed_sales1#49, thu_sales#39 AS thu_sales1#50, fri_sales#40 AS fri_sales1#51, sat_sales#41 AS sat_sales1#52] Input [9]: [d_week_seq#10, sun_sales#35, mon_sales#36, tue_sales#37, wed_sales#38, thu_sales#39, fri_sales#40, sat_sales#41, d_week_seq#42] -(26) Exchange -Input [8]: [d_week_seq1#45, sun_sales1#46, mon_sales1#47, tue_sales1#48, wed_sales1#49, thu_sales1#50, fri_sales1#51, sat_sales1#52] -Arguments: hashpartitioning(d_week_seq1#45, 5), true, [id=#53] - -(27) Sort [codegen id : 7] +(26) Sort [codegen id : 6] Input [8]: [d_week_seq1#45, sun_sales1#46, mon_sales1#47, tue_sales1#48, wed_sales1#49, thu_sales1#50, fri_sales1#51, sat_sales1#52] Arguments: [d_week_seq1#45 ASC NULLS FIRST], false, 0 -(28) ReusedExchange [Reuses operator id: 17] -Output [8]: [d_week_seq#10, sum#54, sum#55, sum#56, sum#57, sum#58, sum#59, sum#60] +(27) ReusedExchange [Reuses operator id: 17] +Output [8]: [d_week_seq#10, sum#53, sum#54, sum#55, sum#56, sum#57, sum#58, sum#59] -(29) HashAggregate [codegen id : 13] -Input [8]: [d_week_seq#10, sum#54, sum#55, sum#56, sum#57, sum#58, sum#59, sum#60] +(28) HashAggregate [codegen id : 12] +Input [8]: [d_week_seq#10, sum#53, sum#54, sum#55, sum#56, sum#57, sum#58, sum#59] Keys [1]: [d_week_seq#10] Functions [7]: [sum(UnscaledValue(CASE WHEN (d_day_name#11 = Sunday) THEN sales_price#4 ELSE null END)), sum(UnscaledValue(CASE WHEN (d_day_name#11 = Monday) THEN sales_price#4 ELSE null END)), sum(UnscaledValue(CASE WHEN (d_day_name#11 = Tuesday) THEN sales_price#4 ELSE null END)), sum(UnscaledValue(CASE WHEN (d_day_name#11 = Wednesday) THEN sales_price#4 ELSE null END)), sum(UnscaledValue(CASE WHEN (d_day_name#11 = Thursday) THEN sales_price#4 ELSE null END)), sum(UnscaledValue(CASE WHEN (d_day_name#11 = Friday) THEN sales_price#4 ELSE null END)), sum(UnscaledValue(CASE WHEN (d_day_name#11 = Saturday) THEN sales_price#4 ELSE null END))] -Aggregate Attributes [7]: [sum(UnscaledValue(CASE WHEN (d_day_name#11 = Sunday) THEN sales_price#4 ELSE null END))#61, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Monday) THEN sales_price#4 ELSE null END))#62, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Tuesday) THEN sales_price#4 ELSE null END))#63, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Wednesday) THEN sales_price#4 ELSE null END))#64, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Thursday) THEN sales_price#4 ELSE null END))#65, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Friday) THEN sales_price#4 ELSE null END))#66, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Saturday) THEN sales_price#4 ELSE null END))#67] -Results [8]: [d_week_seq#10, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Sunday) THEN sales_price#4 ELSE null END))#61,17,2) AS sun_sales#35, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Monday) THEN sales_price#4 ELSE null END))#62,17,2) AS mon_sales#36, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Tuesday) THEN sales_price#4 ELSE null END))#63,17,2) AS tue_sales#37, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Wednesday) THEN sales_price#4 ELSE null END))#64,17,2) AS wed_sales#38, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Thursday) THEN sales_price#4 ELSE null END))#65,17,2) AS thu_sales#39, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Friday) THEN sales_price#4 ELSE null END))#66,17,2) AS fri_sales#40, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Saturday) THEN sales_price#4 ELSE null END))#67,17,2) AS sat_sales#41] +Aggregate Attributes [7]: [sum(UnscaledValue(CASE WHEN (d_day_name#11 = Sunday) THEN sales_price#4 ELSE null END))#60, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Monday) THEN sales_price#4 ELSE null END))#61, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Tuesday) THEN sales_price#4 ELSE null END))#62, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Wednesday) THEN sales_price#4 ELSE null END))#63, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Thursday) THEN sales_price#4 ELSE null END))#64, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Friday) THEN sales_price#4 ELSE null END))#65, sum(UnscaledValue(CASE WHEN (d_day_name#11 = Saturday) THEN sales_price#4 ELSE null END))#66] +Results [8]: [d_week_seq#10, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Sunday) THEN sales_price#4 ELSE null END))#60,17,2) AS sun_sales#35, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Monday) THEN sales_price#4 ELSE null END))#61,17,2) AS mon_sales#36, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Tuesday) THEN sales_price#4 ELSE null END))#62,17,2) AS tue_sales#37, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Wednesday) THEN sales_price#4 ELSE null END))#63,17,2) AS wed_sales#38, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Thursday) THEN sales_price#4 ELSE null END))#64,17,2) AS thu_sales#39, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Friday) THEN sales_price#4 ELSE null END))#65,17,2) AS fri_sales#40, MakeDecimal(sum(UnscaledValue(CASE WHEN (d_day_name#11 = Saturday) THEN sales_price#4 ELSE null END))#66,17,2) AS sat_sales#41] -(30) Scan parquet default.date_dim -Output [2]: [d_week_seq#68, d_year#69] +(29) Scan parquet default.date_dim +Output [2]: [d_week_seq#67, d_year#68] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] PushedFilters: [IsNotNull(d_year), EqualTo(d_year,2002), IsNotNull(d_week_seq)] ReadSchema: struct -(31) ColumnarToRow [codegen id : 12] -Input [2]: [d_week_seq#68, d_year#69] +(30) ColumnarToRow [codegen id : 11] +Input [2]: [d_week_seq#67, d_year#68] -(32) Filter [codegen id : 12] -Input [2]: [d_week_seq#68, d_year#69] -Condition : ((isnotnull(d_year#69) AND (d_year#69 = 2002)) AND isnotnull(d_week_seq#68)) +(31) Filter [codegen id : 11] +Input [2]: [d_week_seq#67, d_year#68] +Condition : ((isnotnull(d_year#68) AND (d_year#68 = 2002)) AND isnotnull(d_week_seq#67)) -(33) Project [codegen id : 12] -Output [1]: [d_week_seq#68] -Input [2]: [d_week_seq#68, d_year#69] +(32) Project [codegen id : 11] +Output [1]: [d_week_seq#67] +Input [2]: [d_week_seq#67, d_year#68] -(34) BroadcastExchange -Input [1]: [d_week_seq#68] -Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#70] +(33) BroadcastExchange +Input [1]: [d_week_seq#67] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#69] -(35) BroadcastHashJoin [codegen id : 13] +(34) BroadcastHashJoin [codegen id : 12] Left keys [1]: [d_week_seq#10] -Right keys [1]: [d_week_seq#68] +Right keys [1]: [d_week_seq#67] Join condition: None -(36) Project [codegen id : 13] -Output [8]: [d_week_seq#10 AS d_week_seq2#71, sun_sales#35 AS sun_sales2#72, mon_sales#36 AS mon_sales2#73, tue_sales#37 AS tue_sales2#74, wed_sales#38 AS wed_sales2#75, thu_sales#39 AS thu_sales2#76, fri_sales#40 AS fri_sales2#77, sat_sales#41 AS sat_sales2#78] -Input [9]: [d_week_seq#10, sun_sales#35, mon_sales#36, tue_sales#37, wed_sales#38, thu_sales#39, fri_sales#40, sat_sales#41, d_week_seq#68] +(35) Project [codegen id : 12] +Output [8]: [d_week_seq#10 AS d_week_seq2#70, sun_sales#35 AS sun_sales2#71, mon_sales#36 AS mon_sales2#72, tue_sales#37 AS tue_sales2#73, wed_sales#38 AS wed_sales2#74, thu_sales#39 AS thu_sales2#75, fri_sales#40 AS fri_sales2#76, sat_sales#41 AS sat_sales2#77] +Input [9]: [d_week_seq#10, sun_sales#35, mon_sales#36, tue_sales#37, wed_sales#38, thu_sales#39, fri_sales#40, sat_sales#41, d_week_seq#67] -(37) Exchange -Input [8]: [d_week_seq2#71, sun_sales2#72, mon_sales2#73, tue_sales2#74, wed_sales2#75, thu_sales2#76, fri_sales2#77, sat_sales2#78] -Arguments: hashpartitioning((d_week_seq2#71 - 53), 5), true, [id=#79] +(36) Exchange +Input [8]: [d_week_seq2#70, sun_sales2#71, mon_sales2#72, tue_sales2#73, wed_sales2#74, thu_sales2#75, fri_sales2#76, sat_sales2#77] +Arguments: hashpartitioning((d_week_seq2#70 - 53), 5), true, [id=#78] -(38) Sort [codegen id : 14] -Input [8]: [d_week_seq2#71, sun_sales2#72, mon_sales2#73, tue_sales2#74, wed_sales2#75, thu_sales2#76, fri_sales2#77, sat_sales2#78] -Arguments: [(d_week_seq2#71 - 53) ASC NULLS FIRST], false, 0 +(37) Sort [codegen id : 13] +Input [8]: [d_week_seq2#70, sun_sales2#71, mon_sales2#72, tue_sales2#73, wed_sales2#74, thu_sales2#75, fri_sales2#76, sat_sales2#77] +Arguments: [(d_week_seq2#70 - 53) ASC NULLS FIRST], false, 0 -(39) SortMergeJoin [codegen id : 15] +(38) SortMergeJoin [codegen id : 14] Left keys [1]: [d_week_seq1#45] -Right keys [1]: [(d_week_seq2#71 - 53)] +Right keys [1]: [(d_week_seq2#70 - 53)] Join condition: None -(40) Project [codegen id : 15] -Output [8]: [d_week_seq1#45, round(CheckOverflow((promote_precision(sun_sales1#46) / promote_precision(sun_sales2#72)), DecimalType(37,20), true), 2) AS round((sun_sales1 / sun_sales2), 2)#80, round(CheckOverflow((promote_precision(mon_sales1#47) / promote_precision(mon_sales2#73)), DecimalType(37,20), true), 2) AS round((mon_sales1 / mon_sales2), 2)#81, round(CheckOverflow((promote_precision(tue_sales1#48) / promote_precision(tue_sales2#74)), DecimalType(37,20), true), 2) AS round((tue_sales1 / tue_sales2), 2)#82, round(CheckOverflow((promote_precision(wed_sales1#49) / promote_precision(wed_sales2#75)), DecimalType(37,20), true), 2) AS round((wed_sales1 / wed_sales2), 2)#83, round(CheckOverflow((promote_precision(thu_sales1#50) / promote_precision(thu_sales2#76)), DecimalType(37,20), true), 2) AS round((thu_sales1 / thu_sales2), 2)#84, round(CheckOverflow((promote_precision(fri_sales1#51) / promote_precision(fri_sales2#77)), DecimalType(37,20), true), 2) AS round((fri_sales1 / fri_sales2), 2)#85, round(CheckOverflow((promote_precision(sat_sales1#52) / promote_precision(sat_sales2#78)), DecimalType(37,20), true), 2) AS round((sat_sales1 / sat_sales2), 2)#86] -Input [16]: [d_week_seq1#45, sun_sales1#46, mon_sales1#47, tue_sales1#48, wed_sales1#49, thu_sales1#50, fri_sales1#51, sat_sales1#52, d_week_seq2#71, sun_sales2#72, mon_sales2#73, tue_sales2#74, wed_sales2#75, thu_sales2#76, fri_sales2#77, sat_sales2#78] +(39) Project [codegen id : 14] +Output [8]: [d_week_seq1#45, round(CheckOverflow((promote_precision(sun_sales1#46) / promote_precision(sun_sales2#71)), DecimalType(37,20), true), 2) AS round((sun_sales1 / sun_sales2), 2)#79, round(CheckOverflow((promote_precision(mon_sales1#47) / promote_precision(mon_sales2#72)), DecimalType(37,20), true), 2) AS round((mon_sales1 / mon_sales2), 2)#80, round(CheckOverflow((promote_precision(tue_sales1#48) / promote_precision(tue_sales2#73)), DecimalType(37,20), true), 2) AS round((tue_sales1 / tue_sales2), 2)#81, round(CheckOverflow((promote_precision(wed_sales1#49) / promote_precision(wed_sales2#74)), DecimalType(37,20), true), 2) AS round((wed_sales1 / wed_sales2), 2)#82, round(CheckOverflow((promote_precision(thu_sales1#50) / promote_precision(thu_sales2#75)), DecimalType(37,20), true), 2) AS round((thu_sales1 / thu_sales2), 2)#83, round(CheckOverflow((promote_precision(fri_sales1#51) / promote_precision(fri_sales2#76)), DecimalType(37,20), true), 2) AS round((fri_sales1 / fri_sales2), 2)#84, round(CheckOverflow((promote_precision(sat_sales1#52) / promote_precision(sat_sales2#77)), DecimalType(37,20), true), 2) AS round((sat_sales1 / sat_sales2), 2)#85] +Input [16]: [d_week_seq1#45, sun_sales1#46, mon_sales1#47, tue_sales1#48, wed_sales1#49, thu_sales1#50, fri_sales1#51, sat_sales1#52, d_week_seq2#70, sun_sales2#71, mon_sales2#72, tue_sales2#73, wed_sales2#74, thu_sales2#75, fri_sales2#76, sat_sales2#77] -(41) Exchange -Input [8]: [d_week_seq1#45, round((sun_sales1 / sun_sales2), 2)#80, round((mon_sales1 / mon_sales2), 2)#81, round((tue_sales1 / tue_sales2), 2)#82, round((wed_sales1 / wed_sales2), 2)#83, round((thu_sales1 / thu_sales2), 2)#84, round((fri_sales1 / fri_sales2), 2)#85, round((sat_sales1 / sat_sales2), 2)#86] -Arguments: rangepartitioning(d_week_seq1#45 ASC NULLS FIRST, 5), true, [id=#87] +(40) Exchange +Input [8]: [d_week_seq1#45, round((sun_sales1 / sun_sales2), 2)#79, round((mon_sales1 / mon_sales2), 2)#80, round((tue_sales1 / tue_sales2), 2)#81, round((wed_sales1 / wed_sales2), 2)#82, round((thu_sales1 / thu_sales2), 2)#83, round((fri_sales1 / fri_sales2), 2)#84, round((sat_sales1 / sat_sales2), 2)#85] +Arguments: rangepartitioning(d_week_seq1#45 ASC NULLS FIRST, 5), true, [id=#86] -(42) Sort [codegen id : 16] -Input [8]: [d_week_seq1#45, round((sun_sales1 / sun_sales2), 2)#80, round((mon_sales1 / mon_sales2), 2)#81, round((tue_sales1 / tue_sales2), 2)#82, round((wed_sales1 / wed_sales2), 2)#83, round((thu_sales1 / thu_sales2), 2)#84, round((fri_sales1 / fri_sales2), 2)#85, round((sat_sales1 / sat_sales2), 2)#86] +(41) Sort [codegen id : 15] +Input [8]: [d_week_seq1#45, round((sun_sales1 / sun_sales2), 2)#79, round((mon_sales1 / mon_sales2), 2)#80, round((tue_sales1 / tue_sales2), 2)#81, round((wed_sales1 / wed_sales2), 2)#82, round((thu_sales1 / thu_sales2), 2)#83, round((fri_sales1 / fri_sales2), 2)#84, round((sat_sales1 / sat_sales2), 2)#85] Arguments: [d_week_seq1#45 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q2.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q2.sf100/simplified.txt index 3df7e4c8e6f3f..3389774c46469 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q2.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q2.sf100/simplified.txt @@ -1,68 +1,65 @@ -WholeStageCodegen (16) +WholeStageCodegen (15) Sort [d_week_seq1] InputAdapter Exchange [d_week_seq1] #1 - WholeStageCodegen (15) + WholeStageCodegen (14) Project [d_week_seq1,sun_sales1,sun_sales2,mon_sales1,mon_sales2,tue_sales1,tue_sales2,wed_sales1,wed_sales2,thu_sales1,thu_sales2,fri_sales1,fri_sales2,sat_sales1,sat_sales2] SortMergeJoin [d_week_seq1,d_week_seq2] InputAdapter - WholeStageCodegen (7) + WholeStageCodegen (6) Sort [d_week_seq1] - InputAdapter - Exchange [d_week_seq1] #2 - WholeStageCodegen (6) - Project [d_week_seq,sun_sales,mon_sales,tue_sales,wed_sales,thu_sales,fri_sales,sat_sales] - BroadcastHashJoin [d_week_seq,d_week_seq] - HashAggregate [d_week_seq,sum,sum,sum,sum,sum,sum,sum] [sum(UnscaledValue(CASE WHEN (d_day_name = Sunday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Monday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Tuesday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Wednesday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Thursday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Friday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Saturday) THEN sales_price ELSE null END)),sun_sales,mon_sales,tue_sales,wed_sales,thu_sales,fri_sales,sat_sales,sum,sum,sum,sum,sum,sum,sum] - InputAdapter - Exchange [d_week_seq] #3 - WholeStageCodegen (4) - HashAggregate [d_week_seq,d_day_name,sales_price] [sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum] - Project [sales_price,d_week_seq,d_day_name] - BroadcastHashJoin [sold_date_sk,d_date_sk] - InputAdapter - Union - WholeStageCodegen (1) - Project [ws_sold_date_sk,ws_ext_sales_price] - Filter [ws_sold_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.web_sales [ws_sold_date_sk,ws_ext_sales_price] - WholeStageCodegen (2) - Project [cs_sold_date_sk,cs_ext_sales_price] - Filter [cs_sold_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.catalog_sales [cs_sold_date_sk,cs_ext_sales_price] - InputAdapter - BroadcastExchange #4 - WholeStageCodegen (3) - Filter [d_date_sk,d_week_seq] - ColumnarToRow - InputAdapter - Scan parquet default.date_dim [d_date_sk,d_week_seq,d_day_name] - InputAdapter - BroadcastExchange #5 - WholeStageCodegen (5) - Project [d_week_seq] - Filter [d_year,d_week_seq] - ColumnarToRow - InputAdapter - Scan parquet default.date_dim [d_week_seq,d_year] + Project [d_week_seq,sun_sales,mon_sales,tue_sales,wed_sales,thu_sales,fri_sales,sat_sales] + BroadcastHashJoin [d_week_seq,d_week_seq] + HashAggregate [d_week_seq,sum,sum,sum,sum,sum,sum,sum] [sum(UnscaledValue(CASE WHEN (d_day_name = Sunday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Monday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Tuesday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Wednesday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Thursday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Friday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Saturday) THEN sales_price ELSE null END)),sun_sales,mon_sales,tue_sales,wed_sales,thu_sales,fri_sales,sat_sales,sum,sum,sum,sum,sum,sum,sum] + InputAdapter + Exchange [d_week_seq] #2 + WholeStageCodegen (4) + HashAggregate [d_week_seq,d_day_name,sales_price] [sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum] + Project [sales_price,d_week_seq,d_day_name] + BroadcastHashJoin [sold_date_sk,d_date_sk] + InputAdapter + Union + WholeStageCodegen (1) + Project [ws_sold_date_sk,ws_ext_sales_price] + Filter [ws_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.web_sales [ws_sold_date_sk,ws_ext_sales_price] + WholeStageCodegen (2) + Project [cs_sold_date_sk,cs_ext_sales_price] + Filter [cs_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.catalog_sales [cs_sold_date_sk,cs_ext_sales_price] + InputAdapter + BroadcastExchange #3 + WholeStageCodegen (3) + Filter [d_date_sk,d_week_seq] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_date_sk,d_week_seq,d_day_name] + InputAdapter + BroadcastExchange #4 + WholeStageCodegen (5) + Project [d_week_seq] + Filter [d_year,d_week_seq] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_week_seq,d_year] InputAdapter - WholeStageCodegen (14) + WholeStageCodegen (13) Sort [d_week_seq2] InputAdapter - Exchange [d_week_seq2] #6 - WholeStageCodegen (13) + Exchange [d_week_seq2] #5 + WholeStageCodegen (12) Project [d_week_seq,sun_sales,mon_sales,tue_sales,wed_sales,thu_sales,fri_sales,sat_sales] BroadcastHashJoin [d_week_seq,d_week_seq] HashAggregate [d_week_seq,sum,sum,sum,sum,sum,sum,sum] [sum(UnscaledValue(CASE WHEN (d_day_name = Sunday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Monday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Tuesday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Wednesday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Thursday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Friday) THEN sales_price ELSE null END)),sum(UnscaledValue(CASE WHEN (d_day_name = Saturday) THEN sales_price ELSE null END)),sun_sales,mon_sales,tue_sales,wed_sales,thu_sales,fri_sales,sat_sales,sum,sum,sum,sum,sum,sum,sum] InputAdapter - ReusedExchange [d_week_seq,sum,sum,sum,sum,sum,sum,sum] #3 + ReusedExchange [d_week_seq,sum,sum,sum,sum,sum,sum,sum] #2 InputAdapter - BroadcastExchange #7 - WholeStageCodegen (12) + BroadcastExchange #6 + WholeStageCodegen (11) Project [d_week_seq] Filter [d_year,d_week_seq] ColumnarToRow diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a.sf100/explain.txt index c5988072f758d..85f71b6cd9388 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a.sf100/explain.txt @@ -1,96 +1,103 @@ == Physical Plan == -CollectLimit (92) -+- * HashAggregate (91) - +- Exchange (90) - +- * HashAggregate (89) - +- Union (88) - :- * Project (60) - : +- * BroadcastHashJoin Inner BuildRight (59) - : :- * Project (53) - : : +- SortMergeJoin LeftSemi (52) - : : :- * Sort (34) - : : : +- Exchange (33) - : : : +- * Project (32) - : : : +- SortMergeJoin LeftSemi (31) - : : : :- * Sort (5) - : : : : +- Exchange (4) - : : : : +- * Filter (3) - : : : : +- * ColumnarToRow (2) - : : : : +- Scan parquet default.catalog_sales (1) - : : : +- * Sort (30) - : : : +- Exchange (29) - : : : +- * Project (28) - : : : +- * Filter (27) - : : : +- * HashAggregate (26) - : : : +- * HashAggregate (25) - : : : +- * Project (24) - : : : +- * SortMergeJoin Inner (23) - : : : :- * Sort (17) - : : : : +- Exchange (16) - : : : : +- * Project (15) - : : : : +- * BroadcastHashJoin Inner BuildRight (14) - : : : : :- * Filter (8) - : : : : : +- * ColumnarToRow (7) - : : : : : +- Scan parquet default.store_sales (6) - : : : : +- BroadcastExchange (13) - : : : : +- * Project (12) - : : : : +- * Filter (11) - : : : : +- * ColumnarToRow (10) - : : : : +- Scan parquet default.date_dim (9) - : : : +- * Sort (22) - : : : +- Exchange (21) - : : : +- * Filter (20) - : : : +- * ColumnarToRow (19) - : : : +- Scan parquet default.item (18) - : : +- * Sort (51) - : : +- * Project (50) - : : +- * Filter (49) - : : +- * HashAggregate (48) - : : +- * HashAggregate (47) - : : +- * Project (46) - : : +- * SortMergeJoin Inner (45) - : : :- * Sort (39) - : : : +- Exchange (38) - : : : +- * Filter (37) - : : : +- * ColumnarToRow (36) - : : : +- Scan parquet default.store_sales (35) - : : +- * Sort (44) - : : +- Exchange (43) - : : +- * Filter (42) - : : +- * ColumnarToRow (41) - : : +- Scan parquet default.customer (40) - : +- BroadcastExchange (58) - : +- * Project (57) - : +- * Filter (56) - : +- * ColumnarToRow (55) - : +- Scan parquet default.date_dim (54) - +- * Project (87) - +- * BroadcastHashJoin Inner BuildRight (86) - :- * Project (84) - : +- SortMergeJoin LeftSemi (83) - : :- * Sort (71) - : : +- Exchange (70) - : : +- * Project (69) - : : +- SortMergeJoin LeftSemi (68) - : : :- * Sort (65) - : : : +- Exchange (64) - : : : +- * Filter (63) - : : : +- * ColumnarToRow (62) - : : : +- Scan parquet default.web_sales (61) - : : +- * Sort (67) - : : +- ReusedExchange (66) - : +- * Sort (82) - : +- * Project (81) - : +- * Filter (80) - : +- * HashAggregate (79) - : +- * HashAggregate (78) - : +- * Project (77) - : +- * SortMergeJoin Inner (76) - : :- * Sort (73) - : : +- ReusedExchange (72) - : +- * Sort (75) - : +- ReusedExchange (74) - +- ReusedExchange (85) +* HashAggregate (99) ++- Exchange (98) + +- * HashAggregate (97) + +- Union (96) + :- * Project (59) + : +- * BroadcastHashJoin Inner BuildRight (58) + : :- * Project (52) + : : +- SortMergeJoin LeftSemi (51) + : : :- * Sort (33) + : : : +- Exchange (32) + : : : +- * Project (31) + : : : +- SortMergeJoin LeftSemi (30) + : : : :- * Sort (5) + : : : : +- Exchange (4) + : : : : +- * Filter (3) + : : : : +- * ColumnarToRow (2) + : : : : +- Scan parquet default.catalog_sales (1) + : : : +- * Sort (29) + : : : +- * Project (28) + : : : +- * Filter (27) + : : : +- * HashAggregate (26) + : : : +- * HashAggregate (25) + : : : +- * Project (24) + : : : +- * SortMergeJoin Inner (23) + : : : :- * Sort (17) + : : : : +- Exchange (16) + : : : : +- * Project (15) + : : : : +- * BroadcastHashJoin Inner BuildRight (14) + : : : : :- * Filter (8) + : : : : : +- * ColumnarToRow (7) + : : : : : +- Scan parquet default.store_sales (6) + : : : : +- BroadcastExchange (13) + : : : : +- * Project (12) + : : : : +- * Filter (11) + : : : : +- * ColumnarToRow (10) + : : : : +- Scan parquet default.date_dim (9) + : : : +- * Sort (22) + : : : +- Exchange (21) + : : : +- * Filter (20) + : : : +- * ColumnarToRow (19) + : : : +- Scan parquet default.item (18) + : : +- * Sort (50) + : : +- * Project (49) + : : +- * Filter (48) + : : +- * HashAggregate (47) + : : +- * HashAggregate (46) + : : +- * Project (45) + : : +- * SortMergeJoin Inner (44) + : : :- * Sort (38) + : : : +- Exchange (37) + : : : +- * Filter (36) + : : : +- * ColumnarToRow (35) + : : : +- Scan parquet default.store_sales (34) + : : +- * Sort (43) + : : +- Exchange (42) + : : +- * Filter (41) + : : +- * ColumnarToRow (40) + : : +- Scan parquet default.customer (39) + : +- BroadcastExchange (57) + : +- * Project (56) + : +- * Filter (55) + : +- * ColumnarToRow (54) + : +- Scan parquet default.date_dim (53) + +- * Project (95) + +- * BroadcastHashJoin Inner BuildRight (94) + :- * Project (92) + : +- SortMergeJoin LeftSemi (91) + : :- * Sort (79) + : : +- Exchange (78) + : : +- * Project (77) + : : +- SortMergeJoin LeftSemi (76) + : : :- * Sort (64) + : : : +- Exchange (63) + : : : +- * Filter (62) + : : : +- * ColumnarToRow (61) + : : : +- Scan parquet default.web_sales (60) + : : +- * Sort (75) + : : +- * Project (74) + : : +- * Filter (73) + : : +- * HashAggregate (72) + : : +- * HashAggregate (71) + : : +- * Project (70) + : : +- * SortMergeJoin Inner (69) + : : :- * Sort (66) + : : : +- ReusedExchange (65) + : : +- * Sort (68) + : : +- ReusedExchange (67) + : +- * Sort (90) + : +- * Project (89) + : +- * Filter (88) + : +- * HashAggregate (87) + : +- * HashAggregate (86) + : +- * Project (85) + : +- * SortMergeJoin Inner (84) + : :- * Sort (81) + : : +- ReusedExchange (80) + : +- * Sort (83) + : +- ReusedExchange (82) + +- ReusedExchange (93) (1) Scan parquet default.catalog_sales @@ -221,435 +228,469 @@ Condition : (count(1)#22 > 4) Output [1]: [item_sk#21] Input [2]: [item_sk#21, count(1)#22] -(29) Exchange -Input [1]: [item_sk#21] -Arguments: hashpartitioning(item_sk#21, 5), true, [id=#23] - -(30) Sort [codegen id : 9] +(29) Sort [codegen id : 8] Input [1]: [item_sk#21] Arguments: [item_sk#21 ASC NULLS FIRST], false, 0 -(31) SortMergeJoin +(30) SortMergeJoin Left keys [1]: [cs_item_sk#3] Right keys [1]: [item_sk#21] Join condition: None -(32) Project [codegen id : 10] +(31) Project [codegen id : 9] Output [4]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5] Input [5]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_item_sk#3, cs_quantity#4, cs_list_price#5] -(33) Exchange +(32) Exchange Input [4]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5] -Arguments: hashpartitioning(cs_bill_customer_sk#2, 5), true, [id=#24] +Arguments: hashpartitioning(cs_bill_customer_sk#2, 5), true, [id=#23] -(34) Sort [codegen id : 11] +(33) Sort [codegen id : 10] Input [4]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5] Arguments: [cs_bill_customer_sk#2 ASC NULLS FIRST], false, 0 -(35) Scan parquet default.store_sales -Output [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +(34) Scan parquet default.store_sales +Output [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] Batched: true Location [not included in comparison]/{warehouse_dir}/store_sales] PushedFilters: [IsNotNull(ss_customer_sk)] ReadSchema: struct -(36) ColumnarToRow [codegen id : 12] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +(35) ColumnarToRow [codegen id : 11] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] -(37) Filter [codegen id : 12] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Condition : isnotnull(ss_customer_sk#25) +(36) Filter [codegen id : 11] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Condition : isnotnull(ss_customer_sk#24) -(38) Exchange -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: hashpartitioning(ss_customer_sk#25, 5), true, [id=#28] +(37) Exchange +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: hashpartitioning(ss_customer_sk#24, 5), true, [id=#27] -(39) Sort [codegen id : 13] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: [ss_customer_sk#25 ASC NULLS FIRST], false, 0 +(38) Sort [codegen id : 12] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: [ss_customer_sk#24 ASC NULLS FIRST], false, 0 -(40) Scan parquet default.customer -Output [1]: [c_customer_sk#29] +(39) Scan parquet default.customer +Output [1]: [c_customer_sk#28] Batched: true Location [not included in comparison]/{warehouse_dir}/customer] PushedFilters: [IsNotNull(c_customer_sk)] ReadSchema: struct -(41) ColumnarToRow [codegen id : 14] -Input [1]: [c_customer_sk#29] +(40) ColumnarToRow [codegen id : 13] +Input [1]: [c_customer_sk#28] -(42) Filter [codegen id : 14] -Input [1]: [c_customer_sk#29] -Condition : isnotnull(c_customer_sk#29) +(41) Filter [codegen id : 13] +Input [1]: [c_customer_sk#28] +Condition : isnotnull(c_customer_sk#28) -(43) Exchange -Input [1]: [c_customer_sk#29] -Arguments: hashpartitioning(c_customer_sk#29, 5), true, [id=#30] +(42) Exchange +Input [1]: [c_customer_sk#28] +Arguments: hashpartitioning(c_customer_sk#28, 5), true, [id=#29] -(44) Sort [codegen id : 15] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 +(43) Sort [codegen id : 14] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 -(45) SortMergeJoin [codegen id : 16] -Left keys [1]: [ss_customer_sk#25] -Right keys [1]: [c_customer_sk#29] +(44) SortMergeJoin [codegen id : 15] +Left keys [1]: [ss_customer_sk#24] +Right keys [1]: [c_customer_sk#28] Join condition: None -(46) Project [codegen id : 16] -Output [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Input [4]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27, c_customer_sk#29] - -(47) HashAggregate [codegen id : 16] -Input [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Keys [1]: [c_customer_sk#29] -Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [2]: [sum#31, isEmpty#32] -Results [3]: [c_customer_sk#29, sum#33, isEmpty#34] - -(48) HashAggregate [codegen id : 16] -Input [3]: [c_customer_sk#29, sum#33, isEmpty#34] -Keys [1]: [c_customer_sk#29] -Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#35] -Results [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#35 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36] - -(49) Filter [codegen id : 16] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36] -Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(Subquery scalar-subquery#37, [id=#38] as decimal(32,6)))), DecimalType(38,8), true))) - -(50) Project [codegen id : 16] -Output [1]: [c_customer_sk#29] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36] - -(51) Sort [codegen id : 16] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 - -(52) SortMergeJoin +(45) Project [codegen id : 15] +Output [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Input [4]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26, c_customer_sk#28] + +(46) HashAggregate [codegen id : 15] +Input [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Keys [1]: [c_customer_sk#28] +Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [2]: [sum#30, isEmpty#31] +Results [3]: [c_customer_sk#28, sum#32, isEmpty#33] + +(47) HashAggregate [codegen id : 15] +Input [3]: [c_customer_sk#28, sum#32, isEmpty#33] +Keys [1]: [c_customer_sk#28] +Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#34] +Results [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#34 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35] + +(48) Filter [codegen id : 15] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35] +Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(Subquery scalar-subquery#36, [id=#37] as decimal(32,6)))), DecimalType(38,8), true))) + +(49) Project [codegen id : 15] +Output [1]: [c_customer_sk#28] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35] + +(50) Sort [codegen id : 15] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 + +(51) SortMergeJoin Left keys [1]: [cs_bill_customer_sk#2] -Right keys [1]: [c_customer_sk#29] +Right keys [1]: [c_customer_sk#28] Join condition: None -(53) Project [codegen id : 18] +(52) Project [codegen id : 17] Output [3]: [cs_sold_date_sk#1, cs_quantity#4, cs_list_price#5] Input [4]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5] -(54) Scan parquet default.date_dim -Output [3]: [d_date_sk#9, d_year#11, d_moy#39] +(53) Scan parquet default.date_dim +Output [3]: [d_date_sk#9, d_year#11, d_moy#38] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] PushedFilters: [IsNotNull(d_year), IsNotNull(d_moy), EqualTo(d_year,2000), EqualTo(d_moy,2), IsNotNull(d_date_sk)] ReadSchema: struct -(55) ColumnarToRow [codegen id : 17] -Input [3]: [d_date_sk#9, d_year#11, d_moy#39] +(54) ColumnarToRow [codegen id : 16] +Input [3]: [d_date_sk#9, d_year#11, d_moy#38] -(56) Filter [codegen id : 17] -Input [3]: [d_date_sk#9, d_year#11, d_moy#39] -Condition : ((((isnotnull(d_year#11) AND isnotnull(d_moy#39)) AND (d_year#11 = 2000)) AND (d_moy#39 = 2)) AND isnotnull(d_date_sk#9)) +(55) Filter [codegen id : 16] +Input [3]: [d_date_sk#9, d_year#11, d_moy#38] +Condition : ((((isnotnull(d_year#11) AND isnotnull(d_moy#38)) AND (d_year#11 = 2000)) AND (d_moy#38 = 2)) AND isnotnull(d_date_sk#9)) -(57) Project [codegen id : 17] +(56) Project [codegen id : 16] Output [1]: [d_date_sk#9] -Input [3]: [d_date_sk#9, d_year#11, d_moy#39] +Input [3]: [d_date_sk#9, d_year#11, d_moy#38] -(58) BroadcastExchange +(57) BroadcastExchange Input [1]: [d_date_sk#9] -Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#40] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#39] -(59) BroadcastHashJoin [codegen id : 18] +(58) BroadcastHashJoin [codegen id : 17] Left keys [1]: [cs_sold_date_sk#1] Right keys [1]: [d_date_sk#9] Join condition: None -(60) Project [codegen id : 18] -Output [1]: [CheckOverflow((promote_precision(cast(cast(cs_quantity#4 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(cs_list_price#5 as decimal(12,2)))), DecimalType(18,2), true) AS sales#41] +(59) Project [codegen id : 17] +Output [1]: [CheckOverflow((promote_precision(cast(cast(cs_quantity#4 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(cs_list_price#5 as decimal(12,2)))), DecimalType(18,2), true) AS sales#40] Input [4]: [cs_sold_date_sk#1, cs_quantity#4, cs_list_price#5, d_date_sk#9] -(61) Scan parquet default.web_sales -Output [5]: [ws_sold_date_sk#42, ws_item_sk#43, ws_bill_customer_sk#44, ws_quantity#45, ws_list_price#46] +(60) Scan parquet default.web_sales +Output [5]: [ws_sold_date_sk#41, ws_item_sk#42, ws_bill_customer_sk#43, ws_quantity#44, ws_list_price#45] Batched: true Location [not included in comparison]/{warehouse_dir}/web_sales] PushedFilters: [IsNotNull(ws_sold_date_sk)] ReadSchema: struct -(62) ColumnarToRow [codegen id : 19] -Input [5]: [ws_sold_date_sk#42, ws_item_sk#43, ws_bill_customer_sk#44, ws_quantity#45, ws_list_price#46] +(61) ColumnarToRow [codegen id : 18] +Input [5]: [ws_sold_date_sk#41, ws_item_sk#42, ws_bill_customer_sk#43, ws_quantity#44, ws_list_price#45] + +(62) Filter [codegen id : 18] +Input [5]: [ws_sold_date_sk#41, ws_item_sk#42, ws_bill_customer_sk#43, ws_quantity#44, ws_list_price#45] +Condition : isnotnull(ws_sold_date_sk#41) + +(63) Exchange +Input [5]: [ws_sold_date_sk#41, ws_item_sk#42, ws_bill_customer_sk#43, ws_quantity#44, ws_list_price#45] +Arguments: hashpartitioning(ws_item_sk#42, 5), true, [id=#46] + +(64) Sort [codegen id : 19] +Input [5]: [ws_sold_date_sk#41, ws_item_sk#42, ws_bill_customer_sk#43, ws_quantity#44, ws_list_price#45] +Arguments: [ws_item_sk#42 ASC NULLS FIRST], false, 0 + +(65) ReusedExchange [Reuses operator id: 16] +Output [2]: [ss_item_sk#8, d_date#10] + +(66) Sort [codegen id : 22] +Input [2]: [ss_item_sk#8, d_date#10] +Arguments: [ss_item_sk#8 ASC NULLS FIRST], false, 0 + +(67) ReusedExchange [Reuses operator id: 21] +Output [2]: [i_item_sk#14, i_item_desc#15] + +(68) Sort [codegen id : 24] +Input [2]: [i_item_sk#14, i_item_desc#15] +Arguments: [i_item_sk#14 ASC NULLS FIRST], false, 0 + +(69) SortMergeJoin [codegen id : 25] +Left keys [1]: [ss_item_sk#8] +Right keys [1]: [i_item_sk#14] +Join condition: None + +(70) Project [codegen id : 25] +Output [3]: [d_date#10, i_item_sk#14, i_item_desc#15] +Input [4]: [ss_item_sk#8, d_date#10, i_item_sk#14, i_item_desc#15] -(63) Filter [codegen id : 19] -Input [5]: [ws_sold_date_sk#42, ws_item_sk#43, ws_bill_customer_sk#44, ws_quantity#45, ws_list_price#46] -Condition : isnotnull(ws_sold_date_sk#42) +(71) HashAggregate [codegen id : 25] +Input [3]: [d_date#10, i_item_sk#14, i_item_desc#15] +Keys [3]: [substr(i_item_desc#15, 1, 30) AS substr(i_item_desc#15, 1, 30)#47, i_item_sk#14, d_date#10] +Functions [1]: [partial_count(1)] +Aggregate Attributes [1]: [count#48] +Results [4]: [substr(i_item_desc#15, 1, 30)#47, i_item_sk#14, d_date#10, count#49] -(64) Exchange -Input [5]: [ws_sold_date_sk#42, ws_item_sk#43, ws_bill_customer_sk#44, ws_quantity#45, ws_list_price#46] -Arguments: hashpartitioning(ws_item_sk#43, 5), true, [id=#47] +(72) HashAggregate [codegen id : 25] +Input [4]: [substr(i_item_desc#15, 1, 30)#47, i_item_sk#14, d_date#10, count#49] +Keys [3]: [substr(i_item_desc#15, 1, 30)#47, i_item_sk#14, d_date#10] +Functions [1]: [count(1)] +Aggregate Attributes [1]: [count(1)#50] +Results [2]: [i_item_sk#14 AS item_sk#21, count(1)#50 AS count(1)#51] -(65) Sort [codegen id : 20] -Input [5]: [ws_sold_date_sk#42, ws_item_sk#43, ws_bill_customer_sk#44, ws_quantity#45, ws_list_price#46] -Arguments: [ws_item_sk#43 ASC NULLS FIRST], false, 0 +(73) Filter [codegen id : 25] +Input [2]: [item_sk#21, count(1)#51] +Condition : (count(1)#51 > 4) -(66) ReusedExchange [Reuses operator id: 29] +(74) Project [codegen id : 25] Output [1]: [item_sk#21] +Input [2]: [item_sk#21, count(1)#51] -(67) Sort [codegen id : 27] +(75) Sort [codegen id : 25] Input [1]: [item_sk#21] Arguments: [item_sk#21 ASC NULLS FIRST], false, 0 -(68) SortMergeJoin -Left keys [1]: [ws_item_sk#43] +(76) SortMergeJoin +Left keys [1]: [ws_item_sk#42] Right keys [1]: [item_sk#21] Join condition: None -(69) Project [codegen id : 28] -Output [4]: [ws_sold_date_sk#42, ws_bill_customer_sk#44, ws_quantity#45, ws_list_price#46] -Input [5]: [ws_sold_date_sk#42, ws_item_sk#43, ws_bill_customer_sk#44, ws_quantity#45, ws_list_price#46] +(77) Project [codegen id : 26] +Output [4]: [ws_sold_date_sk#41, ws_bill_customer_sk#43, ws_quantity#44, ws_list_price#45] +Input [5]: [ws_sold_date_sk#41, ws_item_sk#42, ws_bill_customer_sk#43, ws_quantity#44, ws_list_price#45] -(70) Exchange -Input [4]: [ws_sold_date_sk#42, ws_bill_customer_sk#44, ws_quantity#45, ws_list_price#46] -Arguments: hashpartitioning(ws_bill_customer_sk#44, 5), true, [id=#48] +(78) Exchange +Input [4]: [ws_sold_date_sk#41, ws_bill_customer_sk#43, ws_quantity#44, ws_list_price#45] +Arguments: hashpartitioning(ws_bill_customer_sk#43, 5), true, [id=#52] -(71) Sort [codegen id : 29] -Input [4]: [ws_sold_date_sk#42, ws_bill_customer_sk#44, ws_quantity#45, ws_list_price#46] -Arguments: [ws_bill_customer_sk#44 ASC NULLS FIRST], false, 0 +(79) Sort [codegen id : 27] +Input [4]: [ws_sold_date_sk#41, ws_bill_customer_sk#43, ws_quantity#44, ws_list_price#45] +Arguments: [ws_bill_customer_sk#43 ASC NULLS FIRST], false, 0 -(72) ReusedExchange [Reuses operator id: 38] -Output [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +(80) ReusedExchange [Reuses operator id: 37] +Output [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] -(73) Sort [codegen id : 31] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: [ss_customer_sk#25 ASC NULLS FIRST], false, 0 +(81) Sort [codegen id : 29] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: [ss_customer_sk#24 ASC NULLS FIRST], false, 0 -(74) ReusedExchange [Reuses operator id: 43] -Output [1]: [c_customer_sk#29] +(82) ReusedExchange [Reuses operator id: 42] +Output [1]: [c_customer_sk#28] -(75) Sort [codegen id : 33] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 +(83) Sort [codegen id : 31] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 -(76) SortMergeJoin [codegen id : 34] -Left keys [1]: [ss_customer_sk#25] -Right keys [1]: [c_customer_sk#29] +(84) SortMergeJoin [codegen id : 32] +Left keys [1]: [ss_customer_sk#24] +Right keys [1]: [c_customer_sk#28] Join condition: None -(77) Project [codegen id : 34] -Output [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Input [4]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27, c_customer_sk#29] - -(78) HashAggregate [codegen id : 34] -Input [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Keys [1]: [c_customer_sk#29] -Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [2]: [sum#49, isEmpty#50] -Results [3]: [c_customer_sk#29, sum#51, isEmpty#52] - -(79) HashAggregate [codegen id : 34] -Input [3]: [c_customer_sk#29, sum#51, isEmpty#52] -Keys [1]: [c_customer_sk#29] -Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#53] -Results [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#53 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#54] - -(80) Filter [codegen id : 34] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#54] -Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#54) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#54 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(ReusedSubquery Subquery scalar-subquery#37, [id=#38] as decimal(32,6)))), DecimalType(38,8), true))) - -(81) Project [codegen id : 34] -Output [1]: [c_customer_sk#29] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#54] - -(82) Sort [codegen id : 34] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 - -(83) SortMergeJoin -Left keys [1]: [ws_bill_customer_sk#44] -Right keys [1]: [c_customer_sk#29] +(85) Project [codegen id : 32] +Output [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Input [4]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26, c_customer_sk#28] + +(86) HashAggregate [codegen id : 32] +Input [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Keys [1]: [c_customer_sk#28] +Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [2]: [sum#53, isEmpty#54] +Results [3]: [c_customer_sk#28, sum#55, isEmpty#56] + +(87) HashAggregate [codegen id : 32] +Input [3]: [c_customer_sk#28, sum#55, isEmpty#56] +Keys [1]: [c_customer_sk#28] +Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#57] +Results [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#57 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#58] + +(88) Filter [codegen id : 32] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#58] +Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#58) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#58 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(ReusedSubquery Subquery scalar-subquery#36, [id=#37] as decimal(32,6)))), DecimalType(38,8), true))) + +(89) Project [codegen id : 32] +Output [1]: [c_customer_sk#28] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#58] + +(90) Sort [codegen id : 32] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 + +(91) SortMergeJoin +Left keys [1]: [ws_bill_customer_sk#43] +Right keys [1]: [c_customer_sk#28] Join condition: None -(84) Project [codegen id : 36] -Output [3]: [ws_sold_date_sk#42, ws_quantity#45, ws_list_price#46] -Input [4]: [ws_sold_date_sk#42, ws_bill_customer_sk#44, ws_quantity#45, ws_list_price#46] +(92) Project [codegen id : 34] +Output [3]: [ws_sold_date_sk#41, ws_quantity#44, ws_list_price#45] +Input [4]: [ws_sold_date_sk#41, ws_bill_customer_sk#43, ws_quantity#44, ws_list_price#45] -(85) ReusedExchange [Reuses operator id: 58] +(93) ReusedExchange [Reuses operator id: 57] Output [1]: [d_date_sk#9] -(86) BroadcastHashJoin [codegen id : 36] -Left keys [1]: [ws_sold_date_sk#42] +(94) BroadcastHashJoin [codegen id : 34] +Left keys [1]: [ws_sold_date_sk#41] Right keys [1]: [d_date_sk#9] Join condition: None -(87) Project [codegen id : 36] -Output [1]: [CheckOverflow((promote_precision(cast(cast(ws_quantity#45 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price#46 as decimal(12,2)))), DecimalType(18,2), true) AS sales#55] -Input [4]: [ws_sold_date_sk#42, ws_quantity#45, ws_list_price#46, d_date_sk#9] +(95) Project [codegen id : 34] +Output [1]: [CheckOverflow((promote_precision(cast(cast(ws_quantity#44 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price#45 as decimal(12,2)))), DecimalType(18,2), true) AS sales#59] +Input [4]: [ws_sold_date_sk#41, ws_quantity#44, ws_list_price#45, d_date_sk#9] -(88) Union +(96) Union -(89) HashAggregate [codegen id : 37] -Input [1]: [sales#41] +(97) HashAggregate [codegen id : 35] +Input [1]: [sales#40] Keys: [] -Functions [1]: [partial_sum(sales#41)] -Aggregate Attributes [2]: [sum#56, isEmpty#57] -Results [2]: [sum#58, isEmpty#59] +Functions [1]: [partial_sum(sales#40)] +Aggregate Attributes [2]: [sum#60, isEmpty#61] +Results [2]: [sum#62, isEmpty#63] -(90) Exchange -Input [2]: [sum#58, isEmpty#59] -Arguments: SinglePartition, true, [id=#60] +(98) Exchange +Input [2]: [sum#62, isEmpty#63] +Arguments: SinglePartition, true, [id=#64] -(91) HashAggregate [codegen id : 38] -Input [2]: [sum#58, isEmpty#59] +(99) HashAggregate [codegen id : 36] +Input [2]: [sum#62, isEmpty#63] Keys: [] -Functions [1]: [sum(sales#41)] -Aggregate Attributes [1]: [sum(sales#41)#61] -Results [1]: [sum(sales#41)#61 AS sum(sales)#62] - -(92) CollectLimit -Input [1]: [sum(sales)#62] -Arguments: 100 +Functions [1]: [sum(sales#40)] +Aggregate Attributes [1]: [sum(sales#40)#65] +Results [1]: [sum(sales#40)#65 AS sum(sales)#66] ===== Subqueries ===== -Subquery:1 Hosting operator id = 49 Hosting Expression = Subquery scalar-subquery#37, [id=#38] -* HashAggregate (116) -+- Exchange (115) - +- * HashAggregate (114) - +- * HashAggregate (113) - +- * HashAggregate (112) - +- * Project (111) - +- * SortMergeJoin Inner (110) - :- * Sort (104) - : +- Exchange (103) - : +- * Project (102) - : +- * BroadcastHashJoin Inner BuildRight (101) - : :- * Filter (95) - : : +- * ColumnarToRow (94) - : : +- Scan parquet default.store_sales (93) - : +- BroadcastExchange (100) - : +- * Project (99) - : +- * Filter (98) - : +- * ColumnarToRow (97) - : +- Scan parquet default.date_dim (96) - +- * Sort (109) - +- Exchange (108) - +- * Filter (107) - +- * ColumnarToRow (106) - +- Scan parquet default.customer (105) - - -(93) Scan parquet default.store_sales -Output [4]: [ss_sold_date_sk#7, ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +Subquery:1 Hosting operator id = 48 Hosting Expression = Subquery scalar-subquery#36, [id=#37] +* HashAggregate (123) ++- Exchange (122) + +- * HashAggregate (121) + +- * HashAggregate (120) + +- * HashAggregate (119) + +- * Project (118) + +- * SortMergeJoin Inner (117) + :- * Sort (111) + : +- Exchange (110) + : +- * Project (109) + : +- * BroadcastHashJoin Inner BuildRight (108) + : :- * Filter (102) + : : +- * ColumnarToRow (101) + : : +- Scan parquet default.store_sales (100) + : +- BroadcastExchange (107) + : +- * Project (106) + : +- * Filter (105) + : +- * ColumnarToRow (104) + : +- Scan parquet default.date_dim (103) + +- * Sort (116) + +- Exchange (115) + +- * Filter (114) + +- * ColumnarToRow (113) + +- Scan parquet default.customer (112) + + +(100) Scan parquet default.store_sales +Output [4]: [ss_sold_date_sk#7, ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] Batched: true Location [not included in comparison]/{warehouse_dir}/store_sales] PushedFilters: [IsNotNull(ss_customer_sk), IsNotNull(ss_sold_date_sk)] ReadSchema: struct -(94) ColumnarToRow [codegen id : 2] -Input [4]: [ss_sold_date_sk#7, ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +(101) ColumnarToRow [codegen id : 2] +Input [4]: [ss_sold_date_sk#7, ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] -(95) Filter [codegen id : 2] -Input [4]: [ss_sold_date_sk#7, ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Condition : (isnotnull(ss_customer_sk#25) AND isnotnull(ss_sold_date_sk#7)) +(102) Filter [codegen id : 2] +Input [4]: [ss_sold_date_sk#7, ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Condition : (isnotnull(ss_customer_sk#24) AND isnotnull(ss_sold_date_sk#7)) -(96) Scan parquet default.date_dim +(103) Scan parquet default.date_dim Output [2]: [d_date_sk#9, d_year#11] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] PushedFilters: [In(d_year, [2000,2001,2002,2003]), IsNotNull(d_date_sk)] ReadSchema: struct -(97) ColumnarToRow [codegen id : 1] +(104) ColumnarToRow [codegen id : 1] Input [2]: [d_date_sk#9, d_year#11] -(98) Filter [codegen id : 1] +(105) Filter [codegen id : 1] Input [2]: [d_date_sk#9, d_year#11] Condition : (d_year#11 IN (2000,2001,2002,2003) AND isnotnull(d_date_sk#9)) -(99) Project [codegen id : 1] +(106) Project [codegen id : 1] Output [1]: [d_date_sk#9] Input [2]: [d_date_sk#9, d_year#11] -(100) BroadcastExchange +(107) BroadcastExchange Input [1]: [d_date_sk#9] -Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#63] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#67] -(101) BroadcastHashJoin [codegen id : 2] +(108) BroadcastHashJoin [codegen id : 2] Left keys [1]: [ss_sold_date_sk#7] Right keys [1]: [d_date_sk#9] Join condition: None -(102) Project [codegen id : 2] -Output [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Input [5]: [ss_sold_date_sk#7, ss_customer_sk#25, ss_quantity#26, ss_sales_price#27, d_date_sk#9] +(109) Project [codegen id : 2] +Output [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Input [5]: [ss_sold_date_sk#7, ss_customer_sk#24, ss_quantity#25, ss_sales_price#26, d_date_sk#9] -(103) Exchange -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: hashpartitioning(ss_customer_sk#25, 5), true, [id=#64] +(110) Exchange +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: hashpartitioning(ss_customer_sk#24, 5), true, [id=#68] -(104) Sort [codegen id : 3] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: [ss_customer_sk#25 ASC NULLS FIRST], false, 0 +(111) Sort [codegen id : 3] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: [ss_customer_sk#24 ASC NULLS FIRST], false, 0 -(105) Scan parquet default.customer -Output [1]: [c_customer_sk#29] +(112) Scan parquet default.customer +Output [1]: [c_customer_sk#28] Batched: true Location [not included in comparison]/{warehouse_dir}/customer] PushedFilters: [IsNotNull(c_customer_sk)] ReadSchema: struct -(106) ColumnarToRow [codegen id : 4] -Input [1]: [c_customer_sk#29] +(113) ColumnarToRow [codegen id : 4] +Input [1]: [c_customer_sk#28] -(107) Filter [codegen id : 4] -Input [1]: [c_customer_sk#29] -Condition : isnotnull(c_customer_sk#29) +(114) Filter [codegen id : 4] +Input [1]: [c_customer_sk#28] +Condition : isnotnull(c_customer_sk#28) -(108) Exchange -Input [1]: [c_customer_sk#29] -Arguments: hashpartitioning(c_customer_sk#29, 5), true, [id=#65] +(115) Exchange +Input [1]: [c_customer_sk#28] +Arguments: hashpartitioning(c_customer_sk#28, 5), true, [id=#69] -(109) Sort [codegen id : 5] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 +(116) Sort [codegen id : 5] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 -(110) SortMergeJoin [codegen id : 6] -Left keys [1]: [ss_customer_sk#25] -Right keys [1]: [c_customer_sk#29] +(117) SortMergeJoin [codegen id : 6] +Left keys [1]: [ss_customer_sk#24] +Right keys [1]: [c_customer_sk#28] Join condition: None -(111) Project [codegen id : 6] -Output [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Input [4]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27, c_customer_sk#29] - -(112) HashAggregate [codegen id : 6] -Input [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Keys [1]: [c_customer_sk#29] -Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [2]: [sum#66, isEmpty#67] -Results [3]: [c_customer_sk#29, sum#68, isEmpty#69] - -(113) HashAggregate [codegen id : 6] -Input [3]: [c_customer_sk#29, sum#68, isEmpty#69] -Keys [1]: [c_customer_sk#29] -Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#70] -Results [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#70 AS csales#71] - -(114) HashAggregate [codegen id : 6] -Input [1]: [csales#71] +(118) Project [codegen id : 6] +Output [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Input [4]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26, c_customer_sk#28] + +(119) HashAggregate [codegen id : 6] +Input [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Keys [1]: [c_customer_sk#28] +Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [2]: [sum#70, isEmpty#71] +Results [3]: [c_customer_sk#28, sum#72, isEmpty#73] + +(120) HashAggregate [codegen id : 6] +Input [3]: [c_customer_sk#28, sum#72, isEmpty#73] +Keys [1]: [c_customer_sk#28] +Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#74] +Results [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#74 AS csales#75] + +(121) HashAggregate [codegen id : 6] +Input [1]: [csales#75] Keys: [] -Functions [1]: [partial_max(csales#71)] -Aggregate Attributes [1]: [max#72] -Results [1]: [max#73] +Functions [1]: [partial_max(csales#75)] +Aggregate Attributes [1]: [max#76] +Results [1]: [max#77] -(115) Exchange -Input [1]: [max#73] -Arguments: SinglePartition, true, [id=#74] +(122) Exchange +Input [1]: [max#77] +Arguments: SinglePartition, true, [id=#78] -(116) HashAggregate [codegen id : 7] -Input [1]: [max#73] +(123) HashAggregate [codegen id : 7] +Input [1]: [max#77] Keys: [] -Functions [1]: [max(csales#71)] -Aggregate Attributes [1]: [max(csales#71)#75] -Results [1]: [max(csales#71)#75 AS tpcds_cmax#76] +Functions [1]: [max(csales#75)] +Aggregate Attributes [1]: [max(csales#75)#79] +Results [1]: [max(csales#75)#79 AS tpcds_cmax#80] -Subquery:2 Hosting operator id = 80 Hosting Expression = ReusedSubquery Subquery scalar-subquery#37, [id=#38] +Subquery:2 Hosting operator id = 88 Hosting Expression = ReusedSubquery Subquery scalar-subquery#36, [id=#37] diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a.sf100/simplified.txt index 9ee444cdd988c..5bb8bc5b99d0c 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a.sf100/simplified.txt @@ -1,198 +1,208 @@ -CollectLimit - WholeStageCodegen (38) - HashAggregate [sum,isEmpty] [sum(sales),sum(sales),sum,isEmpty] - InputAdapter - Exchange #1 - WholeStageCodegen (37) - HashAggregate [sales] [sum,isEmpty,sum,isEmpty] - InputAdapter - Union - WholeStageCodegen (18) - Project [cs_quantity,cs_list_price] - BroadcastHashJoin [cs_sold_date_sk,d_date_sk] - Project [cs_sold_date_sk,cs_quantity,cs_list_price] - InputAdapter - SortMergeJoin [cs_bill_customer_sk,c_customer_sk] - WholeStageCodegen (11) - Sort [cs_bill_customer_sk] - InputAdapter - Exchange [cs_bill_customer_sk] #2 - WholeStageCodegen (10) - Project [cs_sold_date_sk,cs_bill_customer_sk,cs_quantity,cs_list_price] - InputAdapter - SortMergeJoin [cs_item_sk,item_sk] - WholeStageCodegen (2) - Sort [cs_item_sk] - InputAdapter - Exchange [cs_item_sk] #3 - WholeStageCodegen (1) - Filter [cs_sold_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk,cs_quantity,cs_list_price] - WholeStageCodegen (9) - Sort [item_sk] - InputAdapter - Exchange [item_sk] #4 - WholeStageCodegen (8) - Project [item_sk] - Filter [count(1)] - HashAggregate [substr(i_item_desc, 1, 30),i_item_sk,d_date,count] [count(1),item_sk,count(1),count] - HashAggregate [i_item_desc,i_item_sk,d_date] [count,substr(i_item_desc, 1, 30),count] - Project [d_date,i_item_sk,i_item_desc] - SortMergeJoin [ss_item_sk,i_item_sk] - InputAdapter - WholeStageCodegen (5) - Sort [ss_item_sk] - InputAdapter - Exchange [ss_item_sk] #5 - WholeStageCodegen (4) - Project [ss_item_sk,d_date] - BroadcastHashJoin [ss_sold_date_sk,d_date_sk] - Filter [ss_sold_date_sk,ss_item_sk] - ColumnarToRow - InputAdapter - Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk] - InputAdapter - BroadcastExchange #6 - WholeStageCodegen (3) - Project [d_date_sk,d_date] - Filter [d_year,d_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.date_dim [d_date_sk,d_date,d_year] - InputAdapter - WholeStageCodegen (7) - Sort [i_item_sk] - InputAdapter - Exchange [i_item_sk] #7 - WholeStageCodegen (6) - Filter [i_item_sk] - ColumnarToRow - InputAdapter - Scan parquet default.item [i_item_sk,i_item_desc] - WholeStageCodegen (16) - Sort [c_customer_sk] - Project [c_customer_sk] - Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] - Subquery #1 - WholeStageCodegen (7) - HashAggregate [max] [max(csales),tpcds_cmax,max] - InputAdapter - Exchange #10 - WholeStageCodegen (6) - HashAggregate [csales] [max,max] - HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),csales,sum,isEmpty] - HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] - Project [ss_quantity,ss_sales_price,c_customer_sk] - SortMergeJoin [ss_customer_sk,c_customer_sk] +WholeStageCodegen (36) + HashAggregate [sum,isEmpty] [sum(sales),sum(sales),sum,isEmpty] + InputAdapter + Exchange #1 + WholeStageCodegen (35) + HashAggregate [sales] [sum,isEmpty,sum,isEmpty] + InputAdapter + Union + WholeStageCodegen (17) + Project [cs_quantity,cs_list_price] + BroadcastHashJoin [cs_sold_date_sk,d_date_sk] + Project [cs_sold_date_sk,cs_quantity,cs_list_price] + InputAdapter + SortMergeJoin [cs_bill_customer_sk,c_customer_sk] + WholeStageCodegen (10) + Sort [cs_bill_customer_sk] + InputAdapter + Exchange [cs_bill_customer_sk] #2 + WholeStageCodegen (9) + Project [cs_sold_date_sk,cs_bill_customer_sk,cs_quantity,cs_list_price] + InputAdapter + SortMergeJoin [cs_item_sk,item_sk] + WholeStageCodegen (2) + Sort [cs_item_sk] + InputAdapter + Exchange [cs_item_sk] #3 + WholeStageCodegen (1) + Filter [cs_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk,cs_quantity,cs_list_price] + WholeStageCodegen (8) + Sort [item_sk] + Project [item_sk] + Filter [count(1)] + HashAggregate [substr(i_item_desc, 1, 30),i_item_sk,d_date,count] [count(1),item_sk,count(1),count] + HashAggregate [i_item_desc,i_item_sk,d_date] [count,substr(i_item_desc, 1, 30),count] + Project [d_date,i_item_sk,i_item_desc] + SortMergeJoin [ss_item_sk,i_item_sk] InputAdapter - WholeStageCodegen (3) - Sort [ss_customer_sk] + WholeStageCodegen (5) + Sort [ss_item_sk] InputAdapter - Exchange [ss_customer_sk] #11 - WholeStageCodegen (2) - Project [ss_customer_sk,ss_quantity,ss_sales_price] + Exchange [ss_item_sk] #4 + WholeStageCodegen (4) + Project [ss_item_sk,d_date] BroadcastHashJoin [ss_sold_date_sk,d_date_sk] - Filter [ss_customer_sk,ss_sold_date_sk] + Filter [ss_sold_date_sk,ss_item_sk] ColumnarToRow InputAdapter - Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk,ss_quantity,ss_sales_price] + Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk] InputAdapter - BroadcastExchange #12 - WholeStageCodegen (1) - Project [d_date_sk] + BroadcastExchange #5 + WholeStageCodegen (3) + Project [d_date_sk,d_date] Filter [d_year,d_date_sk] ColumnarToRow InputAdapter - Scan parquet default.date_dim [d_date_sk,d_year] + Scan parquet default.date_dim [d_date_sk,d_date,d_year] InputAdapter - WholeStageCodegen (5) - Sort [c_customer_sk] + WholeStageCodegen (7) + Sort [i_item_sk] InputAdapter - Exchange [c_customer_sk] #13 - WholeStageCodegen (4) - Filter [c_customer_sk] + Exchange [i_item_sk] #6 + WholeStageCodegen (6) + Filter [i_item_sk] ColumnarToRow InputAdapter - Scan parquet default.customer [c_customer_sk] - HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] - HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] - Project [ss_quantity,ss_sales_price,c_customer_sk] - SortMergeJoin [ss_customer_sk,c_customer_sk] - InputAdapter - WholeStageCodegen (13) - Sort [ss_customer_sk] - InputAdapter - Exchange [ss_customer_sk] #8 - WholeStageCodegen (12) - Filter [ss_customer_sk] - ColumnarToRow - InputAdapter - Scan parquet default.store_sales [ss_customer_sk,ss_quantity,ss_sales_price] - InputAdapter - WholeStageCodegen (15) - Sort [c_customer_sk] - InputAdapter - Exchange [c_customer_sk] #9 - WholeStageCodegen (14) - Filter [c_customer_sk] - ColumnarToRow - InputAdapter - Scan parquet default.customer [c_customer_sk] - InputAdapter - BroadcastExchange #14 - WholeStageCodegen (17) - Project [d_date_sk] - Filter [d_year,d_moy,d_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.date_dim [d_date_sk,d_year,d_moy] - WholeStageCodegen (36) - Project [ws_quantity,ws_list_price] - BroadcastHashJoin [ws_sold_date_sk,d_date_sk] - Project [ws_sold_date_sk,ws_quantity,ws_list_price] - InputAdapter - SortMergeJoin [ws_bill_customer_sk,c_customer_sk] - WholeStageCodegen (29) - Sort [ws_bill_customer_sk] - InputAdapter - Exchange [ws_bill_customer_sk] #15 - WholeStageCodegen (28) - Project [ws_sold_date_sk,ws_bill_customer_sk,ws_quantity,ws_list_price] + Scan parquet default.item [i_item_sk,i_item_desc] + WholeStageCodegen (15) + Sort [c_customer_sk] + Project [c_customer_sk] + Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] + Subquery #1 + WholeStageCodegen (7) + HashAggregate [max] [max(csales),tpcds_cmax,max] InputAdapter - SortMergeJoin [ws_item_sk,item_sk] - WholeStageCodegen (20) - Sort [ws_item_sk] + Exchange #9 + WholeStageCodegen (6) + HashAggregate [csales] [max,max] + HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),csales,sum,isEmpty] + HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] + Project [ss_quantity,ss_sales_price,c_customer_sk] + SortMergeJoin [ss_customer_sk,c_customer_sk] + InputAdapter + WholeStageCodegen (3) + Sort [ss_customer_sk] + InputAdapter + Exchange [ss_customer_sk] #10 + WholeStageCodegen (2) + Project [ss_customer_sk,ss_quantity,ss_sales_price] + BroadcastHashJoin [ss_sold_date_sk,d_date_sk] + Filter [ss_customer_sk,ss_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk,ss_quantity,ss_sales_price] + InputAdapter + BroadcastExchange #11 + WholeStageCodegen (1) + Project [d_date_sk] + Filter [d_year,d_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_date_sk,d_year] + InputAdapter + WholeStageCodegen (5) + Sort [c_customer_sk] + InputAdapter + Exchange [c_customer_sk] #12 + WholeStageCodegen (4) + Filter [c_customer_sk] + ColumnarToRow + InputAdapter + Scan parquet default.customer [c_customer_sk] + HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] + HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] + Project [ss_quantity,ss_sales_price,c_customer_sk] + SortMergeJoin [ss_customer_sk,c_customer_sk] + InputAdapter + WholeStageCodegen (12) + Sort [ss_customer_sk] InputAdapter - Exchange [ws_item_sk] #16 - WholeStageCodegen (19) - Filter [ws_sold_date_sk] + Exchange [ss_customer_sk] #7 + WholeStageCodegen (11) + Filter [ss_customer_sk] ColumnarToRow InputAdapter - Scan parquet default.web_sales [ws_sold_date_sk,ws_item_sk,ws_bill_customer_sk,ws_quantity,ws_list_price] - WholeStageCodegen (27) - Sort [item_sk] + Scan parquet default.store_sales [ss_customer_sk,ss_quantity,ss_sales_price] + InputAdapter + WholeStageCodegen (14) + Sort [c_customer_sk] InputAdapter - ReusedExchange [item_sk] #4 - WholeStageCodegen (34) - Sort [c_customer_sk] - Project [c_customer_sk] - Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] - ReusedSubquery [tpcds_cmax] #1 - HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] - HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] - Project [ss_quantity,ss_sales_price,c_customer_sk] - SortMergeJoin [ss_customer_sk,c_customer_sk] - InputAdapter - WholeStageCodegen (31) - Sort [ss_customer_sk] - InputAdapter - ReusedExchange [ss_customer_sk,ss_quantity,ss_sales_price] #8 - InputAdapter - WholeStageCodegen (33) - Sort [c_customer_sk] - InputAdapter - ReusedExchange [c_customer_sk] #9 + Exchange [c_customer_sk] #8 + WholeStageCodegen (13) + Filter [c_customer_sk] + ColumnarToRow + InputAdapter + Scan parquet default.customer [c_customer_sk] + InputAdapter + BroadcastExchange #13 + WholeStageCodegen (16) + Project [d_date_sk] + Filter [d_year,d_moy,d_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_date_sk,d_year,d_moy] + WholeStageCodegen (34) + Project [ws_quantity,ws_list_price] + BroadcastHashJoin [ws_sold_date_sk,d_date_sk] + Project [ws_sold_date_sk,ws_quantity,ws_list_price] InputAdapter - ReusedExchange [d_date_sk] #14 + SortMergeJoin [ws_bill_customer_sk,c_customer_sk] + WholeStageCodegen (27) + Sort [ws_bill_customer_sk] + InputAdapter + Exchange [ws_bill_customer_sk] #14 + WholeStageCodegen (26) + Project [ws_sold_date_sk,ws_bill_customer_sk,ws_quantity,ws_list_price] + InputAdapter + SortMergeJoin [ws_item_sk,item_sk] + WholeStageCodegen (19) + Sort [ws_item_sk] + InputAdapter + Exchange [ws_item_sk] #15 + WholeStageCodegen (18) + Filter [ws_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.web_sales [ws_sold_date_sk,ws_item_sk,ws_bill_customer_sk,ws_quantity,ws_list_price] + WholeStageCodegen (25) + Sort [item_sk] + Project [item_sk] + Filter [count(1)] + HashAggregate [substr(i_item_desc, 1, 30),i_item_sk,d_date,count] [count(1),item_sk,count(1),count] + HashAggregate [i_item_desc,i_item_sk,d_date] [count,substr(i_item_desc, 1, 30),count] + Project [d_date,i_item_sk,i_item_desc] + SortMergeJoin [ss_item_sk,i_item_sk] + InputAdapter + WholeStageCodegen (22) + Sort [ss_item_sk] + InputAdapter + ReusedExchange [ss_item_sk,d_date] #4 + InputAdapter + WholeStageCodegen (24) + Sort [i_item_sk] + InputAdapter + ReusedExchange [i_item_sk,i_item_desc] #6 + WholeStageCodegen (32) + Sort [c_customer_sk] + Project [c_customer_sk] + Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] + ReusedSubquery [tpcds_cmax] #1 + HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] + HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] + Project [ss_quantity,ss_sales_price,c_customer_sk] + SortMergeJoin [ss_customer_sk,c_customer_sk] + InputAdapter + WholeStageCodegen (29) + Sort [ss_customer_sk] + InputAdapter + ReusedExchange [ss_customer_sk,ss_quantity,ss_sales_price] #7 + InputAdapter + WholeStageCodegen (31) + Sort [c_customer_sk] + InputAdapter + ReusedExchange [c_customer_sk] #8 + InputAdapter + ReusedExchange [d_date_sk] #13 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a/explain.txt index 6d2b5b0013d8f..15ae5bfe24303 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a/explain.txt @@ -1,76 +1,75 @@ == Physical Plan == -CollectLimit (72) -+- * HashAggregate (71) - +- Exchange (70) - +- * HashAggregate (69) - +- Union (68) - :- * Project (51) - : +- * BroadcastHashJoin Inner BuildRight (50) - : :- * Project (44) - : : +- * BroadcastHashJoin LeftSemi BuildRight (43) - : : :- * Project (27) - : : : +- * BroadcastHashJoin LeftSemi BuildRight (26) - : : : :- * Filter (3) - : : : : +- * ColumnarToRow (2) - : : : : +- Scan parquet default.catalog_sales (1) - : : : +- BroadcastExchange (25) - : : : +- * Project (24) - : : : +- * Filter (23) - : : : +- * HashAggregate (22) - : : : +- Exchange (21) - : : : +- * HashAggregate (20) - : : : +- * Project (19) - : : : +- * BroadcastHashJoin Inner BuildRight (18) - : : : :- * Project (13) - : : : : +- * BroadcastHashJoin Inner BuildRight (12) - : : : : :- * Filter (6) - : : : : : +- * ColumnarToRow (5) - : : : : : +- Scan parquet default.store_sales (4) - : : : : +- BroadcastExchange (11) - : : : : +- * Project (10) - : : : : +- * Filter (9) - : : : : +- * ColumnarToRow (8) - : : : : +- Scan parquet default.date_dim (7) - : : : +- BroadcastExchange (17) - : : : +- * Filter (16) - : : : +- * ColumnarToRow (15) - : : : +- Scan parquet default.item (14) - : : +- BroadcastExchange (42) - : : +- * Project (41) - : : +- * Filter (40) - : : +- * HashAggregate (39) - : : +- Exchange (38) - : : +- * HashAggregate (37) - : : +- * Project (36) - : : +- * BroadcastHashJoin Inner BuildRight (35) - : : :- * Filter (30) - : : : +- * ColumnarToRow (29) - : : : +- Scan parquet default.store_sales (28) - : : +- BroadcastExchange (34) - : : +- * Filter (33) - : : +- * ColumnarToRow (32) - : : +- Scan parquet default.customer (31) - : +- BroadcastExchange (49) - : +- * Project (48) - : +- * Filter (47) - : +- * ColumnarToRow (46) - : +- Scan parquet default.date_dim (45) - +- * Project (67) - +- * BroadcastHashJoin Inner BuildRight (66) - :- * Project (64) - : +- * BroadcastHashJoin LeftSemi BuildRight (63) - : :- * Project (57) - : : +- * BroadcastHashJoin LeftSemi BuildRight (56) - : : :- * Filter (54) - : : : +- * ColumnarToRow (53) - : : : +- Scan parquet default.web_sales (52) - : : +- ReusedExchange (55) - : +- BroadcastExchange (62) - : +- * Project (61) - : +- * Filter (60) - : +- * HashAggregate (59) - : +- ReusedExchange (58) - +- ReusedExchange (65) +* HashAggregate (71) ++- Exchange (70) + +- * HashAggregate (69) + +- Union (68) + :- * Project (51) + : +- * BroadcastHashJoin Inner BuildRight (50) + : :- * Project (44) + : : +- * BroadcastHashJoin LeftSemi BuildRight (43) + : : :- * Project (27) + : : : +- * BroadcastHashJoin LeftSemi BuildRight (26) + : : : :- * Filter (3) + : : : : +- * ColumnarToRow (2) + : : : : +- Scan parquet default.catalog_sales (1) + : : : +- BroadcastExchange (25) + : : : +- * Project (24) + : : : +- * Filter (23) + : : : +- * HashAggregate (22) + : : : +- Exchange (21) + : : : +- * HashAggregate (20) + : : : +- * Project (19) + : : : +- * BroadcastHashJoin Inner BuildRight (18) + : : : :- * Project (13) + : : : : +- * BroadcastHashJoin Inner BuildRight (12) + : : : : :- * Filter (6) + : : : : : +- * ColumnarToRow (5) + : : : : : +- Scan parquet default.store_sales (4) + : : : : +- BroadcastExchange (11) + : : : : +- * Project (10) + : : : : +- * Filter (9) + : : : : +- * ColumnarToRow (8) + : : : : +- Scan parquet default.date_dim (7) + : : : +- BroadcastExchange (17) + : : : +- * Filter (16) + : : : +- * ColumnarToRow (15) + : : : +- Scan parquet default.item (14) + : : +- BroadcastExchange (42) + : : +- * Project (41) + : : +- * Filter (40) + : : +- * HashAggregate (39) + : : +- Exchange (38) + : : +- * HashAggregate (37) + : : +- * Project (36) + : : +- * BroadcastHashJoin Inner BuildRight (35) + : : :- * Filter (30) + : : : +- * ColumnarToRow (29) + : : : +- Scan parquet default.store_sales (28) + : : +- BroadcastExchange (34) + : : +- * Filter (33) + : : +- * ColumnarToRow (32) + : : +- Scan parquet default.customer (31) + : +- BroadcastExchange (49) + : +- * Project (48) + : +- * Filter (47) + : +- * ColumnarToRow (46) + : +- Scan parquet default.date_dim (45) + +- * Project (67) + +- * BroadcastHashJoin Inner BuildRight (66) + :- * Project (64) + : +- * BroadcastHashJoin LeftSemi BuildRight (63) + : :- * Project (57) + : : +- * BroadcastHashJoin LeftSemi BuildRight (56) + : : :- * Filter (54) + : : : +- * ColumnarToRow (53) + : : : +- Scan parquet default.web_sales (52) + : : +- ReusedExchange (55) + : +- BroadcastExchange (62) + : +- * Project (61) + : +- * Filter (60) + : +- * HashAggregate (59) + : +- ReusedExchange (58) + +- ReusedExchange (65) (1) Scan parquet default.catalog_sales @@ -398,139 +397,135 @@ Functions [1]: [sum(sales#40)] Aggregate Attributes [1]: [sum(sales#40)#57] Results [1]: [sum(sales#40)#57 AS sum(sales)#58] -(72) CollectLimit -Input [1]: [sum(sales)#58] -Arguments: 100 - ===== Subqueries ===== Subquery:1 Hosting operator id = 40 Hosting Expression = Subquery scalar-subquery#35, [id=#36] -* HashAggregate (94) -+- Exchange (93) - +- * HashAggregate (92) - +- * HashAggregate (91) - +- Exchange (90) - +- * HashAggregate (89) - +- * Project (88) - +- * BroadcastHashJoin Inner BuildRight (87) - :- * Project (81) - : +- * BroadcastHashJoin Inner BuildRight (80) - : :- * Filter (75) - : : +- * ColumnarToRow (74) - : : +- Scan parquet default.store_sales (73) - : +- BroadcastExchange (79) - : +- * Filter (78) - : +- * ColumnarToRow (77) - : +- Scan parquet default.customer (76) - +- BroadcastExchange (86) - +- * Project (85) - +- * Filter (84) - +- * ColumnarToRow (83) - +- Scan parquet default.date_dim (82) - - -(73) Scan parquet default.store_sales +* HashAggregate (93) ++- Exchange (92) + +- * HashAggregate (91) + +- * HashAggregate (90) + +- Exchange (89) + +- * HashAggregate (88) + +- * Project (87) + +- * BroadcastHashJoin Inner BuildRight (86) + :- * Project (80) + : +- * BroadcastHashJoin Inner BuildRight (79) + : :- * Filter (74) + : : +- * ColumnarToRow (73) + : : +- Scan parquet default.store_sales (72) + : +- BroadcastExchange (78) + : +- * Filter (77) + : +- * ColumnarToRow (76) + : +- Scan parquet default.customer (75) + +- BroadcastExchange (85) + +- * Project (84) + +- * Filter (83) + +- * ColumnarToRow (82) + +- Scan parquet default.date_dim (81) + + +(72) Scan parquet default.store_sales Output [4]: [ss_sold_date_sk#6, ss_customer_sk#23, ss_quantity#24, ss_sales_price#25] Batched: true Location [not included in comparison]/{warehouse_dir}/store_sales] PushedFilters: [IsNotNull(ss_customer_sk), IsNotNull(ss_sold_date_sk)] ReadSchema: struct -(74) ColumnarToRow [codegen id : 3] +(73) ColumnarToRow [codegen id : 3] Input [4]: [ss_sold_date_sk#6, ss_customer_sk#23, ss_quantity#24, ss_sales_price#25] -(75) Filter [codegen id : 3] +(74) Filter [codegen id : 3] Input [4]: [ss_sold_date_sk#6, ss_customer_sk#23, ss_quantity#24, ss_sales_price#25] Condition : (isnotnull(ss_customer_sk#23) AND isnotnull(ss_sold_date_sk#6)) -(76) Scan parquet default.customer +(75) Scan parquet default.customer Output [1]: [c_customer_sk#26] Batched: true Location [not included in comparison]/{warehouse_dir}/customer] PushedFilters: [IsNotNull(c_customer_sk)] ReadSchema: struct -(77) ColumnarToRow [codegen id : 1] +(76) ColumnarToRow [codegen id : 1] Input [1]: [c_customer_sk#26] -(78) Filter [codegen id : 1] +(77) Filter [codegen id : 1] Input [1]: [c_customer_sk#26] Condition : isnotnull(c_customer_sk#26) -(79) BroadcastExchange +(78) BroadcastExchange Input [1]: [c_customer_sk#26] Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#59] -(80) BroadcastHashJoin [codegen id : 3] +(79) BroadcastHashJoin [codegen id : 3] Left keys [1]: [ss_customer_sk#23] Right keys [1]: [c_customer_sk#26] Join condition: None -(81) Project [codegen id : 3] +(80) Project [codegen id : 3] Output [4]: [ss_sold_date_sk#6, ss_quantity#24, ss_sales_price#25, c_customer_sk#26] Input [5]: [ss_sold_date_sk#6, ss_customer_sk#23, ss_quantity#24, ss_sales_price#25, c_customer_sk#26] -(82) Scan parquet default.date_dim +(81) Scan parquet default.date_dim Output [2]: [d_date_sk#8, d_year#10] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] PushedFilters: [In(d_year, [2000,2001,2002,2003]), IsNotNull(d_date_sk)] ReadSchema: struct -(83) ColumnarToRow [codegen id : 2] +(82) ColumnarToRow [codegen id : 2] Input [2]: [d_date_sk#8, d_year#10] -(84) Filter [codegen id : 2] +(83) Filter [codegen id : 2] Input [2]: [d_date_sk#8, d_year#10] Condition : (d_year#10 IN (2000,2001,2002,2003) AND isnotnull(d_date_sk#8)) -(85) Project [codegen id : 2] +(84) Project [codegen id : 2] Output [1]: [d_date_sk#8] Input [2]: [d_date_sk#8, d_year#10] -(86) BroadcastExchange +(85) BroadcastExchange Input [1]: [d_date_sk#8] Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#60] -(87) BroadcastHashJoin [codegen id : 3] +(86) BroadcastHashJoin [codegen id : 3] Left keys [1]: [ss_sold_date_sk#6] Right keys [1]: [d_date_sk#8] Join condition: None -(88) Project [codegen id : 3] +(87) Project [codegen id : 3] Output [3]: [ss_quantity#24, ss_sales_price#25, c_customer_sk#26] Input [5]: [ss_sold_date_sk#6, ss_quantity#24, ss_sales_price#25, c_customer_sk#26, d_date_sk#8] -(89) HashAggregate [codegen id : 3] +(88) HashAggregate [codegen id : 3] Input [3]: [ss_quantity#24, ss_sales_price#25, c_customer_sk#26] Keys [1]: [c_customer_sk#26] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#24 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#25 as decimal(12,2)))), DecimalType(18,2), true))] Aggregate Attributes [2]: [sum#61, isEmpty#62] Results [3]: [c_customer_sk#26, sum#63, isEmpty#64] -(90) Exchange +(89) Exchange Input [3]: [c_customer_sk#26, sum#63, isEmpty#64] Arguments: hashpartitioning(c_customer_sk#26, 5), true, [id=#65] -(91) HashAggregate [codegen id : 4] +(90) HashAggregate [codegen id : 4] Input [3]: [c_customer_sk#26, sum#63, isEmpty#64] Keys [1]: [c_customer_sk#26] Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#24 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#25 as decimal(12,2)))), DecimalType(18,2), true))] Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#24 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#25 as decimal(12,2)))), DecimalType(18,2), true))#66] Results [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#24 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#25 as decimal(12,2)))), DecimalType(18,2), true))#66 AS csales#67] -(92) HashAggregate [codegen id : 4] +(91) HashAggregate [codegen id : 4] Input [1]: [csales#67] Keys: [] Functions [1]: [partial_max(csales#67)] Aggregate Attributes [1]: [max#68] Results [1]: [max#69] -(93) Exchange +(92) Exchange Input [1]: [max#69] Arguments: SinglePartition, true, [id=#70] -(94) HashAggregate [codegen id : 5] +(93) HashAggregate [codegen id : 5] Input [1]: [max#69] Keys: [] Functions [1]: [max(csales#67)] diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a/simplified.txt index d860e18574f2a..aebe2bd3e1a6c 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a/simplified.txt @@ -1,143 +1,142 @@ -CollectLimit - WholeStageCodegen (20) - HashAggregate [sum,isEmpty] [sum(sales),sum(sales),sum,isEmpty] - InputAdapter - Exchange #1 - WholeStageCodegen (19) - HashAggregate [sales] [sum,isEmpty,sum,isEmpty] - InputAdapter - Union - WholeStageCodegen (9) - Project [cs_quantity,cs_list_price] - BroadcastHashJoin [cs_sold_date_sk,d_date_sk] - Project [cs_sold_date_sk,cs_quantity,cs_list_price] - BroadcastHashJoin [cs_bill_customer_sk,c_customer_sk] - Project [cs_sold_date_sk,cs_bill_customer_sk,cs_quantity,cs_list_price] - BroadcastHashJoin [cs_item_sk,item_sk] - Filter [cs_sold_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk,cs_quantity,cs_list_price] - InputAdapter - BroadcastExchange #2 - WholeStageCodegen (4) - Project [item_sk] - Filter [count(1)] - HashAggregate [substr(i_item_desc, 1, 30),i_item_sk,d_date,count] [count(1),item_sk,count(1),count] - InputAdapter - Exchange [substr(i_item_desc, 1, 30),i_item_sk,d_date] #3 - WholeStageCodegen (3) - HashAggregate [i_item_desc,i_item_sk,d_date] [count,substr(i_item_desc, 1, 30),count] - Project [d_date,i_item_sk,i_item_desc] - BroadcastHashJoin [ss_item_sk,i_item_sk] - Project [ss_item_sk,d_date] - BroadcastHashJoin [ss_sold_date_sk,d_date_sk] - Filter [ss_sold_date_sk,ss_item_sk] +WholeStageCodegen (20) + HashAggregate [sum,isEmpty] [sum(sales),sum(sales),sum,isEmpty] + InputAdapter + Exchange #1 + WholeStageCodegen (19) + HashAggregate [sales] [sum,isEmpty,sum,isEmpty] + InputAdapter + Union + WholeStageCodegen (9) + Project [cs_quantity,cs_list_price] + BroadcastHashJoin [cs_sold_date_sk,d_date_sk] + Project [cs_sold_date_sk,cs_quantity,cs_list_price] + BroadcastHashJoin [cs_bill_customer_sk,c_customer_sk] + Project [cs_sold_date_sk,cs_bill_customer_sk,cs_quantity,cs_list_price] + BroadcastHashJoin [cs_item_sk,item_sk] + Filter [cs_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk,cs_quantity,cs_list_price] + InputAdapter + BroadcastExchange #2 + WholeStageCodegen (4) + Project [item_sk] + Filter [count(1)] + HashAggregate [substr(i_item_desc, 1, 30),i_item_sk,d_date,count] [count(1),item_sk,count(1),count] + InputAdapter + Exchange [substr(i_item_desc, 1, 30),i_item_sk,d_date] #3 + WholeStageCodegen (3) + HashAggregate [i_item_desc,i_item_sk,d_date] [count,substr(i_item_desc, 1, 30),count] + Project [d_date,i_item_sk,i_item_desc] + BroadcastHashJoin [ss_item_sk,i_item_sk] + Project [ss_item_sk,d_date] + BroadcastHashJoin [ss_sold_date_sk,d_date_sk] + Filter [ss_sold_date_sk,ss_item_sk] + ColumnarToRow + InputAdapter + Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk] + InputAdapter + BroadcastExchange #4 + WholeStageCodegen (1) + Project [d_date_sk,d_date] + Filter [d_year,d_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_date_sk,d_date,d_year] + InputAdapter + BroadcastExchange #5 + WholeStageCodegen (2) + Filter [i_item_sk] ColumnarToRow InputAdapter - Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk] - InputAdapter - BroadcastExchange #4 - WholeStageCodegen (1) - Project [d_date_sk,d_date] - Filter [d_year,d_date_sk] + Scan parquet default.item [i_item_sk,i_item_desc] + InputAdapter + BroadcastExchange #6 + WholeStageCodegen (7) + Project [c_customer_sk] + Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] + Subquery #1 + WholeStageCodegen (5) + HashAggregate [max] [max(csales),tpcds_cmax,max] + InputAdapter + Exchange #9 + WholeStageCodegen (4) + HashAggregate [csales] [max,max] + HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),csales,sum,isEmpty] + InputAdapter + Exchange [c_customer_sk] #10 + WholeStageCodegen (3) + HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] + Project [ss_quantity,ss_sales_price,c_customer_sk] + BroadcastHashJoin [ss_sold_date_sk,d_date_sk] + Project [ss_sold_date_sk,ss_quantity,ss_sales_price,c_customer_sk] + BroadcastHashJoin [ss_customer_sk,c_customer_sk] + Filter [ss_customer_sk,ss_sold_date_sk] ColumnarToRow InputAdapter - Scan parquet default.date_dim [d_date_sk,d_date,d_year] - InputAdapter - BroadcastExchange #5 - WholeStageCodegen (2) - Filter [i_item_sk] - ColumnarToRow - InputAdapter - Scan parquet default.item [i_item_sk,i_item_desc] - InputAdapter - BroadcastExchange #6 - WholeStageCodegen (7) - Project [c_customer_sk] - Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] - Subquery #1 - WholeStageCodegen (5) - HashAggregate [max] [max(csales),tpcds_cmax,max] - InputAdapter - Exchange #9 - WholeStageCodegen (4) - HashAggregate [csales] [max,max] - HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),csales,sum,isEmpty] - InputAdapter - Exchange [c_customer_sk] #10 - WholeStageCodegen (3) - HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] - Project [ss_quantity,ss_sales_price,c_customer_sk] - BroadcastHashJoin [ss_sold_date_sk,d_date_sk] - Project [ss_sold_date_sk,ss_quantity,ss_sales_price,c_customer_sk] - BroadcastHashJoin [ss_customer_sk,c_customer_sk] - Filter [ss_customer_sk,ss_sold_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk,ss_quantity,ss_sales_price] - InputAdapter - BroadcastExchange #11 - WholeStageCodegen (1) - Filter [c_customer_sk] - ColumnarToRow - InputAdapter - Scan parquet default.customer [c_customer_sk] - InputAdapter - BroadcastExchange #12 - WholeStageCodegen (2) - Project [d_date_sk] - Filter [d_year,d_date_sk] + Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk,ss_quantity,ss_sales_price] + InputAdapter + BroadcastExchange #11 + WholeStageCodegen (1) + Filter [c_customer_sk] ColumnarToRow InputAdapter - Scan parquet default.date_dim [d_date_sk,d_year] - HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] - InputAdapter - Exchange [c_customer_sk] #7 - WholeStageCodegen (6) - HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] - Project [ss_quantity,ss_sales_price,c_customer_sk] - BroadcastHashJoin [ss_customer_sk,c_customer_sk] - Filter [ss_customer_sk] - ColumnarToRow - InputAdapter - Scan parquet default.store_sales [ss_customer_sk,ss_quantity,ss_sales_price] - InputAdapter - BroadcastExchange #8 - WholeStageCodegen (5) - Filter [c_customer_sk] - ColumnarToRow - InputAdapter - Scan parquet default.customer [c_customer_sk] - InputAdapter - BroadcastExchange #13 - WholeStageCodegen (8) - Project [d_date_sk] - Filter [d_year,d_moy,d_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.date_dim [d_date_sk,d_year,d_moy] - WholeStageCodegen (18) - Project [ws_quantity,ws_list_price] - BroadcastHashJoin [ws_sold_date_sk,d_date_sk] - Project [ws_sold_date_sk,ws_quantity,ws_list_price] - BroadcastHashJoin [ws_bill_customer_sk,c_customer_sk] - Project [ws_sold_date_sk,ws_bill_customer_sk,ws_quantity,ws_list_price] - BroadcastHashJoin [ws_item_sk,item_sk] - Filter [ws_sold_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.web_sales [ws_sold_date_sk,ws_item_sk,ws_bill_customer_sk,ws_quantity,ws_list_price] - InputAdapter - ReusedExchange [item_sk] #2 - InputAdapter - BroadcastExchange #14 - WholeStageCodegen (16) - Project [c_customer_sk] - Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] - ReusedSubquery [tpcds_cmax] #1 - HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] - InputAdapter - ReusedExchange [c_customer_sk,sum,isEmpty] #7 - InputAdapter - ReusedExchange [d_date_sk] #13 + Scan parquet default.customer [c_customer_sk] + InputAdapter + BroadcastExchange #12 + WholeStageCodegen (2) + Project [d_date_sk] + Filter [d_year,d_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_date_sk,d_year] + HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] + InputAdapter + Exchange [c_customer_sk] #7 + WholeStageCodegen (6) + HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] + Project [ss_quantity,ss_sales_price,c_customer_sk] + BroadcastHashJoin [ss_customer_sk,c_customer_sk] + Filter [ss_customer_sk] + ColumnarToRow + InputAdapter + Scan parquet default.store_sales [ss_customer_sk,ss_quantity,ss_sales_price] + InputAdapter + BroadcastExchange #8 + WholeStageCodegen (5) + Filter [c_customer_sk] + ColumnarToRow + InputAdapter + Scan parquet default.customer [c_customer_sk] + InputAdapter + BroadcastExchange #13 + WholeStageCodegen (8) + Project [d_date_sk] + Filter [d_year,d_moy,d_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_date_sk,d_year,d_moy] + WholeStageCodegen (18) + Project [ws_quantity,ws_list_price] + BroadcastHashJoin [ws_sold_date_sk,d_date_sk] + Project [ws_sold_date_sk,ws_quantity,ws_list_price] + BroadcastHashJoin [ws_bill_customer_sk,c_customer_sk] + Project [ws_sold_date_sk,ws_bill_customer_sk,ws_quantity,ws_list_price] + BroadcastHashJoin [ws_item_sk,item_sk] + Filter [ws_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.web_sales [ws_sold_date_sk,ws_item_sk,ws_bill_customer_sk,ws_quantity,ws_list_price] + InputAdapter + ReusedExchange [item_sk] #2 + InputAdapter + BroadcastExchange #14 + WholeStageCodegen (16) + Project [c_customer_sk] + Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] + ReusedSubquery [tpcds_cmax] #1 + HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] + InputAdapter + ReusedExchange [c_customer_sk,sum,isEmpty] #7 + InputAdapter + ReusedExchange [d_date_sk] #13 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23b.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23b.sf100/explain.txt index 51b85142f37ff..9a4c2b064d091 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23b.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23b.sf100/explain.txt @@ -1,134 +1,140 @@ == Physical Plan == -TakeOrderedAndProject (130) -+- Union (129) - :- * HashAggregate (82) - : +- Exchange (81) - : +- * HashAggregate (80) - : +- * Project (79) - : +- * SortMergeJoin Inner (78) - : :- * Project (59) - : : +- * BroadcastHashJoin Inner BuildRight (58) - : : :- SortMergeJoin LeftSemi (52) - : : : :- * Sort (34) - : : : : +- Exchange (33) - : : : : +- * Project (32) - : : : : +- SortMergeJoin LeftSemi (31) +TakeOrderedAndProject (136) ++- Union (135) + :- * HashAggregate (80) + : +- Exchange (79) + : +- * HashAggregate (78) + : +- * Project (77) + : +- * SortMergeJoin Inner (76) + : :- * Project (58) + : : +- * BroadcastHashJoin Inner BuildRight (57) + : : :- SortMergeJoin LeftSemi (51) + : : : :- * Sort (33) + : : : : +- Exchange (32) + : : : : +- * Project (31) + : : : : +- SortMergeJoin LeftSemi (30) : : : : :- * Sort (5) : : : : : +- Exchange (4) : : : : : +- * Filter (3) : : : : : +- * ColumnarToRow (2) : : : : : +- Scan parquet default.catalog_sales (1) - : : : : +- * Sort (30) - : : : : +- Exchange (29) - : : : : +- * Project (28) - : : : : +- * Filter (27) - : : : : +- * HashAggregate (26) - : : : : +- * HashAggregate (25) - : : : : +- * Project (24) - : : : : +- * SortMergeJoin Inner (23) - : : : : :- * Sort (17) - : : : : : +- Exchange (16) - : : : : : +- * Project (15) - : : : : : +- * BroadcastHashJoin Inner BuildRight (14) - : : : : : :- * Filter (8) - : : : : : : +- * ColumnarToRow (7) - : : : : : : +- Scan parquet default.store_sales (6) - : : : : : +- BroadcastExchange (13) - : : : : : +- * Project (12) - : : : : : +- * Filter (11) - : : : : : +- * ColumnarToRow (10) - : : : : : +- Scan parquet default.date_dim (9) - : : : : +- * Sort (22) - : : : : +- Exchange (21) - : : : : +- * Filter (20) - : : : : +- * ColumnarToRow (19) - : : : : +- Scan parquet default.item (18) - : : : +- * Sort (51) - : : : +- * Project (50) - : : : +- * Filter (49) - : : : +- * HashAggregate (48) - : : : +- * HashAggregate (47) - : : : +- * Project (46) - : : : +- * SortMergeJoin Inner (45) - : : : :- * Sort (39) - : : : : +- Exchange (38) - : : : : +- * Filter (37) - : : : : +- * ColumnarToRow (36) - : : : : +- Scan parquet default.store_sales (35) - : : : +- * Sort (44) - : : : +- Exchange (43) - : : : +- * Filter (42) - : : : +- * ColumnarToRow (41) - : : : +- Scan parquet default.customer (40) - : : +- BroadcastExchange (57) - : : +- * Project (56) - : : +- * Filter (55) - : : +- * ColumnarToRow (54) - : : +- Scan parquet default.date_dim (53) - : +- SortMergeJoin LeftSemi (77) - : :- * Sort (64) - : : +- Exchange (63) - : : +- * Filter (62) - : : +- * ColumnarToRow (61) - : : +- Scan parquet default.customer (60) - : +- * Sort (76) - : +- Exchange (75) - : +- * Project (74) - : +- * Filter (73) - : +- * HashAggregate (72) - : +- * HashAggregate (71) - : +- * Project (70) - : +- * SortMergeJoin Inner (69) - : :- * Sort (66) - : : +- ReusedExchange (65) - : +- * Sort (68) - : +- ReusedExchange (67) - +- * HashAggregate (128) - +- Exchange (127) - +- * HashAggregate (126) - +- * Project (125) - +- * SortMergeJoin Inner (124) - :- * Project (108) - : +- * BroadcastHashJoin Inner BuildRight (107) - : :- SortMergeJoin LeftSemi (105) - : : :- * Sort (93) - : : : +- Exchange (92) - : : : +- * Project (91) - : : : +- SortMergeJoin LeftSemi (90) - : : : :- * Sort (87) - : : : : +- Exchange (86) - : : : : +- * Filter (85) - : : : : +- * ColumnarToRow (84) - : : : : +- Scan parquet default.web_sales (83) - : : : +- * Sort (89) - : : : +- ReusedExchange (88) - : : +- * Sort (104) - : : +- * Project (103) - : : +- * Filter (102) - : : +- * HashAggregate (101) - : : +- * HashAggregate (100) - : : +- * Project (99) - : : +- * SortMergeJoin Inner (98) - : : :- * Sort (95) - : : : +- ReusedExchange (94) - : : +- * Sort (97) - : : +- ReusedExchange (96) - : +- ReusedExchange (106) - +- SortMergeJoin LeftSemi (123) - :- * Sort (110) - : +- ReusedExchange (109) - +- * Sort (122) - +- Exchange (121) - +- * Project (120) - +- * Filter (119) - +- * HashAggregate (118) - +- * HashAggregate (117) - +- * Project (116) - +- * SortMergeJoin Inner (115) - :- * Sort (112) - : +- ReusedExchange (111) - +- * Sort (114) - +- ReusedExchange (113) + : : : : +- * Sort (29) + : : : : +- * Project (28) + : : : : +- * Filter (27) + : : : : +- * HashAggregate (26) + : : : : +- * HashAggregate (25) + : : : : +- * Project (24) + : : : : +- * SortMergeJoin Inner (23) + : : : : :- * Sort (17) + : : : : : +- Exchange (16) + : : : : : +- * Project (15) + : : : : : +- * BroadcastHashJoin Inner BuildRight (14) + : : : : : :- * Filter (8) + : : : : : : +- * ColumnarToRow (7) + : : : : : : +- Scan parquet default.store_sales (6) + : : : : : +- BroadcastExchange (13) + : : : : : +- * Project (12) + : : : : : +- * Filter (11) + : : : : : +- * ColumnarToRow (10) + : : : : : +- Scan parquet default.date_dim (9) + : : : : +- * Sort (22) + : : : : +- Exchange (21) + : : : : +- * Filter (20) + : : : : +- * ColumnarToRow (19) + : : : : +- Scan parquet default.item (18) + : : : +- * Sort (50) + : : : +- * Project (49) + : : : +- * Filter (48) + : : : +- * HashAggregate (47) + : : : +- * HashAggregate (46) + : : : +- * Project (45) + : : : +- * SortMergeJoin Inner (44) + : : : :- * Sort (38) + : : : : +- Exchange (37) + : : : : +- * Filter (36) + : : : : +- * ColumnarToRow (35) + : : : : +- Scan parquet default.store_sales (34) + : : : +- * Sort (43) + : : : +- Exchange (42) + : : : +- * Filter (41) + : : : +- * ColumnarToRow (40) + : : : +- Scan parquet default.customer (39) + : : +- BroadcastExchange (56) + : : +- * Project (55) + : : +- * Filter (54) + : : +- * ColumnarToRow (53) + : : +- Scan parquet default.date_dim (52) + : +- SortMergeJoin LeftSemi (75) + : :- * Sort (63) + : : +- Exchange (62) + : : +- * Filter (61) + : : +- * ColumnarToRow (60) + : : +- Scan parquet default.customer (59) + : +- * Sort (74) + : +- * Project (73) + : +- * Filter (72) + : +- * HashAggregate (71) + : +- * HashAggregate (70) + : +- * Project (69) + : +- * SortMergeJoin Inner (68) + : :- * Sort (65) + : : +- ReusedExchange (64) + : +- * Sort (67) + : +- ReusedExchange (66) + +- * HashAggregate (134) + +- Exchange (133) + +- * HashAggregate (132) + +- * Project (131) + +- * SortMergeJoin Inner (130) + :- * Project (115) + : +- * BroadcastHashJoin Inner BuildRight (114) + : :- SortMergeJoin LeftSemi (112) + : : :- * Sort (100) + : : : +- Exchange (99) + : : : +- * Project (98) + : : : +- SortMergeJoin LeftSemi (97) + : : : :- * Sort (85) + : : : : +- Exchange (84) + : : : : +- * Filter (83) + : : : : +- * ColumnarToRow (82) + : : : : +- Scan parquet default.web_sales (81) + : : : +- * Sort (96) + : : : +- * Project (95) + : : : +- * Filter (94) + : : : +- * HashAggregate (93) + : : : +- * HashAggregate (92) + : : : +- * Project (91) + : : : +- * SortMergeJoin Inner (90) + : : : :- * Sort (87) + : : : : +- ReusedExchange (86) + : : : +- * Sort (89) + : : : +- ReusedExchange (88) + : : +- * Sort (111) + : : +- * Project (110) + : : +- * Filter (109) + : : +- * HashAggregate (108) + : : +- * HashAggregate (107) + : : +- * Project (106) + : : +- * SortMergeJoin Inner (105) + : : :- * Sort (102) + : : : +- ReusedExchange (101) + : : +- * Sort (104) + : : +- ReusedExchange (103) + : +- ReusedExchange (113) + +- SortMergeJoin LeftSemi (129) + :- * Sort (117) + : +- ReusedExchange (116) + +- * Sort (128) + +- * Project (127) + +- * Filter (126) + +- * HashAggregate (125) + +- * HashAggregate (124) + +- * Project (123) + +- * SortMergeJoin Inner (122) + :- * Sort (119) + : +- ReusedExchange (118) + +- * Sort (121) + +- ReusedExchange (120) (1) Scan parquet default.catalog_sales @@ -259,612 +265,642 @@ Condition : (count(1)#22 > 4) Output [1]: [item_sk#21] Input [2]: [item_sk#21, count(1)#22] -(29) Exchange -Input [1]: [item_sk#21] -Arguments: hashpartitioning(item_sk#21, 5), true, [id=#23] - -(30) Sort [codegen id : 9] +(29) Sort [codegen id : 8] Input [1]: [item_sk#21] Arguments: [item_sk#21 ASC NULLS FIRST], false, 0 -(31) SortMergeJoin +(30) SortMergeJoin Left keys [1]: [cs_item_sk#3] Right keys [1]: [item_sk#21] Join condition: None -(32) Project [codegen id : 10] +(31) Project [codegen id : 9] Output [4]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5] Input [5]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_item_sk#3, cs_quantity#4, cs_list_price#5] -(33) Exchange +(32) Exchange Input [4]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5] -Arguments: hashpartitioning(cs_bill_customer_sk#2, 5), true, [id=#24] +Arguments: hashpartitioning(cs_bill_customer_sk#2, 5), true, [id=#23] -(34) Sort [codegen id : 11] +(33) Sort [codegen id : 10] Input [4]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5] Arguments: [cs_bill_customer_sk#2 ASC NULLS FIRST], false, 0 -(35) Scan parquet default.store_sales -Output [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +(34) Scan parquet default.store_sales +Output [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] Batched: true Location [not included in comparison]/{warehouse_dir}/store_sales] PushedFilters: [IsNotNull(ss_customer_sk)] ReadSchema: struct -(36) ColumnarToRow [codegen id : 12] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +(35) ColumnarToRow [codegen id : 11] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] -(37) Filter [codegen id : 12] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Condition : isnotnull(ss_customer_sk#25) +(36) Filter [codegen id : 11] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Condition : isnotnull(ss_customer_sk#24) -(38) Exchange -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: hashpartitioning(ss_customer_sk#25, 5), true, [id=#28] +(37) Exchange +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: hashpartitioning(ss_customer_sk#24, 5), true, [id=#27] -(39) Sort [codegen id : 13] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: [ss_customer_sk#25 ASC NULLS FIRST], false, 0 +(38) Sort [codegen id : 12] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: [ss_customer_sk#24 ASC NULLS FIRST], false, 0 -(40) Scan parquet default.customer -Output [1]: [c_customer_sk#29] +(39) Scan parquet default.customer +Output [1]: [c_customer_sk#28] Batched: true Location [not included in comparison]/{warehouse_dir}/customer] PushedFilters: [IsNotNull(c_customer_sk)] ReadSchema: struct -(41) ColumnarToRow [codegen id : 14] -Input [1]: [c_customer_sk#29] +(40) ColumnarToRow [codegen id : 13] +Input [1]: [c_customer_sk#28] -(42) Filter [codegen id : 14] -Input [1]: [c_customer_sk#29] -Condition : isnotnull(c_customer_sk#29) +(41) Filter [codegen id : 13] +Input [1]: [c_customer_sk#28] +Condition : isnotnull(c_customer_sk#28) -(43) Exchange -Input [1]: [c_customer_sk#29] -Arguments: hashpartitioning(c_customer_sk#29, 5), true, [id=#30] +(42) Exchange +Input [1]: [c_customer_sk#28] +Arguments: hashpartitioning(c_customer_sk#28, 5), true, [id=#29] -(44) Sort [codegen id : 15] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 +(43) Sort [codegen id : 14] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 -(45) SortMergeJoin [codegen id : 16] -Left keys [1]: [ss_customer_sk#25] -Right keys [1]: [c_customer_sk#29] +(44) SortMergeJoin [codegen id : 15] +Left keys [1]: [ss_customer_sk#24] +Right keys [1]: [c_customer_sk#28] Join condition: None -(46) Project [codegen id : 16] -Output [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Input [4]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27, c_customer_sk#29] - -(47) HashAggregate [codegen id : 16] -Input [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Keys [1]: [c_customer_sk#29] -Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [2]: [sum#31, isEmpty#32] -Results [3]: [c_customer_sk#29, sum#33, isEmpty#34] - -(48) HashAggregate [codegen id : 16] -Input [3]: [c_customer_sk#29, sum#33, isEmpty#34] -Keys [1]: [c_customer_sk#29] -Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#35] -Results [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#35 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36] - -(49) Filter [codegen id : 16] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36] -Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(Subquery scalar-subquery#37, [id=#38] as decimal(32,6)))), DecimalType(38,8), true))) - -(50) Project [codegen id : 16] -Output [1]: [c_customer_sk#29] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36] - -(51) Sort [codegen id : 16] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 - -(52) SortMergeJoin +(45) Project [codegen id : 15] +Output [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Input [4]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26, c_customer_sk#28] + +(46) HashAggregate [codegen id : 15] +Input [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Keys [1]: [c_customer_sk#28] +Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [2]: [sum#30, isEmpty#31] +Results [3]: [c_customer_sk#28, sum#32, isEmpty#33] + +(47) HashAggregate [codegen id : 15] +Input [3]: [c_customer_sk#28, sum#32, isEmpty#33] +Keys [1]: [c_customer_sk#28] +Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#34] +Results [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#34 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35] + +(48) Filter [codegen id : 15] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35] +Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(Subquery scalar-subquery#36, [id=#37] as decimal(32,6)))), DecimalType(38,8), true))) + +(49) Project [codegen id : 15] +Output [1]: [c_customer_sk#28] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35] + +(50) Sort [codegen id : 15] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 + +(51) SortMergeJoin Left keys [1]: [cs_bill_customer_sk#2] -Right keys [1]: [c_customer_sk#29] +Right keys [1]: [c_customer_sk#28] Join condition: None -(53) Scan parquet default.date_dim -Output [3]: [d_date_sk#9, d_year#11, d_moy#39] +(52) Scan parquet default.date_dim +Output [3]: [d_date_sk#9, d_year#11, d_moy#38] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] PushedFilters: [IsNotNull(d_year), IsNotNull(d_moy), EqualTo(d_year,2000), EqualTo(d_moy,2), IsNotNull(d_date_sk)] ReadSchema: struct -(54) ColumnarToRow [codegen id : 17] -Input [3]: [d_date_sk#9, d_year#11, d_moy#39] +(53) ColumnarToRow [codegen id : 16] +Input [3]: [d_date_sk#9, d_year#11, d_moy#38] -(55) Filter [codegen id : 17] -Input [3]: [d_date_sk#9, d_year#11, d_moy#39] -Condition : ((((isnotnull(d_year#11) AND isnotnull(d_moy#39)) AND (d_year#11 = 2000)) AND (d_moy#39 = 2)) AND isnotnull(d_date_sk#9)) +(54) Filter [codegen id : 16] +Input [3]: [d_date_sk#9, d_year#11, d_moy#38] +Condition : ((((isnotnull(d_year#11) AND isnotnull(d_moy#38)) AND (d_year#11 = 2000)) AND (d_moy#38 = 2)) AND isnotnull(d_date_sk#9)) -(56) Project [codegen id : 17] +(55) Project [codegen id : 16] Output [1]: [d_date_sk#9] -Input [3]: [d_date_sk#9, d_year#11, d_moy#39] +Input [3]: [d_date_sk#9, d_year#11, d_moy#38] -(57) BroadcastExchange +(56) BroadcastExchange Input [1]: [d_date_sk#9] -Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#40] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#39] -(58) BroadcastHashJoin [codegen id : 18] +(57) BroadcastHashJoin [codegen id : 17] Left keys [1]: [cs_sold_date_sk#1] Right keys [1]: [d_date_sk#9] Join condition: None -(59) Project [codegen id : 18] +(58) Project [codegen id : 17] Output [3]: [cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5] Input [5]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5, d_date_sk#9] -(60) Scan parquet default.customer -Output [3]: [c_customer_sk#29, c_first_name#41, c_last_name#42] +(59) Scan parquet default.customer +Output [3]: [c_customer_sk#28, c_first_name#40, c_last_name#41] Batched: true Location [not included in comparison]/{warehouse_dir}/customer] PushedFilters: [IsNotNull(c_customer_sk)] ReadSchema: struct -(61) ColumnarToRow [codegen id : 19] -Input [3]: [c_customer_sk#29, c_first_name#41, c_last_name#42] +(60) ColumnarToRow [codegen id : 18] +Input [3]: [c_customer_sk#28, c_first_name#40, c_last_name#41] -(62) Filter [codegen id : 19] -Input [3]: [c_customer_sk#29, c_first_name#41, c_last_name#42] -Condition : isnotnull(c_customer_sk#29) +(61) Filter [codegen id : 18] +Input [3]: [c_customer_sk#28, c_first_name#40, c_last_name#41] +Condition : isnotnull(c_customer_sk#28) -(63) Exchange -Input [3]: [c_customer_sk#29, c_first_name#41, c_last_name#42] -Arguments: hashpartitioning(c_customer_sk#29, 5), true, [id=#43] +(62) Exchange +Input [3]: [c_customer_sk#28, c_first_name#40, c_last_name#41] +Arguments: hashpartitioning(c_customer_sk#28, 5), true, [id=#42] -(64) Sort [codegen id : 20] -Input [3]: [c_customer_sk#29, c_first_name#41, c_last_name#42] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 +(63) Sort [codegen id : 19] +Input [3]: [c_customer_sk#28, c_first_name#40, c_last_name#41] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 -(65) ReusedExchange [Reuses operator id: 38] -Output [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +(64) ReusedExchange [Reuses operator id: 37] +Output [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] -(66) Sort [codegen id : 22] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: [ss_customer_sk#25 ASC NULLS FIRST], false, 0 +(65) Sort [codegen id : 21] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: [ss_customer_sk#24 ASC NULLS FIRST], false, 0 -(67) ReusedExchange [Reuses operator id: 43] -Output [1]: [c_customer_sk#29] +(66) ReusedExchange [Reuses operator id: 42] +Output [1]: [c_customer_sk#28] -(68) Sort [codegen id : 24] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 +(67) Sort [codegen id : 23] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 -(69) SortMergeJoin [codegen id : 25] -Left keys [1]: [ss_customer_sk#25] -Right keys [1]: [c_customer_sk#29] +(68) SortMergeJoin [codegen id : 24] +Left keys [1]: [ss_customer_sk#24] +Right keys [1]: [c_customer_sk#28] Join condition: None -(70) Project [codegen id : 25] -Output [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Input [4]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27, c_customer_sk#29] - -(71) HashAggregate [codegen id : 25] -Input [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Keys [1]: [c_customer_sk#29] -Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [2]: [sum#31, isEmpty#32] -Results [3]: [c_customer_sk#29, sum#33, isEmpty#34] - -(72) HashAggregate [codegen id : 25] -Input [3]: [c_customer_sk#29, sum#33, isEmpty#34] -Keys [1]: [c_customer_sk#29] -Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#35] -Results [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#35 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36] - -(73) Filter [codegen id : 25] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36] -Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(ReusedSubquery Subquery scalar-subquery#37, [id=#38] as decimal(32,6)))), DecimalType(38,8), true))) - -(74) Project [codegen id : 25] -Output [1]: [c_customer_sk#29 AS c_customer_sk#29#44] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#36] - -(75) Exchange -Input [1]: [c_customer_sk#29#44] -Arguments: hashpartitioning(c_customer_sk#29#44, 5), true, [id=#45] - -(76) Sort [codegen id : 26] -Input [1]: [c_customer_sk#29#44] -Arguments: [c_customer_sk#29#44 ASC NULLS FIRST], false, 0 - -(77) SortMergeJoin -Left keys [1]: [c_customer_sk#29] -Right keys [1]: [c_customer_sk#29#44] +(69) Project [codegen id : 24] +Output [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Input [4]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26, c_customer_sk#28] + +(70) HashAggregate [codegen id : 24] +Input [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Keys [1]: [c_customer_sk#28] +Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [2]: [sum#30, isEmpty#31] +Results [3]: [c_customer_sk#28, sum#32, isEmpty#33] + +(71) HashAggregate [codegen id : 24] +Input [3]: [c_customer_sk#28, sum#32, isEmpty#33] +Keys [1]: [c_customer_sk#28] +Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#34] +Results [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#34 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35] + +(72) Filter [codegen id : 24] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35] +Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(ReusedSubquery Subquery scalar-subquery#36, [id=#37] as decimal(32,6)))), DecimalType(38,8), true))) + +(73) Project [codegen id : 24] +Output [1]: [c_customer_sk#28 AS c_customer_sk#28#43] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#35] + +(74) Sort [codegen id : 24] +Input [1]: [c_customer_sk#28#43] +Arguments: [c_customer_sk#28#43 ASC NULLS FIRST], false, 0 + +(75) SortMergeJoin +Left keys [1]: [c_customer_sk#28] +Right keys [1]: [c_customer_sk#28#43] Join condition: None -(78) SortMergeJoin [codegen id : 27] +(76) SortMergeJoin [codegen id : 25] Left keys [1]: [cs_bill_customer_sk#2] -Right keys [1]: [c_customer_sk#29] +Right keys [1]: [c_customer_sk#28] Join condition: None -(79) Project [codegen id : 27] -Output [4]: [cs_quantity#4, cs_list_price#5, c_first_name#41, c_last_name#42] -Input [6]: [cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5, c_customer_sk#29, c_first_name#41, c_last_name#42] +(77) Project [codegen id : 25] +Output [4]: [cs_quantity#4, cs_list_price#5, c_first_name#40, c_last_name#41] +Input [6]: [cs_bill_customer_sk#2, cs_quantity#4, cs_list_price#5, c_customer_sk#28, c_first_name#40, c_last_name#41] -(80) HashAggregate [codegen id : 27] -Input [4]: [cs_quantity#4, cs_list_price#5, c_first_name#41, c_last_name#42] -Keys [2]: [c_last_name#42, c_first_name#41] +(78) HashAggregate [codegen id : 25] +Input [4]: [cs_quantity#4, cs_list_price#5, c_first_name#40, c_last_name#41] +Keys [2]: [c_last_name#41, c_first_name#40] Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(cs_quantity#4 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(cs_list_price#5 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [2]: [sum#46, isEmpty#47] -Results [4]: [c_last_name#42, c_first_name#41, sum#48, isEmpty#49] +Aggregate Attributes [2]: [sum#44, isEmpty#45] +Results [4]: [c_last_name#41, c_first_name#40, sum#46, isEmpty#47] -(81) Exchange -Input [4]: [c_last_name#42, c_first_name#41, sum#48, isEmpty#49] -Arguments: hashpartitioning(c_last_name#42, c_first_name#41, 5), true, [id=#50] +(79) Exchange +Input [4]: [c_last_name#41, c_first_name#40, sum#46, isEmpty#47] +Arguments: hashpartitioning(c_last_name#41, c_first_name#40, 5), true, [id=#48] -(82) HashAggregate [codegen id : 28] -Input [4]: [c_last_name#42, c_first_name#41, sum#48, isEmpty#49] -Keys [2]: [c_last_name#42, c_first_name#41] +(80) HashAggregate [codegen id : 26] +Input [4]: [c_last_name#41, c_first_name#40, sum#46, isEmpty#47] +Keys [2]: [c_last_name#41, c_first_name#40] Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(cs_quantity#4 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(cs_list_price#5 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(cs_quantity#4 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(cs_list_price#5 as decimal(12,2)))), DecimalType(18,2), true))#51] -Results [3]: [c_last_name#42, c_first_name#41, sum(CheckOverflow((promote_precision(cast(cast(cs_quantity#4 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(cs_list_price#5 as decimal(12,2)))), DecimalType(18,2), true))#51 AS sales#52] +Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(cs_quantity#4 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(cs_list_price#5 as decimal(12,2)))), DecimalType(18,2), true))#49] +Results [3]: [c_last_name#41, c_first_name#40, sum(CheckOverflow((promote_precision(cast(cast(cs_quantity#4 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(cs_list_price#5 as decimal(12,2)))), DecimalType(18,2), true))#49 AS sales#50] -(83) Scan parquet default.web_sales -Output [5]: [ws_sold_date_sk#53, ws_item_sk#54, ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57] +(81) Scan parquet default.web_sales +Output [5]: [ws_sold_date_sk#51, ws_item_sk#52, ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55] Batched: true Location [not included in comparison]/{warehouse_dir}/web_sales] PushedFilters: [IsNotNull(ws_bill_customer_sk), IsNotNull(ws_sold_date_sk)] ReadSchema: struct -(84) ColumnarToRow [codegen id : 29] -Input [5]: [ws_sold_date_sk#53, ws_item_sk#54, ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57] +(82) ColumnarToRow [codegen id : 27] +Input [5]: [ws_sold_date_sk#51, ws_item_sk#52, ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55] + +(83) Filter [codegen id : 27] +Input [5]: [ws_sold_date_sk#51, ws_item_sk#52, ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55] +Condition : (isnotnull(ws_bill_customer_sk#53) AND isnotnull(ws_sold_date_sk#51)) + +(84) Exchange +Input [5]: [ws_sold_date_sk#51, ws_item_sk#52, ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55] +Arguments: hashpartitioning(ws_item_sk#52, 5), true, [id=#56] -(85) Filter [codegen id : 29] -Input [5]: [ws_sold_date_sk#53, ws_item_sk#54, ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57] -Condition : (isnotnull(ws_bill_customer_sk#55) AND isnotnull(ws_sold_date_sk#53)) +(85) Sort [codegen id : 28] +Input [5]: [ws_sold_date_sk#51, ws_item_sk#52, ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55] +Arguments: [ws_item_sk#52 ASC NULLS FIRST], false, 0 -(86) Exchange -Input [5]: [ws_sold_date_sk#53, ws_item_sk#54, ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57] -Arguments: hashpartitioning(ws_item_sk#54, 5), true, [id=#58] +(86) ReusedExchange [Reuses operator id: 16] +Output [2]: [ss_item_sk#8, d_date#10] + +(87) Sort [codegen id : 31] +Input [2]: [ss_item_sk#8, d_date#10] +Arguments: [ss_item_sk#8 ASC NULLS FIRST], false, 0 + +(88) ReusedExchange [Reuses operator id: 21] +Output [2]: [i_item_sk#14, i_item_desc#15] + +(89) Sort [codegen id : 33] +Input [2]: [i_item_sk#14, i_item_desc#15] +Arguments: [i_item_sk#14 ASC NULLS FIRST], false, 0 + +(90) SortMergeJoin [codegen id : 34] +Left keys [1]: [ss_item_sk#8] +Right keys [1]: [i_item_sk#14] +Join condition: None + +(91) Project [codegen id : 34] +Output [3]: [d_date#10, i_item_sk#14, i_item_desc#15] +Input [4]: [ss_item_sk#8, d_date#10, i_item_sk#14, i_item_desc#15] + +(92) HashAggregate [codegen id : 34] +Input [3]: [d_date#10, i_item_sk#14, i_item_desc#15] +Keys [3]: [substr(i_item_desc#15, 1, 30) AS substr(i_item_desc#15, 1, 30)#57, i_item_sk#14, d_date#10] +Functions [1]: [partial_count(1)] +Aggregate Attributes [1]: [count#58] +Results [4]: [substr(i_item_desc#15, 1, 30)#57, i_item_sk#14, d_date#10, count#59] + +(93) HashAggregate [codegen id : 34] +Input [4]: [substr(i_item_desc#15, 1, 30)#57, i_item_sk#14, d_date#10, count#59] +Keys [3]: [substr(i_item_desc#15, 1, 30)#57, i_item_sk#14, d_date#10] +Functions [1]: [count(1)] +Aggregate Attributes [1]: [count(1)#60] +Results [2]: [i_item_sk#14 AS item_sk#21, count(1)#60 AS count(1)#61] -(87) Sort [codegen id : 30] -Input [5]: [ws_sold_date_sk#53, ws_item_sk#54, ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57] -Arguments: [ws_item_sk#54 ASC NULLS FIRST], false, 0 +(94) Filter [codegen id : 34] +Input [2]: [item_sk#21, count(1)#61] +Condition : (count(1)#61 > 4) -(88) ReusedExchange [Reuses operator id: 29] +(95) Project [codegen id : 34] Output [1]: [item_sk#21] +Input [2]: [item_sk#21, count(1)#61] -(89) Sort [codegen id : 37] +(96) Sort [codegen id : 34] Input [1]: [item_sk#21] Arguments: [item_sk#21 ASC NULLS FIRST], false, 0 -(90) SortMergeJoin -Left keys [1]: [ws_item_sk#54] +(97) SortMergeJoin +Left keys [1]: [ws_item_sk#52] Right keys [1]: [item_sk#21] Join condition: None -(91) Project [codegen id : 38] -Output [4]: [ws_sold_date_sk#53, ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57] -Input [5]: [ws_sold_date_sk#53, ws_item_sk#54, ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57] +(98) Project [codegen id : 35] +Output [4]: [ws_sold_date_sk#51, ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55] +Input [5]: [ws_sold_date_sk#51, ws_item_sk#52, ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55] -(92) Exchange -Input [4]: [ws_sold_date_sk#53, ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57] -Arguments: hashpartitioning(ws_bill_customer_sk#55, 5), true, [id=#59] +(99) Exchange +Input [4]: [ws_sold_date_sk#51, ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55] +Arguments: hashpartitioning(ws_bill_customer_sk#53, 5), true, [id=#62] -(93) Sort [codegen id : 39] -Input [4]: [ws_sold_date_sk#53, ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57] -Arguments: [ws_bill_customer_sk#55 ASC NULLS FIRST], false, 0 +(100) Sort [codegen id : 36] +Input [4]: [ws_sold_date_sk#51, ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55] +Arguments: [ws_bill_customer_sk#53 ASC NULLS FIRST], false, 0 -(94) ReusedExchange [Reuses operator id: 38] -Output [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +(101) ReusedExchange [Reuses operator id: 37] +Output [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] -(95) Sort [codegen id : 41] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: [ss_customer_sk#25 ASC NULLS FIRST], false, 0 +(102) Sort [codegen id : 38] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: [ss_customer_sk#24 ASC NULLS FIRST], false, 0 -(96) ReusedExchange [Reuses operator id: 43] -Output [1]: [c_customer_sk#29] +(103) ReusedExchange [Reuses operator id: 42] +Output [1]: [c_customer_sk#28] -(97) Sort [codegen id : 43] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 +(104) Sort [codegen id : 40] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 -(98) SortMergeJoin [codegen id : 44] -Left keys [1]: [ss_customer_sk#25] -Right keys [1]: [c_customer_sk#29] +(105) SortMergeJoin [codegen id : 41] +Left keys [1]: [ss_customer_sk#24] +Right keys [1]: [c_customer_sk#28] Join condition: None -(99) Project [codegen id : 44] -Output [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Input [4]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27, c_customer_sk#29] - -(100) HashAggregate [codegen id : 44] -Input [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Keys [1]: [c_customer_sk#29] -Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [2]: [sum#60, isEmpty#61] -Results [3]: [c_customer_sk#29, sum#62, isEmpty#63] - -(101) HashAggregate [codegen id : 44] -Input [3]: [c_customer_sk#29, sum#62, isEmpty#63] -Keys [1]: [c_customer_sk#29] -Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#64] -Results [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#64 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#65] - -(102) Filter [codegen id : 44] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#65] -Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#65) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#65 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(ReusedSubquery Subquery scalar-subquery#37, [id=#38] as decimal(32,6)))), DecimalType(38,8), true))) - -(103) Project [codegen id : 44] -Output [1]: [c_customer_sk#29] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#65] - -(104) Sort [codegen id : 44] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 - -(105) SortMergeJoin -Left keys [1]: [ws_bill_customer_sk#55] -Right keys [1]: [c_customer_sk#29] +(106) Project [codegen id : 41] +Output [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Input [4]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26, c_customer_sk#28] + +(107) HashAggregate [codegen id : 41] +Input [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Keys [1]: [c_customer_sk#28] +Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [2]: [sum#63, isEmpty#64] +Results [3]: [c_customer_sk#28, sum#65, isEmpty#66] + +(108) HashAggregate [codegen id : 41] +Input [3]: [c_customer_sk#28, sum#65, isEmpty#66] +Keys [1]: [c_customer_sk#28] +Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#67] +Results [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#67 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#68] + +(109) Filter [codegen id : 41] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#68] +Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#68) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#68 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(ReusedSubquery Subquery scalar-subquery#36, [id=#37] as decimal(32,6)))), DecimalType(38,8), true))) + +(110) Project [codegen id : 41] +Output [1]: [c_customer_sk#28] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#68] + +(111) Sort [codegen id : 41] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 + +(112) SortMergeJoin +Left keys [1]: [ws_bill_customer_sk#53] +Right keys [1]: [c_customer_sk#28] Join condition: None -(106) ReusedExchange [Reuses operator id: 57] +(113) ReusedExchange [Reuses operator id: 56] Output [1]: [d_date_sk#9] -(107) BroadcastHashJoin [codegen id : 46] -Left keys [1]: [ws_sold_date_sk#53] +(114) BroadcastHashJoin [codegen id : 43] +Left keys [1]: [ws_sold_date_sk#51] Right keys [1]: [d_date_sk#9] Join condition: None -(108) Project [codegen id : 46] -Output [3]: [ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57] -Input [5]: [ws_sold_date_sk#53, ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57, d_date_sk#9] +(115) Project [codegen id : 43] +Output [3]: [ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55] +Input [5]: [ws_sold_date_sk#51, ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55, d_date_sk#9] -(109) ReusedExchange [Reuses operator id: 63] -Output [3]: [c_customer_sk#29, c_first_name#41, c_last_name#42] +(116) ReusedExchange [Reuses operator id: 62] +Output [3]: [c_customer_sk#28, c_first_name#40, c_last_name#41] -(110) Sort [codegen id : 48] -Input [3]: [c_customer_sk#29, c_first_name#41, c_last_name#42] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 +(117) Sort [codegen id : 45] +Input [3]: [c_customer_sk#28, c_first_name#40, c_last_name#41] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 -(111) ReusedExchange [Reuses operator id: 38] -Output [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +(118) ReusedExchange [Reuses operator id: 37] +Output [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] -(112) Sort [codegen id : 50] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: [ss_customer_sk#25 ASC NULLS FIRST], false, 0 +(119) Sort [codegen id : 47] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: [ss_customer_sk#24 ASC NULLS FIRST], false, 0 -(113) ReusedExchange [Reuses operator id: 43] -Output [1]: [c_customer_sk#29] +(120) ReusedExchange [Reuses operator id: 42] +Output [1]: [c_customer_sk#28] -(114) Sort [codegen id : 52] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 +(121) Sort [codegen id : 49] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 -(115) SortMergeJoin [codegen id : 53] -Left keys [1]: [ss_customer_sk#25] -Right keys [1]: [c_customer_sk#29] +(122) SortMergeJoin [codegen id : 50] +Left keys [1]: [ss_customer_sk#24] +Right keys [1]: [c_customer_sk#28] Join condition: None -(116) Project [codegen id : 53] -Output [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Input [4]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27, c_customer_sk#29] - -(117) HashAggregate [codegen id : 53] -Input [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Keys [1]: [c_customer_sk#29] -Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [2]: [sum#60, isEmpty#61] -Results [3]: [c_customer_sk#29, sum#62, isEmpty#63] - -(118) HashAggregate [codegen id : 53] -Input [3]: [c_customer_sk#29, sum#62, isEmpty#63] -Keys [1]: [c_customer_sk#29] -Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#64] -Results [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#64 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#65] - -(119) Filter [codegen id : 53] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#65] -Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#65) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#65 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(ReusedSubquery Subquery scalar-subquery#37, [id=#38] as decimal(32,6)))), DecimalType(38,8), true))) - -(120) Project [codegen id : 53] -Output [1]: [c_customer_sk#29 AS c_customer_sk#29#66] -Input [2]: [c_customer_sk#29, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#65] - -(121) Exchange -Input [1]: [c_customer_sk#29#66] -Arguments: hashpartitioning(c_customer_sk#29#66, 5), true, [id=#67] - -(122) Sort [codegen id : 54] -Input [1]: [c_customer_sk#29#66] -Arguments: [c_customer_sk#29#66 ASC NULLS FIRST], false, 0 - -(123) SortMergeJoin -Left keys [1]: [c_customer_sk#29] -Right keys [1]: [c_customer_sk#29#66] +(123) Project [codegen id : 50] +Output [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Input [4]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26, c_customer_sk#28] + +(124) HashAggregate [codegen id : 50] +Input [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Keys [1]: [c_customer_sk#28] +Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [2]: [sum#63, isEmpty#64] +Results [3]: [c_customer_sk#28, sum#65, isEmpty#66] + +(125) HashAggregate [codegen id : 50] +Input [3]: [c_customer_sk#28, sum#65, isEmpty#66] +Keys [1]: [c_customer_sk#28] +Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#67] +Results [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#67 AS sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#68] + +(126) Filter [codegen id : 50] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#68] +Condition : (isnotnull(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#68) AND (cast(sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#68 as decimal(38,8)) > CheckOverflow((0.500000 * promote_precision(cast(ReusedSubquery Subquery scalar-subquery#36, [id=#37] as decimal(32,6)))), DecimalType(38,8), true))) + +(127) Project [codegen id : 50] +Output [1]: [c_customer_sk#28 AS c_customer_sk#28#69] +Input [2]: [c_customer_sk#28, sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#68] + +(128) Sort [codegen id : 50] +Input [1]: [c_customer_sk#28#69] +Arguments: [c_customer_sk#28#69 ASC NULLS FIRST], false, 0 + +(129) SortMergeJoin +Left keys [1]: [c_customer_sk#28] +Right keys [1]: [c_customer_sk#28#69] Join condition: None -(124) SortMergeJoin [codegen id : 55] -Left keys [1]: [ws_bill_customer_sk#55] -Right keys [1]: [c_customer_sk#29] +(130) SortMergeJoin [codegen id : 51] +Left keys [1]: [ws_bill_customer_sk#53] +Right keys [1]: [c_customer_sk#28] Join condition: None -(125) Project [codegen id : 55] -Output [4]: [ws_quantity#56, ws_list_price#57, c_first_name#41, c_last_name#42] -Input [6]: [ws_bill_customer_sk#55, ws_quantity#56, ws_list_price#57, c_customer_sk#29, c_first_name#41, c_last_name#42] +(131) Project [codegen id : 51] +Output [4]: [ws_quantity#54, ws_list_price#55, c_first_name#40, c_last_name#41] +Input [6]: [ws_bill_customer_sk#53, ws_quantity#54, ws_list_price#55, c_customer_sk#28, c_first_name#40, c_last_name#41] -(126) HashAggregate [codegen id : 55] -Input [4]: [ws_quantity#56, ws_list_price#57, c_first_name#41, c_last_name#42] -Keys [2]: [c_last_name#42, c_first_name#41] -Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ws_quantity#56 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price#57 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [2]: [sum#68, isEmpty#69] -Results [4]: [c_last_name#42, c_first_name#41, sum#70, isEmpty#71] +(132) HashAggregate [codegen id : 51] +Input [4]: [ws_quantity#54, ws_list_price#55, c_first_name#40, c_last_name#41] +Keys [2]: [c_last_name#41, c_first_name#40] +Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ws_quantity#54 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price#55 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [2]: [sum#70, isEmpty#71] +Results [4]: [c_last_name#41, c_first_name#40, sum#72, isEmpty#73] -(127) Exchange -Input [4]: [c_last_name#42, c_first_name#41, sum#70, isEmpty#71] -Arguments: hashpartitioning(c_last_name#42, c_first_name#41, 5), true, [id=#72] +(133) Exchange +Input [4]: [c_last_name#41, c_first_name#40, sum#72, isEmpty#73] +Arguments: hashpartitioning(c_last_name#41, c_first_name#40, 5), true, [id=#74] -(128) HashAggregate [codegen id : 56] -Input [4]: [c_last_name#42, c_first_name#41, sum#70, isEmpty#71] -Keys [2]: [c_last_name#42, c_first_name#41] -Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ws_quantity#56 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price#57 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ws_quantity#56 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price#57 as decimal(12,2)))), DecimalType(18,2), true))#73] -Results [3]: [c_last_name#42, c_first_name#41, sum(CheckOverflow((promote_precision(cast(cast(ws_quantity#56 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price#57 as decimal(12,2)))), DecimalType(18,2), true))#73 AS sales#74] +(134) HashAggregate [codegen id : 52] +Input [4]: [c_last_name#41, c_first_name#40, sum#72, isEmpty#73] +Keys [2]: [c_last_name#41, c_first_name#40] +Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ws_quantity#54 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price#55 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ws_quantity#54 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price#55 as decimal(12,2)))), DecimalType(18,2), true))#75] +Results [3]: [c_last_name#41, c_first_name#40, sum(CheckOverflow((promote_precision(cast(cast(ws_quantity#54 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price#55 as decimal(12,2)))), DecimalType(18,2), true))#75 AS sales#76] -(129) Union +(135) Union -(130) TakeOrderedAndProject -Input [3]: [c_last_name#42, c_first_name#41, sales#52] -Arguments: 100, [c_last_name#42 ASC NULLS FIRST, c_first_name#41 ASC NULLS FIRST, sales#52 ASC NULLS FIRST], [c_last_name#42, c_first_name#41, sales#52] +(136) TakeOrderedAndProject +Input [3]: [c_last_name#41, c_first_name#40, sales#50] +Arguments: 100, [c_last_name#41 ASC NULLS FIRST, c_first_name#40 ASC NULLS FIRST, sales#50 ASC NULLS FIRST], [c_last_name#41, c_first_name#40, sales#50] ===== Subqueries ===== -Subquery:1 Hosting operator id = 49 Hosting Expression = Subquery scalar-subquery#37, [id=#38] -* HashAggregate (154) -+- Exchange (153) - +- * HashAggregate (152) - +- * HashAggregate (151) - +- * HashAggregate (150) - +- * Project (149) - +- * SortMergeJoin Inner (148) - :- * Sort (142) - : +- Exchange (141) - : +- * Project (140) - : +- * BroadcastHashJoin Inner BuildRight (139) - : :- * Filter (133) - : : +- * ColumnarToRow (132) - : : +- Scan parquet default.store_sales (131) - : +- BroadcastExchange (138) - : +- * Project (137) - : +- * Filter (136) - : +- * ColumnarToRow (135) - : +- Scan parquet default.date_dim (134) - +- * Sort (147) - +- Exchange (146) - +- * Filter (145) - +- * ColumnarToRow (144) - +- Scan parquet default.customer (143) - - -(131) Scan parquet default.store_sales -Output [4]: [ss_sold_date_sk#7, ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +Subquery:1 Hosting operator id = 48 Hosting Expression = Subquery scalar-subquery#36, [id=#37] +* HashAggregate (160) ++- Exchange (159) + +- * HashAggregate (158) + +- * HashAggregate (157) + +- * HashAggregate (156) + +- * Project (155) + +- * SortMergeJoin Inner (154) + :- * Sort (148) + : +- Exchange (147) + : +- * Project (146) + : +- * BroadcastHashJoin Inner BuildRight (145) + : :- * Filter (139) + : : +- * ColumnarToRow (138) + : : +- Scan parquet default.store_sales (137) + : +- BroadcastExchange (144) + : +- * Project (143) + : +- * Filter (142) + : +- * ColumnarToRow (141) + : +- Scan parquet default.date_dim (140) + +- * Sort (153) + +- Exchange (152) + +- * Filter (151) + +- * ColumnarToRow (150) + +- Scan parquet default.customer (149) + + +(137) Scan parquet default.store_sales +Output [4]: [ss_sold_date_sk#7, ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] Batched: true Location [not included in comparison]/{warehouse_dir}/store_sales] PushedFilters: [IsNotNull(ss_customer_sk), IsNotNull(ss_sold_date_sk)] ReadSchema: struct -(132) ColumnarToRow [codegen id : 2] -Input [4]: [ss_sold_date_sk#7, ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] +(138) ColumnarToRow [codegen id : 2] +Input [4]: [ss_sold_date_sk#7, ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] -(133) Filter [codegen id : 2] -Input [4]: [ss_sold_date_sk#7, ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Condition : (isnotnull(ss_customer_sk#25) AND isnotnull(ss_sold_date_sk#7)) +(139) Filter [codegen id : 2] +Input [4]: [ss_sold_date_sk#7, ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Condition : (isnotnull(ss_customer_sk#24) AND isnotnull(ss_sold_date_sk#7)) -(134) Scan parquet default.date_dim +(140) Scan parquet default.date_dim Output [2]: [d_date_sk#9, d_year#11] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] PushedFilters: [In(d_year, [2000,2001,2002,2003]), IsNotNull(d_date_sk)] ReadSchema: struct -(135) ColumnarToRow [codegen id : 1] +(141) ColumnarToRow [codegen id : 1] Input [2]: [d_date_sk#9, d_year#11] -(136) Filter [codegen id : 1] +(142) Filter [codegen id : 1] Input [2]: [d_date_sk#9, d_year#11] Condition : (d_year#11 IN (2000,2001,2002,2003) AND isnotnull(d_date_sk#9)) -(137) Project [codegen id : 1] +(143) Project [codegen id : 1] Output [1]: [d_date_sk#9] Input [2]: [d_date_sk#9, d_year#11] -(138) BroadcastExchange +(144) BroadcastExchange Input [1]: [d_date_sk#9] -Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#75] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#77] -(139) BroadcastHashJoin [codegen id : 2] +(145) BroadcastHashJoin [codegen id : 2] Left keys [1]: [ss_sold_date_sk#7] Right keys [1]: [d_date_sk#9] Join condition: None -(140) Project [codegen id : 2] -Output [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Input [5]: [ss_sold_date_sk#7, ss_customer_sk#25, ss_quantity#26, ss_sales_price#27, d_date_sk#9] +(146) Project [codegen id : 2] +Output [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Input [5]: [ss_sold_date_sk#7, ss_customer_sk#24, ss_quantity#25, ss_sales_price#26, d_date_sk#9] -(141) Exchange -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: hashpartitioning(ss_customer_sk#25, 5), true, [id=#76] +(147) Exchange +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: hashpartitioning(ss_customer_sk#24, 5), true, [id=#78] -(142) Sort [codegen id : 3] -Input [3]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27] -Arguments: [ss_customer_sk#25 ASC NULLS FIRST], false, 0 +(148) Sort [codegen id : 3] +Input [3]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26] +Arguments: [ss_customer_sk#24 ASC NULLS FIRST], false, 0 -(143) Scan parquet default.customer -Output [1]: [c_customer_sk#29] +(149) Scan parquet default.customer +Output [1]: [c_customer_sk#28] Batched: true Location [not included in comparison]/{warehouse_dir}/customer] PushedFilters: [IsNotNull(c_customer_sk)] ReadSchema: struct -(144) ColumnarToRow [codegen id : 4] -Input [1]: [c_customer_sk#29] +(150) ColumnarToRow [codegen id : 4] +Input [1]: [c_customer_sk#28] -(145) Filter [codegen id : 4] -Input [1]: [c_customer_sk#29] -Condition : isnotnull(c_customer_sk#29) +(151) Filter [codegen id : 4] +Input [1]: [c_customer_sk#28] +Condition : isnotnull(c_customer_sk#28) -(146) Exchange -Input [1]: [c_customer_sk#29] -Arguments: hashpartitioning(c_customer_sk#29, 5), true, [id=#77] +(152) Exchange +Input [1]: [c_customer_sk#28] +Arguments: hashpartitioning(c_customer_sk#28, 5), true, [id=#79] -(147) Sort [codegen id : 5] -Input [1]: [c_customer_sk#29] -Arguments: [c_customer_sk#29 ASC NULLS FIRST], false, 0 +(153) Sort [codegen id : 5] +Input [1]: [c_customer_sk#28] +Arguments: [c_customer_sk#28 ASC NULLS FIRST], false, 0 -(148) SortMergeJoin [codegen id : 6] -Left keys [1]: [ss_customer_sk#25] -Right keys [1]: [c_customer_sk#29] +(154) SortMergeJoin [codegen id : 6] +Left keys [1]: [ss_customer_sk#24] +Right keys [1]: [c_customer_sk#28] Join condition: None -(149) Project [codegen id : 6] -Output [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Input [4]: [ss_customer_sk#25, ss_quantity#26, ss_sales_price#27, c_customer_sk#29] - -(150) HashAggregate [codegen id : 6] -Input [3]: [ss_quantity#26, ss_sales_price#27, c_customer_sk#29] -Keys [1]: [c_customer_sk#29] -Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [2]: [sum#78, isEmpty#79] -Results [3]: [c_customer_sk#29, sum#80, isEmpty#81] - -(151) HashAggregate [codegen id : 6] -Input [3]: [c_customer_sk#29, sum#80, isEmpty#81] -Keys [1]: [c_customer_sk#29] -Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))] -Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#82] -Results [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#26 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#27 as decimal(12,2)))), DecimalType(18,2), true))#82 AS csales#83] - -(152) HashAggregate [codegen id : 6] -Input [1]: [csales#83] +(155) Project [codegen id : 6] +Output [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Input [4]: [ss_customer_sk#24, ss_quantity#25, ss_sales_price#26, c_customer_sk#28] + +(156) HashAggregate [codegen id : 6] +Input [3]: [ss_quantity#25, ss_sales_price#26, c_customer_sk#28] +Keys [1]: [c_customer_sk#28] +Functions [1]: [partial_sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [2]: [sum#80, isEmpty#81] +Results [3]: [c_customer_sk#28, sum#82, isEmpty#83] + +(157) HashAggregate [codegen id : 6] +Input [3]: [c_customer_sk#28, sum#82, isEmpty#83] +Keys [1]: [c_customer_sk#28] +Functions [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))] +Aggregate Attributes [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#84] +Results [1]: [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity#25 as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price#26 as decimal(12,2)))), DecimalType(18,2), true))#84 AS csales#85] + +(158) HashAggregate [codegen id : 6] +Input [1]: [csales#85] Keys: [] -Functions [1]: [partial_max(csales#83)] -Aggregate Attributes [1]: [max#84] -Results [1]: [max#85] +Functions [1]: [partial_max(csales#85)] +Aggregate Attributes [1]: [max#86] +Results [1]: [max#87] -(153) Exchange -Input [1]: [max#85] -Arguments: SinglePartition, true, [id=#86] +(159) Exchange +Input [1]: [max#87] +Arguments: SinglePartition, true, [id=#88] -(154) HashAggregate [codegen id : 7] -Input [1]: [max#85] +(160) HashAggregate [codegen id : 7] +Input [1]: [max#87] Keys: [] -Functions [1]: [max(csales#83)] -Aggregate Attributes [1]: [max(csales#83)#87] -Results [1]: [max(csales#83)#87 AS tpcds_cmax#88] +Functions [1]: [max(csales#85)] +Aggregate Attributes [1]: [max(csales#85)#89] +Results [1]: [max(csales#85)#89 AS tpcds_cmax#90] -Subquery:2 Hosting operator id = 73 Hosting Expression = ReusedSubquery Subquery scalar-subquery#37, [id=#38] +Subquery:2 Hosting operator id = 72 Hosting Expression = ReusedSubquery Subquery scalar-subquery#36, [id=#37] -Subquery:3 Hosting operator id = 102 Hosting Expression = ReusedSubquery Subquery scalar-subquery#37, [id=#38] +Subquery:3 Hosting operator id = 109 Hosting Expression = ReusedSubquery Subquery scalar-subquery#36, [id=#37] -Subquery:4 Hosting operator id = 119 Hosting Expression = ReusedSubquery Subquery scalar-subquery#37, [id=#38] +Subquery:4 Hosting operator id = 126 Hosting Expression = ReusedSubquery Subquery scalar-subquery#36, [id=#37] diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23b.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23b.sf100/simplified.txt index e8891f032a091..4279bf3e16a82 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23b.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23b.sf100/simplified.txt @@ -1,24 +1,24 @@ TakeOrderedAndProject [c_last_name,c_first_name,sales] Union - WholeStageCodegen (28) + WholeStageCodegen (26) HashAggregate [c_last_name,c_first_name,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(cs_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(cs_list_price as decimal(12,2)))), DecimalType(18,2), true)),sales,sum,isEmpty] InputAdapter Exchange [c_last_name,c_first_name] #1 - WholeStageCodegen (27) + WholeStageCodegen (25) HashAggregate [c_last_name,c_first_name,cs_quantity,cs_list_price] [sum,isEmpty,sum,isEmpty] Project [cs_quantity,cs_list_price,c_first_name,c_last_name] SortMergeJoin [cs_bill_customer_sk,c_customer_sk] InputAdapter - WholeStageCodegen (18) + WholeStageCodegen (17) Project [cs_bill_customer_sk,cs_quantity,cs_list_price] BroadcastHashJoin [cs_sold_date_sk,d_date_sk] InputAdapter SortMergeJoin [cs_bill_customer_sk,c_customer_sk] - WholeStageCodegen (11) + WholeStageCodegen (10) Sort [cs_bill_customer_sk] InputAdapter Exchange [cs_bill_customer_sk] #2 - WholeStageCodegen (10) + WholeStageCodegen (9) Project [cs_sold_date_sk,cs_bill_customer_sk,cs_quantity,cs_list_price] InputAdapter SortMergeJoin [cs_item_sk,item_sk] @@ -31,48 +31,45 @@ TakeOrderedAndProject [c_last_name,c_first_name,sales] ColumnarToRow InputAdapter Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk,cs_quantity,cs_list_price] - WholeStageCodegen (9) + WholeStageCodegen (8) Sort [item_sk] - InputAdapter - Exchange [item_sk] #4 - WholeStageCodegen (8) - Project [item_sk] - Filter [count(1)] - HashAggregate [substr(i_item_desc, 1, 30),i_item_sk,d_date,count] [count(1),item_sk,count(1),count] - HashAggregate [i_item_desc,i_item_sk,d_date] [count,substr(i_item_desc, 1, 30),count] - Project [d_date,i_item_sk,i_item_desc] - SortMergeJoin [ss_item_sk,i_item_sk] + Project [item_sk] + Filter [count(1)] + HashAggregate [substr(i_item_desc, 1, 30),i_item_sk,d_date,count] [count(1),item_sk,count(1),count] + HashAggregate [i_item_desc,i_item_sk,d_date] [count,substr(i_item_desc, 1, 30),count] + Project [d_date,i_item_sk,i_item_desc] + SortMergeJoin [ss_item_sk,i_item_sk] + InputAdapter + WholeStageCodegen (5) + Sort [ss_item_sk] InputAdapter - WholeStageCodegen (5) - Sort [ss_item_sk] - InputAdapter - Exchange [ss_item_sk] #5 - WholeStageCodegen (4) - Project [ss_item_sk,d_date] - BroadcastHashJoin [ss_sold_date_sk,d_date_sk] - Filter [ss_sold_date_sk,ss_item_sk] - ColumnarToRow - InputAdapter - Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk] - InputAdapter - BroadcastExchange #6 - WholeStageCodegen (3) - Project [d_date_sk,d_date] - Filter [d_year,d_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.date_dim [d_date_sk,d_date,d_year] + Exchange [ss_item_sk] #4 + WholeStageCodegen (4) + Project [ss_item_sk,d_date] + BroadcastHashJoin [ss_sold_date_sk,d_date_sk] + Filter [ss_sold_date_sk,ss_item_sk] + ColumnarToRow + InputAdapter + Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk] + InputAdapter + BroadcastExchange #5 + WholeStageCodegen (3) + Project [d_date_sk,d_date] + Filter [d_year,d_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_date_sk,d_date,d_year] + InputAdapter + WholeStageCodegen (7) + Sort [i_item_sk] InputAdapter - WholeStageCodegen (7) - Sort [i_item_sk] - InputAdapter - Exchange [i_item_sk] #7 - WholeStageCodegen (6) - Filter [i_item_sk] - ColumnarToRow - InputAdapter - Scan parquet default.item [i_item_sk,i_item_desc] - WholeStageCodegen (16) + Exchange [i_item_sk] #6 + WholeStageCodegen (6) + Filter [i_item_sk] + ColumnarToRow + InputAdapter + Scan parquet default.item [i_item_sk,i_item_desc] + WholeStageCodegen (15) Sort [c_customer_sk] Project [c_customer_sk] Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] @@ -80,7 +77,7 @@ TakeOrderedAndProject [c_last_name,c_first_name,sales] WholeStageCodegen (7) HashAggregate [max] [max(csales),tpcds_cmax,max] InputAdapter - Exchange #10 + Exchange #9 WholeStageCodegen (6) HashAggregate [csales] [max,max] HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),csales,sum,isEmpty] @@ -91,7 +88,7 @@ TakeOrderedAndProject [c_last_name,c_first_name,sales] WholeStageCodegen (3) Sort [ss_customer_sk] InputAdapter - Exchange [ss_customer_sk] #11 + Exchange [ss_customer_sk] #10 WholeStageCodegen (2) Project [ss_customer_sk,ss_quantity,ss_sales_price] BroadcastHashJoin [ss_sold_date_sk,d_date_sk] @@ -100,7 +97,7 @@ TakeOrderedAndProject [c_last_name,c_first_name,sales] InputAdapter Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk,ss_quantity,ss_sales_price] InputAdapter - BroadcastExchange #12 + BroadcastExchange #11 WholeStageCodegen (1) Project [d_date_sk] Filter [d_year,d_date_sk] @@ -111,7 +108,7 @@ TakeOrderedAndProject [c_last_name,c_first_name,sales] WholeStageCodegen (5) Sort [c_customer_sk] InputAdapter - Exchange [c_customer_sk] #13 + Exchange [c_customer_sk] #12 WholeStageCodegen (4) Filter [c_customer_sk] ColumnarToRow @@ -122,28 +119,28 @@ TakeOrderedAndProject [c_last_name,c_first_name,sales] Project [ss_quantity,ss_sales_price,c_customer_sk] SortMergeJoin [ss_customer_sk,c_customer_sk] InputAdapter - WholeStageCodegen (13) + WholeStageCodegen (12) Sort [ss_customer_sk] InputAdapter - Exchange [ss_customer_sk] #8 - WholeStageCodegen (12) + Exchange [ss_customer_sk] #7 + WholeStageCodegen (11) Filter [ss_customer_sk] ColumnarToRow InputAdapter Scan parquet default.store_sales [ss_customer_sk,ss_quantity,ss_sales_price] InputAdapter - WholeStageCodegen (15) + WholeStageCodegen (14) Sort [c_customer_sk] InputAdapter - Exchange [c_customer_sk] #9 - WholeStageCodegen (14) + Exchange [c_customer_sk] #8 + WholeStageCodegen (13) Filter [c_customer_sk] ColumnarToRow InputAdapter Scan parquet default.customer [c_customer_sk] InputAdapter - BroadcastExchange #14 - WholeStageCodegen (17) + BroadcastExchange #13 + WholeStageCodegen (16) Project [d_date_sk] Filter [d_year,d_moy,d_date_sk] ColumnarToRow @@ -151,73 +148,84 @@ TakeOrderedAndProject [c_last_name,c_first_name,sales] Scan parquet default.date_dim [d_date_sk,d_year,d_moy] InputAdapter SortMergeJoin [c_customer_sk,c_customer_sk] - WholeStageCodegen (20) + WholeStageCodegen (19) Sort [c_customer_sk] InputAdapter - Exchange [c_customer_sk] #15 - WholeStageCodegen (19) + Exchange [c_customer_sk] #14 + WholeStageCodegen (18) Filter [c_customer_sk] ColumnarToRow InputAdapter Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name] - WholeStageCodegen (26) + WholeStageCodegen (24) Sort [c_customer_sk] - InputAdapter - Exchange [c_customer_sk] #16 - WholeStageCodegen (25) - Project [c_customer_sk] - Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] - ReusedSubquery [tpcds_cmax] #1 - HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] - HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] - Project [ss_quantity,ss_sales_price,c_customer_sk] - SortMergeJoin [ss_customer_sk,c_customer_sk] + Project [c_customer_sk] + Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] + ReusedSubquery [tpcds_cmax] #1 + HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] + HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] + Project [ss_quantity,ss_sales_price,c_customer_sk] + SortMergeJoin [ss_customer_sk,c_customer_sk] + InputAdapter + WholeStageCodegen (21) + Sort [ss_customer_sk] InputAdapter - WholeStageCodegen (22) - Sort [ss_customer_sk] - InputAdapter - ReusedExchange [ss_customer_sk,ss_quantity,ss_sales_price] #8 + ReusedExchange [ss_customer_sk,ss_quantity,ss_sales_price] #7 + InputAdapter + WholeStageCodegen (23) + Sort [c_customer_sk] InputAdapter - WholeStageCodegen (24) - Sort [c_customer_sk] - InputAdapter - ReusedExchange [c_customer_sk] #9 - WholeStageCodegen (56) + ReusedExchange [c_customer_sk] #8 + WholeStageCodegen (52) HashAggregate [c_last_name,c_first_name,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ws_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ws_list_price as decimal(12,2)))), DecimalType(18,2), true)),sales,sum,isEmpty] InputAdapter - Exchange [c_last_name,c_first_name] #17 - WholeStageCodegen (55) + Exchange [c_last_name,c_first_name] #15 + WholeStageCodegen (51) HashAggregate [c_last_name,c_first_name,ws_quantity,ws_list_price] [sum,isEmpty,sum,isEmpty] Project [ws_quantity,ws_list_price,c_first_name,c_last_name] SortMergeJoin [ws_bill_customer_sk,c_customer_sk] InputAdapter - WholeStageCodegen (46) + WholeStageCodegen (43) Project [ws_bill_customer_sk,ws_quantity,ws_list_price] BroadcastHashJoin [ws_sold_date_sk,d_date_sk] InputAdapter SortMergeJoin [ws_bill_customer_sk,c_customer_sk] - WholeStageCodegen (39) + WholeStageCodegen (36) Sort [ws_bill_customer_sk] InputAdapter - Exchange [ws_bill_customer_sk] #18 - WholeStageCodegen (38) + Exchange [ws_bill_customer_sk] #16 + WholeStageCodegen (35) Project [ws_sold_date_sk,ws_bill_customer_sk,ws_quantity,ws_list_price] InputAdapter SortMergeJoin [ws_item_sk,item_sk] - WholeStageCodegen (30) + WholeStageCodegen (28) Sort [ws_item_sk] InputAdapter - Exchange [ws_item_sk] #19 - WholeStageCodegen (29) + Exchange [ws_item_sk] #17 + WholeStageCodegen (27) Filter [ws_bill_customer_sk,ws_sold_date_sk] ColumnarToRow InputAdapter Scan parquet default.web_sales [ws_sold_date_sk,ws_item_sk,ws_bill_customer_sk,ws_quantity,ws_list_price] - WholeStageCodegen (37) + WholeStageCodegen (34) Sort [item_sk] - InputAdapter - ReusedExchange [item_sk] #4 - WholeStageCodegen (44) + Project [item_sk] + Filter [count(1)] + HashAggregate [substr(i_item_desc, 1, 30),i_item_sk,d_date,count] [count(1),item_sk,count(1),count] + HashAggregate [i_item_desc,i_item_sk,d_date] [count,substr(i_item_desc, 1, 30),count] + Project [d_date,i_item_sk,i_item_desc] + SortMergeJoin [ss_item_sk,i_item_sk] + InputAdapter + WholeStageCodegen (31) + Sort [ss_item_sk] + InputAdapter + ReusedExchange [ss_item_sk,d_date] #4 + InputAdapter + WholeStageCodegen (33) + Sort [i_item_sk] + InputAdapter + ReusedExchange [i_item_sk,i_item_desc] #6 + WholeStageCodegen (41) Sort [c_customer_sk] Project [c_customer_sk] Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] @@ -227,42 +235,39 @@ TakeOrderedAndProject [c_last_name,c_first_name,sales] Project [ss_quantity,ss_sales_price,c_customer_sk] SortMergeJoin [ss_customer_sk,c_customer_sk] InputAdapter - WholeStageCodegen (41) + WholeStageCodegen (38) Sort [ss_customer_sk] InputAdapter - ReusedExchange [ss_customer_sk,ss_quantity,ss_sales_price] #8 + ReusedExchange [ss_customer_sk,ss_quantity,ss_sales_price] #7 InputAdapter - WholeStageCodegen (43) + WholeStageCodegen (40) Sort [c_customer_sk] InputAdapter - ReusedExchange [c_customer_sk] #9 + ReusedExchange [c_customer_sk] #8 InputAdapter - ReusedExchange [d_date_sk] #14 + ReusedExchange [d_date_sk] #13 InputAdapter SortMergeJoin [c_customer_sk,c_customer_sk] - WholeStageCodegen (48) + WholeStageCodegen (45) Sort [c_customer_sk] InputAdapter - ReusedExchange [c_customer_sk,c_first_name,c_last_name] #15 - WholeStageCodegen (54) + ReusedExchange [c_customer_sk,c_first_name,c_last_name] #14 + WholeStageCodegen (50) Sort [c_customer_sk] - InputAdapter - Exchange [c_customer_sk] #20 - WholeStageCodegen (53) - Project [c_customer_sk] - Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] - ReusedSubquery [tpcds_cmax] #1 - HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] - HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] - Project [ss_quantity,ss_sales_price,c_customer_sk] - SortMergeJoin [ss_customer_sk,c_customer_sk] + Project [c_customer_sk] + Filter [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true))] + ReusedSubquery [tpcds_cmax] #1 + HashAggregate [c_customer_sk,sum,isEmpty] [sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum(CheckOverflow((promote_precision(cast(cast(ss_quantity as decimal(10,0)) as decimal(12,2))) * promote_precision(cast(ss_sales_price as decimal(12,2)))), DecimalType(18,2), true)),sum,isEmpty] + HashAggregate [c_customer_sk,ss_quantity,ss_sales_price] [sum,isEmpty,sum,isEmpty] + Project [ss_quantity,ss_sales_price,c_customer_sk] + SortMergeJoin [ss_customer_sk,c_customer_sk] + InputAdapter + WholeStageCodegen (47) + Sort [ss_customer_sk] InputAdapter - WholeStageCodegen (50) - Sort [ss_customer_sk] - InputAdapter - ReusedExchange [ss_customer_sk,ss_quantity,ss_sales_price] #8 + ReusedExchange [ss_customer_sk,ss_quantity,ss_sales_price] #7 + InputAdapter + WholeStageCodegen (49) + Sort [c_customer_sk] InputAdapter - WholeStageCodegen (52) - Sort [c_customer_sk] - InputAdapter - ReusedExchange [c_customer_sk] #9 + ReusedExchange [c_customer_sk] #8 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/explain.txt index 92b9c26825e51..7465ddae84e8a 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/explain.txt @@ -1,72 +1,71 @@ == Physical Plan == -CollectLimit (68) -+- * HashAggregate (67) - +- Exchange (66) - +- * HashAggregate (65) - +- * HashAggregate (64) - +- * HashAggregate (63) - +- * HashAggregate (62) - +- * HashAggregate (61) - +- * HashAggregate (60) - +- Exchange (59) - +- * HashAggregate (58) - +- SortMergeJoin LeftSemi (57) - :- SortMergeJoin LeftSemi (39) - : :- * Sort (21) - : : +- Exchange (20) - : : +- * Project (19) - : : +- * SortMergeJoin Inner (18) - : : :- * Sort (12) - : : : +- Exchange (11) - : : : +- * Project (10) - : : : +- * BroadcastHashJoin Inner BuildRight (9) - : : : :- * Filter (3) - : : : : +- * ColumnarToRow (2) - : : : : +- Scan parquet default.store_sales (1) - : : : +- BroadcastExchange (8) - : : : +- * Project (7) - : : : +- * Filter (6) - : : : +- * ColumnarToRow (5) - : : : +- Scan parquet default.date_dim (4) - : : +- * Sort (17) - : : +- Exchange (16) - : : +- * Filter (15) - : : +- * ColumnarToRow (14) - : : +- Scan parquet default.customer (13) - : +- * Sort (38) - : +- Exchange (37) - : +- * HashAggregate (36) - : +- Exchange (35) - : +- * HashAggregate (34) - : +- * Project (33) - : +- * SortMergeJoin Inner (32) - : :- * Sort (29) - : : +- Exchange (28) - : : +- * Project (27) - : : +- * BroadcastHashJoin Inner BuildRight (26) - : : :- * Filter (24) - : : : +- * ColumnarToRow (23) - : : : +- Scan parquet default.catalog_sales (22) - : : +- ReusedExchange (25) - : +- * Sort (31) - : +- ReusedExchange (30) - +- * Sort (56) - +- Exchange (55) - +- * HashAggregate (54) - +- Exchange (53) - +- * HashAggregate (52) - +- * Project (51) - +- * SortMergeJoin Inner (50) - :- * Sort (47) - : +- Exchange (46) - : +- * Project (45) - : +- * BroadcastHashJoin Inner BuildRight (44) - : :- * Filter (42) - : : +- * ColumnarToRow (41) - : : +- Scan parquet default.web_sales (40) - : +- ReusedExchange (43) - +- * Sort (49) - +- ReusedExchange (48) +* HashAggregate (67) ++- Exchange (66) + +- * HashAggregate (65) + +- * HashAggregate (64) + +- * HashAggregate (63) + +- * HashAggregate (62) + +- * HashAggregate (61) + +- * HashAggregate (60) + +- Exchange (59) + +- * HashAggregate (58) + +- SortMergeJoin LeftSemi (57) + :- SortMergeJoin LeftSemi (39) + : :- * Sort (21) + : : +- Exchange (20) + : : +- * Project (19) + : : +- * SortMergeJoin Inner (18) + : : :- * Sort (12) + : : : +- Exchange (11) + : : : +- * Project (10) + : : : +- * BroadcastHashJoin Inner BuildRight (9) + : : : :- * Filter (3) + : : : : +- * ColumnarToRow (2) + : : : : +- Scan parquet default.store_sales (1) + : : : +- BroadcastExchange (8) + : : : +- * Project (7) + : : : +- * Filter (6) + : : : +- * ColumnarToRow (5) + : : : +- Scan parquet default.date_dim (4) + : : +- * Sort (17) + : : +- Exchange (16) + : : +- * Filter (15) + : : +- * ColumnarToRow (14) + : : +- Scan parquet default.customer (13) + : +- * Sort (38) + : +- Exchange (37) + : +- * HashAggregate (36) + : +- Exchange (35) + : +- * HashAggregate (34) + : +- * Project (33) + : +- * SortMergeJoin Inner (32) + : :- * Sort (29) + : : +- Exchange (28) + : : +- * Project (27) + : : +- * BroadcastHashJoin Inner BuildRight (26) + : : :- * Filter (24) + : : : +- * ColumnarToRow (23) + : : : +- Scan parquet default.catalog_sales (22) + : : +- ReusedExchange (25) + : +- * Sort (31) + : +- ReusedExchange (30) + +- * Sort (56) + +- Exchange (55) + +- * HashAggregate (54) + +- Exchange (53) + +- * HashAggregate (52) + +- * Project (51) + +- * SortMergeJoin Inner (50) + :- * Sort (47) + : +- Exchange (46) + : +- * Project (45) + : +- * BroadcastHashJoin Inner BuildRight (44) + : :- * Filter (42) + : : +- * ColumnarToRow (41) + : : +- Scan parquet default.web_sales (40) + : +- ReusedExchange (43) + +- * Sort (49) + +- ReusedExchange (48) (1) Scan parquet default.store_sales @@ -387,7 +386,3 @@ Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#37] Results [1]: [count(1)#37 AS count(1)#38] -(68) CollectLimit -Input [1]: [count(1)#38] -Arguments: 100 - diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/simplified.txt index 5bcd7dbb93022..8dd59340cf069 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/simplified.txt @@ -1,118 +1,117 @@ -CollectLimit - WholeStageCodegen (26) - HashAggregate [count] [count(1),count(1),count] - InputAdapter - Exchange #1 - WholeStageCodegen (25) - HashAggregate [count,count] +WholeStageCodegen (26) + HashAggregate [count] [count(1),count(1),count] + InputAdapter + Exchange #1 + WholeStageCodegen (25) + HashAggregate [count,count] + HashAggregate [c_last_name,c_first_name,d_date] HashAggregate [c_last_name,c_first_name,d_date] HashAggregate [c_last_name,c_first_name,d_date] HashAggregate [c_last_name,c_first_name,d_date] HashAggregate [c_last_name,c_first_name,d_date] - HashAggregate [c_last_name,c_first_name,d_date] - InputAdapter - Exchange [c_last_name,c_first_name,d_date] #2 - WholeStageCodegen (24) - HashAggregate [c_last_name,c_first_name,d_date] - InputAdapter + InputAdapter + Exchange [c_last_name,c_first_name,d_date] #2 + WholeStageCodegen (24) + HashAggregate [c_last_name,c_first_name,d_date] + InputAdapter + SortMergeJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date] SortMergeJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date] - SortMergeJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date] - WholeStageCodegen (7) - Sort [c_last_name,c_first_name,d_date] - InputAdapter - Exchange [c_last_name,c_first_name,d_date] #3 - WholeStageCodegen (6) - Project [d_date,c_first_name,c_last_name] - SortMergeJoin [ss_customer_sk,c_customer_sk] - InputAdapter - WholeStageCodegen (3) - Sort [ss_customer_sk] - InputAdapter - Exchange [ss_customer_sk] #4 - WholeStageCodegen (2) - Project [ss_customer_sk,d_date] - BroadcastHashJoin [ss_sold_date_sk,d_date_sk] - Filter [ss_sold_date_sk,ss_customer_sk] - ColumnarToRow - InputAdapter - Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk] - InputAdapter - BroadcastExchange #5 - WholeStageCodegen (1) - Project [d_date_sk,d_date] - Filter [d_month_seq,d_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq] - InputAdapter - WholeStageCodegen (5) - Sort [c_customer_sk] - InputAdapter - Exchange [c_customer_sk] #6 - WholeStageCodegen (4) - Filter [c_customer_sk] - ColumnarToRow - InputAdapter - Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name] - WholeStageCodegen (15) - Sort [c_last_name,c_first_name,d_date] - InputAdapter - Exchange [c_last_name,c_first_name,d_date] #7 - WholeStageCodegen (14) - HashAggregate [c_last_name,c_first_name,d_date] + WholeStageCodegen (7) + Sort [c_last_name,c_first_name,d_date] + InputAdapter + Exchange [c_last_name,c_first_name,d_date] #3 + WholeStageCodegen (6) + Project [d_date,c_first_name,c_last_name] + SortMergeJoin [ss_customer_sk,c_customer_sk] InputAdapter - Exchange [c_last_name,c_first_name,d_date] #8 - WholeStageCodegen (13) - HashAggregate [c_last_name,c_first_name,d_date] - Project [c_last_name,c_first_name,d_date] - SortMergeJoin [cs_bill_customer_sk,c_customer_sk] - InputAdapter - WholeStageCodegen (10) - Sort [cs_bill_customer_sk] - InputAdapter - Exchange [cs_bill_customer_sk] #9 - WholeStageCodegen (9) - Project [cs_bill_customer_sk,d_date] - BroadcastHashJoin [cs_sold_date_sk,d_date_sk] - Filter [cs_sold_date_sk,cs_bill_customer_sk] - ColumnarToRow - InputAdapter - Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk] + WholeStageCodegen (3) + Sort [ss_customer_sk] + InputAdapter + Exchange [ss_customer_sk] #4 + WholeStageCodegen (2) + Project [ss_customer_sk,d_date] + BroadcastHashJoin [ss_sold_date_sk,d_date_sk] + Filter [ss_sold_date_sk,ss_customer_sk] + ColumnarToRow + InputAdapter + Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk] + InputAdapter + BroadcastExchange #5 + WholeStageCodegen (1) + Project [d_date_sk,d_date] + Filter [d_month_seq,d_date_sk] + ColumnarToRow InputAdapter - ReusedExchange [d_date_sk,d_date] #5 - InputAdapter - WholeStageCodegen (12) - Sort [c_customer_sk] - InputAdapter - ReusedExchange [c_customer_sk,c_first_name,c_last_name] #6 - WholeStageCodegen (23) + Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq] + InputAdapter + WholeStageCodegen (5) + Sort [c_customer_sk] + InputAdapter + Exchange [c_customer_sk] #6 + WholeStageCodegen (4) + Filter [c_customer_sk] + ColumnarToRow + InputAdapter + Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name] + WholeStageCodegen (15) Sort [c_last_name,c_first_name,d_date] InputAdapter - Exchange [c_last_name,c_first_name,d_date] #10 - WholeStageCodegen (22) + Exchange [c_last_name,c_first_name,d_date] #7 + WholeStageCodegen (14) HashAggregate [c_last_name,c_first_name,d_date] InputAdapter - Exchange [c_last_name,c_first_name,d_date] #11 - WholeStageCodegen (21) + Exchange [c_last_name,c_first_name,d_date] #8 + WholeStageCodegen (13) HashAggregate [c_last_name,c_first_name,d_date] Project [c_last_name,c_first_name,d_date] - SortMergeJoin [ws_bill_customer_sk,c_customer_sk] + SortMergeJoin [cs_bill_customer_sk,c_customer_sk] InputAdapter - WholeStageCodegen (18) - Sort [ws_bill_customer_sk] + WholeStageCodegen (10) + Sort [cs_bill_customer_sk] InputAdapter - Exchange [ws_bill_customer_sk] #12 - WholeStageCodegen (17) - Project [ws_bill_customer_sk,d_date] - BroadcastHashJoin [ws_sold_date_sk,d_date_sk] - Filter [ws_sold_date_sk,ws_bill_customer_sk] + Exchange [cs_bill_customer_sk] #9 + WholeStageCodegen (9) + Project [cs_bill_customer_sk,d_date] + BroadcastHashJoin [cs_sold_date_sk,d_date_sk] + Filter [cs_sold_date_sk,cs_bill_customer_sk] ColumnarToRow InputAdapter - Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk] + Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk] InputAdapter ReusedExchange [d_date_sk,d_date] #5 InputAdapter - WholeStageCodegen (20) + WholeStageCodegen (12) Sort [c_customer_sk] InputAdapter ReusedExchange [c_customer_sk,c_first_name,c_last_name] #6 + WholeStageCodegen (23) + Sort [c_last_name,c_first_name,d_date] + InputAdapter + Exchange [c_last_name,c_first_name,d_date] #10 + WholeStageCodegen (22) + HashAggregate [c_last_name,c_first_name,d_date] + InputAdapter + Exchange [c_last_name,c_first_name,d_date] #11 + WholeStageCodegen (21) + HashAggregate [c_last_name,c_first_name,d_date] + Project [c_last_name,c_first_name,d_date] + SortMergeJoin [ws_bill_customer_sk,c_customer_sk] + InputAdapter + WholeStageCodegen (18) + Sort [ws_bill_customer_sk] + InputAdapter + Exchange [ws_bill_customer_sk] #12 + WholeStageCodegen (17) + Project [ws_bill_customer_sk,d_date] + BroadcastHashJoin [ws_sold_date_sk,d_date_sk] + Filter [ws_sold_date_sk,ws_bill_customer_sk] + ColumnarToRow + InputAdapter + Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk] + InputAdapter + ReusedExchange [d_date_sk,d_date] #5 + InputAdapter + WholeStageCodegen (20) + Sort [c_customer_sk] + InputAdapter + ReusedExchange [c_customer_sk,c_first_name,c_last_name] #6 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/explain.txt index 09ab60c7cf651..74454cf32afd0 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/explain.txt @@ -1,59 +1,58 @@ == Physical Plan == -CollectLimit (55) -+- * HashAggregate (54) - +- Exchange (53) - +- * HashAggregate (52) - +- * HashAggregate (51) - +- * HashAggregate (50) - +- * HashAggregate (49) - +- * HashAggregate (48) - +- * HashAggregate (47) - +- Exchange (46) - +- * HashAggregate (45) - +- * BroadcastHashJoin LeftSemi BuildRight (44) - :- * BroadcastHashJoin LeftSemi BuildRight (30) - : :- * Project (16) - : : +- * BroadcastHashJoin Inner BuildRight (15) - : : :- * Project (10) - : : : +- * BroadcastHashJoin Inner BuildRight (9) - : : : :- * Filter (3) - : : : : +- * ColumnarToRow (2) - : : : : +- Scan parquet default.store_sales (1) - : : : +- BroadcastExchange (8) - : : : +- * Project (7) - : : : +- * Filter (6) - : : : +- * ColumnarToRow (5) - : : : +- Scan parquet default.date_dim (4) - : : +- BroadcastExchange (14) - : : +- * Filter (13) - : : +- * ColumnarToRow (12) - : : +- Scan parquet default.customer (11) - : +- BroadcastExchange (29) - : +- * HashAggregate (28) - : +- Exchange (27) - : +- * HashAggregate (26) - : +- * Project (25) - : +- * BroadcastHashJoin Inner BuildRight (24) - : :- * Project (22) - : : +- * BroadcastHashJoin Inner BuildRight (21) - : : :- * Filter (19) - : : : +- * ColumnarToRow (18) - : : : +- Scan parquet default.catalog_sales (17) - : : +- ReusedExchange (20) - : +- ReusedExchange (23) - +- BroadcastExchange (43) - +- * HashAggregate (42) - +- Exchange (41) - +- * HashAggregate (40) - +- * Project (39) - +- * BroadcastHashJoin Inner BuildRight (38) - :- * Project (36) - : +- * BroadcastHashJoin Inner BuildRight (35) - : :- * Filter (33) - : : +- * ColumnarToRow (32) - : : +- Scan parquet default.web_sales (31) - : +- ReusedExchange (34) - +- ReusedExchange (37) +* HashAggregate (54) ++- Exchange (53) + +- * HashAggregate (52) + +- * HashAggregate (51) + +- * HashAggregate (50) + +- * HashAggregate (49) + +- * HashAggregate (48) + +- * HashAggregate (47) + +- Exchange (46) + +- * HashAggregate (45) + +- * BroadcastHashJoin LeftSemi BuildRight (44) + :- * BroadcastHashJoin LeftSemi BuildRight (30) + : :- * Project (16) + : : +- * BroadcastHashJoin Inner BuildRight (15) + : : :- * Project (10) + : : : +- * BroadcastHashJoin Inner BuildRight (9) + : : : :- * Filter (3) + : : : : +- * ColumnarToRow (2) + : : : : +- Scan parquet default.store_sales (1) + : : : +- BroadcastExchange (8) + : : : +- * Project (7) + : : : +- * Filter (6) + : : : +- * ColumnarToRow (5) + : : : +- Scan parquet default.date_dim (4) + : : +- BroadcastExchange (14) + : : +- * Filter (13) + : : +- * ColumnarToRow (12) + : : +- Scan parquet default.customer (11) + : +- BroadcastExchange (29) + : +- * HashAggregate (28) + : +- Exchange (27) + : +- * HashAggregate (26) + : +- * Project (25) + : +- * BroadcastHashJoin Inner BuildRight (24) + : :- * Project (22) + : : +- * BroadcastHashJoin Inner BuildRight (21) + : : :- * Filter (19) + : : : +- * ColumnarToRow (18) + : : : +- Scan parquet default.catalog_sales (17) + : : +- ReusedExchange (20) + : +- ReusedExchange (23) + +- BroadcastExchange (43) + +- * HashAggregate (42) + +- Exchange (41) + +- * HashAggregate (40) + +- * Project (39) + +- * BroadcastHashJoin Inner BuildRight (38) + :- * Project (36) + : +- * BroadcastHashJoin Inner BuildRight (35) + : :- * Filter (33) + : : +- * ColumnarToRow (32) + : : +- Scan parquet default.web_sales (31) + : +- ReusedExchange (34) + +- ReusedExchange (37) (1) Scan parquet default.store_sales @@ -322,7 +321,3 @@ Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#33] Results [1]: [count(1)#33 AS count(1)#34] -(55) CollectLimit -Input [1]: [count(1)#34] -Arguments: 100 - diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/simplified.txt index 10a2166ce761d..a5b57a4ac9450 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/simplified.txt @@ -1,81 +1,80 @@ -CollectLimit - WholeStageCodegen (13) - HashAggregate [count] [count(1),count(1),count] - InputAdapter - Exchange #1 - WholeStageCodegen (12) - HashAggregate [count,count] +WholeStageCodegen (13) + HashAggregate [count] [count(1),count(1),count] + InputAdapter + Exchange #1 + WholeStageCodegen (12) + HashAggregate [count,count] + HashAggregate [c_last_name,c_first_name,d_date] HashAggregate [c_last_name,c_first_name,d_date] HashAggregate [c_last_name,c_first_name,d_date] HashAggregate [c_last_name,c_first_name,d_date] HashAggregate [c_last_name,c_first_name,d_date] - HashAggregate [c_last_name,c_first_name,d_date] - InputAdapter - Exchange [c_last_name,c_first_name,d_date] #2 - WholeStageCodegen (11) - HashAggregate [c_last_name,c_first_name,d_date] + InputAdapter + Exchange [c_last_name,c_first_name,d_date] #2 + WholeStageCodegen (11) + HashAggregate [c_last_name,c_first_name,d_date] + BroadcastHashJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date] BroadcastHashJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date] - BroadcastHashJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date] - Project [d_date,c_first_name,c_last_name] - BroadcastHashJoin [ss_customer_sk,c_customer_sk] - Project [ss_customer_sk,d_date] - BroadcastHashJoin [ss_sold_date_sk,d_date_sk] - Filter [ss_sold_date_sk,ss_customer_sk] + Project [d_date,c_first_name,c_last_name] + BroadcastHashJoin [ss_customer_sk,c_customer_sk] + Project [ss_customer_sk,d_date] + BroadcastHashJoin [ss_sold_date_sk,d_date_sk] + Filter [ss_sold_date_sk,ss_customer_sk] + ColumnarToRow + InputAdapter + Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk] + InputAdapter + BroadcastExchange #3 + WholeStageCodegen (1) + Project [d_date_sk,d_date] + Filter [d_month_seq,d_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq] + InputAdapter + BroadcastExchange #4 + WholeStageCodegen (2) + Filter [c_customer_sk] ColumnarToRow InputAdapter - Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk] - InputAdapter - BroadcastExchange #3 - WholeStageCodegen (1) - Project [d_date_sk,d_date] - Filter [d_month_seq,d_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq] - InputAdapter - BroadcastExchange #4 - WholeStageCodegen (2) - Filter [c_customer_sk] - ColumnarToRow - InputAdapter - Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name] - InputAdapter - BroadcastExchange #5 - WholeStageCodegen (6) - HashAggregate [c_last_name,c_first_name,d_date] - InputAdapter - Exchange [c_last_name,c_first_name,d_date] #6 - WholeStageCodegen (5) - HashAggregate [c_last_name,c_first_name,d_date] - Project [c_last_name,c_first_name,d_date] - BroadcastHashJoin [cs_bill_customer_sk,c_customer_sk] - Project [cs_bill_customer_sk,d_date] - BroadcastHashJoin [cs_sold_date_sk,d_date_sk] - Filter [cs_sold_date_sk,cs_bill_customer_sk] - ColumnarToRow - InputAdapter - Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk] - InputAdapter - ReusedExchange [d_date_sk,d_date] #3 - InputAdapter - ReusedExchange [c_customer_sk,c_first_name,c_last_name] #4 + Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name] InputAdapter - BroadcastExchange #7 - WholeStageCodegen (10) + BroadcastExchange #5 + WholeStageCodegen (6) HashAggregate [c_last_name,c_first_name,d_date] InputAdapter - Exchange [c_last_name,c_first_name,d_date] #8 - WholeStageCodegen (9) + Exchange [c_last_name,c_first_name,d_date] #6 + WholeStageCodegen (5) HashAggregate [c_last_name,c_first_name,d_date] Project [c_last_name,c_first_name,d_date] - BroadcastHashJoin [ws_bill_customer_sk,c_customer_sk] - Project [ws_bill_customer_sk,d_date] - BroadcastHashJoin [ws_sold_date_sk,d_date_sk] - Filter [ws_sold_date_sk,ws_bill_customer_sk] + BroadcastHashJoin [cs_bill_customer_sk,c_customer_sk] + Project [cs_bill_customer_sk,d_date] + BroadcastHashJoin [cs_sold_date_sk,d_date_sk] + Filter [cs_sold_date_sk,cs_bill_customer_sk] ColumnarToRow InputAdapter - Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk] + Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk] InputAdapter ReusedExchange [d_date_sk,d_date] #3 InputAdapter ReusedExchange [c_customer_sk,c_first_name,c_last_name] #4 + InputAdapter + BroadcastExchange #7 + WholeStageCodegen (10) + HashAggregate [c_last_name,c_first_name,d_date] + InputAdapter + Exchange [c_last_name,c_first_name,d_date] #8 + WholeStageCodegen (9) + HashAggregate [c_last_name,c_first_name,d_date] + Project [c_last_name,c_first_name,d_date] + BroadcastHashJoin [ws_bill_customer_sk,c_customer_sk] + Project [ws_bill_customer_sk,d_date] + BroadcastHashJoin [ws_sold_date_sk,d_date_sk] + Filter [ws_sold_date_sk,ws_bill_customer_sk] + ColumnarToRow + InputAdapter + Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk] + InputAdapter + ReusedExchange [d_date_sk,d_date] #3 + InputAdapter + ReusedExchange [c_customer_sk,c_first_name,c_last_name] #4 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92.sf100/explain.txt index dc4665185b014..99459bfe9a049 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92.sf100/explain.txt @@ -1,5 +1,5 @@ == Physical Plan == -TakeOrderedAndProject (34) +* Sort (34) +- * HashAggregate (33) +- Exchange (32) +- * HashAggregate (31) @@ -190,7 +190,7 @@ Functions [1]: [sum(UnscaledValue(ws_ext_discount_amt#6))] Aggregate Attributes [1]: [sum(UnscaledValue(ws_ext_discount_amt#6))#22] Results [1]: [MakeDecimal(sum(UnscaledValue(ws_ext_discount_amt#6))#22,17,2) AS Excess Discount Amount #23] -(34) TakeOrderedAndProject +(34) Sort [codegen id : 7] Input [1]: [Excess Discount Amount #23] -Arguments: 100, [Excess Discount Amount #23 ASC NULLS FIRST], [Excess Discount Amount #23] +Arguments: [Excess Discount Amount #23 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92.sf100/simplified.txt index 7fd1cd3637a09..0721155286d17 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92.sf100/simplified.txt @@ -1,5 +1,5 @@ -TakeOrderedAndProject [Excess Discount Amount ] - WholeStageCodegen (7) +WholeStageCodegen (7) + Sort [Excess Discount Amount ] HashAggregate [sum] [sum(UnscaledValue(ws_ext_discount_amt)),Excess Discount Amount ,sum] InputAdapter Exchange #1 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92/explain.txt index b17a48db8baac..8a441392f4165 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92/explain.txt @@ -1,5 +1,5 @@ == Physical Plan == -TakeOrderedAndProject (34) +* Sort (34) +- * HashAggregate (33) +- Exchange (32) +- * HashAggregate (31) @@ -190,7 +190,7 @@ Functions [1]: [sum(UnscaledValue(ws_ext_discount_amt#3))] Aggregate Attributes [1]: [sum(UnscaledValue(ws_ext_discount_amt#3))#22] Results [1]: [MakeDecimal(sum(UnscaledValue(ws_ext_discount_amt#3))#22,17,2) AS Excess Discount Amount #23] -(34) TakeOrderedAndProject +(34) Sort [codegen id : 7] Input [1]: [Excess Discount Amount #23] -Arguments: 100, [Excess Discount Amount #23 ASC NULLS FIRST], [Excess Discount Amount #23] +Arguments: [Excess Discount Amount #23 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92/simplified.txt index 652b2e36cf781..1f24a7c964f20 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q92/simplified.txt @@ -1,5 +1,5 @@ -TakeOrderedAndProject [Excess Discount Amount ] - WholeStageCodegen (7) +WholeStageCodegen (7) + Sort [Excess Discount Amount ] HashAggregate [sum] [sum(UnscaledValue(ws_ext_discount_amt)),Excess Discount Amount ,sum] InputAdapter Exchange #1 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94.sf100/explain.txt index 7720d9dee4170..43390c5048a6d 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94.sf100/explain.txt @@ -1,5 +1,5 @@ == Physical Plan == -TakeOrderedAndProject (47) +* Sort (47) +- * HashAggregate (46) +- Exchange (45) +- * HashAggregate (44) @@ -259,7 +259,7 @@ Functions [3]: [sum(UnscaledValue(ws_ext_ship_cost#6)), sum(UnscaledValue(ws_net Aggregate Attributes [3]: [sum(UnscaledValue(ws_ext_ship_cost#6))#24, sum(UnscaledValue(ws_net_profit#7))#25, count(ws_order_number#5)#29] Results [3]: [count(ws_order_number#5)#29 AS order count #32, MakeDecimal(sum(UnscaledValue(ws_ext_ship_cost#6))#24,17,2) AS total shipping cost #33, MakeDecimal(sum(UnscaledValue(ws_net_profit#7))#25,17,2) AS total net profit #34] -(47) TakeOrderedAndProject +(47) Sort [codegen id : 14] Input [3]: [order count #32, total shipping cost #33, total net profit #34] -Arguments: 100, [order count #32 ASC NULLS FIRST], [order count #32, total shipping cost #33, total net profit #34] +Arguments: [order count #32 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94.sf100/simplified.txt index 128a8179ac10b..7b3d461b9e80f 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94.sf100/simplified.txt @@ -1,5 +1,5 @@ -TakeOrderedAndProject [order count ,total shipping cost ,total net profit ] - WholeStageCodegen (14) +WholeStageCodegen (14) + Sort [order count ] HashAggregate [sum,sum,count] [sum(UnscaledValue(ws_ext_ship_cost)),sum(UnscaledValue(ws_net_profit)),count(ws_order_number),order count ,total shipping cost ,total net profit ,sum,sum,count] InputAdapter Exchange #1 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94/explain.txt index a94e74f66b201..2abbe4f9b8390 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94/explain.txt @@ -1,5 +1,5 @@ == Physical Plan == -TakeOrderedAndProject (41) +* Sort (41) +- * HashAggregate (40) +- Exchange (39) +- * HashAggregate (38) @@ -229,7 +229,7 @@ Functions [3]: [sum(UnscaledValue(ws_ext_ship_cost#6)), sum(UnscaledValue(ws_net Aggregate Attributes [3]: [sum(UnscaledValue(ws_ext_ship_cost#6))#22, sum(UnscaledValue(ws_net_profit#7))#23, count(ws_order_number#5)#27] Results [3]: [count(ws_order_number#5)#27 AS order count #30, MakeDecimal(sum(UnscaledValue(ws_ext_ship_cost#6))#22,17,2) AS total shipping cost #31, MakeDecimal(sum(UnscaledValue(ws_net_profit#7))#23,17,2) AS total net profit #32] -(41) TakeOrderedAndProject +(41) Sort [codegen id : 8] Input [3]: [order count #30, total shipping cost #31, total net profit #32] -Arguments: 100, [order count #30 ASC NULLS FIRST], [order count #30, total shipping cost #31, total net profit #32] +Arguments: [order count #30 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94/simplified.txt index 9d30b998fe174..5e7d7db5c0a9e 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94/simplified.txt @@ -1,5 +1,5 @@ -TakeOrderedAndProject [order count ,total shipping cost ,total net profit ] - WholeStageCodegen (8) +WholeStageCodegen (8) + Sort [order count ] HashAggregate [sum,sum,count] [sum(UnscaledValue(ws_ext_ship_cost)),sum(UnscaledValue(ws_net_profit)),count(ws_order_number),order count ,total shipping cost ,total net profit ,sum,sum,count] InputAdapter Exchange #1 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95.sf100/explain.txt index eae118d46245d..547792f3d7ae4 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95.sf100/explain.txt @@ -1,67 +1,65 @@ == Physical Plan == -TakeOrderedAndProject (63) -+- * HashAggregate (62) - +- Exchange (61) - +- * HashAggregate (60) - +- * HashAggregate (59) - +- Exchange (58) - +- * HashAggregate (57) - +- * Project (56) - +- * BroadcastHashJoin Inner BuildRight (55) - :- * Project (49) - : +- * BroadcastHashJoin Inner BuildRight (48) - : :- * Project (42) - : : +- * BroadcastHashJoin Inner BuildRight (41) - : : :- SortMergeJoin LeftSemi (35) - : : : :- * Sort (19) - : : : : +- Exchange (18) - : : : : +- SortMergeJoin LeftSemi (17) +* Sort (61) ++- * HashAggregate (60) + +- Exchange (59) + +- * HashAggregate (58) + +- * HashAggregate (57) + +- Exchange (56) + +- * HashAggregate (55) + +- * Project (54) + +- * BroadcastHashJoin Inner BuildRight (53) + :- * Project (47) + : +- * BroadcastHashJoin Inner BuildRight (46) + : :- * Project (40) + : : +- * BroadcastHashJoin Inner BuildRight (39) + : : :- SortMergeJoin LeftSemi (33) + : : : :- * Sort (17) + : : : : +- Exchange (16) + : : : : +- SortMergeJoin LeftSemi (15) : : : : :- * Sort (5) : : : : : +- Exchange (4) : : : : : +- * Filter (3) : : : : : +- * ColumnarToRow (2) : : : : : +- Scan parquet default.web_sales (1) - : : : : +- * Sort (16) - : : : : +- Exchange (15) - : : : : +- * Project (14) - : : : : +- * SortMergeJoin Inner (13) - : : : : :- * Sort (10) - : : : : : +- Exchange (9) - : : : : : +- * Filter (8) - : : : : : +- * ColumnarToRow (7) - : : : : : +- Scan parquet default.web_sales (6) - : : : : +- * Sort (12) - : : : : +- ReusedExchange (11) - : : : +- * Project (34) - : : : +- * SortMergeJoin Inner (33) - : : : :- * Sort (27) - : : : : +- Exchange (26) - : : : : +- * Project (25) - : : : : +- * SortMergeJoin Inner (24) - : : : : :- * Sort (21) - : : : : : +- ReusedExchange (20) - : : : : +- * Sort (23) - : : : : +- ReusedExchange (22) - : : : +- * Sort (32) - : : : +- Exchange (31) - : : : +- * Filter (30) - : : : +- * ColumnarToRow (29) - : : : +- Scan parquet default.web_returns (28) - : : +- BroadcastExchange (40) - : : +- * Project (39) - : : +- * Filter (38) - : : +- * ColumnarToRow (37) - : : +- Scan parquet default.customer_address (36) - : +- BroadcastExchange (47) - : +- * Project (46) - : +- * Filter (45) - : +- * ColumnarToRow (44) - : +- Scan parquet default.web_site (43) - +- BroadcastExchange (54) - +- * Project (53) - +- * Filter (52) - +- * ColumnarToRow (51) - +- Scan parquet default.date_dim (50) + : : : : +- * Project (14) + : : : : +- * SortMergeJoin Inner (13) + : : : : :- * Sort (10) + : : : : : +- Exchange (9) + : : : : : +- * Filter (8) + : : : : : +- * ColumnarToRow (7) + : : : : : +- Scan parquet default.web_sales (6) + : : : : +- * Sort (12) + : : : : +- ReusedExchange (11) + : : : +- * Project (32) + : : : +- * SortMergeJoin Inner (31) + : : : :- * Sort (25) + : : : : +- Exchange (24) + : : : : +- * Project (23) + : : : : +- * SortMergeJoin Inner (22) + : : : : :- * Sort (19) + : : : : : +- ReusedExchange (18) + : : : : +- * Sort (21) + : : : : +- ReusedExchange (20) + : : : +- * Sort (30) + : : : +- Exchange (29) + : : : +- * Filter (28) + : : : +- * ColumnarToRow (27) + : : : +- Scan parquet default.web_returns (26) + : : +- BroadcastExchange (38) + : : +- * Project (37) + : : +- * Filter (36) + : : +- * ColumnarToRow (35) + : : +- Scan parquet default.customer_address (34) + : +- BroadcastExchange (45) + : +- * Project (44) + : +- * Filter (43) + : +- * ColumnarToRow (42) + : +- Scan parquet default.web_site (41) + +- BroadcastExchange (52) + +- * Project (51) + +- * Filter (50) + +- * ColumnarToRow (49) + +- Scan parquet default.date_dim (48) (1) Scan parquet default.web_sales @@ -124,224 +122,216 @@ Join condition: NOT (ws_warehouse_sk#8 = ws_warehouse_sk#10) Output [1]: [ws_order_number#4 AS ws_order_number#4#12] Input [4]: [ws_warehouse_sk#8, ws_order_number#4, ws_warehouse_sk#10, ws_order_number#11] -(15) Exchange -Input [1]: [ws_order_number#4#12] -Arguments: hashpartitioning(ws_order_number#4#12, 5), true, [id=#13] - -(16) Sort [codegen id : 8] -Input [1]: [ws_order_number#4#12] -Arguments: [ws_order_number#4#12 ASC NULLS FIRST], false, 0 - -(17) SortMergeJoin +(15) SortMergeJoin Left keys [1]: [ws_order_number#4] Right keys [1]: [ws_order_number#4#12] Join condition: None -(18) Exchange +(16) Exchange Input [6]: [ws_ship_date_sk#1, ws_ship_addr_sk#2, ws_web_site_sk#3, ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6] -Arguments: hashpartitioning(cast(ws_order_number#4 as bigint), 5), true, [id=#14] +Arguments: hashpartitioning(cast(ws_order_number#4 as bigint), 5), true, [id=#13] -(19) Sort [codegen id : 9] +(17) Sort [codegen id : 8] Input [6]: [ws_ship_date_sk#1, ws_ship_addr_sk#2, ws_web_site_sk#3, ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6] Arguments: [cast(ws_order_number#4 as bigint) ASC NULLS FIRST], false, 0 -(20) ReusedExchange [Reuses operator id: 9] +(18) ReusedExchange [Reuses operator id: 9] Output [2]: [ws_warehouse_sk#8, ws_order_number#4] -(21) Sort [codegen id : 11] +(19) Sort [codegen id : 10] Input [2]: [ws_warehouse_sk#8, ws_order_number#4] Arguments: [ws_order_number#4 ASC NULLS FIRST], false, 0 -(22) ReusedExchange [Reuses operator id: 9] -Output [2]: [ws_warehouse_sk#15, ws_order_number#16] +(20) ReusedExchange [Reuses operator id: 9] +Output [2]: [ws_warehouse_sk#14, ws_order_number#15] -(23) Sort [codegen id : 13] -Input [2]: [ws_warehouse_sk#15, ws_order_number#16] -Arguments: [ws_order_number#16 ASC NULLS FIRST], false, 0 +(21) Sort [codegen id : 12] +Input [2]: [ws_warehouse_sk#14, ws_order_number#15] +Arguments: [ws_order_number#15 ASC NULLS FIRST], false, 0 -(24) SortMergeJoin [codegen id : 14] +(22) SortMergeJoin [codegen id : 13] Left keys [1]: [ws_order_number#4] -Right keys [1]: [ws_order_number#16] -Join condition: NOT (ws_warehouse_sk#8 = ws_warehouse_sk#15) +Right keys [1]: [ws_order_number#15] +Join condition: NOT (ws_warehouse_sk#8 = ws_warehouse_sk#14) -(25) Project [codegen id : 14] +(23) Project [codegen id : 13] Output [1]: [ws_order_number#4] -Input [4]: [ws_warehouse_sk#8, ws_order_number#4, ws_warehouse_sk#15, ws_order_number#16] +Input [4]: [ws_warehouse_sk#8, ws_order_number#4, ws_warehouse_sk#14, ws_order_number#15] -(26) Exchange +(24) Exchange Input [1]: [ws_order_number#4] -Arguments: hashpartitioning(cast(ws_order_number#4 as bigint), 5), true, [id=#17] +Arguments: hashpartitioning(cast(ws_order_number#4 as bigint), 5), true, [id=#16] -(27) Sort [codegen id : 15] +(25) Sort [codegen id : 14] Input [1]: [ws_order_number#4] Arguments: [cast(ws_order_number#4 as bigint) ASC NULLS FIRST], false, 0 -(28) Scan parquet default.web_returns -Output [1]: [wr_order_number#18] +(26) Scan parquet default.web_returns +Output [1]: [wr_order_number#17] Batched: true Location [not included in comparison]/{warehouse_dir}/web_returns] PushedFilters: [IsNotNull(wr_order_number)] ReadSchema: struct -(29) ColumnarToRow [codegen id : 16] -Input [1]: [wr_order_number#18] +(27) ColumnarToRow [codegen id : 15] +Input [1]: [wr_order_number#17] -(30) Filter [codegen id : 16] -Input [1]: [wr_order_number#18] -Condition : isnotnull(wr_order_number#18) +(28) Filter [codegen id : 15] +Input [1]: [wr_order_number#17] +Condition : isnotnull(wr_order_number#17) -(31) Exchange -Input [1]: [wr_order_number#18] -Arguments: hashpartitioning(wr_order_number#18, 5), true, [id=#19] +(29) Exchange +Input [1]: [wr_order_number#17] +Arguments: hashpartitioning(wr_order_number#17, 5), true, [id=#18] -(32) Sort [codegen id : 17] -Input [1]: [wr_order_number#18] -Arguments: [wr_order_number#18 ASC NULLS FIRST], false, 0 +(30) Sort [codegen id : 16] +Input [1]: [wr_order_number#17] +Arguments: [wr_order_number#17 ASC NULLS FIRST], false, 0 -(33) SortMergeJoin [codegen id : 18] +(31) SortMergeJoin [codegen id : 17] Left keys [1]: [cast(ws_order_number#4 as bigint)] -Right keys [1]: [wr_order_number#18] +Right keys [1]: [wr_order_number#17] Join condition: None -(34) Project [codegen id : 18] -Output [1]: [wr_order_number#18] -Input [2]: [ws_order_number#4, wr_order_number#18] +(32) Project [codegen id : 17] +Output [1]: [wr_order_number#17] +Input [2]: [ws_order_number#4, wr_order_number#17] -(35) SortMergeJoin +(33) SortMergeJoin Left keys [1]: [cast(ws_order_number#4 as bigint)] -Right keys [1]: [wr_order_number#18] +Right keys [1]: [wr_order_number#17] Join condition: None -(36) Scan parquet default.customer_address -Output [2]: [ca_address_sk#20, ca_state#21] +(34) Scan parquet default.customer_address +Output [2]: [ca_address_sk#19, ca_state#20] Batched: true Location [not included in comparison]/{warehouse_dir}/customer_address] PushedFilters: [IsNotNull(ca_state), EqualTo(ca_state,IL), IsNotNull(ca_address_sk)] ReadSchema: struct -(37) ColumnarToRow [codegen id : 19] -Input [2]: [ca_address_sk#20, ca_state#21] +(35) ColumnarToRow [codegen id : 18] +Input [2]: [ca_address_sk#19, ca_state#20] -(38) Filter [codegen id : 19] -Input [2]: [ca_address_sk#20, ca_state#21] -Condition : ((isnotnull(ca_state#21) AND (ca_state#21 = IL)) AND isnotnull(ca_address_sk#20)) +(36) Filter [codegen id : 18] +Input [2]: [ca_address_sk#19, ca_state#20] +Condition : ((isnotnull(ca_state#20) AND (ca_state#20 = IL)) AND isnotnull(ca_address_sk#19)) -(39) Project [codegen id : 19] -Output [1]: [ca_address_sk#20] -Input [2]: [ca_address_sk#20, ca_state#21] +(37) Project [codegen id : 18] +Output [1]: [ca_address_sk#19] +Input [2]: [ca_address_sk#19, ca_state#20] -(40) BroadcastExchange -Input [1]: [ca_address_sk#20] -Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#22] +(38) BroadcastExchange +Input [1]: [ca_address_sk#19] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#21] -(41) BroadcastHashJoin [codegen id : 22] +(39) BroadcastHashJoin [codegen id : 21] Left keys [1]: [ws_ship_addr_sk#2] -Right keys [1]: [ca_address_sk#20] +Right keys [1]: [ca_address_sk#19] Join condition: None -(42) Project [codegen id : 22] +(40) Project [codegen id : 21] Output [5]: [ws_ship_date_sk#1, ws_web_site_sk#3, ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6] -Input [7]: [ws_ship_date_sk#1, ws_ship_addr_sk#2, ws_web_site_sk#3, ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6, ca_address_sk#20] +Input [7]: [ws_ship_date_sk#1, ws_ship_addr_sk#2, ws_web_site_sk#3, ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6, ca_address_sk#19] -(43) Scan parquet default.web_site -Output [2]: [web_site_sk#23, web_company_name#24] +(41) Scan parquet default.web_site +Output [2]: [web_site_sk#22, web_company_name#23] Batched: true Location [not included in comparison]/{warehouse_dir}/web_site] PushedFilters: [IsNotNull(web_company_name), EqualTo(web_company_name,pri), IsNotNull(web_site_sk)] ReadSchema: struct -(44) ColumnarToRow [codegen id : 20] -Input [2]: [web_site_sk#23, web_company_name#24] +(42) ColumnarToRow [codegen id : 19] +Input [2]: [web_site_sk#22, web_company_name#23] -(45) Filter [codegen id : 20] -Input [2]: [web_site_sk#23, web_company_name#24] -Condition : ((isnotnull(web_company_name#24) AND (web_company_name#24 = pri)) AND isnotnull(web_site_sk#23)) +(43) Filter [codegen id : 19] +Input [2]: [web_site_sk#22, web_company_name#23] +Condition : ((isnotnull(web_company_name#23) AND (web_company_name#23 = pri)) AND isnotnull(web_site_sk#22)) -(46) Project [codegen id : 20] -Output [1]: [web_site_sk#23] -Input [2]: [web_site_sk#23, web_company_name#24] +(44) Project [codegen id : 19] +Output [1]: [web_site_sk#22] +Input [2]: [web_site_sk#22, web_company_name#23] -(47) BroadcastExchange -Input [1]: [web_site_sk#23] -Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#25] +(45) BroadcastExchange +Input [1]: [web_site_sk#22] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#24] -(48) BroadcastHashJoin [codegen id : 22] +(46) BroadcastHashJoin [codegen id : 21] Left keys [1]: [ws_web_site_sk#3] -Right keys [1]: [web_site_sk#23] +Right keys [1]: [web_site_sk#22] Join condition: None -(49) Project [codegen id : 22] +(47) Project [codegen id : 21] Output [4]: [ws_ship_date_sk#1, ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6] -Input [6]: [ws_ship_date_sk#1, ws_web_site_sk#3, ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6, web_site_sk#23] +Input [6]: [ws_ship_date_sk#1, ws_web_site_sk#3, ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6, web_site_sk#22] -(50) Scan parquet default.date_dim -Output [2]: [d_date_sk#26, d_date#27] +(48) Scan parquet default.date_dim +Output [2]: [d_date_sk#25, d_date#26] Batched: true Location [not included in comparison]/{warehouse_dir}/date_dim] PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-01), LessThanOrEqual(d_date,1999-04-02), IsNotNull(d_date_sk)] ReadSchema: struct -(51) ColumnarToRow [codegen id : 21] -Input [2]: [d_date_sk#26, d_date#27] +(49) ColumnarToRow [codegen id : 20] +Input [2]: [d_date_sk#25, d_date#26] -(52) Filter [codegen id : 21] -Input [2]: [d_date_sk#26, d_date#27] -Condition : (((isnotnull(d_date#27) AND (d_date#27 >= 10623)) AND (d_date#27 <= 10683)) AND isnotnull(d_date_sk#26)) +(50) Filter [codegen id : 20] +Input [2]: [d_date_sk#25, d_date#26] +Condition : (((isnotnull(d_date#26) AND (d_date#26 >= 10623)) AND (d_date#26 <= 10683)) AND isnotnull(d_date_sk#25)) -(53) Project [codegen id : 21] -Output [1]: [d_date_sk#26] -Input [2]: [d_date_sk#26, d_date#27] +(51) Project [codegen id : 20] +Output [1]: [d_date_sk#25] +Input [2]: [d_date_sk#25, d_date#26] -(54) BroadcastExchange -Input [1]: [d_date_sk#26] -Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#28] +(52) BroadcastExchange +Input [1]: [d_date_sk#25] +Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#27] -(55) BroadcastHashJoin [codegen id : 22] +(53) BroadcastHashJoin [codegen id : 21] Left keys [1]: [ws_ship_date_sk#1] -Right keys [1]: [d_date_sk#26] +Right keys [1]: [d_date_sk#25] Join condition: None -(56) Project [codegen id : 22] +(54) Project [codegen id : 21] Output [3]: [ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6] -Input [5]: [ws_ship_date_sk#1, ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6, d_date_sk#26] +Input [5]: [ws_ship_date_sk#1, ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6, d_date_sk#25] -(57) HashAggregate [codegen id : 22] +(55) HashAggregate [codegen id : 21] Input [3]: [ws_order_number#4, ws_ext_ship_cost#5, ws_net_profit#6] Keys [1]: [ws_order_number#4] Functions [2]: [partial_sum(UnscaledValue(ws_ext_ship_cost#5)), partial_sum(UnscaledValue(ws_net_profit#6))] -Aggregate Attributes [2]: [sum(UnscaledValue(ws_ext_ship_cost#5))#29, sum(UnscaledValue(ws_net_profit#6))#30] -Results [3]: [ws_order_number#4, sum#31, sum#32] +Aggregate Attributes [2]: [sum(UnscaledValue(ws_ext_ship_cost#5))#28, sum(UnscaledValue(ws_net_profit#6))#29] +Results [3]: [ws_order_number#4, sum#30, sum#31] -(58) Exchange -Input [3]: [ws_order_number#4, sum#31, sum#32] -Arguments: hashpartitioning(ws_order_number#4, 5), true, [id=#33] +(56) Exchange +Input [3]: [ws_order_number#4, sum#30, sum#31] +Arguments: hashpartitioning(ws_order_number#4, 5), true, [id=#32] -(59) HashAggregate [codegen id : 23] -Input [3]: [ws_order_number#4, sum#31, sum#32] +(57) HashAggregate [codegen id : 22] +Input [3]: [ws_order_number#4, sum#30, sum#31] Keys [1]: [ws_order_number#4] Functions [2]: [merge_sum(UnscaledValue(ws_ext_ship_cost#5)), merge_sum(UnscaledValue(ws_net_profit#6))] -Aggregate Attributes [2]: [sum(UnscaledValue(ws_ext_ship_cost#5))#29, sum(UnscaledValue(ws_net_profit#6))#30] -Results [3]: [ws_order_number#4, sum#31, sum#32] +Aggregate Attributes [2]: [sum(UnscaledValue(ws_ext_ship_cost#5))#28, sum(UnscaledValue(ws_net_profit#6))#29] +Results [3]: [ws_order_number#4, sum#30, sum#31] -(60) HashAggregate [codegen id : 23] -Input [3]: [ws_order_number#4, sum#31, sum#32] +(58) HashAggregate [codegen id : 22] +Input [3]: [ws_order_number#4, sum#30, sum#31] Keys: [] Functions [3]: [merge_sum(UnscaledValue(ws_ext_ship_cost#5)), merge_sum(UnscaledValue(ws_net_profit#6)), partial_count(distinct ws_order_number#4)] -Aggregate Attributes [3]: [sum(UnscaledValue(ws_ext_ship_cost#5))#29, sum(UnscaledValue(ws_net_profit#6))#30, count(ws_order_number#4)#34] -Results [3]: [sum#31, sum#32, count#35] +Aggregate Attributes [3]: [sum(UnscaledValue(ws_ext_ship_cost#5))#28, sum(UnscaledValue(ws_net_profit#6))#29, count(ws_order_number#4)#33] +Results [3]: [sum#30, sum#31, count#34] -(61) Exchange -Input [3]: [sum#31, sum#32, count#35] -Arguments: SinglePartition, true, [id=#36] +(59) Exchange +Input [3]: [sum#30, sum#31, count#34] +Arguments: SinglePartition, true, [id=#35] -(62) HashAggregate [codegen id : 24] -Input [3]: [sum#31, sum#32, count#35] +(60) HashAggregate [codegen id : 23] +Input [3]: [sum#30, sum#31, count#34] Keys: [] Functions [3]: [sum(UnscaledValue(ws_ext_ship_cost#5)), sum(UnscaledValue(ws_net_profit#6)), count(distinct ws_order_number#4)] -Aggregate Attributes [3]: [sum(UnscaledValue(ws_ext_ship_cost#5))#29, sum(UnscaledValue(ws_net_profit#6))#30, count(ws_order_number#4)#34] -Results [3]: [count(ws_order_number#4)#34 AS order count #37, MakeDecimal(sum(UnscaledValue(ws_ext_ship_cost#5))#29,17,2) AS total shipping cost #38, MakeDecimal(sum(UnscaledValue(ws_net_profit#6))#30,17,2) AS total net profit #39] +Aggregate Attributes [3]: [sum(UnscaledValue(ws_ext_ship_cost#5))#28, sum(UnscaledValue(ws_net_profit#6))#29, count(ws_order_number#4)#33] +Results [3]: [count(ws_order_number#4)#33 AS order count #36, MakeDecimal(sum(UnscaledValue(ws_ext_ship_cost#5))#28,17,2) AS total shipping cost #37, MakeDecimal(sum(UnscaledValue(ws_net_profit#6))#29,17,2) AS total net profit #38] -(63) TakeOrderedAndProject -Input [3]: [order count #37, total shipping cost #38, total net profit #39] -Arguments: 100, [order count #37 ASC NULLS FIRST], [order count #37, total shipping cost #38, total net profit #39] +(61) Sort [codegen id : 23] +Input [3]: [order count #36, total shipping cost #37, total net profit #38] +Arguments: [order count #36 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95.sf100/simplified.txt index bdcbb87b372dc..7213a9f58d3f8 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95.sf100/simplified.txt @@ -1,14 +1,14 @@ -TakeOrderedAndProject [order count ,total shipping cost ,total net profit ] - WholeStageCodegen (24) +WholeStageCodegen (23) + Sort [order count ] HashAggregate [sum,sum,count] [sum(UnscaledValue(ws_ext_ship_cost)),sum(UnscaledValue(ws_net_profit)),count(ws_order_number),order count ,total shipping cost ,total net profit ,sum,sum,count] InputAdapter Exchange #1 - WholeStageCodegen (23) + WholeStageCodegen (22) HashAggregate [ws_order_number] [sum(UnscaledValue(ws_ext_ship_cost)),sum(UnscaledValue(ws_net_profit)),count(ws_order_number),sum,sum,count,sum,sum,count] HashAggregate [ws_order_number] [sum(UnscaledValue(ws_ext_ship_cost)),sum(UnscaledValue(ws_net_profit)),sum,sum,sum,sum] InputAdapter Exchange [ws_order_number] #2 - WholeStageCodegen (22) + WholeStageCodegen (21) HashAggregate [ws_order_number,ws_ext_ship_cost,ws_net_profit] [sum(UnscaledValue(ws_ext_ship_cost)),sum(UnscaledValue(ws_net_profit)),sum,sum,sum,sum] Project [ws_order_number,ws_ext_ship_cost,ws_net_profit] BroadcastHashJoin [ws_ship_date_sk,d_date_sk] @@ -18,7 +18,7 @@ TakeOrderedAndProject [order count ,total shipping cost ,total net profit ] BroadcastHashJoin [ws_ship_addr_sk,ca_address_sk] InputAdapter SortMergeJoin [ws_order_number,wr_order_number] - WholeStageCodegen (9) + WholeStageCodegen (8) Sort [ws_order_number] InputAdapter Exchange [ws_order_number] #3 @@ -32,78 +32,74 @@ TakeOrderedAndProject [order count ,total shipping cost ,total net profit ] ColumnarToRow InputAdapter Scan parquet default.web_sales [ws_ship_date_sk,ws_ship_addr_sk,ws_web_site_sk,ws_order_number,ws_ext_ship_cost,ws_net_profit] - WholeStageCodegen (8) - Sort [ws_order_number] - InputAdapter - Exchange [ws_order_number] #5 - WholeStageCodegen (7) - Project [ws_order_number] - SortMergeJoin [ws_order_number,ws_order_number,ws_warehouse_sk,ws_warehouse_sk] - InputAdapter - WholeStageCodegen (4) - Sort [ws_order_number] - InputAdapter - Exchange [ws_order_number] #6 - WholeStageCodegen (3) - Filter [ws_order_number,ws_warehouse_sk] - ColumnarToRow - InputAdapter - Scan parquet default.web_sales [ws_warehouse_sk,ws_order_number] - InputAdapter - WholeStageCodegen (6) - Sort [ws_order_number] - InputAdapter - ReusedExchange [ws_warehouse_sk,ws_order_number] #6 - WholeStageCodegen (18) + WholeStageCodegen (7) + Project [ws_order_number] + SortMergeJoin [ws_order_number,ws_order_number,ws_warehouse_sk,ws_warehouse_sk] + InputAdapter + WholeStageCodegen (4) + Sort [ws_order_number] + InputAdapter + Exchange [ws_order_number] #5 + WholeStageCodegen (3) + Filter [ws_order_number,ws_warehouse_sk] + ColumnarToRow + InputAdapter + Scan parquet default.web_sales [ws_warehouse_sk,ws_order_number] + InputAdapter + WholeStageCodegen (6) + Sort [ws_order_number] + InputAdapter + ReusedExchange [ws_warehouse_sk,ws_order_number] #5 + WholeStageCodegen (17) Project [wr_order_number] SortMergeJoin [ws_order_number,wr_order_number] InputAdapter - WholeStageCodegen (15) + WholeStageCodegen (14) Sort [ws_order_number] InputAdapter - Exchange [ws_order_number] #7 - WholeStageCodegen (14) + Exchange [ws_order_number] #6 + WholeStageCodegen (13) Project [ws_order_number] SortMergeJoin [ws_order_number,ws_order_number,ws_warehouse_sk,ws_warehouse_sk] InputAdapter - WholeStageCodegen (11) + WholeStageCodegen (10) Sort [ws_order_number] InputAdapter - ReusedExchange [ws_warehouse_sk,ws_order_number] #6 + ReusedExchange [ws_warehouse_sk,ws_order_number] #5 InputAdapter - WholeStageCodegen (13) + WholeStageCodegen (12) Sort [ws_order_number] InputAdapter - ReusedExchange [ws_warehouse_sk,ws_order_number] #6 + ReusedExchange [ws_warehouse_sk,ws_order_number] #5 InputAdapter - WholeStageCodegen (17) + WholeStageCodegen (16) Sort [wr_order_number] InputAdapter - Exchange [wr_order_number] #8 - WholeStageCodegen (16) + Exchange [wr_order_number] #7 + WholeStageCodegen (15) Filter [wr_order_number] ColumnarToRow InputAdapter Scan parquet default.web_returns [wr_order_number] InputAdapter - BroadcastExchange #9 - WholeStageCodegen (19) + BroadcastExchange #8 + WholeStageCodegen (18) Project [ca_address_sk] Filter [ca_state,ca_address_sk] ColumnarToRow InputAdapter Scan parquet default.customer_address [ca_address_sk,ca_state] InputAdapter - BroadcastExchange #10 - WholeStageCodegen (20) + BroadcastExchange #9 + WholeStageCodegen (19) Project [web_site_sk] Filter [web_company_name,web_site_sk] ColumnarToRow InputAdapter Scan parquet default.web_site [web_site_sk,web_company_name] InputAdapter - BroadcastExchange #11 - WholeStageCodegen (21) + BroadcastExchange #10 + WholeStageCodegen (20) Project [d_date_sk] Filter [d_date,d_date_sk] ColumnarToRow diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95/explain.txt index 3a24e83aff256..1cc99e296383f 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95/explain.txt @@ -1,5 +1,5 @@ == Physical Plan == -TakeOrderedAndProject (56) +* Sort (56) +- * HashAggregate (55) +- Exchange (54) +- * HashAggregate (53) @@ -312,7 +312,7 @@ Functions [3]: [sum(UnscaledValue(ws_ext_ship_cost#5)), sum(UnscaledValue(ws_net Aggregate Attributes [3]: [sum(UnscaledValue(ws_ext_ship_cost#5))#27, sum(UnscaledValue(ws_net_profit#6))#28, count(ws_order_number#4)#32] Results [3]: [count(ws_order_number#4)#32 AS order count #35, MakeDecimal(sum(UnscaledValue(ws_ext_ship_cost#5))#27,17,2) AS total shipping cost #36, MakeDecimal(sum(UnscaledValue(ws_net_profit#6))#28,17,2) AS total net profit #37] -(56) TakeOrderedAndProject +(56) Sort [codegen id : 11] Input [3]: [order count #35, total shipping cost #36, total net profit #37] -Arguments: 100, [order count #35 ASC NULLS FIRST], [order count #35, total shipping cost #36, total net profit #37] +Arguments: [order count #35 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95/simplified.txt index 6d35311c810f5..191ff22c1961f 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95/simplified.txt @@ -1,5 +1,5 @@ -TakeOrderedAndProject [order count ,total shipping cost ,total net profit ] - WholeStageCodegen (11) +WholeStageCodegen (11) + Sort [order count ] HashAggregate [sum,sum,count] [sum(UnscaledValue(ws_ext_ship_cost)),sum(UnscaledValue(ws_net_profit)),count(ws_order_number),order count ,total shipping cost ,total net profit ,sum,sum,count] InputAdapter Exchange #1 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96.sf100/explain.txt index d00029f985471..5ae0e1632f15b 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96.sf100/explain.txt @@ -1,5 +1,5 @@ == Physical Plan == -TakeOrderedAndProject (28) +* Sort (28) +- * HashAggregate (27) +- Exchange (26) +- * HashAggregate (25) @@ -154,7 +154,7 @@ Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#17] Results [1]: [count(1)#17 AS count(1)#18] -(28) TakeOrderedAndProject +(28) Sort [codegen id : 5] Input [1]: [count(1)#18] -Arguments: 100, [count(1)#18 ASC NULLS FIRST], [count(1)#18] +Arguments: [count(1)#18 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96.sf100/simplified.txt index 1355caffbbfe8..d9ee3e09481ed 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96.sf100/simplified.txt @@ -1,5 +1,5 @@ -TakeOrderedAndProject [count(1)] - WholeStageCodegen (5) +WholeStageCodegen (5) + Sort [count(1)] HashAggregate [count] [count(1),count(1),count] InputAdapter Exchange #1 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96/explain.txt index 3561eff8f57ef..6729910d9cb4a 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96/explain.txt @@ -1,5 +1,5 @@ == Physical Plan == -TakeOrderedAndProject (28) +* Sort (28) +- * HashAggregate (27) +- Exchange (26) +- * HashAggregate (25) @@ -154,7 +154,7 @@ Functions [1]: [count(1)] Aggregate Attributes [1]: [count(1)#17] Results [1]: [count(1)#17 AS count(1)#18] -(28) TakeOrderedAndProject +(28) Sort [codegen id : 5] Input [1]: [count(1)#18] -Arguments: 100, [count(1)#18 ASC NULLS FIRST], [count(1)#18] +Arguments: [count(1)#18 ASC NULLS FIRST], true, 0 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96/simplified.txt index b13f28bf69cfd..45400b6c512f4 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q96/simplified.txt @@ -1,5 +1,5 @@ -TakeOrderedAndProject [count(1)] - WholeStageCodegen (5) +WholeStageCodegen (5) + Sort [count(1)] HashAggregate [count] [count(1),count(1),count] InputAdapter Exchange #1 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97.sf100/explain.txt index 0a2e88b5bc160..e904ad94dd8fa 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97.sf100/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97.sf100/explain.txt @@ -1,34 +1,33 @@ == Physical Plan == -CollectLimit (30) -+- * HashAggregate (29) - +- Exchange (28) - +- * HashAggregate (27) - +- * Project (26) - +- SortMergeJoin FullOuter (25) - :- * Sort (14) - : +- * HashAggregate (13) - : +- Exchange (12) - : +- * HashAggregate (11) - : +- * Project (10) - : +- * BroadcastHashJoin Inner BuildRight (9) - : :- * Filter (3) - : : +- * ColumnarToRow (2) - : : +- Scan parquet default.store_sales (1) - : +- BroadcastExchange (8) - : +- * Project (7) - : +- * Filter (6) - : +- * ColumnarToRow (5) - : +- Scan parquet default.date_dim (4) - +- * Sort (24) - +- * HashAggregate (23) - +- Exchange (22) - +- * HashAggregate (21) - +- * Project (20) - +- * BroadcastHashJoin Inner BuildRight (19) - :- * Filter (17) - : +- * ColumnarToRow (16) - : +- Scan parquet default.catalog_sales (15) - +- ReusedExchange (18) +* HashAggregate (29) ++- Exchange (28) + +- * HashAggregate (27) + +- * Project (26) + +- SortMergeJoin FullOuter (25) + :- * Sort (14) + : +- * HashAggregate (13) + : +- Exchange (12) + : +- * HashAggregate (11) + : +- * Project (10) + : +- * BroadcastHashJoin Inner BuildRight (9) + : :- * Filter (3) + : : +- * ColumnarToRow (2) + : : +- Scan parquet default.store_sales (1) + : +- BroadcastExchange (8) + : +- * Project (7) + : +- * Filter (6) + : +- * ColumnarToRow (5) + : +- Scan parquet default.date_dim (4) + +- * Sort (24) + +- * HashAggregate (23) + +- Exchange (22) + +- * HashAggregate (21) + +- * Project (20) + +- * BroadcastHashJoin Inner BuildRight (19) + :- * Filter (17) + : +- * ColumnarToRow (16) + : +- Scan parquet default.catalog_sales (15) + +- ReusedExchange (18) (1) Scan parquet default.store_sales @@ -173,7 +172,3 @@ Functions [3]: [sum(cast(CASE WHEN (isnotnull(customer_sk#8) AND isnull(customer Aggregate Attributes [3]: [sum(cast(CASE WHEN (isnotnull(customer_sk#8) AND isnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#23, sum(cast(CASE WHEN (isnull(customer_sk#8) AND isnotnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#24, sum(cast(CASE WHEN (isnotnull(customer_sk#8) AND isnotnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#25] Results [3]: [sum(cast(CASE WHEN (isnotnull(customer_sk#8) AND isnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#23 AS store_only#26, sum(cast(CASE WHEN (isnull(customer_sk#8) AND isnotnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#24 AS catalog_only#27, sum(cast(CASE WHEN (isnotnull(customer_sk#8) AND isnotnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#25 AS store_and_catalog#28] -(30) CollectLimit -Input [3]: [store_only#26, catalog_only#27, store_and_catalog#28] -Arguments: 100 - diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97.sf100/simplified.txt index bae48ec244faa..c5921a11cd889 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97.sf100/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97.sf100/simplified.txt @@ -1,46 +1,45 @@ -CollectLimit - WholeStageCodegen (8) - HashAggregate [sum,sum,sum] [sum(cast(CASE WHEN (isnotnull(customer_sk) AND isnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),sum(cast(CASE WHEN (isnull(customer_sk) AND isnotnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),sum(cast(CASE WHEN (isnotnull(customer_sk) AND isnotnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),store_only,catalog_only,store_and_catalog,sum,sum,sum] - InputAdapter - Exchange #1 - WholeStageCodegen (7) - HashAggregate [customer_sk,customer_sk] [sum,sum,sum,sum,sum,sum] - Project [customer_sk,customer_sk] - InputAdapter - SortMergeJoin [customer_sk,item_sk,customer_sk,item_sk] - WholeStageCodegen (3) - Sort [customer_sk,item_sk] - HashAggregate [ss_customer_sk,ss_item_sk] [customer_sk,item_sk] - InputAdapter - Exchange [ss_customer_sk,ss_item_sk] #2 - WholeStageCodegen (2) - HashAggregate [ss_customer_sk,ss_item_sk] - Project [ss_item_sk,ss_customer_sk] - BroadcastHashJoin [ss_sold_date_sk,d_date_sk] - Filter [ss_sold_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_customer_sk] - InputAdapter - BroadcastExchange #3 - WholeStageCodegen (1) - Project [d_date_sk] - Filter [d_month_seq,d_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.date_dim [d_date_sk,d_month_seq] - WholeStageCodegen (6) - Sort [customer_sk,item_sk] - HashAggregate [cs_bill_customer_sk,cs_item_sk] [customer_sk,item_sk] - InputAdapter - Exchange [cs_bill_customer_sk,cs_item_sk] #4 - WholeStageCodegen (5) - HashAggregate [cs_bill_customer_sk,cs_item_sk] - Project [cs_bill_customer_sk,cs_item_sk] - BroadcastHashJoin [cs_sold_date_sk,d_date_sk] - Filter [cs_sold_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk] - InputAdapter - ReusedExchange [d_date_sk] #3 +WholeStageCodegen (8) + HashAggregate [sum,sum,sum] [sum(cast(CASE WHEN (isnotnull(customer_sk) AND isnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),sum(cast(CASE WHEN (isnull(customer_sk) AND isnotnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),sum(cast(CASE WHEN (isnotnull(customer_sk) AND isnotnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),store_only,catalog_only,store_and_catalog,sum,sum,sum] + InputAdapter + Exchange #1 + WholeStageCodegen (7) + HashAggregate [customer_sk,customer_sk] [sum,sum,sum,sum,sum,sum] + Project [customer_sk,customer_sk] + InputAdapter + SortMergeJoin [customer_sk,item_sk,customer_sk,item_sk] + WholeStageCodegen (3) + Sort [customer_sk,item_sk] + HashAggregate [ss_customer_sk,ss_item_sk] [customer_sk,item_sk] + InputAdapter + Exchange [ss_customer_sk,ss_item_sk] #2 + WholeStageCodegen (2) + HashAggregate [ss_customer_sk,ss_item_sk] + Project [ss_item_sk,ss_customer_sk] + BroadcastHashJoin [ss_sold_date_sk,d_date_sk] + Filter [ss_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_customer_sk] + InputAdapter + BroadcastExchange #3 + WholeStageCodegen (1) + Project [d_date_sk] + Filter [d_month_seq,d_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_date_sk,d_month_seq] + WholeStageCodegen (6) + Sort [customer_sk,item_sk] + HashAggregate [cs_bill_customer_sk,cs_item_sk] [customer_sk,item_sk] + InputAdapter + Exchange [cs_bill_customer_sk,cs_item_sk] #4 + WholeStageCodegen (5) + HashAggregate [cs_bill_customer_sk,cs_item_sk] + Project [cs_bill_customer_sk,cs_item_sk] + BroadcastHashJoin [cs_sold_date_sk,d_date_sk] + Filter [cs_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk] + InputAdapter + ReusedExchange [d_date_sk] #3 diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97/explain.txt index 0a2e88b5bc160..e904ad94dd8fa 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97/explain.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97/explain.txt @@ -1,34 +1,33 @@ == Physical Plan == -CollectLimit (30) -+- * HashAggregate (29) - +- Exchange (28) - +- * HashAggregate (27) - +- * Project (26) - +- SortMergeJoin FullOuter (25) - :- * Sort (14) - : +- * HashAggregate (13) - : +- Exchange (12) - : +- * HashAggregate (11) - : +- * Project (10) - : +- * BroadcastHashJoin Inner BuildRight (9) - : :- * Filter (3) - : : +- * ColumnarToRow (2) - : : +- Scan parquet default.store_sales (1) - : +- BroadcastExchange (8) - : +- * Project (7) - : +- * Filter (6) - : +- * ColumnarToRow (5) - : +- Scan parquet default.date_dim (4) - +- * Sort (24) - +- * HashAggregate (23) - +- Exchange (22) - +- * HashAggregate (21) - +- * Project (20) - +- * BroadcastHashJoin Inner BuildRight (19) - :- * Filter (17) - : +- * ColumnarToRow (16) - : +- Scan parquet default.catalog_sales (15) - +- ReusedExchange (18) +* HashAggregate (29) ++- Exchange (28) + +- * HashAggregate (27) + +- * Project (26) + +- SortMergeJoin FullOuter (25) + :- * Sort (14) + : +- * HashAggregate (13) + : +- Exchange (12) + : +- * HashAggregate (11) + : +- * Project (10) + : +- * BroadcastHashJoin Inner BuildRight (9) + : :- * Filter (3) + : : +- * ColumnarToRow (2) + : : +- Scan parquet default.store_sales (1) + : +- BroadcastExchange (8) + : +- * Project (7) + : +- * Filter (6) + : +- * ColumnarToRow (5) + : +- Scan parquet default.date_dim (4) + +- * Sort (24) + +- * HashAggregate (23) + +- Exchange (22) + +- * HashAggregate (21) + +- * Project (20) + +- * BroadcastHashJoin Inner BuildRight (19) + :- * Filter (17) + : +- * ColumnarToRow (16) + : +- Scan parquet default.catalog_sales (15) + +- ReusedExchange (18) (1) Scan parquet default.store_sales @@ -173,7 +172,3 @@ Functions [3]: [sum(cast(CASE WHEN (isnotnull(customer_sk#8) AND isnull(customer Aggregate Attributes [3]: [sum(cast(CASE WHEN (isnotnull(customer_sk#8) AND isnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#23, sum(cast(CASE WHEN (isnull(customer_sk#8) AND isnotnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#24, sum(cast(CASE WHEN (isnotnull(customer_sk#8) AND isnotnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#25] Results [3]: [sum(cast(CASE WHEN (isnotnull(customer_sk#8) AND isnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#23 AS store_only#26, sum(cast(CASE WHEN (isnull(customer_sk#8) AND isnotnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#24 AS catalog_only#27, sum(cast(CASE WHEN (isnotnull(customer_sk#8) AND isnotnull(customer_sk#14)) THEN 1 ELSE 0 END as bigint))#25 AS store_and_catalog#28] -(30) CollectLimit -Input [3]: [store_only#26, catalog_only#27, store_and_catalog#28] -Arguments: 100 - diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97/simplified.txt index bae48ec244faa..c5921a11cd889 100644 --- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97/simplified.txt +++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q97/simplified.txt @@ -1,46 +1,45 @@ -CollectLimit - WholeStageCodegen (8) - HashAggregate [sum,sum,sum] [sum(cast(CASE WHEN (isnotnull(customer_sk) AND isnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),sum(cast(CASE WHEN (isnull(customer_sk) AND isnotnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),sum(cast(CASE WHEN (isnotnull(customer_sk) AND isnotnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),store_only,catalog_only,store_and_catalog,sum,sum,sum] - InputAdapter - Exchange #1 - WholeStageCodegen (7) - HashAggregate [customer_sk,customer_sk] [sum,sum,sum,sum,sum,sum] - Project [customer_sk,customer_sk] - InputAdapter - SortMergeJoin [customer_sk,item_sk,customer_sk,item_sk] - WholeStageCodegen (3) - Sort [customer_sk,item_sk] - HashAggregate [ss_customer_sk,ss_item_sk] [customer_sk,item_sk] - InputAdapter - Exchange [ss_customer_sk,ss_item_sk] #2 - WholeStageCodegen (2) - HashAggregate [ss_customer_sk,ss_item_sk] - Project [ss_item_sk,ss_customer_sk] - BroadcastHashJoin [ss_sold_date_sk,d_date_sk] - Filter [ss_sold_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_customer_sk] - InputAdapter - BroadcastExchange #3 - WholeStageCodegen (1) - Project [d_date_sk] - Filter [d_month_seq,d_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.date_dim [d_date_sk,d_month_seq] - WholeStageCodegen (6) - Sort [customer_sk,item_sk] - HashAggregate [cs_bill_customer_sk,cs_item_sk] [customer_sk,item_sk] - InputAdapter - Exchange [cs_bill_customer_sk,cs_item_sk] #4 - WholeStageCodegen (5) - HashAggregate [cs_bill_customer_sk,cs_item_sk] - Project [cs_bill_customer_sk,cs_item_sk] - BroadcastHashJoin [cs_sold_date_sk,d_date_sk] - Filter [cs_sold_date_sk] - ColumnarToRow - InputAdapter - Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk] - InputAdapter - ReusedExchange [d_date_sk] #3 +WholeStageCodegen (8) + HashAggregate [sum,sum,sum] [sum(cast(CASE WHEN (isnotnull(customer_sk) AND isnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),sum(cast(CASE WHEN (isnull(customer_sk) AND isnotnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),sum(cast(CASE WHEN (isnotnull(customer_sk) AND isnotnull(customer_sk)) THEN 1 ELSE 0 END as bigint)),store_only,catalog_only,store_and_catalog,sum,sum,sum] + InputAdapter + Exchange #1 + WholeStageCodegen (7) + HashAggregate [customer_sk,customer_sk] [sum,sum,sum,sum,sum,sum] + Project [customer_sk,customer_sk] + InputAdapter + SortMergeJoin [customer_sk,item_sk,customer_sk,item_sk] + WholeStageCodegen (3) + Sort [customer_sk,item_sk] + HashAggregate [ss_customer_sk,ss_item_sk] [customer_sk,item_sk] + InputAdapter + Exchange [ss_customer_sk,ss_item_sk] #2 + WholeStageCodegen (2) + HashAggregate [ss_customer_sk,ss_item_sk] + Project [ss_item_sk,ss_customer_sk] + BroadcastHashJoin [ss_sold_date_sk,d_date_sk] + Filter [ss_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_customer_sk] + InputAdapter + BroadcastExchange #3 + WholeStageCodegen (1) + Project [d_date_sk] + Filter [d_month_seq,d_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.date_dim [d_date_sk,d_month_seq] + WholeStageCodegen (6) + Sort [customer_sk,item_sk] + HashAggregate [cs_bill_customer_sk,cs_item_sk] [customer_sk,item_sk] + InputAdapter + Exchange [cs_bill_customer_sk,cs_item_sk] #4 + WholeStageCodegen (5) + HashAggregate [cs_bill_customer_sk,cs_item_sk] + Project [cs_bill_customer_sk,cs_item_sk] + BroadcastHashJoin [cs_sold_date_sk,d_date_sk] + Filter [cs_sold_date_sk] + ColumnarToRow + InputAdapter + Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk] + InputAdapter + ReusedExchange [d_date_sk] #3 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 321f4966178d7..d34dcb4fe0c01 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -26,7 +26,6 @@ import java.util.concurrent.atomic.AtomicLong import scala.reflect.runtime.universe.TypeTag import scala.util.Random -import org.scalatest.matchers.must.Matchers import org.scalatest.matchers.should.Matchers._ import org.apache.spark.SparkException diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala index 8b0f46b9d1ddb..4fdaeb57ad50e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameTimeWindowingSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql -import org.scalatest.BeforeAndAfterEach - import org.apache.spark.sql.catalyst.plans.logical.Expand import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSparkSession diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala index 8c998290b5044..fd408c37ef6cd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql -import java.sql.Date - import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSparkSession diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala index de791383326f1..35e732e0840e4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala @@ -23,7 +23,7 @@ import scala.collection.JavaConverters._ import org.scalatest.BeforeAndAfter -import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchTableException, TableAlreadyExistsException} +import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic} import org.apache.spark.sql.connector.{InMemoryTable, InMemoryTableCatalog} import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala index 2be86b9ad6208..ac51634febc99 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql import scala.collection.immutable.{HashSet => HSet} import scala.collection.immutable.Queue import scala.collection.mutable.{LinkedHashMap => LHMap} -import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.test.SharedSparkSession diff --git a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala index 80346b350c142..861a001b190aa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala @@ -27,7 +27,6 @@ import org.scalatest.Assertions._ import org.apache.spark.TestUtils import org.apache.spark.api.python.{PythonBroadcast, PythonEvalType, PythonFunction, PythonUtils} import org.apache.spark.broadcast.Broadcast -import org.apache.spark.internal.config.Tests import org.apache.spark.sql.catalyst.expressions.{Cast, Expression} import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.execution.python.UserDefinedPythonFunction diff --git a/sql/core/src/test/scala/org/apache/spark/sql/PlanStabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/PlanStabilitySuite.scala index c2aee0ad4c9a1..76204c504c0ed 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/PlanStabilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/PlanStabilitySuite.scala @@ -30,7 +30,6 @@ import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite import org.apache.spark.sql.execution.exchange.{Exchange, ReusedExchangeExec} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.tags.ExtendedSQLTest // scalastyle:off line.size.limit /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala index b016cc3f57e0d..65377594f083c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala @@ -25,7 +25,6 @@ import java.util.concurrent.TimeUnit import scala.collection.mutable import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.catalog.CatalogColumnStat import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util.DateTimeTestUtils diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala index 6b25d7c61663c..46112d40f08ba 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala @@ -30,7 +30,6 @@ import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.internal.SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap class DataSourceV2DataFrameSessionCatalogSuite extends InsertIntoTests(supportsDynamicOverwrite = true, includeSQLOnlyTests = false) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala index db3f11dbda51a..ddafa1bb5070a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -25,7 +25,7 @@ import scala.collection.JavaConverters._ import org.apache.spark.SparkException import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NamespaceAlreadyExistsException, NoSuchDatabaseException, NoSuchNamespaceException, NoSuchPartitionException, NoSuchPartitionsException, NoSuchTableException, PartitionsAlreadyExistException, TableAlreadyExistsException} +import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NamespaceAlreadyExistsException, NoSuchDatabaseException, NoSuchNamespaceException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.connector.catalog._ import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME @@ -139,6 +139,10 @@ class DataSourceV2SQLSuite Array("# Partitioning", "", ""), Array("Part 0", "id", ""), Array("", "", ""), + Array("# Metadata Columns", "", ""), + Array("index", "string", "Metadata column used to conflict with a data column"), + Array("_partition", "string", "Partition key used to store the row"), + Array("", "", ""), Array("# Detailed Table Information", "", ""), Array("Name", "testcat.table_name", ""), Array("Comment", "this is a test table", ""), @@ -1909,21 +1913,6 @@ class DataSourceV2SQLSuite } } - test("SHOW PARTITIONS") { - val t = "testcat.ns1.ns2.tbl" - withTable(t) { - sql( - s""" - |CREATE TABLE $t (id bigint, data string) - |USING foo - |PARTITIONED BY (id) - """.stripMargin) - - testV1Command("SHOW PARTITIONS", t) - testV1Command("SHOW PARTITIONS", s"$t PARTITION(id='1')") - } - } - test("LOAD DATA INTO TABLE") { val t = "testcat.ns1.ns2.tbl" withTable(t) { @@ -2485,6 +2474,45 @@ class DataSourceV2SQLSuite } } + test("SPARK-31255: Project a metadata column") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " + + "PARTITIONED BY (bucket(4, id), id)") + sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')") + + checkAnswer( + spark.sql(s"SELECT id, data, _partition FROM $t1"), + Seq(Row(1, "a", "3/1"), Row(2, "b", "2/2"), Row(3, "c", "2/3"))) + } + } + + test("SPARK-31255: Projects data column when metadata column has the same name") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (index bigint, data string) USING $v2Format " + + "PARTITIONED BY (bucket(4, index), index)") + sql(s"INSERT INTO $t1 VALUES (3, 'c'), (2, 'b'), (1, 'a')") + + checkAnswer( + spark.sql(s"SELECT index, data, _partition FROM $t1"), + Seq(Row(3, "c", "2/3"), Row(2, "b", "2/2"), Row(1, "a", "3/1"))) + } + } + + test("SPARK-31255: * expansion does not include metadata columns") { + val t1 = s"${catalogAndNamespace}table" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " + + "PARTITIONED BY (bucket(4, id), id)") + sql(s"INSERT INTO $t1 VALUES (3, 'c'), (2, 'b'), (1, 'a')") + + checkAnswer( + spark.sql(s"SELECT * FROM $t1"), + Seq(Row(3, "c"), Row(2, "b"), Row(1, "a"))) + } + } + private def testNotSupportedV2Command(sqlCommand: String, sqlParams: String): Unit = { val e = intercept[AnalysisException] { sql(s"$sqlCommand $sqlParams") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala index eacdb9e2fcd7b..3aad644655aa6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression} import org.apache.spark.sql.connector.catalog.{Identifier, SupportsCatalogOptions, TableCatalog} import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME -import org.apache.spark.sql.connector.expressions.{FieldReference, IdentityTransform, Transform} +import org.apache.spark.sql.connector.expressions.{FieldReference, IdentityTransform} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.internal.SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/TableCapabilityCheckSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/TableCapabilityCheckSuite.scala index 2d75a35215866..bad21aac41712 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/TableCapabilityCheckSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/TableCapabilityCheckSuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.analysis.{AnalysisSuite, NamedRelation} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2 -import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, Table, TableCapability, TableProvider} +import org.apache.spark.sql.connector.catalog.{Table, TableCapability} import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, TableCapabilityCheck} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/V1ReadFallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/V1ReadFallbackSuite.scala index 74f2ca14234d2..9beef690cba32 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/V1ReadFallbackSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/V1ReadFallbackSuite.scala @@ -23,7 +23,7 @@ import scala.collection.JavaConverters._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, QueryTest, Row, SparkSession, SQLContext} -import org.apache.spark.sql.connector.catalog.{Identifier, SupportsRead, Table, TableCapability, TableProvider} +import org.apache.spark.sql.connector.catalog.{Identifier, SupportsRead, Table, TableCapability} import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns, V1Scan} import org.apache.spark.sql.execution.RowDataSourceScanExec diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala index e6029400997a2..81f292809df4a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala @@ -28,7 +28,6 @@ import org.scalatest.exceptions.TestFailedException import org.apache.spark.{SparkException, TaskContext, TestUtils} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, GenericInternalRow} import org.apache.spark.sql.catalyst.plans.physical.Partitioning diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index be29acb6d3a7c..6de81cc414d7d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql.{execution, DataFrame, Row} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ -import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Range, Repartition, Sort, Union} +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Range, Repartition, Union} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecution} import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} @@ -895,6 +895,201 @@ class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { } } + test("SPARK-33399: aliases should be handled properly in PartitioningCollection output" + + " partitioning") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withTempView("t1", "t2", "t3") { + spark.range(10).repartition($"id").createTempView("t1") + spark.range(20).repartition($"id").createTempView("t2") + spark.range(30).repartition($"id").createTempView("t3") + val planned = sql( + """ + |SELECT t3.id as t3id + |FROM ( + | SELECT t1.id as t1id, t2.id as t2id + | FROM t1, t2 + | WHERE t1.id = t2.id + |) t12, t3 + |WHERE t1id = t3.id + """.stripMargin).queryExecution.executedPlan + val exchanges = planned.collect { case s: ShuffleExchangeExec => s } + assert(exchanges.size == 3) + + val projects = planned.collect { case p: ProjectExec => p } + assert(projects.exists(_.outputPartitioning match { + case PartitioningCollection(Seq(HashPartitioning(Seq(k1: AttributeReference), _), + HashPartitioning(Seq(k2: AttributeReference), _))) if k1.name == "t1id" => + true + case _ => false + })) + } + } + } + + test("SPARK-33399: aliases should be handled properly in HashPartitioning") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withTempView("t1", "t2", "t3") { + spark.range(10).repartition($"id").createTempView("t1") + spark.range(20).repartition($"id").createTempView("t2") + spark.range(30).repartition($"id").createTempView("t3") + val planned = sql( + """ + |SELECT t1id, t3.id as t3id + |FROM ( + | SELECT t1.id as t1id + | FROM t1 LEFT SEMI JOIN t2 + | ON t1.id = t2.id + |) t12 INNER JOIN t3 + |WHERE t1id = t3.id + """.stripMargin).queryExecution.executedPlan + val exchanges = planned.collect { case s: ShuffleExchangeExec => s } + assert(exchanges.size == 3) + + val projects = planned.collect { case p: ProjectExec => p } + assert(projects.exists(_.outputPartitioning match { + case HashPartitioning(Seq(a: AttributeReference), _) => a.name == "t1id" + case _ => false + })) + } + } + } + + test("SPARK-33399: alias handling should happen properly for RangePartitioning") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + val df = spark.range(1, 100) + .select(col("id").as("id1")).groupBy("id1").count() + // Plan for this will be Range -> ProjectWithAlias -> HashAggregate -> HashAggregate + // if Project normalizes alias in its Range outputPartitioning, then no Exchange should come + // in between HashAggregates + val planned = df.queryExecution.executedPlan + val exchanges = planned.collect { case s: ShuffleExchangeExec => s } + assert(exchanges.isEmpty) + + val projects = planned.collect { case p: ProjectExec => p } + assert(projects.exists(_.outputPartitioning match { + case RangePartitioning(Seq(SortOrder(ar: AttributeReference, _, _, _)), _) => + ar.name == "id1" + case _ => false + })) + } + } + + test("SPARK-33399: aliased should be handled properly " + + "for partitioning and sortorder involving complex expressions") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withTempView("t1", "t2", "t3") { + spark.range(10).select(col("id").as("id1")).createTempView("t1") + spark.range(20).select(col("id").as("id2")).createTempView("t2") + spark.range(30).select(col("id").as("id3")).createTempView("t3") + val planned = sql( + """ + |SELECT t3.id3 as t3id + |FROM ( + | SELECT t1.id1 as t1id, t2.id2 as t2id + | FROM t1, t2 + | WHERE t1.id1 * 10 = t2.id2 * 10 + |) t12, t3 + |WHERE t1id * 10 = t3.id3 * 10 + """.stripMargin).queryExecution.executedPlan + val sortNodes = planned.collect { case s: SortExec => s } + assert(sortNodes.size == 3) + val exchangeNodes = planned.collect { case e: ShuffleExchangeExec => e } + assert(exchangeNodes.size == 3) + + val projects = planned.collect { case p: ProjectExec => p } + assert(projects.exists(_.outputPartitioning match { + case PartitioningCollection(Seq(HashPartitioning(Seq(Multiply(ar1, _, _)), _), + HashPartitioning(Seq(Multiply(ar2, _, _)), _))) => + Seq(ar1, ar2) match { + case Seq(ar1: AttributeReference, ar2: AttributeReference) => + ar1.name == "t1id" && ar2.name == "id2" + case _ => + false + } + case _ => false + })) + + } + } + } + + test("SPARK-33399: alias handling should happen properly for SinglePartition") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + val df = spark.range(1, 100, 1, 1) + .select(col("id").as("id1")).groupBy("id1").count() + val planned = df.queryExecution.executedPlan + val exchanges = planned.collect { case s: ShuffleExchangeExec => s } + assert(exchanges.isEmpty) + + val projects = planned.collect { case p: ProjectExec => p } + assert(projects.exists(_.outputPartitioning match { + case SinglePartition => true + case _ => false + })) + } + } + + test("SPARK-33399: No extra exchanges in case of" + + " [Inner Join -> Project with aliases -> HashAggregate]") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withTempView("t1", "t2") { + spark.range(10).repartition($"id").createTempView("t1") + spark.range(20).repartition($"id").createTempView("t2") + val planned = sql( + """ + |SELECT t1id, t2id + |FROM ( + | SELECT t1.id as t1id, t2.id as t2id + | FROM t1 INNER JOIN t2 + | WHERE t1.id = t2.id + |) t12 + |GROUP BY t1id, t2id + """.stripMargin).queryExecution.executedPlan + val exchanges = planned.collect { case s: ShuffleExchangeExec => s } + assert(exchanges.size == 2) + + val projects = planned.collect { case p: ProjectExec => p } + assert(projects.exists(_.outputPartitioning match { + case PartitioningCollection(Seq(HashPartitioning(Seq(k1: AttributeReference), _), + HashPartitioning(Seq(k2: AttributeReference), _))) => + k1.name == "t1id" && k2.name == "t2id" + case _ => false + })) + } + } + } + + test("SPARK-33400: Normalization of sortOrder should take care of sameOrderExprs") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withTempView("t1", "t2", "t3") { + spark.range(10).repartition($"id").createTempView("t1") + spark.range(20).repartition($"id").createTempView("t2") + spark.range(30).repartition($"id").createTempView("t3") + val planned = sql( + """ + |SELECT t2id, t3.id as t3id + |FROM ( + | SELECT t1.id as t1id, t2.id as t2id + | FROM t1, t2 + | WHERE t1.id = t2.id + |) t12, t3 + |WHERE t2id = t3.id + """.stripMargin).queryExecution.executedPlan + + val sortNodes = planned.collect { case s: SortExec => s } + assert(sortNodes.size == 3) + + val projects = planned.collect { case p: ProjectExec => p } + assert(projects.exists(_.outputOrdering match { + case Seq(SortOrder(_, Ascending, NullsFirst, sameOrderExprs)) => + sameOrderExprs.size == 1 && sameOrderExprs.head.isInstanceOf[AttributeReference] && + sameOrderExprs.head.asInstanceOf[AttributeReference].name == "t2id" + case _ => false + })) + } + } + } + test("aliases to expressions should not be replaced") { withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { withTempView("df1", "df2") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantSortsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantSortsSuite.scala index 54c5a33441900..751078d08fda9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantSortsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantSortsSuite.scala @@ -18,7 +18,9 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.{DataFrame, QueryTest} +import org.apache.spark.sql.catalyst.plans.physical.{RangePartitioning, UnknownPartitioning} import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite} +import org.apache.spark.sql.execution.joins.ShuffledJoin import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -135,6 +137,32 @@ abstract class RemoveRedundantSortsSuiteBase } } } + + test("SPARK-33472: shuffled join with different left and right side partition numbers") { + withTempView("t1", "t2") { + spark.range(0, 100, 1, 2).select('id as "key").createOrReplaceTempView("t1") + (0 to 100).toDF("key").createOrReplaceTempView("t2") + + val queryTemplate = """ + |SELECT /*+ %s(t1) */ t1.key + |FROM t1 JOIN t2 ON t1.key = t2.key + |WHERE t1.key > 10 AND t2.key < 50 + |ORDER BY t1.key ASC + """.stripMargin + + Seq(("MERGE", 3), ("SHUFFLE_HASH", 1)).foreach { case (hint, count) => + val query = queryTemplate.format(hint) + val df = sql(query) + val sparkPlan = df.queryExecution.sparkPlan + val join = sparkPlan.collect { case j: ShuffledJoin => j }.head + val leftPartitioning = join.left.outputPartitioning + assert(leftPartitioning.isInstanceOf[RangePartitioning]) + assert(leftPartitioning.numPartitions == 2) + assert(join.right.outputPartitioning == UnknownPartitioning(0)) + checkSorts(query, count, count) + } + } + } } class RemoveRedundantSortsSuite extends RemoveRedundantSortsSuiteBase diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index 87a5cb9f73355..792f920ee0217 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -181,7 +181,6 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { sql(s"SHOW CREATE TABLE $viewName") }.getMessage assert(e3.contains(s"$viewName is a temp view not table or permanent view")) - assertNoSuchTable(s"SHOW PARTITIONS $viewName") val e4 = intercept[AnalysisException] { sql(s"ANALYZE TABLE $viewName COMPUTE STATISTICS") }.getMessage diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SameResultSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SameResultSuite.scala index ddaa2687eaf1a..18d36670306b8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SameResultSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SameResultSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.{DataFrame, QueryTest} import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project} -import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, FileScan} +import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala index 7ddf9d87a6aca..f1fcf3bc5125e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.execution -import scala.language.implicitConversions import scala.util.control.NonFatal import org.apache.spark.SparkFunSuite diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveTestUtils.scala index 48f85ae76cd8c..ad3ec85e984c8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveTestUtils.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.execution.adaptive -import java.io.{PrintWriter, StringWriter} - import org.scalactic.source.Position import org.scalatest.Tag diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala index 9ade8b14f59b0..a98ca7f5d8f88 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.functions.{monotonically_increasing_id, timestamp_seconds} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType -import org.apache.spark.sql.types.{ByteType, Decimal, DecimalType, TimestampType} +import org.apache.spark.sql.types.{ByteType, Decimal, DecimalType} /** * Benchmark to measure read performance with Filter pushdown. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ParquetNestedPredicatePushDownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ParquetNestedPredicatePushDownBenchmark.scala index d2bd962b50654..f89fe2e64c778 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ParquetNestedPredicatePushDownBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ParquetNestedPredicatePushDownBenchmark.scala @@ -17,9 +17,8 @@ package org.apache.spark.sql.execution.benchmark -import org.apache.spark.SparkConf import org.apache.spark.benchmark.Benchmark -import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} +import org.apache.spark.sql.{DataFrame, SaveMode} import org.apache.spark.sql.internal.SQLConf /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala index 43bc7c12937ec..f931914b19c6c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.SparkConf import org.apache.spark.benchmark.Benchmark -import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.catalog.HiveTableRelation import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala index 847e0ec4f3195..0abb3cb6a2ed0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.CalendarInterval class ColumnStatsSuite extends SparkFunSuite { testColumnStats(classOf[BooleanColumnStats], BOOLEAN, Array(true, false, 0)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala index 7fd7040f0f51d..8ce4bcbadc223 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala @@ -359,14 +359,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { assert(e.contains("Found duplicate keys 'a'")) } - test("empty values in non-optional partition specs") { - val e = intercept[ParseException] { - parser.parsePlan( - "SHOW PARTITIONS dbx.tab1 PARTITION (a='1', b)") - }.getMessage - assert(e.contains("Found an empty partition key 'b'")) - } - test("Test CTAS #1") { val s1 = """ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 348cf94dfc629..9d0147048dbb8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -2026,7 +2026,6 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } test("SPARK-30312: truncate table - keep acl/permission") { - import testImplicits._ val ignorePermissionAcl = Seq(true, false) ignorePermissionAcl.foreach { ignore => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index d5820b016736a..fd1978c5137a5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -26,7 +26,7 @@ import org.mockito.invocation.InvocationOnMock import org.apache.spark.sql.{AnalysisException, SaveMode} import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, Analyzer, CTESubstitution, EmptyFunctionRegistry, NoSuchTableException, ResolveCatalogs, ResolvedTable, ResolveInlineTables, ResolveSessionCatalog, UnresolvedAttribute, UnresolvedRelation, UnresolvedStar, UnresolvedSubqueryColumnAliases, UnresolvedV2Relation} +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, Analyzer, CTESubstitution, EmptyFunctionRegistry, NoSuchTableException, ResolveCatalogs, ResolvedTable, ResolveInlineTables, ResolveSessionCatalog, UnresolvedAttribute, UnresolvedRelation, UnresolvedSubqueryColumnAliases, UnresolvedV2Relation} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Expression, InSubquery, IntegerLiteral, ListQuery, StringLiteral} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser @@ -151,7 +151,7 @@ class PlanResolutionSuite extends AnalysisTest { } else { catalogManagerWithoutDefault } - val analyzer = new Analyzer(catalogManager, conf) + val analyzer = new Analyzer(catalogManager) // TODO: run the analyzer directly. val rules = Seq( CTESubstitution, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsParserSuite.scala new file mode 100644 index 0000000000000..bc75528b9644c --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsParserSuite.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.apache.spark.sql.catalyst.analysis.AnalysisTest +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser.parsePlan +import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.catalyst.plans.logical.ShowPartitionsStatement +import org.apache.spark.sql.execution.SparkSqlParser +import org.apache.spark.sql.test.SharedSparkSession + +class ShowPartitionsParserSuite extends AnalysisTest with SharedSparkSession { + test("SHOW PARTITIONS") { + Seq( + "SHOW PARTITIONS t1" -> ShowPartitionsStatement(Seq("t1"), None), + "SHOW PARTITIONS db1.t1" -> ShowPartitionsStatement(Seq("db1", "t1"), None), + "SHOW PARTITIONS t1 PARTITION(partcol1='partvalue', partcol2='partvalue')" -> + ShowPartitionsStatement( + Seq("t1"), + Some(Map("partcol1" -> "partvalue", "partcol2" -> "partvalue"))), + "SHOW PARTITIONS a.b.c" -> ShowPartitionsStatement(Seq("a", "b", "c"), None), + "SHOW PARTITIONS a.b.c PARTITION(ds='2017-06-10')" -> + ShowPartitionsStatement(Seq("a", "b", "c"), Some(Map("ds" -> "2017-06-10"))) + ).foreach { case (sql, expected) => + val parsed = parsePlan(sql) + comparePlans(parsed, expected) + } + } + + test("empty values in non-optional partition specs") { + val e = intercept[ParseException] { + new SparkSqlParser().parsePlan( + "SHOW PARTITIONS dbx.tab1 PARTITION (a='1', b)") + }.getMessage + assert(e.contains("Found an empty partition key 'b'")) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala new file mode 100644 index 0000000000000..413e170326eea --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/ShowPartitionsSuiteBase.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command + +import org.scalactic.source.Position +import org.scalatest.Tag + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.test.SQLTestUtils + +trait ShowPartitionsSuiteBase extends QueryTest with SQLTestUtils { + protected def version: String + protected def catalog: String + protected def defaultNamespace: Seq[String] + protected def defaultUsing: String + + override def test(testName: String, testTags: Tag*)(testFun: => Any) + (implicit pos: Position): Unit = { + super.test(s"SHOW PARTITIONS $version: " + testName, testTags: _*)(testFun) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala new file mode 100644 index 0000000000000..bcc71e9b7241c --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/ShowPartitionsSuite.scala @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command.v1 + +import org.apache.spark.sql.{AnalysisException, Row, SaveMode} +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException +import org.apache.spark.sql.connector.catalog.CatalogManager +import org.apache.spark.sql.execution.command +import org.apache.spark.sql.test.SharedSparkSession + +trait ShowPartitionsSuiteBase extends command.ShowPartitionsSuiteBase { + override def version: String = "V1" + override def catalog: String = CatalogManager.SESSION_CATALOG_NAME + override def defaultNamespace: Seq[String] = Seq("default") + override def defaultUsing: String = "USING parquet" + + private def createDateTable(table: String): Unit = { + sql(s""" + |CREATE TABLE $table (price int, qty int, year int, month int) + |$defaultUsing + |partitioned by (year, month)""".stripMargin) + sql(s"INSERT INTO $table PARTITION(year = 2015, month = 1) SELECT 1, 1") + sql(s"INSERT INTO $table PARTITION(year = 2015, month = 2) SELECT 2, 2") + sql(s"INSERT INTO $table PARTITION(year = 2016, month = 2) SELECT 3, 3") + sql(s"INSERT INTO $table PARTITION(year = 2016, month = 3) SELECT 3, 3") + } + + test("show everything") { + val table = "dateTable" + withTable(table) { + createDateTable(table) + checkAnswer( + sql(s"show partitions $table"), + Row("year=2015/month=1") :: + Row("year=2015/month=2") :: + Row("year=2016/month=2") :: + Row("year=2016/month=3") :: Nil) + + checkAnswer( + sql(s"show partitions default.$table"), + Row("year=2015/month=1") :: + Row("year=2015/month=2") :: + Row("year=2016/month=2") :: + Row("year=2016/month=3") :: Nil) + } + } + + test("filter by partitions") { + val table = "dateTable" + withTable(table) { + createDateTable(table) + checkAnswer( + sql(s"show partitions default.$table PARTITION(year=2015)"), + Row("year=2015/month=1") :: + Row("year=2015/month=2") :: Nil) + checkAnswer( + sql(s"show partitions default.$table PARTITION(year=2015, month=1)"), + Row("year=2015/month=1") :: Nil) + checkAnswer( + sql(s"show partitions default.$table PARTITION(month=2)"), + Row("year=2015/month=2") :: + Row("year=2016/month=2") :: Nil) + } + } + + test("show everything more than 5 part keys") { + val table = "wideTable" + withTable(table) { + sql(s""" + |CREATE TABLE $table ( + | price int, qty int, + | year int, month int, hour int, minute int, sec int, extra int) + |$defaultUsing + |PARTITIONED BY (year, month, hour, minute, sec, extra)""".stripMargin) + sql(s""" + |INSERT INTO $table + |PARTITION(year = 2016, month = 3, hour = 10, minute = 10, sec = 10, extra = 1) SELECT 3, 3 + """.stripMargin) + sql(s""" + |INSERT INTO $table + |PARTITION(year = 2016, month = 4, hour = 10, minute = 10, sec = 10, extra = 1) SELECT 3, 3 + """.stripMargin) + checkAnswer( + sql(s"show partitions $table"), + Row("year=2016/month=3/hour=10/minute=10/sec=10/extra=1") :: + Row("year=2016/month=4/hour=10/minute=10/sec=10/extra=1") :: Nil) + } + } + + test("non-partitioning columns") { + val table = "dateTable" + withTable(table) { + createDateTable(table) + val errMsg = intercept[AnalysisException] { + sql(s"SHOW PARTITIONS $table PARTITION(abcd=2015, xyz=1)") + }.getMessage + assert(errMsg.contains("Non-partitioning column(s) [abcd, xyz] are specified")) + } + } + + test("show partitions of non-partitioned table") { + val table = "not_partitioned_table" + withTable(table) { + sql(s"CREATE TABLE $table (col1 int) $defaultUsing") + val errMsg = intercept[AnalysisException] { + sql(s"SHOW PARTITIONS $table") + }.getMessage + assert(errMsg.contains("not allowed on a table that is not partitioned")) + } + } + + test("show partitions of a view") { + val table = "dateTable" + withTable(table) { + createDateTable(table) + val view = "view1" + withView(view) { + sql(s"CREATE VIEW $view as select * from $table") + val errMsg = intercept[AnalysisException] { + sql(s"SHOW PARTITIONS $view") + }.getMessage + assert(errMsg.contains("is not allowed on a view")) + } + } + } + + test("show partitions of a temporary view") { + val viewName = "test_view" + withTempView(viewName) { + spark.range(10).createTempView(viewName) + val errMsg = intercept[NoSuchTableException] { + sql(s"SHOW PARTITIONS $viewName") + }.getMessage + assert(errMsg.contains(s"Table or view '$viewName' not found")) + } + } +} + +class ShowPartitionsSuite extends ShowPartitionsSuiteBase with SharedSparkSession { + // The test is placed here because it fails with `USING HIVE`: + // org.apache.spark.sql.AnalysisException: + // Hive data source can only be used with tables, you can't use it with CREATE TEMP VIEW USING + test("issue exceptions on the temporary view") { + val viewName = "test_view" + withTempView(viewName) { + sql(s""" + |CREATE TEMPORARY VIEW $viewName (c1 INT, c2 STRING) + |$defaultUsing""".stripMargin) + val errMsg = intercept[NoSuchTableException] { + sql(s"SHOW PARTITIONS $viewName") + }.getMessage + assert(errMsg.contains(s"Table or view '$viewName' not found")) + } + } + + test("show partitions from a datasource") { + import testImplicits._ + withTable("part_datasrc") { + val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c") + df.write + .partitionBy("a") + .format("parquet") + .mode(SaveMode.Overwrite) + .saveAsTable("part_datasrc") + + assert(sql("SHOW PARTITIONS part_datasrc").count() == 3) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala new file mode 100644 index 0000000000000..8a63cd49e89e9 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/ShowPartitionsSuite.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.command.v2 + +import org.apache.spark.SparkConf +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.connector.InMemoryTableCatalog +import org.apache.spark.sql.execution.command +import org.apache.spark.sql.test.SharedSparkSession + +class ShowPartitionsSuite extends command.ShowPartitionsSuiteBase with SharedSparkSession { + override def version: String = "V2" + override def catalog: String = "test_catalog" + override def defaultNamespace: Seq[String] = Nil + override def defaultUsing: String = "USING _" + + override def sparkConf: SparkConf = super.sparkConf + .set(s"spark.sql.catalog.$catalog", classOf[InMemoryTableCatalog].getName) + + // TODO(SPARK-33452): Create a V2 SHOW PARTITIONS execution node + test("not supported SHOW PARTITIONS") { + def testV1Command(sqlCommand: String, sqlParams: String): Unit = { + val e = intercept[AnalysisException] { + sql(s"$sqlCommand $sqlParams") + } + assert(e.message.contains(s"$sqlCommand is only supported with v1 tables")) + } + val t = s"$catalog.ns1.ns2.tbl" + withTable(t) { + sql( + s""" + |CREATE TABLE $t (id bigint, data string) + |$defaultUsing + |PARTITIONED BY (id) + """.stripMargin) + + testV1Command("SHOW PARTITIONS", t) + testV1Command("SHOW PARTITIONS", s"$t PARTITION(id='1')") + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaTest.scala index fd70b6529ff51..22db55afc27c9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaTest.scala @@ -21,7 +21,7 @@ import java.io.File import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.functions._ -import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} +import org.apache.spark.sql.test.SharedSparkSession /** * The reader schema is said to be evolved (or projected) when it changed after the data is diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/RowDataSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/RowDataSourceStrategySuite.scala index 6420081a9757b..3e8a4fe290502 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/RowDataSourceStrategySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/RowDataSourceStrategySuite.scala @@ -22,15 +22,10 @@ import java.util.Properties import org.scalatest.BeforeAndAfter -import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.{DataFrame, Row} -import org.apache.spark.sql.sources._ import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types._ import org.apache.spark.util.Utils class RowDataSourceStrategySuite extends SharedSparkSession with BeforeAndAfter { - import testImplicits._ val url = "jdbc:h2:mem:testdb0" val urlWithUserAndPass = "jdbc:h2:mem:testdb0;user=testUser;password=testPass" diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala index 233978289f068..e843d1d328425 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SaveIntoDataSourceCommandSuite.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.execution.datasources -import org.apache.spark.SparkConf import org.apache.spark.sql.SaveMode import org.apache.spark.sql.test.SharedSparkSession diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala index 8462916daaab8..86ff026d7b1e9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.SQLConf.SOURCES_BINARY_FILE_MAX_LENGTH import org.apache.spark.sql.sources._ -import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} +import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ import org.apache.spark.util.Utils diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index d27b5c4737a11..7cc3a1cf9f3b8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -18,12 +18,11 @@ package org.apache.spark.sql.execution.datasources.json import org.apache.spark.sql.{QueryTest, Row} -import org.apache.spark.sql.catalyst.json.JSONOptions import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{DoubleType, StringType, StructType} +import org.apache.spark.sql.types.{StringType, StructType} /** - * Test cases for various [[JSONOptions]]. + * Test cases for various [[org.apache.spark.sql.catalyst.json.JSONOptions]]. */ class JsonParsingOptionsSuite extends QueryTest with SharedSparkSession { import testImplicits._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV2SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV2SchemaPruningSuite.scala index 6c9bd32913178..378b52f9c6c8c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV2SchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV2SchemaPruningSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources.orc import org.apache.spark.SparkConf -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.datasources.SchemaPruningSuite diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCommitterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCommitterSuite.scala index 4b2437803d645..7f408dbba5099 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCommitterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCommitterSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.execution.datasources.parquet -import java.io.FileNotFoundException - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} @@ -149,7 +147,7 @@ private object MarkingFileOutput { * @param outputPath destination directory * @param conf configuration to create the FS with * @return the status of the marker - * @throws FileNotFoundException if the marker is absent + * @throws java.io.FileNotFoundException if the marker is absent */ def checkMarker(outputPath: Path, conf: Configuration): FileStatus = { outputPath.getFileSystem(conf).getFileStatus(new Path(outputPath, "marker")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index 34bdef7bdb402..d13b3e58a30ff 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.datasources.parquet import java.nio.file.{Files, Paths, StandardCopyOption} import java.sql.{Date, Timestamp} -import java.time._ import java.util.Locale import scala.collection.JavaConverters._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala index 8c4eedfde76cd..8c5f7bed7c50d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala @@ -21,7 +21,7 @@ import java.io.File import java.time.ZoneOffset import org.apache.commons.io.FileUtils -import org.apache.hadoop.fs.{FileSystem, Path, PathFilter} +import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala index accd04592bec5..5c41614c45b6f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala @@ -23,8 +23,6 @@ import java.sql.{Date, Timestamp} import java.time.{ZoneId, ZoneOffset} import java.util.{Calendar, Locale} -import scala.collection.mutable.ArrayBuffer - import com.google.common.io.Files import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.ParquetOutputFormat diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index 7990b1c27437a..e97c6cd29709c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala @@ -23,7 +23,7 @@ import scala.reflect.runtime.universe.TypeTag import org.apache.parquet.io.ParquetDecodingException import org.apache.parquet.schema.{MessageType, MessageTypeParser} -import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.execution.QueryExecutionException import org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala index c3bcf86c1ed27..1a4f08418f8d3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala @@ -29,7 +29,7 @@ import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.{NamespaceAlreadyExistsException, NoSuchNamespaceException, NoSuchTableException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Identifier, NamespaceChange, SupportsNamespaces, TableCatalog, TableChange, V1Table} +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Identifier, NamespaceChange, TableCatalog, TableChange, V1Table} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType, StructField, StructType, TimestampType} import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -46,7 +46,7 @@ abstract class V2SessionCatalogBaseSuite extends SharedSparkSession with BeforeA val testIdent: Identifier = Identifier.of(testNs, "test_table") def newCatalog(): V2SessionCatalog = { - val newCatalog = new V2SessionCatalog(spark.sessionState.catalog, spark.sessionState.conf) + val newCatalog = new V2SessionCatalog(spark.sessionState.catalog) newCatalog.initialize("test", CaseInsensitiveStringMap.empty()) newCatalog } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala index c53617b40e09d..622d69e188821 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/FileStreamSinkLogSuite.scala @@ -22,7 +22,6 @@ import java.lang.{Long => JLong} import java.net.URI import java.nio.charset.StandardCharsets.UTF_8 import java.util.concurrent.ConcurrentHashMap -import java.util.concurrent.atomic.AtomicLong import scala.util.Random diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala index 3ead91fcf712a..014840d758c0c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkSuite.scala @@ -24,7 +24,7 @@ import org.scalatest.BeforeAndAfter import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.streaming.sources._ -import org.apache.spark.sql.streaming.{OutputMode, StreamTest} +import org.apache.spark.sql.streaming.StreamTest import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.apache.spark.util.Utils diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithStateExecHelperSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithStateExecHelperSuite.scala index dec30fd01f7e2..ea6fd8ab312c9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithStateExecHelperSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithStateExecHelperSuite.scala @@ -23,7 +23,6 @@ import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.execution.streaming.GroupStateImpl._ -import org.apache.spark.sql.streaming.FlatMapGroupsWithStateSuite._ import org.apache.spark.sql.streaming.StreamTest import org.apache.spark.sql.types._ diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala index 298820349b683..6eb070138c3b8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalog.{Column, Database, Function, Table} import org.apache.spark.sql.catalyst.{FunctionIdentifier, ScalaReflection, TableIdentifier} import org.apache.spark.sql.catalyst.catalog._ -import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo} +import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.Range import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index 77a5d12cd8c95..580e7df6ef63e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.internal import java.util.TimeZone -import scala.language.reflectiveCalls - import org.apache.hadoop.fs.Path import org.apache.log4j.Level diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala index 4832386e553db..167e87dd3d5cb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning -import org.apache.spark.sql.execution.{DataSourceScanExec, FileSourceScanExec, SortExec, SparkPlan} +import org.apache.spark.sql.execution.{FileSourceScanExec, SortExec, SparkPlan} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec import org.apache.spark.sql.execution.datasources.BucketingUtils import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala index 1fdd3be88f782..179cdeb976391 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala @@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite} -import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index 4686a0c69de63..aaf8765c04425 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -756,6 +756,47 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { } } + test("SPARK-33354: Throw exceptions on inserting invalid cast with ANSI casting policy") { + withSQLConf( + SQLConf.STORE_ASSIGNMENT_POLICY.key -> SQLConf.StoreAssignmentPolicy.ANSI.toString) { + withTable("t") { + sql("CREATE TABLE t(i int, t timestamp) USING parquet") + val msg = intercept[AnalysisException] { + sql("INSERT INTO t VALUES (TIMESTAMP('2010-09-02 14:10:10'), 1)") + }.getMessage + assert(msg.contains("Cannot safely cast 'i': timestamp to int")) + assert(msg.contains("Cannot safely cast 't': int to timestamp")) + } + + withTable("t") { + sql("CREATE TABLE t(i int, d date) USING parquet") + val msg = intercept[AnalysisException] { + sql("INSERT INTO t VALUES (date('2010-09-02'), 1)") + }.getMessage + assert(msg.contains("Cannot safely cast 'i': date to int")) + assert(msg.contains("Cannot safely cast 'd': int to date")) + } + + withTable("t") { + sql("CREATE TABLE t(b boolean, t timestamp) USING parquet") + val msg = intercept[AnalysisException] { + sql("INSERT INTO t VALUES (TIMESTAMP('2010-09-02 14:10:10'), true)") + }.getMessage + assert(msg.contains("Cannot safely cast 'b': timestamp to boolean")) + assert(msg.contains("Cannot safely cast 't': boolean to timestamp")) + } + + withTable("t") { + sql("CREATE TABLE t(b boolean, d date) USING parquet") + val msg = intercept[AnalysisException] { + sql("INSERT INTO t VALUES (date('2010-09-02'), true)") + }.getMessage + assert(msg.contains("Cannot safely cast 'b': date to boolean")) + assert(msg.contains("Cannot safely cast 'd': boolean to date")) + } + } + } + test("SPARK-30844: static partition should also follow StoreAssignmentPolicy") { SQLConf.StoreAssignmentPolicy.values.foreach { policy => withSQLConf( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala index 9b26a5659df49..48d717daf00d4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.sources import java.net.URI -import org.apache.hadoop.fs.Path - import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession, SQLContext} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogUtils diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala index f97c9386f9488..788be539fe073 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala @@ -21,7 +21,6 @@ import java.io.File import java.sql.Date import org.apache.commons.io.FileUtils -import org.scalatest.BeforeAndAfterAll import org.scalatest.exceptions.TestFailedException import org.apache.spark.SparkException @@ -34,7 +33,7 @@ import org.apache.spark.sql.catalyst.plans.physical.UnknownPartitioning import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._ import org.apache.spark.sql.execution.RDDScanExec import org.apache.spark.sql.execution.streaming._ -import org.apache.spark.sql.execution.streaming.state.{FlatMapGroupsWithStateExecHelper, MemoryStateStore, StateStore, StateStoreId, StateStoreMetrics, UnsafeRowPair} +import org.apache.spark.sql.execution.streaming.state.{FlatMapGroupsWithStateExecHelper, MemoryStateStore, StateStore} import org.apache.spark.sql.functions.timestamp_seconds import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.util.StreamManualClock diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala index 8797e5ad64149..e64d5f6f3587e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala @@ -1134,7 +1134,7 @@ class StreamSuite extends StreamTest { verifyLocalLimit(inputDF.toDF("value").join(staticDF, "value"), expectStreamingLimit = false) verifyLocalLimit( - inputDF.groupBy().count().limit(1), + inputDF.groupBy("value").count().limit(1), expectStreamingLimit = false, outputMode = OutputMode.Complete()) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala index 7a2e29f1258ae..624b630401f47 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.streaming import scala.collection.mutable import scala.collection.mutable.ArrayBuffer -import scala.language.experimental.macros import scala.reflect.ClassTag import scala.util.Random import scala.util.control.NonFatal diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala index 4a57cc27b1d59..0524e29662014 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala @@ -20,8 +20,6 @@ package org.apache.spark.sql.streaming import java.io.File import java.util.{Locale, TimeZone} -import scala.collection.mutable - import org.apache.commons.io.FileUtils import org.scalatest.Assertions diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala index e1505acf3ecda..ac9cd1a12d06f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala @@ -17,13 +17,9 @@ package org.apache.spark.sql.streaming -import org.scalatest.BeforeAndAfterAll - -import org.apache.spark.sql.{DataFrame, Row} -import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashPartitioning, SinglePartition} +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._ -import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamingDeduplicateExec} -import org.apache.spark.sql.execution.streaming.state.StateStore +import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala index b2bb00b704a69..a25616af360b1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala @@ -26,19 +26,11 @@ import scala.util.Random import org.apache.commons.io.FileUtils import org.scalatest.BeforeAndAfter -import org.apache.spark.SparkContext import org.apache.spark.scheduler.ExecutorCacheTaskLocation -import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SparkSession} -import org.apache.spark.sql.catalyst.analysis.StreamingJoinHelper -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Literal} -import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, Filter} -import org.apache.spark.sql.catalyst.trees.TreeNode -import org.apache.spark.sql.execution.{FileSourceScanExec, LogicalRDD} -import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.execution.streaming.{MemoryStream, StatefulOperatorStateInfo, StreamingSymmetricHashJoinExec, StreamingSymmetricHashJoinHelper} import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreProviderId} import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types._ import org.apache.spark.util.Utils abstract class StreamingJoinSuite diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala index 0d17f2e0bc7fb..02f91399fce1c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala @@ -22,7 +22,6 @@ import java.sql.Timestamp import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart} import org.apache.spark.sql._ -import org.apache.spark.sql.execution.datasources.v2.ContinuousScanExec import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.continuous._ import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryStream diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala index 8d39704c61d4e..bdc714d49fcc9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala @@ -89,8 +89,6 @@ class DefaultSource extends StreamSourceProvider with StreamSinkProvider { override def getOffset: Option[Offset] = Some(new LongOffset(0)) override def getBatch(start: Option[Offset], end: Offset): DataFrame = { - import spark.implicits._ - spark.internalCreateDataFrame(spark.sparkContext.emptyRDD, schema, isStreaming = true) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/GenericFunSpecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/GenericFunSpecSuite.scala index 1b6724054a3ad..d15e5c42732d1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/GenericFunSpecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/GenericFunSpecSuite.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.test import org.scalatest.funspec.AnyFunSpec -import org.apache.spark.sql.Dataset - /** * The purpose of this suite is to make sure that generic FunSpec-based scala * tests work with a shared spark session diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala index e58357a415545..45cfa86ba9343 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala @@ -17,10 +17,8 @@ package org.apache.spark.sql.hive.thriftserver -import java.util.UUID import java.util.regex.Pattern -import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveOperationType import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.GetSchemasOperation @@ -29,7 +27,6 @@ import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext -import org.apache.spark.util.{Utils => SparkUtils} /** * Spark's own GetSchemasOperation diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala index bccad865be27a..bddf5eb82012f 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala @@ -30,7 +30,6 @@ import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.catalog.CatalogTableType._ -import org.apache.spark.sql.hive.HiveUtils /** * Spark's own GetTablesOperation diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala index c39d2ecdd7923..df0fa514ccff3 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala @@ -24,7 +24,6 @@ import javax.security.auth.login.LoginException import scala.collection.JavaConverters._ import scala.util.control.NonFatal -import org.apache.commons.logging.Log import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hadoop.hive.shims.Utils @@ -37,7 +36,6 @@ import org.apache.hive.service.server.HiveServer2 import org.slf4j.Logger import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, sqlContext: SQLContext) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala index e4559e69e7585..856edede0b85f 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala @@ -17,11 +17,7 @@ package org.apache.spark.sql.hive.thriftserver -import java.util.concurrent.Executors - -import org.apache.commons.logging.Log import org.apache.hadoop.hive.conf.HiveConf -import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hive.service.cli.SessionHandle import org.apache.hive.service.cli.session.SessionManager import org.apache.hive.service.rpc.thrift.TProtocolVersion diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala index 8efbdb30c605c..54a40e3990f09 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.hive.thriftserver.ui import java.net.URLEncoder import java.nio.charset.StandardCharsets.UTF_8 -import java.util.Calendar import javax.servlet.http.HttpServletRequest import scala.xml.Node diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala index f5ce21f2af335..d39b94503fe40 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala @@ -27,7 +27,7 @@ import scala.concurrent.Promise import scala.concurrent.duration._ import org.apache.hadoop.hive.conf.HiveConf.ConfVars -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} +import org.scalatest.BeforeAndAfterAll import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala index be42497113469..4a87be5f61195 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala @@ -23,7 +23,6 @@ import java.util.{Locale, MissingFormatArgumentException} import scala.util.control.NonFatal -import org.apache.commons.io.FileUtils import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.spark.SparkException diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index a685549290f0e..d9b6bb43c2b47 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -22,7 +22,6 @@ import java.io.File import org.scalatest.BeforeAndAfter import org.apache.spark.sql.catalyst.rules.RuleExecutor -import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index f01a03996821a..907bb86ad0c1c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -41,7 +41,6 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap -import org.apache.spark.sql.connector.catalog.TableCatalog import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.{PartitioningUtils, SourceOptions} import org.apache.spark.sql.hive.client.HiveClient diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 8a248a251820f..f60bad180a710 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -34,7 +34,6 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.{Cast, Expression} import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.hive.HiveShim.HiveFunctionWrapper -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DecimalType, DoubleType} import org.apache.spark.util.Utils @@ -44,7 +43,6 @@ private[sql] class HiveSessionCatalog( globalTempViewManagerBuilder: () => GlobalTempViewManager, val metastoreCatalog: HiveMetastoreCatalog, functionRegistry: FunctionRegistry, - conf: SQLConf, hadoopConf: Configuration, parser: ParserInterface, functionResourceLoader: FunctionResourceLoader) @@ -52,7 +50,6 @@ private[sql] class HiveSessionCatalog( externalCatalogBuilder, globalTempViewManagerBuilder, functionRegistry, - conf, hadoopConf, parser, functionResourceLoader) { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala index f79aaa464de81..b30492802495f 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala @@ -62,7 +62,6 @@ class HiveSessionStateBuilder( () => session.sharedState.globalTempViewManager, new HiveMetastoreCatalog(session), functionRegistry, - conf, SessionState.newHadoopConf(session.sparkContext.hadoopConfiguration, conf), sqlParser, resourceLoader) @@ -73,7 +72,7 @@ class HiveSessionStateBuilder( /** * A logical query plan `Analyzer` with rules specific to Hive. */ - override protected def analyzer: Analyzer = new Analyzer(catalogManager, conf) { + override protected def analyzer: Analyzer = new Analyzer(catalogManager) { override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = new ResolveHiveSerdeTable(session) +: new FindDataSourceTable(session) +: @@ -98,7 +97,7 @@ class HiveSessionStateBuilder( PreWriteCheck +: PreReadCheck +: TableCapabilityCheck +: - CommandCheck(conf) +: + CommandCheck +: customCheckRules } @@ -109,7 +108,7 @@ class HiveSessionStateBuilder( * Planner that takes into account Hive-specific strategies. */ override protected def planner: SparkPlanner = { - new SparkPlanner(session, conf, experimentalMethods) with HiveStrategies { + new SparkPlanner(session, experimentalMethods) with HiveStrategies { override val sparkSession: SparkSession = session override def extraPlanningStrategies: Seq[Strategy] = diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala index 399f8911ef679..46a8e9660a207 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala @@ -24,7 +24,6 @@ import java.util.concurrent.TimeUnit import scala.collection.JavaConverters._ import scala.collection.mutable.HashMap -import scala.language.implicitConversions import org.apache.commons.lang3.{JavaVersion, SystemUtils} import org.apache.hadoop.conf.Configuration @@ -96,17 +95,18 @@ private[spark] object HiveUtils extends Logging { .createWithDefault("builtin") val HIVE_METASTORE_JARS_PATH = buildStaticConf("spark.sql.hive.metastore.jars.path") - .doc(s"Comma separated URL of Hive jars, support both local and remote paths," + - s"Such as: " + - s" 1. file://path/to/jar/xxx.jar\n" + - s" 2. hdfs://nameservice/path/to/jar/xxx.jar\n" + - s" 3. /path/to/jar/ (path without URI scheme follow conf `fs.defaultFS`'s URI schema)\n" + - s" 4. [http/https/ftp]://path/to/jar/xxx.jar\n" + - s"Notice: `http/https/ftp` doesn't support wildcard, but other URLs support" + - s"nested path wildcard, Such as: " + - s" 1. file://path/to/jar/*, file://path/to/jar/*/*\n" + - s" 2. hdfs://nameservice/path/to/jar/*, hdfs://nameservice/path/to/jar/*/*\n" + - s"When ${HIVE_METASTORE_JARS.key} is set to `path`, we will use Hive jars configured by this") + .doc(s""" + | Comma-separated paths of the jars that used to instantiate the HiveMetastoreClient. + | This configuration is useful only when `{$HIVE_METASTORE_JARS.key}` is set as `path`. + | The paths can be any of the following format: + | 1. file://path/to/jar/foo.jar + | 2. hdfs://nameservice/path/to/jar/foo.jar + | 3. /path/to/jar/ (path without URI scheme follow conf `fs.defaultFS`'s URI schema) + | 4. [http/https/ftp]://path/to/jar/foo.jar + | Note that 1, 2, and 3 support wildcard. For example: + | 1. file://path/to/jar/*,file://path2/to/jar/*/*.jar + | 2. hdfs://nameservice/path/to/jar/*,hdfs://nameservice2/path/to/jar/*/*.jar + """.stripMargin) .version("3.1.0") .stringConf .toSequence diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala index 3e0d44160c8a1..eb9ce877fc8d2 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala @@ -39,7 +39,7 @@ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, NewHadoopRDD, RDD, UnionRDD} import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} import org.apache.spark.sql.catalyst.analysis.CastSupport import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.DateTimeUtils @@ -68,7 +68,7 @@ class HadoopTableReader( @transient private val tableDesc: TableDesc, @transient private val sparkSession: SparkSession, hadoopConf: Configuration) - extends TableReader with CastSupport with Logging { + extends TableReader with CastSupport with SQLConfHelper with Logging { // Hadoop honors "mapreduce.job.maps" as hint, // but will ignore when mapreduce.jobtracker.address is "local". diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index a78e1cebc588c..9bc99b08c2cc8 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -57,7 +57,6 @@ import org.apache.spark.sql.connector.catalog.SupportsNamespaces._ import org.apache.spark.sql.execution.QueryExecutionException import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.HiveExternalCatalog.{DATASOURCE_SCHEMA, DATASOURCE_SCHEMA_NUMPARTS, DATASOURCE_SCHEMA_PART_PREFIX} -import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.util.{CircularBuffer, Utils} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index bf67ae6bfe92e..d989f0154ea95 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -41,11 +41,11 @@ import org.apache.hadoop.hive.serde.serdeConstants import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchPermanentFunctionException import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, CatalogTablePartition, CatalogUtils, FunctionResource, FunctionResourceType} import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{AtomicType, IntegralType, StringType} import org.apache.spark.unsafe.types.UTF8String @@ -724,12 +724,13 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { } val useAdvanced = SQLConf.get.advancedPartitionPredicatePushdownEnabled + val inSetThreshold = SQLConf.get.metastorePartitionPruningInSetThreshold object ExtractAttribute { def unapply(expr: Expression): Option[Attribute] = { expr match { case attr: Attribute => Some(attr) - case Cast(child @ AtomicType(), dt: AtomicType, _) + case Cast(child @ IntegralType(), dt: IntegralType, _) if Cast.canUpCast(child.dataType.asInstanceOf[AtomicType], dt) => unapply(child) case _ => None } @@ -741,6 +742,12 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { if useAdvanced => Some(convertInToOr(name, values)) + case InSet(child, values) if useAdvanced && values.size > inSetThreshold => + val dataType = child.dataType + val sortedValues = values.toSeq.sorted(TypeUtils.getInterpretedOrdering(dataType)) + convert(And(GreaterThanOrEqual(child, Literal(sortedValues.head, dataType)), + LessThanOrEqual(child, Literal(sortedValues.last, dataType)))) + case InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values)) if useAdvanced => Some(convertInToOr(name, values)) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala index 4be3cd45454c6..c712a4a2b7c23 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.hive.execution -import java.io.{File, IOException} +import java.io.IOException import java.net.URI import java.text.SimpleDateFormat import java.util.{Date, Locale, Random} diff --git a/sql/hive/src/test/resources/data/scripts/test_transform.py b/sql/hive/src/test/resources/data/scripts/test_transform.py index ac6d11d8b919c..dedb370f6c90e 100755 --- a/sql/hive/src/test/resources/data/scripts/test_transform.py +++ b/sql/hive/src/test/resources/data/scripts/test_transform.py @@ -1,3 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# import sys delim = sys.argv[1] diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/InsertIntoHiveTableBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/InsertIntoHiveTableBenchmark.scala index da34c54cb36a2..e71b11e7a3f41 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/InsertIntoHiveTableBenchmark.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/InsertIntoHiveTableBenchmark.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHive /** diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala index 270595b0011e9..e413e0ee73cb9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.hive -import java.net.URI - import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkConf diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala index 86fc32cd8ca63..b3ea54a7bc931 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala @@ -25,7 +25,6 @@ import org.apache.spark.sql.catalyst.catalog.HiveTableRelation import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.util.Utils /** * A suite of tests for the Parquet support through the data sources API. diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShimSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShimSuite.scala index 54c64a4eeb190..89131a79e59de 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShimSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShimSuite.scala @@ -16,9 +16,6 @@ */ package org.apache.spark.sql.hive -import scala.collection.JavaConverters._ -import scala.language.implicitConversions - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hive.serde2.ColumnProjectionUtils diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala index 446923ad23201..3e7c3e6799724 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.{AnalysisException, ShowCreateTableSuite} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.hive.test.TestHiveSingleton -import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} +import org.apache.spark.sql.internal.HiveSerDe class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSingleton { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index 501a877e8b7fb..77d54ed45a5de 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -770,8 +770,6 @@ object SPARK_14244 extends QueryTest { val hiveContext = new TestHiveContext(sparkContext) spark = hiveContext.sparkSession - import hiveContext.implicits._ - try { val window = Window.orderBy("id") val df = spark.range(2).select(cume_dist().over(window).as("cdist")).orderBy("cdist") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUserDefinedTypeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUserDefinedTypeSuite.scala index ca1af73b038a7..d0af8dc7ae49f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUserDefinedTypeSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUserDefinedTypeSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.hive import scala.collection.JavaConverters._ -import scala.util.Random import org.apache.hadoop.hive.ql.udf.generic.GenericUDF import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, StandardListObjectInspector} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala index 4ad97eaa2b1c8..d8e1e01292820 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala @@ -23,9 +23,8 @@ import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.QueryTest -import org.apache.spark.sql.execution.HiveResult import org.apache.spark.sql.hive.test.TestHiveSingleton -import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SQLTestUtils} +import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.util.ChildFirstURLClassLoader class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala index 1e396553c9c52..483622b16762a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala @@ -17,12 +17,8 @@ package org.apache.spark.sql.hive -import java.io.File import java.sql.Timestamp -import com.google.common.io.Files -import org.apache.hadoop.fs.FileSystem - import org.apache.spark.internal.config._ import org.apache.spark.sql._ import org.apache.spark.sql.hive.test.TestHiveSingleton diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 7d5a200606356..43d1ba04c561d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException import org.apache.spark.sql.catalyst.catalog.{CatalogColumnStat, CatalogStatistics, HiveTableRelation} -import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, HistogramBin, HistogramSerializer} +import org.apache.spark.sql.catalyst.plans.logical.HistogramBin import org.apache.spark.sql.catalyst.util.{DateTimeUtils, StringUtils} import org.apache.spark.sql.execution.command.{AnalyzeColumnCommand, CommandUtils, DDLUtils} import org.apache.spark.sql.execution.datasources.LogicalRelation diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala index 2a4efd0cce6e0..12b409e487061 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala @@ -114,5 +114,33 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { } } + test("SPARK-33416: Avoid Hive metastore stack overflow when InSet predicate have many values") { + def checkConverted(inSet: InSet, result: String): Unit = { + assert(shim.convertFilters(testTable, inSet :: Nil) == result) + } + + withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD.key -> "15") { + checkConverted( + InSet(a("intcol", IntegerType), + Range(1, 20).map(s => Literal(s).eval(EmptyRow)).toSet), + "(intcol >= 1 and intcol <= 19)") + + checkConverted( + InSet(a("stringcol", StringType), + Range(1, 20).map(s => Literal(s.toString).eval(EmptyRow)).toSet), + "(stringcol >= \"1\" and stringcol <= \"9\")") + + checkConverted( + InSet(a("intcol", IntegerType).cast(LongType), + Range(1, 20).map(s => Literal(s.toLong).eval(EmptyRow)).toSet), + "(intcol >= 1 and intcol <= 19)") + + checkConverted( + InSet(a("doublecol", DoubleType), + Range(1, 20).map(s => Literal(s.toDouble).eval(EmptyRow)).toSet), + "") + } + } + private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)() } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuite.scala index 77956f4fe69da..b94d517e89e30 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuite.scala @@ -21,7 +21,6 @@ import java.security.PrivilegedExceptionAction import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.UserGroupInformation -import org.scalatest.{BeforeAndAfterAll, PrivateMethodTester} import org.apache.spark.util.Utils diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala index daa785bf110c5..81186909bb167 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.types.{BooleanType, IntegerType, LongType, StructType} +import org.apache.spark.sql.types.{BooleanType, IntegerType, LongType, StringType, StructType} import org.apache.spark.util.Utils class HivePartitionFilteringSuite(version: String) @@ -290,6 +290,13 @@ class HivePartitionFilteringSuite(version: String) (20170101 to 20170103, 0 to 4, Seq("ab", "bb")) :: Nil) } + test("getPartitionsByFilter: chunk in ('ab', 'ba') and ((cast(ds as string)>'20170102')") { + val day = (20170101 to 20170103, 0 to 4, Seq("ab", "ba")) + testMetastorePartitionFiltering( + attr("chunk").in("ab", "ba") && (attr("ds").cast(StringType) > "20170102"), + day :: Nil) + } + private def testMetastorePartitionFiltering( filterExpr: Expression, expectedDs: Seq[Int], diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala index a78fd506b752e..d3398842afb21 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala @@ -22,9 +22,8 @@ import java.io.File import com.google.common.io.Files import org.apache.hadoop.fs.{FileContext, FsConstants, Path} -import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode} +import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.execution.command.LoadDataCommand import org.apache.spark.sql.hive.test.TestHiveSingleton @@ -33,7 +32,6 @@ import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { - import testImplicits._ protected override def beforeAll(): Unit = { super.beforeAll() @@ -58,27 +56,11 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto |STORED AS PARQUET |TBLPROPERTIES('prop1Key'="prop1Val", '`prop2Key`'="prop2Val") """.stripMargin) - sql("CREATE TABLE parquet_tab3(col1 int, `col 2` int) USING hive") sql("CREATE TABLE parquet_tab4 (price int, qty int) partitioned by (year int, month int)") sql("INSERT INTO parquet_tab4 PARTITION(year = 2015, month = 1) SELECT 1, 1") sql("INSERT INTO parquet_tab4 PARTITION(year = 2015, month = 2) SELECT 2, 2") sql("INSERT INTO parquet_tab4 PARTITION(year = 2016, month = 2) SELECT 3, 3") sql("INSERT INTO parquet_tab4 PARTITION(year = 2016, month = 3) SELECT 3, 3") - sql( - """ - |CREATE TABLE parquet_tab5 (price int, qty int) - |PARTITIONED BY (year int, month int, hour int, minute int, sec int, extra int) - """.stripMargin) - sql( - """ - |INSERT INTO parquet_tab5 - |PARTITION(year = 2016, month = 3, hour = 10, minute = 10, sec = 10, extra = 1) SELECT 3, 3 - """.stripMargin) - sql( - """ - |INSERT INTO parquet_tab5 - |PARTITION(year = 2016, month = 4, hour = 10, minute = 10, sec = 10, extra = 1) SELECT 3, 3 - """.stripMargin) sql("CREATE VIEW parquet_view1 as select * from parquet_tab4") } @@ -86,10 +68,8 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto try { sql("DROP TABLE IF EXISTS parquet_tab1") sql("DROP TABLE IF EXISTS parquet_tab2") - sql("DROP TABLE IF EXISTS parquet_tab3") sql("DROP VIEW IF EXISTS parquet_view1") sql("DROP TABLE IF EXISTS parquet_tab4") - sql("DROP TABLE IF EXISTS parquet_tab5") } finally { super.afterAll() } @@ -393,88 +373,6 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } } - - test("show partitions - show everything") { - checkAnswer( - sql("show partitions parquet_tab4"), - Row("year=2015/month=1") :: - Row("year=2015/month=2") :: - Row("year=2016/month=2") :: - Row("year=2016/month=3") :: Nil) - - checkAnswer( - sql("show partitions default.parquet_tab4"), - Row("year=2015/month=1") :: - Row("year=2015/month=2") :: - Row("year=2016/month=2") :: - Row("year=2016/month=3") :: Nil) - } - - test("show partitions - show everything more than 5 part keys") { - checkAnswer( - sql("show partitions parquet_tab5"), - Row("year=2016/month=3/hour=10/minute=10/sec=10/extra=1") :: - Row("year=2016/month=4/hour=10/minute=10/sec=10/extra=1") :: Nil) - } - - test("show partitions - filter") { - checkAnswer( - sql("show partitions default.parquet_tab4 PARTITION(year=2015)"), - Row("year=2015/month=1") :: - Row("year=2015/month=2") :: Nil) - - checkAnswer( - sql("show partitions default.parquet_tab4 PARTITION(year=2015, month=1)"), - Row("year=2015/month=1") :: Nil) - - checkAnswer( - sql("show partitions default.parquet_tab4 PARTITION(month=2)"), - Row("year=2015/month=2") :: - Row("year=2016/month=2") :: Nil) - } - - test("show partitions - empty row") { - withTempView("parquet_temp") { - sql( - """ - |CREATE TEMPORARY VIEW parquet_temp (c1 INT, c2 STRING) - |USING org.apache.spark.sql.parquet.DefaultSource - """.stripMargin) - // An empty sequence of row is returned for session temporary table. - intercept[NoSuchTableException] { - sql("SHOW PARTITIONS parquet_temp") - } - - val message1 = intercept[AnalysisException] { - sql("SHOW PARTITIONS parquet_tab3") - }.getMessage - assert(message1.contains("not allowed on a table that is not partitioned")) - - val message2 = intercept[AnalysisException] { - sql("SHOW PARTITIONS parquet_tab4 PARTITION(abcd=2015, xyz=1)") - }.getMessage - assert(message2.contains("Non-partitioning column(s) [abcd, xyz] are specified")) - - val message3 = intercept[AnalysisException] { - sql("SHOW PARTITIONS parquet_view1") - }.getMessage - assert(message3.contains("is not allowed on a view")) - } - } - - test("show partitions - datasource") { - withTable("part_datasrc") { - val df = (1 to 3).map(i => (i, s"val_$i", i * 2)).toDF("a", "b", "c") - df.write - .partitionBy("a") - .format("parquet") - .mode(SaveMode.Overwrite) - .saveAsTable("part_datasrc") - - assert(sql("SHOW PARTITIONS part_datasrc").count() == 3) - } - } - test("SPARK-25918: LOAD DATA LOCAL INPATH should handle a relative path") { val localFS = FileContext.getLocalFSFileContext() val workingDir = localFS.getWorkingDirectory diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index cea7c5686054a..1cabf6033e8d8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -34,7 +34,6 @@ import org.apache.spark.sql.catalyst.expressions.Cast import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec -import org.apache.spark.sql.hive._ import org.apache.spark.sql.hive.test.{HiveTestJars, TestHive} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.internal.SQLConf diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala index da7dfd05f33d6..8aae7a1545b1a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala @@ -17,11 +17,11 @@ package org.apache.spark.sql.hive.execution -import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} +import org.apache.spark.sql.{AnalysisException, Row} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.execution.SQLViewSuite -import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton} +import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.types.{NullType, StructType} /** diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala index a8b10fc94d880..1018ae5b68895 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala @@ -17,10 +17,8 @@ package org.apache.spark.sql.hive.execution -import java.io.File import java.sql.Timestamp -import org.apache.commons.io.FileUtils import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.scalatest.exceptions.TestFailedException @@ -28,7 +26,6 @@ import org.apache.spark.{SparkException, TestUtils} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression} import org.apache.spark.sql.execution._ import org.apache.spark.sql.functions._ -import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index ba6dbb01d5901..4a50621d89d4e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -21,7 +21,6 @@ import java.io.{File, IOException} import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col -import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala index dd797b39e0939..9e8046b9ef544 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala @@ -34,7 +34,6 @@ import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.command.FunctionsCommand import org.apache.spark.sql.functions.max -import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala index 993a730524f6f..8e35cd034311d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BinaryOperator, EqualTo, Expression, IsNotNull, Literal} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BinaryOperator, Expression, IsNotNull, Literal} import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index a69a949e3a3a2..712f81d98753e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -45,7 +45,6 @@ import org.apache.spark.sql.internal.StaticSQLConf.GLOBAL_TEMP_DATABASE import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ import org.apache.spark.tags.SlowHiveTest -import org.apache.spark.util.Utils case class Nested1(f1: Nested2) case class Nested2(f2: Nested3) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala index 1f1a5568b0201..50f13efccc915 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala @@ -17,23 +17,15 @@ package org.apache.spark.sql.hive.execution -import java.lang.{Double => jlDouble, Integer => jlInt, Long => jlLong} - -import scala.collection.JavaConverters._ -import scala.util.Random - -import test.org.apache.spark.sql.MyDoubleAvg -import test.org.apache.spark.sql.MyDoubleSum +import java.lang.{Double => jlDouble, Long => jlLong} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.GenericInternalRow -import org.apache.spark.sql.catalyst.expressions.UnsafeRow -import org.apache.spark.sql.expressions.{Aggregator} +import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowPartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowPartitionsSuite.scala new file mode 100644 index 0000000000000..a92478faf0e16 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/command/ShowPartitionsSuite.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.execution.command + +import org.apache.spark.sql.execution.command.v1 +import org.apache.spark.sql.hive.test.TestHiveSingleton + +class ShowPartitionsSuite extends v1.ShowPartitionsSuiteBase with TestHiveSingleton { + override def version: String = "Hive V1" + override def defaultUsing: String = "USING HIVE" +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala index 5669cb757a678..f7c13ea047da7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -23,7 +23,6 @@ import java.util.{Set => JavaSet} import scala.collection.JavaConverters._ import scala.collection.mutable -import scala.language.implicitConversions import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path diff --git a/streaming/src/main/scala/org/apache/spark/status/api/v1/streaming/ApiStreamingRootResource.scala b/streaming/src/main/scala/org/apache/spark/status/api/v1/streaming/ApiStreamingRootResource.scala index a2571b910f615..99d59e4a1447a 100644 --- a/streaming/src/main/scala/org/apache/spark/status/api/v1/streaming/ApiStreamingRootResource.scala +++ b/streaming/src/main/scala/org/apache/spark/status/api/v1/streaming/ApiStreamingRootResource.scala @@ -23,9 +23,7 @@ import javax.ws.rs.core.MediaType import org.apache.spark.status.api.v1.NotFoundException import org.apache.spark.streaming.Time -import org.apache.spark.streaming.ui.StreamingJobProgressListener import org.apache.spark.streaming.ui.StreamingJobProgressListener._ -import org.apache.spark.ui.SparkUI @Produces(Array(MediaType.APPLICATION_JSON)) private[v1] class ApiStreamingRootResource extends BaseStreamingAppResource { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/State.scala b/streaming/src/main/scala/org/apache/spark/streaming/State.scala index 734c6ef42696e..c4cd1a9dc336b 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/State.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/State.scala @@ -17,8 +17,6 @@ package org.apache.spark.streaming -import scala.language.implicitConversions - import org.apache.spark.annotation.Experimental /** diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala index d038021e93e73..4ac1c62822e7a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala @@ -21,7 +21,6 @@ import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.collection.mutable -import scala.language.implicitConversions import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala index 6b332206e8f6d..9d4b67bccecaf 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala @@ -22,7 +22,6 @@ import scala.util.Random import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.rdd.BlockRDD import org.apache.spark.storage.{StorageLevel, StreamBlockId} -import org.apache.spark.streaming.StreamingConf.RECEIVER_WAL_ENABLE_CONF_KEY import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.rdd.WriteAheadLogBackedBlockRDD import org.apache.spark.streaming.receiver.{BlockManagerBasedStoreResult, Receiver, WriteAheadLogBasedStoreResult} diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala index 55c2950261a07..7ce4343acbdac 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala @@ -17,11 +17,10 @@ package org.apache.spark.streaming -import java.io.{File, IOException, ObjectInputStream} +import java.io.{IOException, ObjectInputStream} import java.util.concurrent.{ConcurrentLinkedQueue, TimeUnit} import scala.collection.JavaConverters._ -import scala.language.implicitConversions import scala.reflect.ClassTag import org.scalatest.BeforeAndAfterEach diff --git a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala index cd867aa8132bc..31456b0b95b18 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/receiver/BlockGeneratorSuite.scala @@ -25,7 +25,6 @@ import scala.collection.mutable import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} import org.scalatest.concurrent.Eventually._ -import org.scalatest.matchers.must.Matchers import org.scalatest.matchers.should.Matchers._ import org.scalatest.time.SpanSugar._ diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala index 293498ae5c37b..c2b039244d01f 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.streaming.scheduler import org.mockito.ArgumentMatchers.{any, eq => meq} import org.mockito.Mockito.{never, reset, times, verify, when} -import org.scalatest.{BeforeAndAfterEach, PrivateMethodTester} +import org.scalatest.PrivateMethodTester import org.scalatest.concurrent.Eventually.{eventually, timeout} import org.scalatest.time.SpanSugar._ import org.scalatestplus.mockito.MockitoSugar