Skip to content

Commit e4a4776

Browse files
committed
Merge remote-tracking branch 'origin/master' into interval-literals-as-ansi
2 parents a408ab9 + 71133e1 commit e4a4776

File tree

65 files changed

+7773
-494
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+7773
-494
lines changed

.github/workflows/build_and_test.yml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ jobs:
9090
run: |
9191
apache_spark_ref=`git rev-parse HEAD`
9292
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/}
93-
git merge --progress --ff-only FETCH_HEAD
93+
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
94+
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
9495
echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
9596
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
9697
- name: Cache Scala, SBT and Maven
@@ -186,7 +187,8 @@ jobs:
186187
run: |
187188
apache_spark_ref=`git rev-parse HEAD`
188189
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/}
189-
git merge --progress --ff-only FETCH_HEAD
190+
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
191+
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
190192
echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
191193
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
192194
- name: Cache Scala, SBT and Maven
@@ -261,7 +263,8 @@ jobs:
261263
run: |
262264
apache_spark_ref=`git rev-parse HEAD`
263265
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF##*/}
264-
git merge --progress --ff-only FETCH_HEAD
266+
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD
267+
git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit"
265268
echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref"
266269
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
267270
- name: Cache Scala, SBT and Maven

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,15 +1583,7 @@ class SparkContext(config: SparkConf) extends Logging {
15831583
private def addFile(
15841584
path: String, recursive: Boolean, addedOnSubmit: Boolean, isArchive: Boolean = false
15851585
): Unit = {
1586-
val uri = if (!isArchive) {
1587-
if (Utils.isAbsoluteURI(path) && path.contains("%")) {
1588-
new URI(path)
1589-
} else {
1590-
new Path(path).toUri
1591-
}
1592-
} else {
1593-
Utils.resolveURI(path)
1594-
}
1586+
val uri = Utils.resolveURI(path)
15951587
val schemeCorrectedURI = uri.getScheme match {
15961588
case null => new File(path).getCanonicalFile.toURI
15971589
case "local" =>
@@ -1979,11 +1971,7 @@ class SparkContext(config: SparkConf) extends Logging {
19791971
// For local paths with backslashes on Windows, URI throws an exception
19801972
(addLocalJarFile(new File(path)), "local")
19811973
} else {
1982-
val uri = if (Utils.isAbsoluteURI(path) && path.contains("%")) {
1983-
new URI(path)
1984-
} else {
1985-
new Path(path).toUri
1986-
}
1974+
val uri = Utils.resolveURI(path)
19871975
// SPARK-17650: Make sure this is a valid URL before adding it to the list of dependencies
19881976
Utils.validateURL(uri)
19891977
val uriScheme = uri.getScheme

core/src/main/scala/org/apache/spark/TestUtils.scala

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
package org.apache.spark
1919

2020
import java.io.{ByteArrayInputStream, File, FileInputStream, FileOutputStream}
21-
import java.net.{HttpURLConnection, URI, URL}
21+
import java.net.{HttpURLConnection, InetSocketAddress, URI, URL}
2222
import java.nio.charset.StandardCharsets
2323
import java.nio.file.{Files => JavaFiles, Paths}
2424
import java.nio.file.attribute.PosixFilePermission.{OWNER_EXECUTE, OWNER_READ, OWNER_WRITE}
@@ -41,6 +41,11 @@ import scala.util.Try
4141
import com.google.common.io.{ByteStreams, Files}
4242
import org.apache.commons.lang3.StringUtils
4343
import org.apache.log4j.PropertyConfigurator
44+
import org.eclipse.jetty.server.Handler
45+
import org.eclipse.jetty.server.Server
46+
import org.eclipse.jetty.server.handler.DefaultHandler
47+
import org.eclipse.jetty.server.handler.HandlerList
48+
import org.eclipse.jetty.server.handler.ResourceHandler
4449
import org.json4s.JsonAST.JValue
4550
import org.json4s.jackson.JsonMethods.{compact, render}
4651

@@ -364,6 +369,22 @@ private[spark] object TestUtils {
364369
}
365370
}
366371

372+
def withHttpServer(resBaseDir: String = ".")(body: URL => Unit): Unit = {
373+
// 0 as port means choosing randomly from the available ports
374+
val server = new Server(new InetSocketAddress(Utils.localCanonicalHostName, 0))
375+
val resHandler = new ResourceHandler()
376+
resHandler.setResourceBase(resBaseDir)
377+
val handlers = new HandlerList()
378+
handlers.setHandlers(Array[Handler](resHandler, new DefaultHandler()))
379+
server.setHandler(handlers)
380+
server.start()
381+
try {
382+
body(server.getURI.toURL)
383+
} finally {
384+
server.stop()
385+
}
386+
}
387+
367388
/**
368389
* Wait until at least `numExecutors` executors are up, or throw `TimeoutException` if the waiting
369390
* time elapsed before `numExecutors` executors up. Exposed for testing.

core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -852,6 +852,9 @@ private[spark] class SparkSubmit extends Logging {
852852
}
853853
sparkConf.set(SUBMIT_PYTHON_FILES, formattedPyFiles.split(",").toSeq)
854854

855+
if (args.verbose) {
856+
childArgs ++= Seq("--verbose")
857+
}
855858
(childArgs.toSeq, childClasspath.toSeq, sparkConf, childMainClass)
856859
}
857860

dev/sparktestsupport/modules.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,18 @@ def __hash__(self):
630630
"pyspark.pandas.tests.test_series_conversion",
631631
"pyspark.pandas.tests.test_series_datetime",
632632
"pyspark.pandas.tests.test_series_string",
633+
"pyspark.pandas.tests.test_categorical",
634+
"pyspark.pandas.tests.test_csv",
635+
"pyspark.pandas.tests.test_groupby",
636+
"pyspark.pandas.tests.test_expanding",
637+
"pyspark.pandas.tests.test_indexing",
638+
"pyspark.pandas.tests.test_namespace",
639+
"pyspark.pandas.tests.test_repr",
640+
"pyspark.pandas.tests.test_reshape",
641+
"pyspark.pandas.tests.test_rolling",
642+
"pyspark.pandas.tests.test_sql",
643+
"pyspark.pandas.tests.test_stats",
644+
"pyspark.pandas.tests.test_window",
633645
"pyspark.pandas.tests.plot.test_frame_plot",
634646
"pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
635647
"pyspark.pandas.tests.plot.test_frame_plot_plotly",

docs/sql-migration-guide.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ license: |
7777

7878
- In Spark 3.2, `CREATE TABLE .. LIKE ..` command can not use reserved properties. You need their specific clauses to specify them, for example, `CREATE TABLE test1 LIKE test LOCATION 'some path'`. You can set `spark.sql.legacy.notReserveProperties` to `true` to ignore the `ParseException`, in this case, these properties will be silently removed, for example: `TBLPROPERTIES('owner'='yao')` will have no effect. In Spark version 3.1 and below, the reserved properties can be used in `CREATE TABLE .. LIKE ..` command but have no side effects, for example, `TBLPROPERTIES('location'='/tmp')` does not change the location of the table but only create a headless property just like `'a'='b'`.
7979

80+
- In Spark 3.2, `TRANSFORM` operator can't support alias in inputs. In Spark 3.1 and earlier, we can write script transform like `SELECT TRANSFORM(a AS c1, b AS c2) USING 'cat' FROM TBL`.
81+
8082
## Upgrading from Spark SQL 3.0 to 3.1
8183

8284
- In Spark 3.1, statistical aggregation function includes `std`, `stddev`, `stddev_samp`, `variance`, `var_samp`, `skewness`, `kurtosis`, `covar_samp`, `corr` will return `NULL` instead of `Double.NaN` when `DivideByZero` occurs during expression evaluation, for example, when `stddev_samp` applied on a single element set. In Spark version 3.0 and earlier, it will return `Double.NaN` in such case. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.statisticalAggregate` to `true`.

python/pyspark/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
from pyspark.serializers import MarshalSerializer, PickleSerializer
6161
from pyspark.taskcontext import TaskContext, BarrierTaskContext, BarrierTaskInfo
6262
from pyspark.profiler import Profiler, BasicProfiler
63-
from pyspark.version import __version__ # noqa: F401
63+
from pyspark.version import __version__
6464
from pyspark._globals import _NoValue # noqa: F401
6565

6666

@@ -125,4 +125,5 @@ def wrapper(self, *args, **kwargs):
125125
"Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer",
126126
"StatusTracker", "SparkJobInfo", "SparkStageInfo", "Profiler", "BasicProfiler", "TaskContext",
127127
"RDDBarrier", "BarrierTaskContext", "BarrierTaskInfo", "InheritableThread",
128+
"__version__",
128129
]

python/pyspark/__init__.pyi

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ from pyspark.taskcontext import ( # noqa: F401
5252
TaskContext as TaskContext,
5353
)
5454
from pyspark.util import InheritableThread as InheritableThread # noqa: F401
55+
from pyspark.version import __version__ as __version__
5556

5657
# Compatibility imports
5758
from pyspark.sql import ( # noqa: F401
@@ -71,5 +72,3 @@ def copy_func(
7172
doc: Optional[str] = ...,
7273
) -> F: ...
7374
def keyword_only(func: F) -> F: ...
74-
75-
__version__: str

python/pyspark/pandas/__init__.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,12 @@ def assert_python_version():
4242

4343
if sys.version_info[:2] <= deprecated_version:
4444
warnings.warn(
45-
"Koalas support for Python {dep_ver} is deprecated and will be dropped in "
45+
"pandas-on-Spark support for Python {dep_ver} is deprecated and will be dropped in "
4646
"the future release. At that point, existing Python {dep_ver} workflows "
47-
"that use Koalas will continue to work without modification, but Python {dep_ver} "
48-
"users will no longer get access to the latest Koalas features and bugfixes. "
49-
"We recommend that you upgrade to Python {min_ver} or newer.".format(
47+
"that use pandas-on-Spark will continue to work without modification, but "
48+
"Python {dep_ver} users will no longer get access to the latest pandas-on-Spark "
49+
"features and bugfixes. We recommend that you upgrade to Python {min_ver} or "
50+
"newer.".format(
5051
dep_ver=".".join(map(str, deprecated_version)),
5152
min_ver=".".join(map(str, min_supported_version)),
5253
),
@@ -68,8 +69,8 @@ def assert_python_version():
6869
"'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to "
6970
"set this environment variable to '1' in both driver and executor sides if you use "
7071
"pyarrow>=2.0.0. "
71-
"Koalas will set it for you but it does not work if there is a Spark context already "
72-
"launched."
72+
"pandas-on-Spark will set it for you but it does not work if there is a Spark context "
73+
"already launched."
7374
)
7475
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
7576

python/pyspark/pandas/accessors.py

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# limitations under the License.
1616
#
1717
"""
18-
Koalas specific features.
18+
pandas-on-Spark specific features.
1919
"""
2020
import inspect
2121
from typing import Any, Optional, Tuple, Union, TYPE_CHECKING, cast
@@ -47,8 +47,8 @@
4747
from pyspark.pandas.series import Series # noqa: F401 (SPARK-34943)
4848

4949

50-
class KoalasFrameMethods(object):
51-
""" Koalas specific features for DataFrame. """
50+
class PandasOnSparkFrameMethods(object):
51+
""" pandas-on-Spark specific features for DataFrame. """
5252

5353
def __init__(self, frame: "DataFrame"):
5454
self._kdf = frame
@@ -194,10 +194,10 @@ def apply_batch(self, func, args=(), **kwds) -> "DataFrame":
194194
See also `Transform and apply a function
195195
<https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.
196196
197-
.. note:: the `func` is unable to access to the whole input frame. Koalas internally
198-
splits the input series into multiple batches and calls `func` with each batch multiple
199-
times. Therefore, operations such as global aggregations are impossible. See the example
200-
below.
197+
.. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
198+
internally splits the input series into multiple batches and calls `func` with each
199+
batch multiple times. Therefore, operations such as global aggregations are impossible.
200+
See the example below.
201201
202202
>>> # This case does not return the length of whole frame but of the batch internally
203203
... # used.
@@ -286,7 +286,7 @@ def apply_batch(self, func, args=(), **kwds) -> "DataFrame":
286286
A B
287287
0 1 2
288288
289-
You can also omit the type hints so Koalas infers the return schema as below:
289+
You can also omit the type hints so pandas-on-Spark infers the return schema as below:
290290
291291
>>> df.koalas.apply_batch(lambda pdf: pdf.query('A == 1'))
292292
A B
@@ -400,10 +400,10 @@ def transform_batch(self, func, *args, **kwargs) -> Union["DataFrame", "Series"]
400400
See also `Transform and apply a function
401401
<https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.
402402
403-
.. note:: the `func` is unable to access to the whole input frame. Koalas internally
404-
splits the input series into multiple batches and calls `func` with each batch multiple
405-
times. Therefore, operations such as global aggregations are impossible. See the example
406-
below.
403+
.. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
404+
internally splits the input series into multiple batches and calls `func` with each
405+
batch multiple times. Therefore, operations such as global aggregations are impossible.
406+
See the example below.
407407
408408
>>> # This case does not return the length of whole frame but of the batch internally
409409
... # used.
@@ -497,7 +497,7 @@ def transform_batch(self, func, *args, **kwargs) -> Union["DataFrame", "Series"]
497497
2 7
498498
dtype: int64
499499
500-
You can also omit the type hints so Koalas infers the return schema as below:
500+
You can also omit the type hints so pandas-on-Spark infers the return schema as below:
501501
502502
>>> df.koalas.transform_batch(lambda pdf: pdf + 1)
503503
A B
@@ -699,8 +699,8 @@ def pandas_frame_func(f, field_name):
699699
return DataFrame(internal)
700700

701701

702-
class KoalasSeriesMethods(object):
703-
""" Koalas specific features for Series. """
702+
class PandasOnSparkSeriesMethods(object):
703+
""" pandas-on-Spark specific features for Series. """
704704

705705
def __init__(self, series: "Series"):
706706
self._kser = series
@@ -713,10 +713,10 @@ def transform_batch(self, func, *args, **kwargs) -> "Series":
713713
See also `Transform and apply a function
714714
<https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.
715715
716-
.. note:: the `func` is unable to access to the whole input series. Koalas internally
717-
splits the input series into multiple batches and calls `func` with each batch multiple
718-
times. Therefore, operations such as global aggregations are impossible. See the example
719-
below.
716+
.. note:: the `func` is unable to access to the whole input series. pandas-on-Spark
717+
internally splits the input series into multiple batches and calls `func` with each
718+
batch multiple times. Therefore, operations such as global aggregations are impossible.
719+
See the example below.
720720
721721
>>> # This case does not return the length of whole frame but of the batch internally
722722
... # used.
@@ -774,7 +774,7 @@ def transform_batch(self, func, *args, **kwargs) -> "Series":
774774
2 6
775775
Name: A, dtype: int64
776776
777-
You can also omit the type hints so Koalas infers the return schema as below:
777+
You can also omit the type hints so pandas-on-Spark infers the return schema as below:
778778
779779
>>> df.A.koalas.transform_batch(lambda pser: pser + 1)
780780
0 2

0 commit comments

Comments
 (0)