Skip to content

Commit a432da7

Browse files
committed
Merge remote-tracking branch 'upstream/master' into SPARK-29680
2 parents 91c8e58 + 1e1b730 commit a432da7

File tree

80 files changed

+1792
-500
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+1792
-500
lines changed

common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.spark.unsafe.types;
1919

2020
import java.io.Serializable;
21+
import java.util.Objects;
2122

2223
/**
2324
* The internal representation of interval type.
@@ -31,45 +32,50 @@ public final class CalendarInterval implements Serializable {
3132
public static final long MICROS_PER_WEEK = MICROS_PER_DAY * 7;
3233

3334
public final int months;
35+
public final int days;
3436
public final long microseconds;
3537

3638
public long milliseconds() {
3739
return this.microseconds / MICROS_PER_MILLI;
3840
}
3941

40-
public CalendarInterval(int months, long microseconds) {
42+
public CalendarInterval(int months, int days, long microseconds) {
4143
this.months = months;
44+
this.days = days;
4245
this.microseconds = microseconds;
4346
}
4447

4548
public CalendarInterval add(CalendarInterval that) {
4649
int months = this.months + that.months;
50+
int days = this.days + that.days;
4751
long microseconds = this.microseconds + that.microseconds;
48-
return new CalendarInterval(months, microseconds);
52+
return new CalendarInterval(months, days, microseconds);
4953
}
5054

5155
public CalendarInterval subtract(CalendarInterval that) {
5256
int months = this.months - that.months;
57+
int days = this.days - that.days;
5358
long microseconds = this.microseconds - that.microseconds;
54-
return new CalendarInterval(months, microseconds);
59+
return new CalendarInterval(months, days, microseconds);
5560
}
5661

5762
public CalendarInterval negate() {
58-
return new CalendarInterval(-this.months, -this.microseconds);
63+
return new CalendarInterval(-this.months, -this.days, -this.microseconds);
5964
}
6065

6166
@Override
62-
public boolean equals(Object other) {
63-
if (this == other) return true;
64-
if (other == null || !(other instanceof CalendarInterval)) return false;
65-
66-
CalendarInterval o = (CalendarInterval) other;
67-
return this.months == o.months && this.microseconds == o.microseconds;
67+
public boolean equals(Object o) {
68+
if (this == o) return true;
69+
if (o == null || getClass() != o.getClass()) return false;
70+
CalendarInterval that = (CalendarInterval) o;
71+
return months == that.months &&
72+
days == that.days &&
73+
microseconds == that.microseconds;
6874
}
6975

7076
@Override
7177
public int hashCode() {
72-
return 31 * months + (int) microseconds;
78+
return Objects.hash(months, days, microseconds);
7379
}
7480

7581
@Override
@@ -81,12 +87,13 @@ public String toString() {
8187
appendUnit(sb, months % 12, "month");
8288
}
8389

90+
if (days != 0) {
91+
appendUnit(sb, days / 7, "week");
92+
appendUnit(sb, days % 7, "day");
93+
}
94+
8495
if (microseconds != 0) {
8596
long rest = microseconds;
86-
appendUnit(sb, rest / MICROS_PER_WEEK, "week");
87-
rest %= MICROS_PER_WEEK;
88-
appendUnit(sb, rest / MICROS_PER_DAY, "day");
89-
rest %= MICROS_PER_DAY;
9097
appendUnit(sb, rest / MICROS_PER_HOUR, "hour");
9198
rest %= MICROS_PER_HOUR;
9299
appendUnit(sb, rest / MICROS_PER_MINUTE, "minute");
@@ -96,7 +103,7 @@ public String toString() {
96103
appendUnit(sb, rest / MICROS_PER_MILLI, "millisecond");
97104
rest %= MICROS_PER_MILLI;
98105
appendUnit(sb, rest, "microsecond");
99-
} else if (months == 0) {
106+
} else if (months == 0 && days == 0) {
100107
sb.append(" 0 microseconds");
101108
}
102109

common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -26,59 +26,72 @@ public class CalendarIntervalSuite {
2626

2727
@Test
2828
public void equalsTest() {
29-
CalendarInterval i1 = new CalendarInterval(3, 123);
30-
CalendarInterval i2 = new CalendarInterval(3, 321);
31-
CalendarInterval i3 = new CalendarInterval(1, 123);
32-
CalendarInterval i4 = new CalendarInterval(3, 123);
29+
CalendarInterval i1 = new CalendarInterval(3, 2, 123);
30+
CalendarInterval i2 = new CalendarInterval(3, 2,321);
31+
CalendarInterval i3 = new CalendarInterval(3, 4,123);
32+
CalendarInterval i4 = new CalendarInterval(1, 2, 123);
33+
CalendarInterval i5 = new CalendarInterval(1, 4, 321);
34+
CalendarInterval i6 = new CalendarInterval(3, 2, 123);
3335

3436
assertNotSame(i1, i2);
3537
assertNotSame(i1, i3);
38+
assertNotSame(i1, i4);
3639
assertNotSame(i2, i3);
37-
assertEquals(i1, i4);
40+
assertNotSame(i2, i4);
41+
assertNotSame(i3, i4);
42+
assertNotSame(i1, i5);
43+
assertEquals(i1, i6);
3844
}
3945

4046
@Test
4147
public void toStringTest() {
4248
CalendarInterval i;
4349

44-
i = new CalendarInterval(0, 0);
50+
i = new CalendarInterval(0, 0, 0);
4551
assertEquals("interval 0 microseconds", i.toString());
4652

47-
i = new CalendarInterval(34, 0);
53+
i = new CalendarInterval(34, 0, 0);
4854
assertEquals("interval 2 years 10 months", i.toString());
4955

50-
i = new CalendarInterval(-34, 0);
56+
i = new CalendarInterval(-34, 0, 0);
5157
assertEquals("interval -2 years -10 months", i.toString());
5258

53-
i = new CalendarInterval(0, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
54-
assertEquals("interval 3 weeks 13 hours 123 microseconds", i.toString());
59+
i = new CalendarInterval(0, 31, 0);
60+
assertEquals("interval 4 weeks 3 days", i.toString());
5561

56-
i = new CalendarInterval(0, -3 * MICROS_PER_WEEK - 13 * MICROS_PER_HOUR - 123);
57-
assertEquals("interval -3 weeks -13 hours -123 microseconds", i.toString());
62+
i = new CalendarInterval(0, -31, 0);
63+
assertEquals("interval -4 weeks -3 days", i.toString());
5864

59-
i = new CalendarInterval(34, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
60-
assertEquals("interval 2 years 10 months 3 weeks 13 hours 123 microseconds", i.toString());
65+
i = new CalendarInterval(0, 0, 3 * MICROS_PER_HOUR + 13 * MICROS_PER_MINUTE + 123);
66+
assertEquals("interval 3 hours 13 minutes 123 microseconds", i.toString());
67+
68+
i = new CalendarInterval(0, 0, -3 * MICROS_PER_HOUR - 13 * MICROS_PER_MINUTE - 123);
69+
assertEquals("interval -3 hours -13 minutes -123 microseconds", i.toString());
70+
71+
i = new CalendarInterval(34, 31, 3 * MICROS_PER_HOUR + 13 * MICROS_PER_MINUTE + 123);
72+
assertEquals("interval 2 years 10 months 4 weeks 3 days 3 hours 13 minutes 123 microseconds",
73+
i.toString());
6174
}
6275

6376
@Test
6477
public void addTest() {
65-
CalendarInterval input1 = new CalendarInterval(3, 1 * MICROS_PER_HOUR);
66-
CalendarInterval input2 = new CalendarInterval(2, 100 * MICROS_PER_HOUR);
67-
assertEquals(input1.add(input2), new CalendarInterval(5, 101 * MICROS_PER_HOUR));
78+
CalendarInterval input1 = new CalendarInterval(3, 1, 1 * MICROS_PER_HOUR);
79+
CalendarInterval input2 = new CalendarInterval(2, 4, 100 * MICROS_PER_HOUR);
80+
assertEquals(input1.add(input2), new CalendarInterval(5, 5, 101 * MICROS_PER_HOUR));
6881

69-
input1 = new CalendarInterval(-10, -81 * MICROS_PER_HOUR);
70-
input2 = new CalendarInterval(75, 200 * MICROS_PER_HOUR);
71-
assertEquals(input1.add(input2), new CalendarInterval(65, 119 * MICROS_PER_HOUR));
82+
input1 = new CalendarInterval(-10, -30, -81 * MICROS_PER_HOUR);
83+
input2 = new CalendarInterval(75, 150, 200 * MICROS_PER_HOUR);
84+
assertEquals(input1.add(input2), new CalendarInterval(65, 120, 119 * MICROS_PER_HOUR));
7285
}
7386

7487
@Test
7588
public void subtractTest() {
76-
CalendarInterval input1 = new CalendarInterval(3, 1 * MICROS_PER_HOUR);
77-
CalendarInterval input2 = new CalendarInterval(2, 100 * MICROS_PER_HOUR);
78-
assertEquals(input1.subtract(input2), new CalendarInterval(1, -99 * MICROS_PER_HOUR));
89+
CalendarInterval input1 = new CalendarInterval(3, 1, 1 * MICROS_PER_HOUR);
90+
CalendarInterval input2 = new CalendarInterval(2, 4, 100 * MICROS_PER_HOUR);
91+
assertEquals(input1.subtract(input2), new CalendarInterval(1, -3, -99 * MICROS_PER_HOUR));
7992

80-
input1 = new CalendarInterval(-10, -81 * MICROS_PER_HOUR);
81-
input2 = new CalendarInterval(75, 200 * MICROS_PER_HOUR);
82-
assertEquals(input1.subtract(input2), new CalendarInterval(-85, -281 * MICROS_PER_HOUR));
93+
input1 = new CalendarInterval(-10, -30, -81 * MICROS_PER_HOUR);
94+
input2 = new CalendarInterval(75, 150, 200 * MICROS_PER_HOUR);
95+
assertEquals(input1.subtract(input2), new CalendarInterval(-85, -180, -281 * MICROS_PER_HOUR));
8396
}
8497
}

core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import scala.xml.Node
2525
import org.apache.spark.status.{AppStatusStore, StreamBlockData}
2626
import org.apache.spark.status.api.v1
2727
import org.apache.spark.ui._
28+
import org.apache.spark.ui.storage.ToolTips._
2829
import org.apache.spark.util.Utils
2930

3031
/** Page showing list of RDD's currently stored in the cluster */
@@ -56,7 +57,8 @@ private[ui] class StoragePage(parent: SparkUITab, store: AppStatusStore) extends
5657
rddHeader,
5758
rddRow(request, _: v1.RDDStorageInfo),
5859
rdds,
59-
id = Some("storage-by-rdd-table"))}
60+
id = Some("storage-by-rdd-table"),
61+
tooltipHeaders = tooltips)}
6062
</div>
6163
</div>
6264
}
@@ -72,6 +74,16 @@ private[ui] class StoragePage(parent: SparkUITab, store: AppStatusStore) extends
7274
"Size in Memory",
7375
"Size on Disk")
7476

77+
/** Tooltips for header fields of the RDD table */
78+
val tooltips = Seq(
79+
None,
80+
Some(RDD_NAME),
81+
Some(STORAGE_LEVEL),
82+
Some(CACHED_PARTITIONS),
83+
Some(FRACTION_CACHED),
84+
Some(SIZE_IN_MEMORY),
85+
Some(SIZE_ON_DISK))
86+
7587
/** Render an HTML row representing an RDD */
7688
private def rddRow(request: HttpServletRequest, rdd: v1.RDDStorageInfo): Seq[Node] = {
7789
// scalastyle:off
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.ui.storage
19+
20+
private[ui] object ToolTips {
21+
22+
val RDD_NAME =
23+
"Name of the persisted RDD"
24+
25+
val STORAGE_LEVEL =
26+
"StorageLevel displays where the persisted RDD is stored, " +
27+
"format of the persisted RDD (serialized or de-serialized) and" +
28+
"replication factor of the persisted RDD"
29+
30+
val CACHED_PARTITIONS =
31+
"Number of partitions cached"
32+
33+
val FRACTION_CACHED =
34+
"Fraction of total partitions cached"
35+
36+
val SIZE_IN_MEMORY =
37+
"Total size of partitions in memory"
38+
39+
val SIZE_ON_DISK =
40+
"Total size of partitions on the disk"
41+
}
42+

core/src/test/scala/org/apache/spark/ui/storage/StoragePageSuite.scala

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package org.apache.spark.ui.storage
2020
import javax.servlet.http.HttpServletRequest
2121

2222
import org.mockito.Mockito._
23+
import scala.xml.{Node, Text}
2324

2425
import org.apache.spark.SparkFunSuite
2526
import org.apache.spark.status.StreamBlockData
@@ -74,7 +75,21 @@ class StoragePageSuite extends SparkFunSuite {
7475
"Fraction Cached",
7576
"Size in Memory",
7677
"Size on Disk")
77-
assert((xmlNodes \\ "th").map(_.text) === headers)
78+
79+
val headerRow: Seq[Node] = {
80+
headers.view.zipWithIndex.map { x =>
81+
storagePage.tooltips(x._2) match {
82+
case Some(tooltip) =>
83+
<th width={""} class={""}>
84+
<span data-toggle="tooltip" title={tooltip}>
85+
{Text(x._1)}
86+
</span>
87+
</th>
88+
case None => <th width={""} class={""}>{Text(x._1)}</th>
89+
}
90+
}.toList
91+
}
92+
assert((xmlNodes \\ "th").map(_.text) === headerRow.map(_.text))
7893

7994
assert((xmlNodes \\ "tr").size === 3)
8095
assert(((xmlNodes \\ "tr")(0) \\ "td").map(_.text.trim) ===

docs/pyspark-migration-guide.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ Please refer [Migration Guide: SQL, Datasets and DataFrame](sql-migration-guide.
8484

8585
- Since Spark 3.0, `createDataFrame(..., verifySchema=True)` validates `LongType` as well in PySpark. Previously, `LongType` was not verified and resulted in `None` in case the value overflows. To restore this behavior, `verifySchema` can be set to `False` to disable the validation.
8686

87+
- Since Spark 3.0, `Column.getItem` is fixed such that it does not call `Column.apply`. Consequently, if `Column` is used as an argument to `getItem`, the indexing operator should be used.
88+
For example, `map_col.getItem(col('id'))` should be replaced with `map_col[col('id')]`.
89+
8790
## Upgrading from PySpark 2.3 to 2.4
8891

8992
- In PySpark, when Arrow optimization is enabled, previously `toPandas` just failed when Arrow optimization is unable to be used whereas `createDataFrame` from Pandas DataFrame allowed the fallback to non-optimization. Now, both `toPandas` and `createDataFrame` from Pandas DataFrame allow the fallback by default, which can be switched off by `spark.sql.execution.arrow.fallback.enabled`.

mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import org.apache.spark.ml.util.Instrumentation.instrumented
3737
import org.apache.spark.mllib.linalg.VectorImplicits._
3838
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
3939
import org.apache.spark.sql.{Dataset, Row}
40+
import org.apache.spark.storage.StorageLevel
4041

4142
/** Params for linear SVM Classifier. */
4243
private[classification] trait LinearSVCParams extends ClassifierParams with HasRegParam
@@ -159,7 +160,10 @@ class LinearSVC @Since("2.2.0") (
159160
override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra)
160161

161162
override protected def train(dataset: Dataset[_]): LinearSVCModel = instrumented { instr =>
163+
val handlePersistence = dataset.storageLevel == StorageLevel.NONE
164+
162165
val instances = extractInstances(dataset)
166+
if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
163167

164168
instr.logPipelineStage(this)
165169
instr.logDataset(dataset)
@@ -268,6 +272,8 @@ class LinearSVC @Since("2.2.0") (
268272
(Vectors.dense(coefficientArray), intercept, scaledObjectiveHistory.result())
269273
}
270274

275+
if (handlePersistence) instances.unpersist()
276+
271277
copyValues(new LinearSVCModel(uid, coefficientVector, interceptVector))
272278
}
273279
}

project/SparkBuild.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,8 @@ object SparkParallelTestGrouping {
475475
"org.apache.spark.ml.classification.LogisticRegressionSuite",
476476
"org.apache.spark.ml.classification.LinearSVCSuite",
477477
"org.apache.spark.sql.SQLQueryTestSuite",
478-
"org.apache.spark.sql.hive.thriftserver.ThriftServerQueryTestSuite"
478+
"org.apache.spark.sql.hive.thriftserver.ThriftServerQueryTestSuite",
479+
"org.apache.spark.sql.hive.thriftserver.SparkSQLEnvSuite"
479480
)
480481

481482
private val DEFAULT_TEST_GROUP = "default_test_group"

python/pyspark/sql/column.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -296,14 +296,12 @@ def getItem(self, key):
296296
+----+------+
297297
| 1| value|
298298
+----+------+
299-
>>> df.select(df.l[0], df.d["key"]).show()
300-
+----+------+
301-
|l[0]|d[key]|
302-
+----+------+
303-
| 1| value|
304-
+----+------+
299+
300+
.. versionchanged:: 3.0
301+
If `key` is a `Column` object, the indexing operator should be used instead.
302+
For example, `map_col.getItem(col('id'))` should be replaced with `map_col[col('id')]`.
305303
"""
306-
return self[key]
304+
return _bin_op("getItem")(self, key)
307305

308306
@since(1.3)
309307
def getField(self, name):

python/pyspark/sql/context.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ def registerDataFrameAsTable(self, df, tableName):
318318

319319
@since(1.6)
320320
def dropTempTable(self, tableName):
321-
""" Remove the temp table from catalog.
321+
""" Remove the temporary table from catalog.
322322
323323
>>> sqlContext.registerDataFrameAsTable(df, "table1")
324324
>>> sqlContext.dropTempTable("table1")

0 commit comments

Comments
 (0)