Skip to content

Commit 12d2757

Browse files
committed
[SPARK-31892][SQL] Disable week-based date filed for parsing
1 parent b806fc4 commit 12d2757

File tree

17 files changed

+88
-48
lines changed

17 files changed

+88
-48
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,7 @@ object CatalogColumnStat extends Logging {
525525
TimestampFormatter(
526526
format = "yyyy-MM-dd HH:mm:ss.SSSSSS",
527527
zoneId = ZoneOffset.UTC,
528-
needVarLengthSecondFraction = isParsing)
528+
isParsing = isParsing)
529529
}
530530

531531
/**

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
3535
options.zoneId,
3636
options.locale,
3737
legacyFormat = FAST_DATE_FORMAT,
38-
needVarLengthSecondFraction = true)
38+
isParsing = true)
3939

4040
private val decimalParser = if (options.locale == Locale.US) {
4141
// Special handling the default locale for backward compatibility

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,13 @@ class UnivocityGenerator(
4747
options.zoneId,
4848
options.locale,
4949
legacyFormat = FAST_DATE_FORMAT,
50-
needVarLengthSecondFraction = false)
50+
isParsing = false)
5151
private val dateFormatter = DateFormatter(
5252
options.dateFormat,
5353
options.zoneId,
5454
options.locale,
55-
legacyFormat = FAST_DATE_FORMAT)
55+
legacyFormat = FAST_DATE_FORMAT,
56+
isParsing = false)
5657

5758
private def makeConverter(dataType: DataType): ValueConverter = dataType match {
5859
case DateType =>

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,13 @@ class UnivocityParser(
9090
options.zoneId,
9191
options.locale,
9292
legacyFormat = FAST_DATE_FORMAT,
93-
needVarLengthSecondFraction = true)
93+
isParsing = true)
9494
private lazy val dateFormatter = DateFormatter(
9595
options.dateFormat,
9696
options.zoneId,
9797
options.locale,
98-
legacyFormat = FAST_DATE_FORMAT)
98+
legacyFormat = FAST_DATE_FORMAT,
99+
isParsing = true)
99100

100101
private val csvFilters = new CSVFilters(filters, requiredSchema)
101102

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -734,7 +734,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
734734
format.toString,
735735
zoneId,
736736
legacyFormat = SIMPLE_DATE_FORMAT,
737-
needVarLengthSecondFraction = false)
737+
isParsing = false)
738738
}
739739
} else None
740740
}
@@ -745,7 +745,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
745745
format.toString,
746746
zoneId,
747747
legacyFormat = SIMPLE_DATE_FORMAT,
748-
needVarLengthSecondFraction = false)
748+
isParsing = false)
749749
} else {
750750
formatter.get
751751
}
@@ -890,7 +890,7 @@ abstract class ToTimestamp
890890
constFormat.toString,
891891
zoneId,
892892
legacyFormat = SIMPLE_DATE_FORMAT,
893-
needVarLengthSecondFraction = true)
893+
isParsing = true)
894894
} catch {
895895
case e: SparkUpgradeException => throw e
896896
case NonFatal(_) => null
@@ -929,7 +929,7 @@ abstract class ToTimestamp
929929
formatString,
930930
zoneId,
931931
legacyFormat = SIMPLE_DATE_FORMAT,
932-
needVarLengthSecondFraction = true)
932+
isParsing = true)
933933
.parse(t.asInstanceOf[UTF8String].toString) / downScaleFactor
934934
} catch {
935935
case e: SparkUpgradeException => throw e
@@ -1072,7 +1072,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
10721072
constFormat.toString,
10731073
zoneId,
10741074
legacyFormat = SIMPLE_DATE_FORMAT,
1075-
needVarLengthSecondFraction = false)
1075+
isParsing = false)
10761076
} catch {
10771077
case e: SparkUpgradeException => throw e
10781078
case NonFatal(_) => null
@@ -1105,7 +1105,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
11051105
f.toString,
11061106
zoneId,
11071107
legacyFormat = SIMPLE_DATE_FORMAT,
1108-
needVarLengthSecondFraction = false)
1108+
isParsing = false)
11091109
.format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
11101110
} catch {
11111111
case e: SparkUpgradeException => throw e

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,13 @@ private[sql] class JacksonGenerator(
8383
options.zoneId,
8484
options.locale,
8585
legacyFormat = FAST_DATE_FORMAT,
86-
needVarLengthSecondFraction = false)
86+
isParsing = false)
8787
private val dateFormatter = DateFormatter(
8888
options.dateFormat,
8989
options.zoneId,
9090
options.locale,
91-
legacyFormat = FAST_DATE_FORMAT)
91+
legacyFormat = FAST_DATE_FORMAT,
92+
isParsing = false)
9293

9394
private def makeWriter(dataType: DataType): ValueWriter = dataType match {
9495
case NullType =>

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,13 @@ class JacksonParser(
6161
options.zoneId,
6262
options.locale,
6363
legacyFormat = FAST_DATE_FORMAT,
64-
needVarLengthSecondFraction = true)
64+
isParsing = true)
6565
private lazy val dateFormatter = DateFormatter(
6666
options.dateFormat,
6767
options.zoneId,
6868
options.locale,
69-
legacyFormat = FAST_DATE_FORMAT)
69+
legacyFormat = FAST_DATE_FORMAT,
70+
isParsing = true)
7071

7172
/**
7273
* Create a converter which converts the JSON documents held by the `JsonParser`

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
4343
options.zoneId,
4444
options.locale,
4545
legacyFormat = FAST_DATE_FORMAT,
46-
needVarLengthSecondFraction = true)
46+
isParsing = true)
4747

4848
/**
4949
* Infer the type of a collection of json records in three stages:

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.util
1919

2020
import java.text.SimpleDateFormat
2121
import java.time.{LocalDate, ZoneId}
22-
import java.time.format.DateTimeFormatter
2322
import java.util.{Date, Locale}
2423

2524
import org.apache.commons.lang3.time.FastDateFormat
@@ -42,7 +41,8 @@ class Iso8601DateFormatter(
4241
pattern: String,
4342
zoneId: ZoneId,
4443
locale: Locale,
45-
legacyFormat: LegacyDateFormats.LegacyDateFormat)
44+
legacyFormat: LegacyDateFormats.LegacyDateFormat,
45+
isParsing: Boolean)
4646
extends DateFormatter with DateTimeFormatterHelper {
4747

4848
@transient
@@ -125,12 +125,13 @@ object DateFormatter {
125125
format: Option[String],
126126
zoneId: ZoneId,
127127
locale: Locale = defaultLocale,
128-
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT): DateFormatter = {
128+
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
129+
isParsing: Boolean = true): DateFormatter = {
129130
val pattern = format.getOrElse(defaultPattern)
130131
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
131132
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
132133
} else {
133-
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat)
134+
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat, isParsing)
134135
df.validatePatternString()
135136
df
136137
}
@@ -153,8 +154,9 @@ object DateFormatter {
153154
format: String,
154155
zoneId: ZoneId,
155156
locale: Locale,
156-
legacyFormat: LegacyDateFormat): DateFormatter = {
157-
getFormatter(Some(format), zoneId, locale, legacyFormat)
157+
legacyFormat: LegacyDateFormat,
158+
isParsing: Boolean): DateFormatter = {
159+
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
158160
}
159161

160162
def apply(format: String, zoneId: ZoneId): DateFormatter = {

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ trait DateTimeFormatterHelper {
8989
protected def getOrCreateFormatter(
9090
pattern: String,
9191
locale: Locale,
92-
needVarLengthSecondFraction: Boolean = false): DateTimeFormatter = {
93-
val newPattern = convertIncompatiblePattern(pattern)
94-
val useVarLen = needVarLengthSecondFraction && newPattern.contains('S')
92+
isParsing: Boolean = false): DateTimeFormatter = {
93+
val newPattern = convertIncompatiblePattern(pattern, isParsing)
94+
val useVarLen = isParsing && newPattern.contains('S')
9595
val key = (newPattern, locale, useVarLen)
9696
var formatter = cache.getIfPresent(key)
9797
if (formatter == null) {
@@ -227,6 +227,11 @@ private object DateTimeFormatterHelper {
227227
formatter.format(LocalDate.of(2000, 1, 1)) == "1 1"
228228
}
229229
final val unsupportedLetters = Set('A', 'c', 'e', 'n', 'N', 'p')
230+
// SPARK-31892: The week-based date fields are rarely used and really confusing for parsing values
231+
// to timestamp, especially when they are mixed with other non-week-based ones. we have tried our
232+
// best to restore the behavior change between 2.4 and 3.0 and failed, see
233+
// https://github.com/apache/spark/pull/28674
234+
final val unsupportedLettersForParsing = Set('Y', 'W', 'w', 'E', 'u', 'F')
230235
final val unsupportedPatternLengths = {
231236
// SPARK-31771: Disable Narrow-form TextStyle to avoid silent data change, as it is Full-form in
232237
// 2.4
@@ -246,7 +251,7 @@ private object DateTimeFormatterHelper {
246251
* @param pattern The input pattern.
247252
* @return The pattern for new parser
248253
*/
249-
def convertIncompatiblePattern(pattern: String): String = {
254+
def convertIncompatiblePattern(pattern: String, isParsing: Boolean = false): String = {
250255
val eraDesignatorContained = pattern.split("'").zipWithIndex.exists {
251256
case (patternPart, index) =>
252257
// Text can be quoted using single quotes, we only check the non-quote parts.
@@ -255,7 +260,8 @@ private object DateTimeFormatterHelper {
255260
(pattern + " ").split("'").zipWithIndex.map {
256261
case (patternPart, index) =>
257262
if (index % 2 == 0) {
258-
for (c <- patternPart if unsupportedLetters.contains(c)) {
263+
for (c <- patternPart if unsupportedLetters.contains(c) ||
264+
(isParsing && unsupportedLettersForParsing.contains(c))) {
259265
throw new IllegalArgumentException(s"Illegal pattern character: $c")
260266
}
261267
for (style <- unsupportedPatternLengths if patternPart.contains(style)) {

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -287,13 +287,13 @@ object TimestampFormatter {
287287
zoneId: ZoneId,
288288
locale: Locale = defaultLocale,
289289
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
290-
needVarLengthSecondFraction: Boolean = false): TimestampFormatter = {
290+
isParsing: Boolean = false): TimestampFormatter = {
291291
val pattern = format.getOrElse(defaultPattern)
292292
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
293293
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
294294
} else {
295295
val tf = new Iso8601TimestampFormatter(
296-
pattern, zoneId, locale, legacyFormat, needVarLengthSecondFraction)
296+
pattern, zoneId, locale, legacyFormat, isParsing)
297297
tf.validatePatternString()
298298
tf
299299
}
@@ -319,23 +319,23 @@ object TimestampFormatter {
319319
zoneId: ZoneId,
320320
locale: Locale,
321321
legacyFormat: LegacyDateFormat,
322-
needVarLengthSecondFraction: Boolean): TimestampFormatter = {
323-
getFormatter(Some(format), zoneId, locale, legacyFormat, needVarLengthSecondFraction)
322+
isParsing: Boolean): TimestampFormatter = {
323+
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
324324
}
325325

326326
def apply(
327327
format: String,
328328
zoneId: ZoneId,
329329
legacyFormat: LegacyDateFormat,
330-
needVarLengthSecondFraction: Boolean): TimestampFormatter = {
331-
getFormatter(Some(format), zoneId, defaultLocale, legacyFormat, needVarLengthSecondFraction)
330+
isParsing: Boolean): TimestampFormatter = {
331+
getFormatter(Some(format), zoneId, defaultLocale, legacyFormat, isParsing)
332332
}
333333

334334
def apply(
335335
format: String,
336336
zoneId: ZoneId,
337-
needVarLengthSecondFraction: Boolean = false): TimestampFormatter = {
338-
getFormatter(Some(format), zoneId, needVarLengthSecondFraction = needVarLengthSecondFraction)
337+
isParsing: Boolean = false): TimestampFormatter = {
338+
getFormatter(Some(format), zoneId, isParsing = isParsing)
339339
}
340340

341341
def apply(zoneId: ZoneId): TimestampFormatter = {

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,4 +1168,22 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
11681168
checkExceptionInExpression[ArithmeticException](
11691169
MillisToTimestamp(Literal(-92233720368547758L)), "long overflow")
11701170
}
1171+
1172+
test("Disable week-based date filed for parsing") {
1173+
1174+
def checkSparkUpgrade(c: Char): Unit = {
1175+
checkExceptionInExpression[SparkUpgradeException](
1176+
new ParseToTimestamp(Literal("1"), Literal(c.toString)).child, "3.0")
1177+
checkExceptionInExpression[SparkUpgradeException](
1178+
new ParseToDate(Literal("1"), Literal(c.toString)).child, "3.0")
1179+
checkExceptionInExpression[SparkUpgradeException](
1180+
ToUnixTimestamp(Literal("1"), Literal(c.toString)), "3.0")
1181+
checkExceptionInExpression[SparkUpgradeException](
1182+
UnixTimestamp(Literal("1"), Literal(c.toString)), "3.0")
1183+
}
1184+
1185+
Seq('Y', 'W', 'w', 'E', 'u', 'F').foreach { l =>
1186+
checkSparkUpgrade(l)
1187+
}
1188+
}
11711189
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelperSuite.scala

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst.util
1919

20-
import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
20+
import org.apache.spark.SparkFunSuite
2121
import org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper._
2222

2323
class DateTimeFormatterHelperSuite extends SparkFunSuite {
@@ -40,6 +40,13 @@ class DateTimeFormatterHelperSuite extends SparkFunSuite {
4040
val e = intercept[IllegalArgumentException](convertIncompatiblePattern(s"yyyy-MM-dd $l G"))
4141
assert(e.getMessage === s"Illegal pattern character: $l")
4242
}
43+
unsupportedLettersForParsing.foreach { l =>
44+
val e = intercept[IllegalArgumentException] {
45+
convertIncompatiblePattern(s"$l", isParsing = true)
46+
}
47+
assert(e.getMessage === s"Illegal pattern character: $l")
48+
assert(convertIncompatiblePattern(s"$l").nonEmpty)
49+
}
4350
unsupportedPatternLengths.foreach { style =>
4451
val e1 = intercept[IllegalArgumentException] {
4552
convertIncompatiblePattern(s"yyyy-MM-dd $style")

sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
7272
DateFormatter.defaultPattern,
7373
getZoneId(timeZone),
7474
DateFormatter.defaultLocale,
75-
legacyFormat)
75+
legacyFormat,
76+
isParsing = false)
7677
val days = formatter.parse(date)
7778
assert(date === formatter.format(days))
7879
assert(date === formatter.format(daysToLocalDate(days)))
@@ -106,7 +107,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
106107
DateFormatter.defaultPattern,
107108
getZoneId(timeZone),
108109
DateFormatter.defaultLocale,
109-
legacyFormat)
110+
legacyFormat,
111+
isParsing = false)
110112
val date = formatter.format(days)
111113
val parsed = formatter.parse(date)
112114
assert(days === parsed)
@@ -173,7 +175,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
173175
DateFormatter.defaultPattern,
174176
getZoneId(timeZone),
175177
DateFormatter.defaultLocale,
176-
legacyFormat)
178+
legacyFormat,
179+
isParsing = false)
177180
assert(LocalDate.ofEpochDay(formatter.parse("1000-01-01")) === LocalDate.of(1000, 1, 1))
178181
assert(formatter.format(LocalDate.of(1000, 1, 1)) === "1000-01-01")
179182
assert(formatter.format(localDateToDays(LocalDate.of(1000, 1, 1))) === "1000-01-01")

0 commit comments

Comments
 (0)