Skip to content

[SPARK-31892][SQL] Disable week-based date filed for parsing #28706

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/sql-ref-datetime-pattern.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ The count of pattern letters determines the format.
During formatting, all valid data will be output even it is in the optional section.
During parsing, the whole section may be missing from the parsed string.
An optional section is started by `[` and ended using `]` (or at the end of the pattern).

- Symbols of 'Y', 'W', 'w', 'E', 'u', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.

More details for the text style:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not related to this PR, but can we embed this section into Text: ...?


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ object CatalogColumnStat extends Logging {
TimestampFormatter(
format = "yyyy-MM-dd HH:mm:ss.SSSSSS",
zoneId = ZoneOffset.UTC,
needVarLengthSecondFraction = isParsing)
isParsing = isParsing)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)

private val decimalParser = if (options.locale == Locale.US) {
// Special handling the default locale for backward compatibility
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,13 @@ class UnivocityGenerator(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
private val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = false)

private def makeConverter(dataType: DataType): ValueConverter = dataType match {
case DateType =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,13 @@ class UnivocityParser(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
private lazy val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = true)

private val csvFilters = new CSVFilters(filters, requiredSchema)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
format.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
}
} else None
}
Expand All @@ -745,7 +745,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
format.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
} else {
formatter.get
}
Expand Down Expand Up @@ -890,7 +890,7 @@ abstract class ToTimestamp
constFormat.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
} catch {
case e: SparkUpgradeException => throw e
case NonFatal(_) => null
Expand Down Expand Up @@ -929,7 +929,7 @@ abstract class ToTimestamp
formatString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
.parse(t.asInstanceOf[UTF8String].toString) / downScaleFactor
} catch {
case e: SparkUpgradeException => throw e
Expand Down Expand Up @@ -1072,7 +1072,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
constFormat.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
} catch {
case e: SparkUpgradeException => throw e
case NonFatal(_) => null
Expand Down Expand Up @@ -1105,7 +1105,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
f.toString,
zoneId,
legacyFormat = SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
.format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
} catch {
case e: SparkUpgradeException => throw e
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,13 @@ private[sql] class JacksonGenerator(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = false)
isParsing = false)
private val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = false)

private def makeWriter(dataType: DataType): ValueWriter = dataType match {
case NullType =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,13 @@ class JacksonParser(
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)
private lazy val dateFormatter = DateFormatter(
options.dateFormat,
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT)
legacyFormat = FAST_DATE_FORMAT,
isParsing = true)

/**
* Create a converter which converts the JSON documents held by the `JsonParser`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
options.zoneId,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
needVarLengthSecondFraction = true)
isParsing = true)

/**
* Infer the type of a collection of json records in three stages:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.util

import java.text.SimpleDateFormat
import java.time.{LocalDate, ZoneId}
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}

import org.apache.commons.lang3.time.FastDateFormat
Expand All @@ -42,7 +41,8 @@ class Iso8601DateFormatter(
pattern: String,
zoneId: ZoneId,
locale: Locale,
legacyFormat: LegacyDateFormats.LegacyDateFormat)
legacyFormat: LegacyDateFormats.LegacyDateFormat,
isParsing: Boolean)
extends DateFormatter with DateTimeFormatterHelper {

@transient
Expand Down Expand Up @@ -125,12 +125,13 @@ object DateFormatter {
format: Option[String],
zoneId: ZoneId,
locale: Locale = defaultLocale,
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT): DateFormatter = {
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
isParsing: Boolean = true): DateFormatter = {
val pattern = format.getOrElse(defaultPattern)
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
} else {
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat)
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat, isParsing)
df.validatePatternString()
df
}
Expand All @@ -153,8 +154,9 @@ object DateFormatter {
format: String,
zoneId: ZoneId,
locale: Locale,
legacyFormat: LegacyDateFormat): DateFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat)
legacyFormat: LegacyDateFormat,
isParsing: Boolean): DateFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
}

def apply(format: String, zoneId: ZoneId): DateFormatter = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ trait DateTimeFormatterHelper {
protected def getOrCreateFormatter(
pattern: String,
locale: Locale,
needVarLengthSecondFraction: Boolean = false): DateTimeFormatter = {
val newPattern = convertIncompatiblePattern(pattern)
val useVarLen = needVarLengthSecondFraction && newPattern.contains('S')
isParsing: Boolean = false): DateTimeFormatter = {
val newPattern = convertIncompatiblePattern(pattern, isParsing)
val useVarLen = isParsing && newPattern.contains('S')
val key = (newPattern, locale, useVarLen)
var formatter = cache.getIfPresent(key)
if (formatter == null) {
Expand Down Expand Up @@ -227,6 +227,12 @@ private object DateTimeFormatterHelper {
formatter.format(LocalDate.of(2000, 1, 1)) == "1 1"
}
final val unsupportedLetters = Set('A', 'c', 'e', 'n', 'N', 'p')
// SPARK-31892: The week-based date fields are rarely used and really confusing for parsing values
// to datetime, especially when they are mixed with other non-week-based ones
// The quarter fields will also be parsed strangely, e.g. when the pattern contains `yMd` and can
// be directly resolved then the `q` do check for whether the month is valid, but if the date
// fields is incomplete, e.g. `yM`, the checking will be bypassed.
final val unsupportedLettersForParsing = Set('Y', 'W', 'w', 'E', 'u', 'F', 'q', 'Q')
final val unsupportedPatternLengths = {
// SPARK-31771: Disable Narrow-form TextStyle to avoid silent data change, as it is Full-form in
// 2.4
Expand All @@ -246,7 +252,7 @@ private object DateTimeFormatterHelper {
* @param pattern The input pattern.
* @return The pattern for new parser
*/
def convertIncompatiblePattern(pattern: String): String = {
def convertIncompatiblePattern(pattern: String, isParsing: Boolean = false): String = {
val eraDesignatorContained = pattern.split("'").zipWithIndex.exists {
case (patternPart, index) =>
// Text can be quoted using single quotes, we only check the non-quote parts.
Expand All @@ -255,7 +261,8 @@ private object DateTimeFormatterHelper {
(pattern + " ").split("'").zipWithIndex.map {
case (patternPart, index) =>
if (index % 2 == 0) {
for (c <- patternPart if unsupportedLetters.contains(c)) {
for (c <- patternPart if unsupportedLetters.contains(c) ||
(isParsing && unsupportedLettersForParsing.contains(c))) {
throw new IllegalArgumentException(s"Illegal pattern character: $c")
}
for (style <- unsupportedPatternLengths if patternPart.contains(style)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,13 +287,13 @@ object TimestampFormatter {
zoneId: ZoneId,
locale: Locale = defaultLocale,
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
needVarLengthSecondFraction: Boolean = false): TimestampFormatter = {
isParsing: Boolean = false): TimestampFormatter = {
val pattern = format.getOrElse(defaultPattern)
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
} else {
val tf = new Iso8601TimestampFormatter(
pattern, zoneId, locale, legacyFormat, needVarLengthSecondFraction)
pattern, zoneId, locale, legacyFormat, isParsing)
tf.validatePatternString()
tf
}
Expand All @@ -319,23 +319,23 @@ object TimestampFormatter {
zoneId: ZoneId,
locale: Locale,
legacyFormat: LegacyDateFormat,
needVarLengthSecondFraction: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat, needVarLengthSecondFraction)
isParsing: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
}

def apply(
format: String,
zoneId: ZoneId,
legacyFormat: LegacyDateFormat,
needVarLengthSecondFraction: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, defaultLocale, legacyFormat, needVarLengthSecondFraction)
isParsing: Boolean): TimestampFormatter = {
getFormatter(Some(format), zoneId, defaultLocale, legacyFormat, isParsing)
}

def apply(
format: String,
zoneId: ZoneId,
needVarLengthSecondFraction: Boolean = false): TimestampFormatter = {
getFormatter(Some(format), zoneId, needVarLengthSecondFraction = needVarLengthSecondFraction)
isParsing: Boolean = false): TimestampFormatter = {
getFormatter(Some(format), zoneId, isParsing = isParsing)
}

def apply(zoneId: ZoneId): TimestampFormatter = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1168,4 +1168,33 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkExceptionInExpression[ArithmeticException](
MillisToTimestamp(Literal(-92233720368547758L)), "long overflow")
}

test("Disable week-based date fields and quarter fields for parsing") {

def checkSparkUpgrade(c: Char): Unit = {
checkExceptionInExpression[SparkUpgradeException](
new ParseToTimestamp(Literal("1"), Literal(c.toString)).child, "3.0")
checkExceptionInExpression[SparkUpgradeException](
new ParseToDate(Literal("1"), Literal(c.toString)).child, "3.0")
checkExceptionInExpression[SparkUpgradeException](
ToUnixTimestamp(Literal("1"), Literal(c.toString)), "3.0")
checkExceptionInExpression[SparkUpgradeException](
UnixTimestamp(Literal("1"), Literal(c.toString)), "3.0")
}

def checkNullify(c: Char): Unit = {
checkEvaluation(new ParseToTimestamp(Literal("1"), Literal(c.toString)).child, null)
checkEvaluation(new ParseToDate(Literal("1"), Literal(c.toString)).child, null)
checkEvaluation(ToUnixTimestamp(Literal("1"), Literal(c.toString)), null)
checkEvaluation(UnixTimestamp(Literal("1"), Literal(c.toString)), null)
}

Seq('Y', 'W', 'w', 'E', 'u', 'F').foreach { l =>
checkSparkUpgrade(l)
}

Seq('q', 'Q').foreach { l =>
checkNullify(l)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

package org.apache.spark.sql.catalyst.util

import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper._

class DateTimeFormatterHelperSuite extends SparkFunSuite {
Expand All @@ -40,6 +40,13 @@ class DateTimeFormatterHelperSuite extends SparkFunSuite {
val e = intercept[IllegalArgumentException](convertIncompatiblePattern(s"yyyy-MM-dd $l G"))
assert(e.getMessage === s"Illegal pattern character: $l")
}
unsupportedLettersForParsing.foreach { l =>
val e = intercept[IllegalArgumentException] {
convertIncompatiblePattern(s"$l", isParsing = true)
}
assert(e.getMessage === s"Illegal pattern character: $l")
assert(convertIncompatiblePattern(s"$l").nonEmpty)
}
unsupportedPatternLengths.foreach { style =>
val e1 = intercept[IllegalArgumentException] {
convertIncompatiblePattern(s"yyyy-MM-dd $style")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
DateFormatter.defaultPattern,
getZoneId(timeZone),
DateFormatter.defaultLocale,
legacyFormat)
legacyFormat,
isParsing = false)
val days = formatter.parse(date)
assert(date === formatter.format(days))
assert(date === formatter.format(daysToLocalDate(days)))
Expand Down Expand Up @@ -106,7 +107,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
DateFormatter.defaultPattern,
getZoneId(timeZone),
DateFormatter.defaultLocale,
legacyFormat)
legacyFormat,
isParsing = false)
val date = formatter.format(days)
val parsed = formatter.parse(date)
assert(days === parsed)
Expand Down Expand Up @@ -173,7 +175,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
DateFormatter.defaultPattern,
getZoneId(timeZone),
DateFormatter.defaultLocale,
legacyFormat)
legacyFormat,
isParsing = false)
assert(LocalDate.ofEpochDay(formatter.parse("1000-01-01")) === LocalDate.of(1000, 1, 1))
assert(formatter.format(LocalDate.of(1000, 1, 1)) === "1000-01-01")
assert(formatter.format(localDateToDays(LocalDate.of(1000, 1, 1))) === "1000-01-01")
Expand Down
Loading