[SPARK-39904][SQL] Rename inferDate to prefersDate and clarify semantics of the option in CSV data source

sadikovi · HyukjinKwon · commit 31ab8bc4d5df · 2022-08-03T13:06:44.000+09:00
### What changes were proposed in this pull request? This is a follow-up for #36871. PR renames `inferDate` to `prefersDate` to avoid confusion when dates inference would change the column type and result in confusion when the user meant to infer timestamps. The patch also updates semantics of the option: `prefersDate` is allowed to be used during schema inference (`inferSchema`) as well as user-provided schema where it could be used as a fallback mechanism when parsing timestamps. ### Why are the changes needed? Fixes ambiguity when setting `prefersDate` to true and clarifies semantics of the option. ### Does this PR introduce _any_ user-facing change? Although it is an option rename, the original PR was merged a few days ago and the config option has not been included in a Spark release. ### How was this patch tested? I added a unit test for prefersDate = true with a user schema. Closes #37327 from sadikovi/rename_config. Authored-by: Ivan Sadikov <ivan.sadikov@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md
@@ -109,9 +109,9 @@ Data source options of CSV can be set via:
     <td>read</td>
   </tr>
   <tr>
-    <td><code>inferDate</code></td> 
+    <td><code>prefersDate</code></td>
     <td>false</td>
-    <td>Whether or not to infer columns that satisfy the <code>dateFormat</code> option as <code>Date</code>. Requires <code>inferSchema</code> to be <code>true</code>. When <code>false</code>, columns with dates will be inferred as <code>String</code> (or as <code>Timestamp</code> if it fits the <code>timestampFormat</code>).</td>
+    <td>During schema inference (<code>inferSchema</code>), attempts to infer string columns that contain dates or timestamps as <code>Date</code> if the values satisfy the <code>dateFormat</code> option and failed to be parsed by the respective formatter. With a user-provided schema, attempts to parse timestamp columns as dates using <code>dateFormat</code> if they fail to conform to <code>timestampFormat</code>, in this case the parsed values will be cast to timestamp type afterwards.</td>
     <td>read</td>
   </tr>
   <tr>
@@ -176,8 +176,8 @@ Data source options of CSV can be set via:
   </tr>
   <tr>
     <td><code>enableDateTimeParsingFallback</code></td>
-    <td>Enabled if the time parser policy is legacy or no custom date or timestamp pattern was provided</td>
-    <td>Allows to fall back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns.</td>
+    <td>Enabled if the time parser policy has legacy settings or if no custom date or timestamp pattern was provided.</td>
+    <td>Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns.</td>
     <td>read</td>
   </tr>
   <tr>
diff --git a/docs/sql-data-sources-json.md b/docs/sql-data-sources-json.md
@@ -204,8 +204,8 @@ Data source options of JSON can be set via:
   </tr>
   <tr>
     <td><code>enableDateTimeParsingFallback</code></td>
-    <td>Enabled if the time parser policy is legacy or no custom date or timestamp pattern was provided</td>
-    <td>Allows to fall back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns.</td>
+    <td>Enabled if the time parser policy has legacy settings or if no custom date or timestamp pattern was provided.</td>
+    <td>Allows falling back to the backward compatible (Spark 1.x and 2.0) behavior of parsing dates and timestamps if values do not match the set patterns.</td>
     <td>read</td>
   </tr>
   <tr>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
@@ -124,9 +124,9 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
         case _: DecimalType => tryParseDecimal(field)
         case DoubleType => tryParseDouble(field)
         case DateType => tryParseDateTime(field)
-        case TimestampNTZType if options.inferDate => tryParseDateTime(field)
+        case TimestampNTZType if options.prefersDate => tryParseDateTime(field)
         case TimestampNTZType => tryParseTimestampNTZ(field)
-        case TimestampType if options.inferDate => tryParseDateTime(field)
+        case TimestampType if options.prefersDate => tryParseDateTime(field)
         case TimestampType => tryParseTimestamp(field)
         case BooleanType => tryParseBoolean(field)
         case StringType => StringType
@@ -178,7 +178,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
   private def tryParseDouble(field: String): DataType = {
     if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) {
       DoubleType
-    } else if (options.inferDate) {
+    } else if (options.prefersDate) {
       tryParseDateTime(field)
     } else {
       tryParseTimestampNTZ(field)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
@@ -149,23 +149,29 @@ class CSVOptions(
   val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US)
 
   /**
-   * Infer columns with all valid date entries as date type (otherwise inferred as timestamp type).
-   * Disabled by default for backwards compatibility and performance. When enabled, date entries in
-   * timestamp columns will be cast to timestamp upon parsing. Not compatible with
-   * legacyTimeParserPolicy == LEGACY since legacy date parser will accept extra trailing characters
+   * Infer columns with all valid date entries as date type (otherwise inferred as timestamp type)
+   * if schema inference is enabled. When being used with user-provided schema, tries to parse
+   * timestamp values as dates if the values do not conform to the timestamp formatter before
+   * falling back to the backward compatible parsing - the parsed values will be cast to timestamp
+   * afterwards.
+   *
+   * Disabled by default for backwards compatibility and performance.
+   *
+   * Not compatible with legacyTimeParserPolicy == LEGACY since legacy date parser will accept
+   * extra trailing characters.
    */
-  val inferDate = {
-    val inferDateFlag = getBool("inferDate")
-    if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY && inferDateFlag) {
+  val prefersDate = {
+    val inferDateFlag = getBool("prefersDate")
+    if (inferDateFlag && SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
       throw QueryExecutionErrors.inferDateWithLegacyTimeParserError()
     }
     inferDateFlag
   }
 
-  // Provide a default value for dateFormatInRead when inferDate. This ensures that the
+  // Provide a default value for dateFormatInRead when prefersDate. This ensures that the
   // Iso8601DateFormatter (with strict date parsing) is used for date inference
   val dateFormatInRead: Option[String] =
-    if (inferDate) {
+    if (prefersDate) {
       Option(parameters.getOrElse("dateFormat", DateFormatter.defaultPattern))
     } else {
       parameters.get("dateFormat")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
@@ -235,7 +235,7 @@ class UnivocityParser(
         } catch {
           case NonFatal(e) =>
             // There may be date type entries in timestamp column due to schema inference
-            if (options.inferDate) {
+            if (options.prefersDate) {
               daysToMicros(dateFormatter.parse(datum), options.zoneId)
             } else {
               // If fails to parse, then tries the way used in 2.0 and 1.x for backwards
@@ -254,7 +254,7 @@ class UnivocityParser(
         try {
           timestampNTZFormatter.parseWithoutTimeZone(datum, false)
         } catch {
-          case NonFatal(e) if (options.inferDate) =>
+          case NonFatal(e) if options.prefersDate =>
             daysToMicros(dateFormatter.parse(datum), TimeZoneUTC.toZoneId)
         }
       }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
@@ -201,30 +201,30 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {
 
   test("SPARK-39469: inferring date type") {
     // "yyyy/MM/dd" format
-    var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "inferDate" -> "true"),
+    var options = new CSVOptions(Map("dateFormat" -> "yyyy/MM/dd", "prefersDate" -> "true"),
       false, "UTC")
     var inferSchema = new CSVInferSchema(options)
     assert(inferSchema.inferField(NullType, "2018/12/02") == DateType)
     // "MMM yyyy" format
-    options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "inferDate" -> "true"),
+    options = new CSVOptions(Map("dateFormat" -> "MMM yyyy", "prefersDate" -> "true"),
       false, "GMT")
     inferSchema = new CSVInferSchema(options)
     assert(inferSchema.inferField(NullType, "Dec 2018") == DateType)
     // Field should strictly match date format to infer as date
     options = new CSVOptions(
       Map("dateFormat" -> "yyyy-MM-dd", "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss",
-        "inferDate" -> "true"),
+        "prefersDate" -> "true"),
       columnPruning = false,
       defaultTimeZoneId = "GMT")
     inferSchema = new CSVInferSchema(options)
     assert(inferSchema.inferField(NullType, "2018-12-03T11:00:00") == TimestampType)
     assert(inferSchema.inferField(NullType, "2018-12-03") == DateType)
   }
 
-  test("SPARK-39469: inferring date and timestamp types in a mixed column with inferDate=true") {
+  test("SPARK-39469: inferring date and timestamp types in a mixed column with prefersDate=true") {
     var options = new CSVOptions(
       Map("dateFormat" -> "yyyy_MM_dd", "timestampFormat" -> "yyyy|MM|dd",
-        "timestampNTZFormat" -> "yyyy/MM/dd", "inferDate" -> "true"),
+        "timestampNTZFormat" -> "yyyy/MM/dd", "prefersDate" -> "true"),
       columnPruning = false,
       defaultTimeZoneId = "UTC")
     var inferSchema = new CSVInferSchema(options)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
@@ -373,10 +373,10 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
     assert(err.getMessage.contains("Illegal pattern character: n"))
   }
 
-  test("SPARK-39469: dates should be parsed correctly in a timestamp column when inferDate=true") {
+  test("SPARK-39469: dates should be parsed correctly in timestamp column when prefersDate=true") {
     def checkDate(dataType: DataType): Unit = {
       val timestampsOptions =
-        new CSVOptions(Map("inferDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm",
+        new CSVOptions(Map("prefersDate" -> "true", "timestampFormat" -> "dd/MM/yyyy HH:mm",
           "timestampNTZFormat" -> "dd-MM-yyyy HH:mm", "dateFormat" -> "dd_MM_yyyy"),
           false, DateTimeUtils.getZoneId("-08:00").toString)
       // Use CSVOption ZoneId="-08:00" (PST) to test that Dates in TimestampNTZ column are always
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -2797,13 +2797,13 @@ abstract class CSVSuite
       "inferSchema" -> "true",
       "timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss",
       "dateFormat" -> "yyyy-MM-dd",
-      "inferDate" -> "true")
+      "prefersDate" -> "true")
     val options2 = Map(
       "header" -> "true",
       "inferSchema" -> "true",
-      "inferDate" -> "true")
+      "prefersDate" -> "true")
 
-    // Error should be thrown when attempting to inferDate with Legacy parser
+    // Error should be thrown when attempting to prefersDate with Legacy parser
     if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
       val msg = intercept[IllegalArgumentException] {
         spark.read
@@ -2840,6 +2840,42 @@ abstract class CSVSuite
     }
   }
 
+  test("SPARK-39904: Parse incorrect timestamp values with prefersDate=true") {
+    withTempPath { path =>
+      Seq(
+        "2020-02-01 12:34:56",
+        "2020-02-02",
+        "invalid"
+      ).toDF()
+        .repartition(1)
+        .write.text(path.getAbsolutePath)
+
+      val schema = new StructType()
+        .add("ts", TimestampType)
+
+      val output = spark.read
+        .schema(schema)
+        .option("prefersDate", "true")
+        .csv(path.getAbsolutePath)
+
+      if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
+        val msg = intercept[IllegalArgumentException] {
+          output.collect()
+        }.getMessage
+        assert(msg.contains("CANNOT_INFER_DATE"))
+      } else {
+        checkAnswer(
+          output,
+          Seq(
+            Row(Timestamp.valueOf("2020-02-01 12:34:56")),
+            Row(Timestamp.valueOf("2020-02-02 00:00:00")),
+            Row(null)
+          )
+        )
+      }
+    }
+  }
+
   test("SPARK-39731: Correctly parse dates and timestamps with yyyyMMdd pattern") {
     withTempPath { path =>
       Seq(