-
Notifications
You must be signed in to change notification settings - Fork 28.6k
[SPARK-34953][CORE][SQL] Add the code change for adding the DateType in the infer schema while reading in CSV and JSON #32558
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6e65207
0aac7a0
f4959e1
8946bff
275ed99
100d40a
4e272cd
3d46a3c
78a7356
2689fd1
c93b873
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,8 +24,8 @@ import scala.util.control.Exception.allCatch | |
import org.apache.spark.rdd.RDD | ||
import org.apache.spark.sql.catalyst.analysis.TypeCoercion | ||
import org.apache.spark.sql.catalyst.expressions.ExprUtils | ||
import org.apache.spark.sql.catalyst.util.{DateFormatter, LegacyFastDateFormatter, TimestampFormatter} | ||
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT | ||
import org.apache.spark.sql.catalyst.util.TimestampFormatter | ||
import org.apache.spark.sql.errors.QueryExecutionErrors | ||
import org.apache.spark.sql.types._ | ||
|
||
|
@@ -38,6 +38,12 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { | |
legacyFormat = FAST_DATE_FORMAT, | ||
isParsing = true) | ||
|
||
private val dateFormatter = DateFormatter( | ||
options.dateFormat, | ||
options.locale, | ||
legacyFormat = FAST_DATE_FORMAT, | ||
isParsing = true) | ||
|
||
private val decimalParser = if (options.locale == Locale.US) { | ||
// Special handling the default locale for backward compatibility | ||
s: String => new java.math.BigDecimal(s) | ||
|
@@ -109,6 +115,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { | |
case LongType => tryParseLong(field) | ||
case _: DecimalType => tryParseDecimal(field) | ||
case DoubleType => tryParseDouble(field) | ||
case DateType => tryParseDateFormat(field) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just curious why you try to infer dates before timestamps? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is as per the suggestion in one of the review comments to use |
||
case TimestampType => tryParseTimestamp(field) | ||
case BooleanType => tryParseBoolean(field) | ||
case StringType => StringType | ||
|
@@ -160,6 +167,16 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable { | |
private def tryParseDouble(field: String): DataType = { | ||
if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) { | ||
DoubleType | ||
} else { | ||
tryParseDateFormat(field) | ||
} | ||
} | ||
|
||
private def tryParseDateFormat(field: String): DataType = { | ||
if (options.inferDateType | ||
&& !dateFormatter.isInstanceOf[LegacyFastDateFormatter] | ||
&& (allCatch opt dateFormatter.parse(field)).isDefined) { | ||
DateType | ||
} else { | ||
tryParseTimestamp(field) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -206,6 +206,12 @@ class CSVOptions( | |
sep | ||
} | ||
|
||
/** | ||
* option to infer date Type in the schema | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For me as an user, it is not clear the relation between the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This option inferDateType is added to -> This is to keep in sync with older version of spark, If someone wants to use the dateType, they can enable it in the option. This is just to prevent any migration issue to spark-3.2.0 from the older version. if they don't enable this option inferDateType, It will infer it as StringType. Where as on other hand inferSchema is to enable the inferring of schema. |
||
*/ | ||
val inferDateType = | ||
parameters.get("inferDateType").map(_.toBoolean).getOrElse(false) | ||
|
||
val lineSeparatorInRead: Option[Array[Byte]] = lineSeparator.map { lineSep => | ||
lineSep.getBytes(charset) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cc @MaxGekk FYI