Skip to content

Commit b025271

Browse files
committed
1 parent dfa3978 commit b025271

File tree

4 files changed

+46
-1
lines changed

4 files changed

+46
-1
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,12 @@ class CSVOptions(
213213
}
214214
val lineSeparatorInWrite: Option[String] = lineSeparator
215215

216+
/**
217+
* The handling method to be used when unescaped quotes are found in the input.
218+
*/
219+
val unescapedQuoteHandling: UnescapedQuoteHandling = UnescapedQuoteHandling.valueOf(parameters
220+
.getOrElse("unescapedQuoteHandling", "STOP_AT_DELIMITER").toUpperCase(Locale.ROOT))
221+
216222
def asWriterSettings: CsvWriterSettings = {
217223
val writerSettings = new CsvWriterSettings()
218224
val format = writerSettings.getFormat
@@ -258,7 +264,7 @@ class CSVOptions(
258264
settings.setNullValue(nullValue)
259265
settings.setEmptyValue(emptyValueInRead)
260266
settings.setMaxCharsPerColumn(maxCharsPerColumn)
261-
settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER)
267+
settings.setUnescapedQuoteHandling(unescapedQuoteHandling)
262268
settings.setLineSeparatorDetectionEnabled(lineSeparatorInRead.isEmpty && multiLine)
263269
lineSeparatorInRead.foreach { _ =>
264270
settings.setNormalizeLineEndingsWithinQuotes(!multiLine)

sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,27 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
727727
* a record can have.</li>
728728
* <li>`maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed
729729
* for any given value being read. By default, it is -1 meaning unlimited length</li>
730+
* <li>`unescapedQuoteHandling` (default `STOP_AT_DELIMITER`): defines how the CsvParser
731+
* will handle values with unescaped quotes.
732+
* <ul>
733+
* <li>`STOP_AT_CLOSING_QUOTE`: If unescaped quotes are found in the input, accumulate
734+
* the quote character and proceed parsing the value as a quoted value, until a closing
735+
* quote is found.</li>
736+
* <li>`BACK_TO_DELIMITER`: If unescaped quotes are found in the input, consider the value
737+
* as an unquoted value. This will make the parser accumulate all characters of the current
738+
* parsed value until the delimiter is found. If no
739+
* delimiter is found in the value, the parser will continue accumulating characters from
740+
* the input until a delimiter or line ending is found.</li>
741+
* <li>`STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value
742+
* as an unquoted value. This will make the parser accumulate all characters until the
743+
* delimiter or a line ending is found in the input.</li>
744+
* <li>`STOP_AT_DELIMITER`: If unescaped quotes are found in the input, the content parsed
745+
* for the given value will be skipped and the value set in nullValue will be produced
746+
* instead.</li>
747+
* <li>`RAISE_ERROR`: If unescaped quotes are found in the input, a TextParsingException
748+
* will be thrown.</li>
749+
* </ul>
750+
* </li>
730751
* <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
731752
* during parsing. It supports the following case-insensitive modes. Note that Spark tries
732753
* to parse only required columns in CSV under column pruning. Therefore, corrupt records
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
c1,c2
2+
"a,""b,c","xyz"
3+
"a,b,c","x""yz"

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ abstract class CSVSuite
7575
private val valueMalformedFile = "test-data/value-malformed.csv"
7676
private val badAfterGoodFile = "test-data/bad_after_good.csv"
7777
private val malformedRowFile = "test-data/malformedRow.csv"
78+
private val unescapedQuotesAndUnescapedDelimiterFile =
79+
"test-data/unescaped-quotes-unescaped-delimiter.csv"
7880

7981
/** Verifies data and schema. */
8082
private def verifyCars(
@@ -2428,6 +2430,19 @@ abstract class CSVSuite
24282430
assert(readback.collect sameElements Array(Row("0"), Row("1"), Row("2")))
24292431
}
24302432
}
2433+
2434+
test("SPARK-33566: configure UnescapedQuoteHandling to parse " +
2435+
"unescapedQuotesAndUnescapedDelimiterFile correctly") {
2436+
// Without configure UnescapedQuoteHandling to STOP_AT_CLOSING_QUOTE,
2437+
// the result will be Row(""""a,""b""", """c""""), Row("""a,b,c""", """"x""yz"""")
2438+
val result = spark.read
2439+
.option("inferSchema", "true")
2440+
.option("header", "true")
2441+
.option("unescapedQuoteHandling", "STOP_AT_CLOSING_QUOTE")
2442+
.csv(testFile(unescapedQuotesAndUnescapedDelimiterFile)).collect()
2443+
val exceptResults = Array(Row("""a,""b,c""", "xyz"), Row("""a,b,c""", """x""yz"""))
2444+
assert(result.sameElements(exceptResults))
2445+
}
24312446
}
24322447

24332448
class CSVv1Suite extends CSVSuite {

0 commit comments

Comments
 (0)