Skip to content

Commit 1e6c1d8

Browse files
Justin UangHyukjinKwon
authored andcommitted
[SPARK-25493][SQL] Use auto-detection for CRLF in CSV datasource multiline mode
## What changes were proposed in this pull request? CSVs with windows style crlf ('\r\n') don't work in multiline mode. They work fine in single line mode because the line separation is done by Hadoop, which can handle all the different types of line separators. This PR fixes it by enabling Univocity's line separator detection in multiline mode, which will detect '\r\n', '\r', or '\n' automatically as it is done by hadoop in single line mode. ## How was this patch tested? Unit test with a file with crlf line endings. Closes #22503 from justinuang/fix-clrf-multiline. Authored-by: Justin Uang <juang@palantir.com> Signed-off-by: hyukjinkwon <gurwls223@apache.org>
1 parent d0ecff2 commit 1e6c1d8

File tree

3 files changed

+21
-0
lines changed

3 files changed

+21
-0
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ class CSVOptions(
212212
settings.setEmptyValue(emptyValueInRead)
213213
settings.setMaxCharsPerColumn(maxCharsPerColumn)
214214
settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER)
215+
settings.setLineSeparatorDetectionEnabled(multiLine == true)
216+
215217
settings
216218
}
217219
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
2+
year,make,model,comment,blank
3+
"2012","Tesla","S","No comment",
4+
5+
1997,Ford,E350,"Go get one now they are going fast",
6+
2015,Chevy,Volt
7+

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
5252
private val carsNullFile = "test-data/cars-null.csv"
5353
private val carsEmptyValueFile = "test-data/cars-empty-value.csv"
5454
private val carsBlankColName = "test-data/cars-blank-column-name.csv"
55+
private val carsCrlf = "test-data/cars-crlf.csv"
5556
private val emptyFile = "test-data/empty.csv"
5657
private val commentsFile = "test-data/comments.csv"
5758
private val disableCommentsFile = "test-data/disable_comments.csv"
@@ -220,6 +221,17 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
220221
}
221222
}
222223

224+
test("crlf line separators in multiline mode") {
225+
val cars = spark
226+
.read
227+
.format("csv")
228+
.option("multiLine", "true")
229+
.option("header", "true")
230+
.load(testFile(carsCrlf))
231+
232+
verifyCars(cars, withHeader = true)
233+
}
234+
223235
test("test aliases sep and encoding for delimiter and charset") {
224236
// scalastyle:off
225237
val cars = spark

0 commit comments

Comments
 (0)