Skip to content

Commit 0869b81

Browse files
committed
Tests for lineSep in different encodings
1 parent bb8a13b commit 0869b81

File tree

1 file changed

+45
-15
lines changed
  • sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv

1 file changed

+45
-15
lines changed

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,8 +1894,8 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
18941894
}
18951895
}
18961896

1897-
def testLineSeparator(lineSep: String): Unit = {
1898-
test(s"Support line separator - lineSep: '$lineSep'") {
1897+
def testLineSeparator(lineSep: String, encoding: String, inferSchema: Boolean, id: Int): Unit = {
1898+
test(s"Support line separator in ${encoding} #${id}") {
18991899
// Read
19001900
val data =
19011901
s""""a",1$lineSep
@@ -1905,17 +1905,23 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
19051905

19061906
Seq(data, dataWithTrailingLineSep).foreach { lines =>
19071907
withTempPath { path =>
1908-
Files.write(path.toPath, lines.getBytes(StandardCharsets.UTF_8))
1909-
val schema = StructType(StructField("f", StringType)
1910-
:: StructField("f0", LongType) :: Nil)
1908+
Files.write(path.toPath, lines.getBytes(encoding))
1909+
val schema = StructType(StructField("_c0", StringType)
1910+
:: StructField("_c1", LongType) :: Nil)
19111911

1912-
val expected = Seq(("a", 1), ("\nc", 2), ("\nd", 3)).toDF()
1912+
val expected = Seq(("a", 1), ("\nc", 2), ("\nd", 3))
1913+
.toDF("_c0", "_c1")
19131914
Seq(false, true).foreach { multiLine =>
1914-
val df = spark.read
1915-
.schema(schema)
1915+
val reader = spark
1916+
.read
19161917
.option("lineSep", lineSep)
19171918
.option("multiLine", multiLine)
1918-
.csv(path.getAbsolutePath)
1919+
.option("encoding", encoding)
1920+
val df = if (inferSchema) {
1921+
reader.option("inferSchema", true).csv(path.getAbsolutePath)
1922+
} else {
1923+
reader.schema(schema).csv(path.getAbsolutePath)
1924+
}
19191925
checkAnswer(df, expected)
19201926
}
19211927
}
@@ -1924,26 +1930,50 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
19241930
// Write
19251931
withTempPath { path =>
19261932
Seq("a", "b", "c").toDF("value").coalesce(1)
1927-
.write.option("lineSep", lineSep).csv(path.getAbsolutePath)
1933+
.write
1934+
.option("lineSep", lineSep)
1935+
.option("encoding", encoding)
1936+
.csv(path.getAbsolutePath)
19281937
val partFile = TestUtils.recursiveList(path).filter(f => f.getName.startsWith("part-")).head
1929-
val readBack = new String(Files.readAllBytes(partFile.toPath), StandardCharsets.UTF_8)
1938+
val readBack = new String(Files.readAllBytes(partFile.toPath), encoding)
19301939
assert(
19311940
readBack === s"a${lineSep}b${lineSep}c${lineSep}")
19321941
}
19331942

19341943
// Roundtrip
19351944
withTempPath { path =>
19361945
val df = Seq("a", "b", "c").toDF()
1937-
df.write.option("lineSep", lineSep).csv(path.getAbsolutePath)
1938-
val readBack = spark.read.option("lineSep", lineSep).csv(path.getAbsolutePath)
1946+
df.write
1947+
.option("lineSep", lineSep)
1948+
.option("encoding", encoding)
1949+
.csv(path.getAbsolutePath)
1950+
val readBack = spark
1951+
.read
1952+
.option("lineSep", lineSep)
1953+
.option("encoding", encoding)
1954+
.csv(path.getAbsolutePath)
19391955
checkAnswer(df, readBack)
19401956
}
19411957
}
19421958
}
19431959

19441960
// scalastyle:off nonascii
1945-
Seq("|", "^", "::", 0x1E.toChar.toString).foreach { lineSep =>
1946-
testLineSeparator(lineSep)
1961+
List(
1962+
(0, "|", "UTF-8", false),
1963+
(1, "^", "UTF-16BE", true),
1964+
(2, "::", "ISO-8859-1", true),
1965+
(3, "!!", "UTF-32LE", false),
1966+
(4, 0x1E.toChar.toString, "UTF-8", true),
1967+
(5, "", "UTF-32BE", false),
1968+
(6, "ку", "CP1251", true),
1969+
(8, "\r\n", "UTF-16LE", true),
1970+
(9, "\r\n", "utf-16be", false),
1971+
(10, "\u000d\u000a", "UTF-32BE", false),
1972+
(11, "\u000a\u000d", "UTF-8", true),
1973+
(12, "==", "US-ASCII", false),
1974+
(13, "$^", "utf-32le", true)
1975+
).foreach { case (testNum, sep, encoding, inferSchema) =>
1976+
testLineSeparator(sep, encoding, inferSchema, testNum)
19471977
}
19481978
// scalastyle:on nonascii
19491979

0 commit comments

Comments
 (0)