@@ -1894,8 +1894,8 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
1894
1894
}
1895
1895
}
1896
1896
1897
- def testLineSeparator (lineSep : String ): Unit = {
1898
- test(s " Support line separator - lineSep: ' $lineSep ' " ) {
1897
+ def testLineSeparator (lineSep : String , encoding : String , inferSchema : Boolean , id : Int ): Unit = {
1898
+ test(s " Support line separator in ${encoding} # ${id} " ) {
1899
1899
// Read
1900
1900
val data =
1901
1901
s """ "a",1 $lineSep
@@ -1905,17 +1905,23 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
1905
1905
1906
1906
Seq (data, dataWithTrailingLineSep).foreach { lines =>
1907
1907
withTempPath { path =>
1908
- Files .write(path.toPath, lines.getBytes(StandardCharsets . UTF_8 ))
1909
- val schema = StructType (StructField (" f " , StringType )
1910
- :: StructField (" f0 " , LongType ) :: Nil )
1908
+ Files .write(path.toPath, lines.getBytes(encoding ))
1909
+ val schema = StructType (StructField (" _c0 " , StringType )
1910
+ :: StructField (" _c1 " , LongType ) :: Nil )
1911
1911
1912
- val expected = Seq ((" a" , 1 ), (" \n c" , 2 ), (" \n d" , 3 )).toDF()
1912
+ val expected = Seq ((" a" , 1 ), (" \n c" , 2 ), (" \n d" , 3 ))
1913
+ .toDF(" _c0" , " _c1" )
1913
1914
Seq (false , true ).foreach { multiLine =>
1914
- val df = spark.read
1915
- .schema(schema)
1915
+ val reader = spark
1916
+ .read
1916
1917
.option(" lineSep" , lineSep)
1917
1918
.option(" multiLine" , multiLine)
1918
- .csv(path.getAbsolutePath)
1919
+ .option(" encoding" , encoding)
1920
+ val df = if (inferSchema) {
1921
+ reader.option(" inferSchema" , true ).csv(path.getAbsolutePath)
1922
+ } else {
1923
+ reader.schema(schema).csv(path.getAbsolutePath)
1924
+ }
1919
1925
checkAnswer(df, expected)
1920
1926
}
1921
1927
}
@@ -1924,26 +1930,50 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
1924
1930
// Write
1925
1931
withTempPath { path =>
1926
1932
Seq (" a" , " b" , " c" ).toDF(" value" ).coalesce(1 )
1927
- .write.option(" lineSep" , lineSep).csv(path.getAbsolutePath)
1933
+ .write
1934
+ .option(" lineSep" , lineSep)
1935
+ .option(" encoding" , encoding)
1936
+ .csv(path.getAbsolutePath)
1928
1937
val partFile = TestUtils .recursiveList(path).filter(f => f.getName.startsWith(" part-" )).head
1929
- val readBack = new String (Files .readAllBytes(partFile.toPath), StandardCharsets . UTF_8 )
1938
+ val readBack = new String (Files .readAllBytes(partFile.toPath), encoding )
1930
1939
assert(
1931
1940
readBack === s " a ${lineSep}b ${lineSep}c ${lineSep}" )
1932
1941
}
1933
1942
1934
1943
// Roundtrip
1935
1944
withTempPath { path =>
1936
1945
val df = Seq (" a" , " b" , " c" ).toDF()
1937
- df.write.option(" lineSep" , lineSep).csv(path.getAbsolutePath)
1938
- val readBack = spark.read.option(" lineSep" , lineSep).csv(path.getAbsolutePath)
1946
+ df.write
1947
+ .option(" lineSep" , lineSep)
1948
+ .option(" encoding" , encoding)
1949
+ .csv(path.getAbsolutePath)
1950
+ val readBack = spark
1951
+ .read
1952
+ .option(" lineSep" , lineSep)
1953
+ .option(" encoding" , encoding)
1954
+ .csv(path.getAbsolutePath)
1939
1955
checkAnswer(df, readBack)
1940
1956
}
1941
1957
}
1942
1958
}
1943
1959
1944
1960
// scalastyle:off nonascii
1945
- Seq (" |" , " ^" , " ::" , 0x1E .toChar.toString).foreach { lineSep =>
1946
- testLineSeparator(lineSep)
1961
+ List (
1962
+ (0 , " |" , " UTF-8" , false ),
1963
+ (1 , " ^" , " UTF-16BE" , true ),
1964
+ (2 , " ::" , " ISO-8859-1" , true ),
1965
+ (3 , " !!" , " UTF-32LE" , false ),
1966
+ (4 , 0x1E .toChar.toString, " UTF-8" , true ),
1967
+ (5 , " 아" , " UTF-32BE" , false ),
1968
+ (6 , " ку" , " CP1251" , true ),
1969
+ (8 , " \r\n " , " UTF-16LE" , true ),
1970
+ (9 , " \r\n " , " utf-16be" , false ),
1971
+ (10 , " \u000d\u000a " , " UTF-32BE" , false ),
1972
+ (11 , " \u000a\u000d " , " UTF-8" , true ),
1973
+ (12 , " ==" , " US-ASCII" , false ),
1974
+ (13 , " $^" , " utf-32le" , true )
1975
+ ).foreach { case (testNum, sep, encoding, inferSchema) =>
1976
+ testLineSeparator(sep, encoding, inferSchema, testNum)
1947
1977
}
1948
1978
// scalastyle:on nonascii
1949
1979
0 commit comments