Skip to content

Commit 8ef323c

Browse files
dongjoon-hyuncloud-fan
authored andcommitted
[SPARK-23072][SQL][TEST] Add a Unicode schema test for file-based data sources
## What changes were proposed in this pull request? After [SPARK-20682](#19651), Apache Spark 2.3 is able to read ORC files with Unicode schema. Previously, it raises `org.apache.spark.sql.catalyst.parser.ParseException`. This PR adds a Unicode schema test for CSV/JSON/ORC/Parquet file-based data sources. Note that TEXT data source only has [a single column with a fixed name 'value'](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala#L71). ## How was this patch tested? Pass the newly added test case. Author: Dongjoon Hyun <dongjoon@apache.org> Closes #20266 from dongjoon-hyun/SPARK-23072. (cherry picked from commit a0aedb0) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent 00c744e commit 8ef323c

File tree

4 files changed

+81
-38
lines changed

4 files changed

+81
-38
lines changed
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql
19+
20+
import org.apache.spark.sql.test.SharedSQLContext
21+
22+
class FileBasedDataSourceSuite extends QueryTest with SharedSQLContext {
23+
import testImplicits._
24+
25+
private val allFileBasedDataSources = Seq("orc", "parquet", "csv", "json", "text")
26+
27+
allFileBasedDataSources.foreach { format =>
28+
test(s"Writing empty datasets should not fail - $format") {
29+
withTempPath { dir =>
30+
Seq("str").toDS().limit(0).write.format(format).save(dir.getCanonicalPath)
31+
}
32+
}
33+
}
34+
35+
// `TEXT` data source always has a single column whose name is `value`.
36+
allFileBasedDataSources.filterNot(_ == "text").foreach { format =>
37+
test(s"SPARK-23072 Write and read back unicode column names - $format") {
38+
withTempPath { path =>
39+
val dir = path.getCanonicalPath
40+
41+
// scalastyle:off nonascii
42+
val df = Seq("a").toDF("한글")
43+
// scalastyle:on nonascii
44+
45+
df.write.format(format).option("header", "true").save(dir)
46+
val answerDf = spark.read.format(format).option("header", "true").load(dir)
47+
48+
assert(df.schema.sameType(answerDf.schema))
49+
checkAnswer(df, answerDf)
50+
}
51+
}
52+
}
53+
54+
// Only ORC/Parquet support this. `CSV` and `JSON` returns an empty schema.
55+
// `TEXT` data source always has a single column whose name is `value`.
56+
Seq("orc", "parquet").foreach { format =>
57+
test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
58+
withTempPath { file =>
59+
val path = file.getCanonicalPath
60+
val emptyDf = Seq((true, 1, "str")).toDF().limit(0)
61+
emptyDf.write.format(format).save(path)
62+
63+
val df = spark.read.format(format).load(path)
64+
assert(df.schema.sameType(emptyDf.schema))
65+
checkAnswer(df, emptyDf)
66+
}
67+
}
68+
}
69+
70+
allFileBasedDataSources.foreach { format =>
71+
test(s"SPARK-22146 read files containing special characters using $format") {
72+
val nameWithSpecialChars = s"sp&cial%chars"
73+
withTempDir { dir =>
74+
val tmpFile = s"$dir/$nameWithSpecialChars"
75+
spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
76+
val fileContent = spark.read.format(format).load(tmpFile)
77+
checkAnswer(fileContent, Seq(Row("a"), Row("b")))
78+
}
79+
}
80+
}
81+
}

sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2757,20 +2757,4 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
27572757
}
27582758
}
27592759
}
2760-
2761-
// Only New OrcFileFormat supports this
2762-
Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName,
2763-
"parquet").foreach { format =>
2764-
test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
2765-
withTempPath { file =>
2766-
val path = file.getCanonicalPath
2767-
val emptyDf = Seq((true, 1, "str")).toDF.limit(0)
2768-
emptyDf.write.format(format).save(path)
2769-
2770-
val df = spark.read.format(format).load(path)
2771-
assert(df.schema.sameType(emptyDf.schema))
2772-
checkAnswer(df, emptyDf)
2773-
}
2774-
}
2775-
}
27762760
}

sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,12 @@ import scala.collection.mutable.ArrayBuffer
2323

2424
import org.apache.hadoop.fs.Path
2525

26-
import org.apache.spark.SparkContext
2726
import org.apache.spark.sql._
2827
import org.apache.spark.sql.catalyst.TableIdentifier
2928
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
3029
import org.apache.spark.sql.execution.command.CreateTableCommand
3130
import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
3231
import org.apache.spark.sql.hive.HiveExternalCatalog._
33-
import org.apache.spark.sql.hive.client.HiveClient
3432
import org.apache.spark.sql.hive.test.TestHiveSingleton
3533
import org.apache.spark.sql.internal.SQLConf
3634
import org.apache.spark.sql.internal.StaticSQLConf._
@@ -1344,18 +1342,6 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
13441342
}
13451343
}
13461344

1347-
Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
1348-
test(s"SPARK-22146: read files containing special characters using $format") {
1349-
val nameWithSpecialChars = s"sp&cial%chars"
1350-
withTempDir { dir =>
1351-
val tmpFile = s"$dir/$nameWithSpecialChars"
1352-
spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
1353-
val fileContent = spark.read.format(format).load(tmpFile)
1354-
checkAnswer(fileContent, Seq(Row("a"), Row("b")))
1355-
}
1356-
}
1357-
}
1358-
13591345
private def withDebugMode(f: => Unit): Unit = {
13601346
val previousValue = sparkSession.sparkContext.conf.get(DEBUG_MODE)
13611347
try {

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2159,12 +2159,4 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
21592159
}
21602160
}
21612161
}
2162-
2163-
Seq("orc", "parquet", "csv", "json", "text").foreach { format =>
2164-
test(s"Writing empty datasets should not fail - $format") {
2165-
withTempDir { dir =>
2166-
Seq("str").toDS.limit(0).write.format(format).save(dir.getCanonicalPath + "/tmp")
2167-
}
2168-
}
2169-
}
21702162
}

0 commit comments

Comments
 (0)