Skip to content

Commit 3a80f92

Browse files
gatorsmilecloud-fan
authored andcommitted
[SPARK-17492][SQL] Fix Reading Cataloged Data Sources without Extending SchemaRelationProvider
### What changes were proposed in this pull request? For data sources without extending `SchemaRelationProvider`, we expect users to not specify schemas when they creating tables. If the schema is input from users, an exception is issued. Since Spark 2.1, for any data source, to avoid infer the schema every time, we store the schema in the metastore catalog. Thus, when reading a cataloged data source table, the schema could be read from metastore catalog. In this case, we also got an exception. For example, ```Scala sql( s""" |CREATE TABLE relationProvierWithSchema |USING org.apache.spark.sql.sources.SimpleScanSource |OPTIONS ( | From '1', | To '10' |) """.stripMargin) spark.table(tableName).show() ``` ``` org.apache.spark.sql.sources.SimpleScanSource does not allow user-specified schemas.; ``` This PR is to fix the above issue. When building a data source, we introduce a flag `isSchemaFromUsers` to indicate whether the schema is really input from users. If true, we issue an exception. Otherwise, we will call the `createRelation` of `RelationProvider` to generate the `BaseRelation`, in which it contains the actual schema. ### How was this patch tested? Added a few cases. Author: gatorsmile <gatorsmile@gmail.com> Closes #15046 from gatorsmile/tempViewCases.
1 parent cb324f6 commit 3a80f92

File tree

4 files changed

+102
-24
lines changed

4 files changed

+102
-24
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -333,8 +333,13 @@ case class DataSource(
333333
dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions)
334334
case (_: SchemaRelationProvider, None) =>
335335
throw new AnalysisException(s"A schema needs to be specified when using $className.")
336-
case (_: RelationProvider, Some(_)) =>
337-
throw new AnalysisException(s"$className does not allow user-specified schemas.")
336+
case (dataSource: RelationProvider, Some(schema)) =>
337+
val baseRelation =
338+
dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions)
339+
if (baseRelation.schema != schema) {
340+
throw new AnalysisException(s"$className does not allow user-specified schemas.")
341+
}
342+
baseRelation
338343

339344
// We are reading from the results of a streaming query. Load files from the metadata log
340345
// instead of listing them using HDFS APIs.

sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,26 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
6565
)
6666
}
6767

68+
test("insert into a temp view that does not point to an insertable data source") {
69+
import testImplicits._
70+
withTempView("t1", "t2") {
71+
sql(
72+
"""
73+
|CREATE TEMPORARY VIEW t1
74+
|USING org.apache.spark.sql.sources.SimpleScanSource
75+
|OPTIONS (
76+
| From '1',
77+
| To '10')
78+
""".stripMargin)
79+
sparkContext.parallelize(1 to 10).toDF("a").createOrReplaceTempView("t2")
80+
81+
val message = intercept[AnalysisException] {
82+
sql("INSERT INTO TABLE t1 SELECT a FROM t2")
83+
}.getMessage
84+
assert(message.contains("does not allow insertion"))
85+
}
86+
}
87+
6888
test("PreInsert casting and renaming") {
6989
sql(
7090
s"""

sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala

Lines changed: 42 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -348,31 +348,51 @@ class TableScanSuite extends DataSourceTest with SharedSQLContext {
348348
test("exceptions") {
349349
// Make sure we do throw correct exception when users use a relation provider that
350350
// only implements the RelationProvider or the SchemaRelationProvider.
351-
val schemaNotAllowed = intercept[Exception] {
352-
sql(
353-
"""
354-
|CREATE TEMPORARY VIEW relationProvierWithSchema (i int)
355-
|USING org.apache.spark.sql.sources.SimpleScanSource
356-
|OPTIONS (
357-
| From '1',
358-
| To '10'
359-
|)
360-
""".stripMargin)
351+
Seq("TEMPORARY VIEW", "TABLE").foreach { tableType =>
352+
val schemaNotAllowed = intercept[Exception] {
353+
sql(
354+
s"""
355+
|CREATE $tableType relationProvierWithSchema (i int)
356+
|USING org.apache.spark.sql.sources.SimpleScanSource
357+
|OPTIONS (
358+
| From '1',
359+
| To '10'
360+
|)
361+
""".stripMargin)
362+
}
363+
assert(schemaNotAllowed.getMessage.contains("does not allow user-specified schemas"))
364+
365+
val schemaNeeded = intercept[Exception] {
366+
sql(
367+
s"""
368+
|CREATE $tableType schemaRelationProvierWithoutSchema
369+
|USING org.apache.spark.sql.sources.AllDataTypesScanSource
370+
|OPTIONS (
371+
| From '1',
372+
| To '10'
373+
|)
374+
""".stripMargin)
375+
}
376+
assert(schemaNeeded.getMessage.contains("A schema needs to be specified when using"))
361377
}
362-
assert(schemaNotAllowed.getMessage.contains("does not allow user-specified schemas"))
378+
}
363379

364-
val schemaNeeded = intercept[Exception] {
365-
sql(
366-
"""
367-
|CREATE TEMPORARY VIEW schemaRelationProvierWithoutSchema
368-
|USING org.apache.spark.sql.sources.AllDataTypesScanSource
369-
|OPTIONS (
370-
| From '1',
371-
| To '10'
372-
|)
373-
""".stripMargin)
380+
test("read the data source tables that do not extend SchemaRelationProvider") {
381+
Seq("TEMPORARY VIEW", "TABLE").foreach { tableType =>
382+
val tableName = "relationProvierWithSchema"
383+
withTable (tableName) {
384+
sql(
385+
s"""
386+
|CREATE $tableType $tableName
387+
|USING org.apache.spark.sql.sources.SimpleScanSource
388+
|OPTIONS (
389+
| From '1',
390+
| To '10'
391+
|)
392+
""".stripMargin)
393+
checkAnswer(spark.table(tableName), spark.range(1, 11).toDF())
394+
}
374395
}
375-
assert(schemaNeeded.getMessage.contains("A schema needs to be specified when using"))
376396
}
377397

378398
test("SPARK-5196 schema field with comment") {

sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,39 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
293293
Option(dir).map(spark.read.format("org.apache.spark.sql.test").load)
294294
}
295295

296+
test("read a data source that does not extend SchemaRelationProvider") {
297+
val dfReader = spark.read
298+
.option("from", "1")
299+
.option("TO", "10")
300+
.format("org.apache.spark.sql.sources.SimpleScanSource")
301+
302+
// when users do not specify the schema
303+
checkAnswer(dfReader.load(), spark.range(1, 11).toDF())
304+
305+
// when users specify the schema
306+
val inputSchema = new StructType().add("s", IntegerType, nullable = false)
307+
val e = intercept[AnalysisException] { dfReader.schema(inputSchema).load() }
308+
assert(e.getMessage.contains(
309+
"org.apache.spark.sql.sources.SimpleScanSource does not allow user-specified schemas"))
310+
}
311+
312+
test("read a data source that does not extend RelationProvider") {
313+
val dfReader = spark.read
314+
.option("from", "1")
315+
.option("TO", "10")
316+
.option("option_with_underscores", "someval")
317+
.option("option.with.dots", "someval")
318+
.format("org.apache.spark.sql.sources.AllDataTypesScanSource")
319+
320+
// when users do not specify the schema
321+
val e = intercept[AnalysisException] { dfReader.load() }
322+
assert(e.getMessage.contains("A schema needs to be specified when using"))
323+
324+
// when users specify the schema
325+
val inputSchema = new StructType().add("s", StringType, nullable = false)
326+
assert(dfReader.schema(inputSchema).load().count() == 10)
327+
}
328+
296329
test("text - API and behavior regarding schema") {
297330
// Writer
298331
spark.createDataset(data).write.mode(SaveMode.Overwrite).text(dir)

0 commit comments

Comments
 (0)