@@ -26,8 +26,10 @@ import org.apache.spark.sql.{QueryTest, SQLConf, SaveMode}
2626import org .apache .spark .sql .catalyst .expressions .Row
2727import org .apache .spark .sql .execution .{ExecutedCommand , PhysicalRDD }
2828import org .apache .spark .sql .hive .execution .HiveTableScan
29+ import org .apache .spark .sql .hive .test .TestHive
2930import org .apache .spark .sql .hive .test .TestHive ._
3031import org .apache .spark .sql .hive .test .TestHive .implicits ._
32+ import org .apache .spark .sql .json .JSONRelation
3133import org .apache .spark .sql .sources .{InsertIntoDataSource , LogicalRelation }
3234import org .apache .spark .sql .parquet .{ParquetRelation2 , ParquetTableScan }
3335import org .apache .spark .sql .SaveMode
@@ -389,6 +391,116 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
389391
390392 sql(" DROP TABLE ms_convert" )
391393 }
394+
395+ test(" Caching converted data source Parquet Relations" ) {
396+ def checkCached (tableIdentifer : catalog.QualifiedTableName ): Unit = {
397+ // Converted test_parquet should be cached.
398+ catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) match {
399+ case null => fail(" Converted test_parquet should be cached in the cache." )
400+ case logical @ LogicalRelation (parquetRelation : ParquetRelation2 ) => // OK
401+ case other =>
402+ fail(
403+ " The cached test_parquet should be a Parquet Relation. " +
404+ s " However, $other is returned form the cache. " )
405+ }
406+ }
407+
408+ sql(" DROP TABLE IF EXISTS test_insert_parquet" )
409+ sql(" DROP TABLE IF EXISTS test_parquet_partitioned_cache_test" )
410+
411+ sql(
412+ """
413+ |create table test_insert_parquet
414+ |(
415+ | intField INT,
416+ | stringField STRING
417+ |)
418+ |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
419+ |STORED AS
420+ | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
421+ | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
422+ """ .stripMargin)
423+
424+ var tableIdentifer = catalog.QualifiedTableName (" default" , " test_insert_parquet" )
425+
426+ // First, make sure the converted test_parquet is not cached.
427+ assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null )
428+ // Table lookup will make the table cached.
429+ table(" test_insert_parquet" )
430+ checkCached(tableIdentifer)
431+ // For insert into non-partitioned table, we will do the conversion,
432+ // so the converted test_insert_parquet should be cached.
433+ invalidateTable(" test_insert_parquet" )
434+ assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null )
435+ sql(
436+ """
437+ |INSERT INTO TABLE test_insert_parquet
438+ |select a, b from jt
439+ """ .stripMargin)
440+ checkCached(tableIdentifer)
441+ // Make sure we can read the data.
442+ checkAnswer(
443+ sql(" select * from test_insert_parquet" ),
444+ sql(" select a, b from jt" ).collect())
445+ // Invalidate the cache.
446+ invalidateTable(" test_insert_parquet" )
447+ assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null )
448+
449+ // Create a partitioned table.
450+ sql(
451+ """
452+ |create table test_parquet_partitioned_cache_test
453+ |(
454+ | intField INT,
455+ | stringField STRING
456+ |)
457+ |PARTITIONED BY (date string)
458+ |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
459+ |STORED AS
460+ | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
461+ | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
462+ """ .stripMargin)
463+
464+ tableIdentifer = catalog.QualifiedTableName (" default" , " test_parquet_partitioned_cache_test" )
465+ assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null )
466+ sql(
467+ """
468+ |INSERT INTO TABLE test_parquet_partitioned_cache_test
469+ |PARTITION (date='2015-04-01')
470+ |select a, b from jt
471+ """ .stripMargin)
472+ // Right now, insert into a partitioned Parquet is not supported in data source Parquet.
473+ // So, we expect it is not cached.
474+ assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null )
475+ conf.setConf(SQLConf .PARQUET_USE_DATA_SOURCE_API , " false" )
476+ sql(
477+ """
478+ |INSERT INTO TABLE test_parquet_partitioned_cache_test
479+ |PARTITION (date='2015-04-02')
480+ |select a, b from jt
481+ """ .stripMargin)
482+ assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null )
483+ conf.setConf(SQLConf .PARQUET_USE_DATA_SOURCE_API , " true" )
484+
485+ // Make sure we can cache the partitioned table.
486+ table(" test_parquet_partitioned_cache_test" )
487+ checkCached(tableIdentifer)
488+ // Make sure we can read the data.
489+ checkAnswer(
490+ sql(" select STRINGField, date, intField from test_parquet_partitioned_cache_test" ),
491+ sql(
492+ """
493+ |select b, '2015-04-01', a FROM jt
494+ |UNION ALL
495+ |select b, '2015-04-02', a FROM jt
496+ """ .stripMargin).collect())
497+
498+ invalidateTable(" test_parquet_partitioned_cache_test" )
499+ assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null )
500+
501+ sql(" DROP TABLE test_insert_parquet" )
502+ sql(" DROP TABLE test_parquet_partitioned_cache_test" )
503+ }
392504}
393505
394506class ParquetDataSourceOffMetastoreSuite extends ParquetMetastoreSuiteBase {
0 commit comments