Skip to content

Commit 4b82bd7

Browse files
yhuaimarmbrus
authored andcommitted
[SPARK-6575][SQL] Converted Parquet Metastore tables no longer cache metadata
https://issues.apache.org/jira/browse/SPARK-6575 Author: Yin Huai <yhuai@databricks.com> Closes #5339 from yhuai/parquetRelationCache and squashes the following commits: 83d9846 [Yin Huai] Remove unnecessary change. c0dc7a4 [Yin Huai] Cache converted parquet relations.
1 parent 45134ec commit 4b82bd7

File tree

2 files changed

+167
-6
lines changed

2 files changed

+167
-6
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
116116
}
117117

118118
override def refreshTable(databaseName: String, tableName: String): Unit = {
119-
cachedDataSourceTables.refresh(QualifiedTableName(databaseName, tableName).toLowerCase)
119+
// refresh table does not eagerly reload the cache. It just invalidate the cache.
120+
// Next time when we use the table, it will be populated in the cache.
121+
invalidateTable(databaseName, tableName)
120122
}
121123

122124
def invalidateTable(databaseName: String, tableName: String): Unit = {
@@ -229,13 +231,42 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
229231
private def convertToParquetRelation(metastoreRelation: MetastoreRelation): LogicalRelation = {
230232
val metastoreSchema = StructType.fromAttributes(metastoreRelation.output)
231233
val mergeSchema = hive.convertMetastoreParquetWithSchemaMerging
232-
val parquetOptions = Map(
233-
ParquetRelation2.METASTORE_SCHEMA -> metastoreSchema.json,
234-
ParquetRelation2.MERGE_SCHEMA -> mergeSchema.toString)
235234

236235
// NOTE: Instead of passing Metastore schema directly to `ParquetRelation2`, we have to
237236
// serialize the Metastore schema to JSON and pass it as a data source option because of the
238237
// evil case insensitivity issue, which is reconciled within `ParquetRelation2`.
238+
val parquetOptions = Map(
239+
ParquetRelation2.METASTORE_SCHEMA -> metastoreSchema.json,
240+
ParquetRelation2.MERGE_SCHEMA -> mergeSchema.toString)
241+
val tableIdentifier =
242+
QualifiedTableName(metastoreRelation.databaseName, metastoreRelation.tableName)
243+
244+
def getCached(
245+
tableIdentifier: QualifiedTableName,
246+
pathsInMetastore: Seq[String],
247+
schemaInMetastore: StructType,
248+
partitionSpecInMetastore: Option[PartitionSpec]): Option[LogicalRelation] = {
249+
cachedDataSourceTables.getIfPresent(tableIdentifier) match {
250+
case null => None // Cache miss
251+
case logical @ LogicalRelation(parquetRelation: ParquetRelation2) =>
252+
// If we have the same paths, same schema, and same partition spec,
253+
// we will use the cached Parquet Relation.
254+
val useCached =
255+
parquetRelation.paths == pathsInMetastore &&
256+
logical.schema.sameType(metastoreSchema) &&
257+
parquetRelation.maybePartitionSpec == partitionSpecInMetastore
258+
259+
if (useCached) Some(logical) else None
260+
case other =>
261+
logWarning(
262+
s"${metastoreRelation.databaseName}.${metastoreRelation.tableName} shold be stored " +
263+
s"as Parquet. However, we are getting a ${other} from the metastore cache. " +
264+
s"This cached entry will be invalidated.")
265+
cachedDataSourceTables.invalidate(tableIdentifier)
266+
None
267+
}
268+
}
269+
239270
if (metastoreRelation.hiveQlTable.isPartitioned) {
240271
val partitionSchema = StructType.fromAttributes(metastoreRelation.partitionKeys)
241272
val partitionColumnDataTypes = partitionSchema.map(_.dataType)
@@ -248,10 +279,28 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with
248279
}
249280
val partitionSpec = PartitionSpec(partitionSchema, partitions)
250281
val paths = partitions.map(_.path)
251-
LogicalRelation(ParquetRelation2(paths, parquetOptions, None, Some(partitionSpec))(hive))
282+
283+
val cached = getCached(tableIdentifier, paths, metastoreSchema, Some(partitionSpec))
284+
val parquetRelation = cached.getOrElse {
285+
val created =
286+
LogicalRelation(ParquetRelation2(paths, parquetOptions, None, Some(partitionSpec))(hive))
287+
cachedDataSourceTables.put(tableIdentifier, created)
288+
created
289+
}
290+
291+
parquetRelation
252292
} else {
253293
val paths = Seq(metastoreRelation.hiveQlTable.getDataLocation.toString)
254-
LogicalRelation(ParquetRelation2(paths, parquetOptions)(hive))
294+
295+
val cached = getCached(tableIdentifier, paths, metastoreSchema, None)
296+
val parquetRelation = cached.getOrElse {
297+
val created =
298+
LogicalRelation(ParquetRelation2(paths, parquetOptions)(hive))
299+
cachedDataSourceTables.put(tableIdentifier, created)
300+
created
301+
}
302+
303+
parquetRelation
255304
}
256305
}
257306

sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@ import org.apache.spark.sql.{QueryTest, SQLConf, SaveMode}
2626
import org.apache.spark.sql.catalyst.expressions.Row
2727
import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD}
2828
import org.apache.spark.sql.hive.execution.HiveTableScan
29+
import org.apache.spark.sql.hive.test.TestHive
2930
import org.apache.spark.sql.hive.test.TestHive._
3031
import org.apache.spark.sql.hive.test.TestHive.implicits._
32+
import org.apache.spark.sql.json.JSONRelation
3133
import org.apache.spark.sql.sources.{InsertIntoDataSource, LogicalRelation}
3234
import org.apache.spark.sql.parquet.{ParquetRelation2, ParquetTableScan}
3335
import org.apache.spark.sql.SaveMode
@@ -390,6 +392,116 @@ class ParquetDataSourceOnMetastoreSuite extends ParquetMetastoreSuiteBase {
390392

391393
sql("DROP TABLE ms_convert")
392394
}
395+
396+
test("Caching converted data source Parquet Relations") {
397+
def checkCached(tableIdentifer: catalog.QualifiedTableName): Unit = {
398+
// Converted test_parquet should be cached.
399+
catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) match {
400+
case null => fail("Converted test_parquet should be cached in the cache.")
401+
case logical @ LogicalRelation(parquetRelation: ParquetRelation2) => // OK
402+
case other =>
403+
fail(
404+
"The cached test_parquet should be a Parquet Relation. " +
405+
s"However, $other is returned form the cache.")
406+
}
407+
}
408+
409+
sql("DROP TABLE IF EXISTS test_insert_parquet")
410+
sql("DROP TABLE IF EXISTS test_parquet_partitioned_cache_test")
411+
412+
sql(
413+
"""
414+
|create table test_insert_parquet
415+
|(
416+
| intField INT,
417+
| stringField STRING
418+
|)
419+
|ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
420+
|STORED AS
421+
| INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
422+
| OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
423+
""".stripMargin)
424+
425+
var tableIdentifer = catalog.QualifiedTableName("default", "test_insert_parquet")
426+
427+
// First, make sure the converted test_parquet is not cached.
428+
assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
429+
// Table lookup will make the table cached.
430+
table("test_insert_parquet")
431+
checkCached(tableIdentifer)
432+
// For insert into non-partitioned table, we will do the conversion,
433+
// so the converted test_insert_parquet should be cached.
434+
invalidateTable("test_insert_parquet")
435+
assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
436+
sql(
437+
"""
438+
|INSERT INTO TABLE test_insert_parquet
439+
|select a, b from jt
440+
""".stripMargin)
441+
checkCached(tableIdentifer)
442+
// Make sure we can read the data.
443+
checkAnswer(
444+
sql("select * from test_insert_parquet"),
445+
sql("select a, b from jt").collect())
446+
// Invalidate the cache.
447+
invalidateTable("test_insert_parquet")
448+
assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
449+
450+
// Create a partitioned table.
451+
sql(
452+
"""
453+
|create table test_parquet_partitioned_cache_test
454+
|(
455+
| intField INT,
456+
| stringField STRING
457+
|)
458+
|PARTITIONED BY (date string)
459+
|ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
460+
|STORED AS
461+
| INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
462+
| OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
463+
""".stripMargin)
464+
465+
tableIdentifer = catalog.QualifiedTableName("default", "test_parquet_partitioned_cache_test")
466+
assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
467+
sql(
468+
"""
469+
|INSERT INTO TABLE test_parquet_partitioned_cache_test
470+
|PARTITION (date='2015-04-01')
471+
|select a, b from jt
472+
""".stripMargin)
473+
// Right now, insert into a partitioned Parquet is not supported in data source Parquet.
474+
// So, we expect it is not cached.
475+
assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
476+
conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "false")
477+
sql(
478+
"""
479+
|INSERT INTO TABLE test_parquet_partitioned_cache_test
480+
|PARTITION (date='2015-04-02')
481+
|select a, b from jt
482+
""".stripMargin)
483+
assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
484+
conf.setConf(SQLConf.PARQUET_USE_DATA_SOURCE_API, "true")
485+
486+
// Make sure we can cache the partitioned table.
487+
table("test_parquet_partitioned_cache_test")
488+
checkCached(tableIdentifer)
489+
// Make sure we can read the data.
490+
checkAnswer(
491+
sql("select STRINGField, date, intField from test_parquet_partitioned_cache_test"),
492+
sql(
493+
"""
494+
|select b, '2015-04-01', a FROM jt
495+
|UNION ALL
496+
|select b, '2015-04-02', a FROM jt
497+
""".stripMargin).collect())
498+
499+
invalidateTable("test_parquet_partitioned_cache_test")
500+
assert(catalog.cachedDataSourceTables.getIfPresent(tableIdentifer) === null)
501+
502+
sql("DROP TABLE test_insert_parquet")
503+
sql("DROP TABLE test_parquet_partitioned_cache_test")
504+
}
393505
}
394506

395507
class ParquetDataSourceOffMetastoreSuite extends ParquetMetastoreSuiteBase {

0 commit comments

Comments
 (0)