Skip to content

Commit 568f321

Browse files
committed
fix statistics for external table
1 parent 45ce327 commit 568f321

File tree

3 files changed

+12
-3
lines changed

3 files changed

+12
-3
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -447,16 +447,21 @@ private[hive] case class MetastoreRelation
447447

448448
@transient override lazy val statistics = Statistics(
449449
sizeInBytes = {
450+
val totalSize = hiveQlTable.getParameters.get(HiveShim.getStatsSetupConstTotalSize)
451+
val rawDataSize = hiveQlTable.getParameters.get(HiveShim.getStatsSetupConstRawDataSize)
450452
// TODO: check if this estimate is valid for tables after partition pruning.
451453
// NOTE: getting `totalSize` directly from params is kind of hacky, but this should be
452454
// relatively cheap if parameters for the table are populated into the metastore. An
453455
// alternative would be going through Hadoop's FileSystem API, which can be expensive if a lot
454456
// of RPCs are involved. Besides `totalSize`, there are also `numFiles`, `numRows`,
455457
// `rawDataSize` keys (see StatsSetupConst in Hive) that we can look at in the future.
456458
BigInt(
457-
Option(hiveQlTable.getParameters.get(HiveShim.getStatsSetupConstTotalSize))
458-
.map(_.toLong)
459-
.getOrElse(sqlContext.defaultSizeInBytes))
459+
// When table is external,`totalSize` is always zero, which will influence join strategy
460+
// so when `totalSize` is zero, use `rawDataSize` instead
461+
// if the size is still less than zero, we use default size
462+
Option(totalSize).map(_.toLong).filter(_ > 0)
463+
.getOrElse(Option(rawDataSize).map(_.toLong).filter(_ > 0)
464+
.getOrElse(sqlContext.defaultSizeInBytes)))
460465
}
461466
)
462467

sql/hive/v0.12.0/src/main/scala/org/apache/spark/sql/hive/Shim12.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ private[hive] object HiveShim {
136136

137137
def getStatsSetupConstTotalSize = StatsSetupConst.TOTAL_SIZE
138138

139+
def getStatsSetupConstRawDataSize = StatsSetupConst.RAW_DATA_SIZE
140+
139141
def createDefaultDBIfNeeded(context: HiveContext) = { }
140142

141143
def getCommandProcessor(cmd: Array[String], conf: HiveConf) = {

sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ private[hive] object HiveShim {
154154

155155
def getStatsSetupConstTotalSize = StatsSetupConst.TOTAL_SIZE
156156

157+
def getStatsSetupConstRawDataSize = StatsSetupConst.RAW_DATA_SIZE
158+
157159
def createDefaultDBIfNeeded(context: HiveContext) = {
158160
context.runSqlHive("CREATE DATABASE default")
159161
context.runSqlHive("USE default")

0 commit comments

Comments
 (0)