Skip to content

Commit b6c655a

Browse files
author
wangzhenhua
committed
fix bug about size
1 parent aa438c4 commit b6c655a

File tree

5 files changed

+64
-37
lines changed

5 files changed

+64
-37
lines changed

sql/core/src/main/scala/org/apache/spark/sql/catalyst/SQLBuilder.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,7 @@ class SQLBuilder private (
591591
object ExtractSQLTable {
592592
def unapply(plan: LogicalPlan): Option[SQLTable] = plan match {
593593
case l @ LogicalRelation(_, _, Some(catalogTable))
594-
if catalogTable.identifier.database.isDefined =>
594+
if catalogTable.identifier.database.isDefined =>
595595
Some(SQLTable(
596596
catalogTable.identifier.database.get,
597597
catalogTable.identifier.table,

sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeTableCommand.scala

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,11 @@ case class AnalyzeTableCommand(tableName: String, noscan: Boolean = true) extend
9393

9494
// data source tables have been converted into LogicalRelations
9595
case logicalRel: LogicalRelation if logicalRel.catalogTable.isDefined =>
96+
val table = logicalRel.catalogTable.get
9697
updateTableStats(
97-
logicalRel.catalogTable.get,
98-
oldTotalSize = logicalRel.statistics.sizeInBytes.toLong,
99-
oldRowCount = logicalRel.statistics.rowCount.map(_.toLong).getOrElse(-1L),
98+
table,
99+
oldTotalSize = table.stats.map(_.sizeInBytes.toLong).getOrElse(0L),
100+
oldRowCount = table.stats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L),
100101
newTotalSize = logicalRel.relation.sizeInBytes)
101102

102103
case otherRelation =>

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,7 @@ case class LogicalRelation(
5252

5353
// Logical Relations are distinct if they have different output for the sake of transformations.
5454
override def equals(other: Any): Boolean = other match {
55-
case l @ LogicalRelation(otherRelation, _, _) =>
56-
relation == otherRelation && output == l.output
55+
case l @ LogicalRelation(otherRelation, _, _) => relation == otherRelation && output == l.output
5756
case _ => false
5857
}
5958

@@ -73,7 +72,6 @@ case class LogicalRelation(
7372
// expId can be different but the relation is still the same.
7473
override lazy val cleanArgs: Seq[Any] = Seq(relation)
7574

76-
// inheritedStats is inherited from a CatalogRelation
7775
@transient override lazy val statistics: Statistics = {
7876
catalogTable.flatMap(_.stats.map(_.copy(sizeInBytes = relation.sizeInBytes))).getOrElse(
7977
Statistics(sizeInBytes = relation.sizeInBytes))

sql/core/src/test/scala/org/apache/spark/sql/StatisticsSuite.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,11 @@ class StatisticsSuite extends QueryTest with SharedSQLContext {
7777
}
7878

7979
test("test table-level statistics for data source table created in InMemoryCatalog") {
80-
def checkTableStats(tableName: String, rowCount: Option[BigInt]): Unit = {
80+
def checkTableStats(tableName: String, expectedRowCount: Option[BigInt]): Unit = {
8181
val df = sql(s"SELECT * FROM $tableName")
8282
val relations = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
83-
assert(rel.statistics.sizeInBytes === rel.relation.sizeInBytes)
84-
assert(rel.statistics.rowCount === rowCount)
83+
assert(rel.catalogTable.isDefined)
84+
assert(rel.catalogTable.get.stats.flatMap(_.rowCount) === expectedRowCount)
8585
rel
8686
}
8787
assert(relations.size === 1)
@@ -94,11 +94,11 @@ class StatisticsSuite extends QueryTest with SharedSQLContext {
9494

9595
// noscan won't count the number of rows
9696
sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan")
97-
checkTableStats(tableName, None)
97+
checkTableStats(tableName, expectedRowCount = None)
9898

9999
// without noscan, we count the number of rows
100100
sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS")
101-
checkTableStats(tableName, Some(2))
101+
checkTableStats(tableName, expectedRowCount = Some(2))
102102
}
103103
}
104104
}

sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala

Lines changed: 53 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import scala.reflect.ClassTag
2323

2424
import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
2525
import org.apache.spark.sql.catalyst.TableIdentifier
26+
import org.apache.spark.sql.catalyst.plans.logical.Statistics
2627
import org.apache.spark.sql.execution.command.{AnalyzeTableCommand, DDLUtils}
2728
import org.apache.spark.sql.execution.datasources.LogicalRelation
2829
import org.apache.spark.sql.execution.joins._
@@ -171,12 +172,18 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
171172

172173
private def checkMetastoreRelationStats(
173174
tableName: String,
174-
expectedTotalSize: Long,
175-
expectedRowCount: Option[BigInt]): Unit = {
175+
expectedStats: Option[Statistics]): Unit = {
176176
val df = sql(s"SELECT * FROM $tableName")
177177
val relations = df.queryExecution.analyzed.collect { case rel: MetastoreRelation =>
178-
assert(rel.statistics.sizeInBytes === expectedTotalSize)
179-
assert(rel.statistics.rowCount === expectedRowCount)
178+
expectedStats match {
179+
case Some(es) =>
180+
assert(rel.catalogTable.stats.isDefined)
181+
val stats = rel.catalogTable.stats.get
182+
assert(stats.sizeInBytes === es.sizeInBytes)
183+
assert(stats.rowCount === es.rowCount)
184+
case None =>
185+
assert(rel.catalogTable.stats.isEmpty)
186+
}
180187
rel
181188
}
182189
assert(relations.size === 1)
@@ -185,21 +192,22 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
185192
test("test table-level statistics for hive tables created in HiveExternalCatalog") {
186193
val textTable = "textTable"
187194
withTable(textTable) {
195+
// Currently Spark's statistics are self-contained, we don't have statistics until we use
196+
// the `ANALYZE TABLE` command.
188197
sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
189-
checkMetastoreRelationStats(textTable,
190-
expectedTotalSize = spark.sessionState.conf.defaultSizeInBytes, expectedRowCount = None)
191-
198+
checkMetastoreRelationStats(textTable, expectedStats = None)
192199
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
193-
// don't have our statistics, MetastoreRelation uses hive's `totalSize`
194-
checkMetastoreRelationStats(textTable, expectedTotalSize = 5812, expectedRowCount = None)
200+
checkMetastoreRelationStats(textTable, expectedStats = None)
195201

196202
// noscan won't count the number of rows
197203
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
198-
checkMetastoreRelationStats(textTable, expectedTotalSize = 5812, expectedRowCount = None)
204+
checkMetastoreRelationStats(textTable, expectedStats =
205+
Some(Statistics(sizeInBytes = 5812, rowCount = None)))
199206

200207
// without noscan, we count the number of rows
201208
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
202-
checkMetastoreRelationStats(textTable, expectedTotalSize = 5812, expectedRowCount = Some(500))
209+
checkMetastoreRelationStats(textTable, expectedStats =
210+
Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
203211
}
204212
}
205213

@@ -209,47 +217,59 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
209217
sql(s"CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE")
210218
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
211219
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS")
212-
checkMetastoreRelationStats(textTable, expectedTotalSize = 5812, expectedRowCount = Some(500))
220+
checkMetastoreRelationStats(textTable, expectedStats =
221+
Some(Statistics(sizeInBytes = 5812, rowCount = Some(500))))
213222

214223
sql(s"INSERT INTO TABLE $textTable SELECT * FROM src")
215224
sql(s"ANALYZE TABLE $textTable COMPUTE STATISTICS noscan")
216225
// update total size and remove the old and invalid row count
217-
checkMetastoreRelationStats(textTable, expectedTotalSize = 11624, expectedRowCount = None)
226+
checkMetastoreRelationStats(textTable, expectedStats =
227+
Some(Statistics(sizeInBytes = 11624, rowCount = None)))
218228
}
219229
}
220230

221231
private def checkLogicalRelationStats(
222232
tableName: String,
223-
expectedRowCount: Option[BigInt]): Unit = {
233+
expectedStats: Option[Statistics]): Unit = {
224234
val df = sql(s"SELECT * FROM $tableName")
225235
val relations = df.queryExecution.analyzed.collect { case rel: LogicalRelation =>
226-
// TODO: We don't have an expected value here because parquet size is different on Windows
227-
// and Linux, we need to find the reason and fix this.
228-
assert(rel.statistics.sizeInBytes === rel.relation.sizeInBytes)
229-
assert(rel.statistics.rowCount === expectedRowCount)
236+
assert(rel.catalogTable.isDefined)
237+
expectedStats match {
238+
case Some(es) =>
239+
assert(rel.catalogTable.get.stats.isDefined)
240+
val stats = rel.catalogTable.get.stats.get
241+
assert(stats.sizeInBytes === es.sizeInBytes)
242+
assert(stats.rowCount === es.rowCount)
243+
case None =>
244+
assert(rel.catalogTable.get.stats.isEmpty)
245+
}
230246
rel
231247
}
232248
assert(relations.size === 1)
233249
}
234250

235-
test("test statistics of LogicalRelation inherited from MetastoreRelation") {
251+
test("test statistics of LogicalRelation converted from MetastoreRelation") {
236252
val parquetTable = "parquetTable"
237253
val orcTable = "orcTable"
238254
withTable(parquetTable, orcTable) {
239255
sql(s"CREATE TABLE $parquetTable (key STRING, value STRING) STORED AS PARQUET")
240256
sql(s"CREATE TABLE $orcTable (key STRING, value STRING) STORED AS ORC")
241257
sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
242258
sql(s"INSERT INTO TABLE $orcTable SELECT * FROM src")
243-
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
244-
sql(s"ANALYZE TABLE $orcTable COMPUTE STATISTICS")
245259

246260
// the default value for `spark.sql.hive.convertMetastoreParquet` is true, here we just set it
247261
// for robustness
248262
withSQLConf("spark.sql.hive.convertMetastoreParquet" -> "true") {
249-
checkLogicalRelationStats(parquetTable, expectedRowCount = Some(500))
263+
checkLogicalRelationStats(parquetTable, expectedStats = None)
264+
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
265+
checkLogicalRelationStats(parquetTable, expectedStats =
266+
Some(Statistics(sizeInBytes = 4236, rowCount = Some(500))))
250267
}
251268
withSQLConf("spark.sql.hive.convertMetastoreOrc" -> "true") {
252-
checkLogicalRelationStats(orcTable, expectedRowCount = Some(500))
269+
checkLogicalRelationStats(orcTable, expectedStats = None)
270+
sql(s"ANALYZE TABLE $orcTable COMPUTE STATISTICS")
271+
checkLogicalRelationStats(orcTable, expectedStats =
272+
Some(Statistics(sizeInBytes = 3023, rowCount = Some(500))))
253273
}
254274
}
255275
}
@@ -262,14 +282,22 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
262282
assert(DDLUtils.isDatasourceTable(catalogTable))
263283

264284
sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
285+
checkLogicalRelationStats(parquetTable, expectedStats = None)
265286

266287
// noscan won't count the number of rows
267288
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
268-
checkLogicalRelationStats(parquetTable, expectedRowCount = None)
289+
checkLogicalRelationStats(parquetTable, expectedStats =
290+
Some(Statistics(sizeInBytes = 4236, rowCount = None)))
291+
292+
sql(s"INSERT INTO TABLE $parquetTable SELECT * FROM src")
293+
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan")
294+
checkLogicalRelationStats(parquetTable, expectedStats =
295+
Some(Statistics(sizeInBytes = 8472, rowCount = None)))
269296

270297
// without noscan, we count the number of rows
271298
sql(s"ANALYZE TABLE $parquetTable COMPUTE STATISTICS")
272-
checkLogicalRelationStats(parquetTable, expectedRowCount = Some(500))
299+
checkLogicalRelationStats(parquetTable, expectedStats =
300+
Some(Statistics(sizeInBytes = 8472, rowCount = Some(1000))))
273301
}
274302
}
275303

0 commit comments

Comments
 (0)