@@ -23,6 +23,7 @@ import scala.reflect.ClassTag
2323
2424import org .apache .spark .sql .{AnalysisException , QueryTest , Row }
2525import org .apache .spark .sql .catalyst .TableIdentifier
26+ import org .apache .spark .sql .catalyst .plans .logical .Statistics
2627import org .apache .spark .sql .execution .command .{AnalyzeTableCommand , DDLUtils }
2728import org .apache .spark .sql .execution .datasources .LogicalRelation
2829import org .apache .spark .sql .execution .joins ._
@@ -171,12 +172,18 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
171172
172173 private def checkMetastoreRelationStats (
173174 tableName : String ,
174- expectedTotalSize : Long ,
175- expectedRowCount : Option [BigInt ]): Unit = {
175+ expectedStats : Option [Statistics ]): Unit = {
176176 val df = sql(s " SELECT * FROM $tableName" )
177177 val relations = df.queryExecution.analyzed.collect { case rel : MetastoreRelation =>
178- assert(rel.statistics.sizeInBytes === expectedTotalSize)
179- assert(rel.statistics.rowCount === expectedRowCount)
178+ expectedStats match {
179+ case Some (es) =>
180+ assert(rel.catalogTable.stats.isDefined)
181+ val stats = rel.catalogTable.stats.get
182+ assert(stats.sizeInBytes === es.sizeInBytes)
183+ assert(stats.rowCount === es.rowCount)
184+ case None =>
185+ assert(rel.catalogTable.stats.isEmpty)
186+ }
180187 rel
181188 }
182189 assert(relations.size === 1 )
@@ -185,21 +192,22 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
185192 test(" test table-level statistics for hive tables created in HiveExternalCatalog" ) {
186193 val textTable = " textTable"
187194 withTable(textTable) {
195+ // Currently Spark's statistics are self-contained, we don't have statistics until we use
196+ // the `ANALYZE TABLE` command.
188197 sql(s " CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE " )
189- checkMetastoreRelationStats(textTable,
190- expectedTotalSize = spark.sessionState.conf.defaultSizeInBytes, expectedRowCount = None )
191-
198+ checkMetastoreRelationStats(textTable, expectedStats = None )
192199 sql(s " INSERT INTO TABLE $textTable SELECT * FROM src " )
193- // don't have our statistics, MetastoreRelation uses hive's `totalSize`
194- checkMetastoreRelationStats(textTable, expectedTotalSize = 5812 , expectedRowCount = None )
200+ checkMetastoreRelationStats(textTable, expectedStats = None )
195201
196202 // noscan won't count the number of rows
197203 sql(s " ANALYZE TABLE $textTable COMPUTE STATISTICS noscan " )
198- checkMetastoreRelationStats(textTable, expectedTotalSize = 5812 , expectedRowCount = None )
204+ checkMetastoreRelationStats(textTable, expectedStats =
205+ Some (Statistics (sizeInBytes = 5812 , rowCount = None )))
199206
200207 // without noscan, we count the number of rows
201208 sql(s " ANALYZE TABLE $textTable COMPUTE STATISTICS " )
202- checkMetastoreRelationStats(textTable, expectedTotalSize = 5812 , expectedRowCount = Some (500 ))
209+ checkMetastoreRelationStats(textTable, expectedStats =
210+ Some (Statistics (sizeInBytes = 5812 , rowCount = Some (500 ))))
203211 }
204212 }
205213
@@ -209,47 +217,59 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
209217 sql(s " CREATE TABLE $textTable (key STRING, value STRING) STORED AS TEXTFILE " )
210218 sql(s " INSERT INTO TABLE $textTable SELECT * FROM src " )
211219 sql(s " ANALYZE TABLE $textTable COMPUTE STATISTICS " )
212- checkMetastoreRelationStats(textTable, expectedTotalSize = 5812 , expectedRowCount = Some (500 ))
220+ checkMetastoreRelationStats(textTable, expectedStats =
221+ Some (Statistics (sizeInBytes = 5812 , rowCount = Some (500 ))))
213222
214223 sql(s " INSERT INTO TABLE $textTable SELECT * FROM src " )
215224 sql(s " ANALYZE TABLE $textTable COMPUTE STATISTICS noscan " )
216225 // update total size and remove the old and invalid row count
217- checkMetastoreRelationStats(textTable, expectedTotalSize = 11624 , expectedRowCount = None )
226+ checkMetastoreRelationStats(textTable, expectedStats =
227+ Some (Statistics (sizeInBytes = 11624 , rowCount = None )))
218228 }
219229 }
220230
221231 private def checkLogicalRelationStats (
222232 tableName : String ,
223- expectedRowCount : Option [BigInt ]): Unit = {
233+ expectedStats : Option [Statistics ]): Unit = {
224234 val df = sql(s " SELECT * FROM $tableName" )
225235 val relations = df.queryExecution.analyzed.collect { case rel : LogicalRelation =>
226- // TODO: We don't have an expected value here because parquet size is different on Windows
227- // and Linux, we need to find the reason and fix this.
228- assert(rel.statistics.sizeInBytes === rel.relation.sizeInBytes)
229- assert(rel.statistics.rowCount === expectedRowCount)
236+ assert(rel.catalogTable.isDefined)
237+ expectedStats match {
238+ case Some (es) =>
239+ assert(rel.catalogTable.get.stats.isDefined)
240+ val stats = rel.catalogTable.get.stats.get
241+ assert(stats.sizeInBytes === es.sizeInBytes)
242+ assert(stats.rowCount === es.rowCount)
243+ case None =>
244+ assert(rel.catalogTable.get.stats.isEmpty)
245+ }
230246 rel
231247 }
232248 assert(relations.size === 1 )
233249 }
234250
235- test(" test statistics of LogicalRelation inherited from MetastoreRelation" ) {
251+ test(" test statistics of LogicalRelation converted from MetastoreRelation" ) {
236252 val parquetTable = " parquetTable"
237253 val orcTable = " orcTable"
238254 withTable(parquetTable, orcTable) {
239255 sql(s " CREATE TABLE $parquetTable (key STRING, value STRING) STORED AS PARQUET " )
240256 sql(s " CREATE TABLE $orcTable (key STRING, value STRING) STORED AS ORC " )
241257 sql(s " INSERT INTO TABLE $parquetTable SELECT * FROM src " )
242258 sql(s " INSERT INTO TABLE $orcTable SELECT * FROM src " )
243- sql(s " ANALYZE TABLE $parquetTable COMPUTE STATISTICS " )
244- sql(s " ANALYZE TABLE $orcTable COMPUTE STATISTICS " )
245259
246260 // the default value for `spark.sql.hive.convertMetastoreParquet` is true, here we just set it
247261 // for robustness
248262 withSQLConf(" spark.sql.hive.convertMetastoreParquet" -> " true" ) {
249- checkLogicalRelationStats(parquetTable, expectedRowCount = Some (500 ))
263+ checkLogicalRelationStats(parquetTable, expectedStats = None )
264+ sql(s " ANALYZE TABLE $parquetTable COMPUTE STATISTICS " )
265+ checkLogicalRelationStats(parquetTable, expectedStats =
266+ Some (Statistics (sizeInBytes = 4236 , rowCount = Some (500 ))))
250267 }
251268 withSQLConf(" spark.sql.hive.convertMetastoreOrc" -> " true" ) {
252- checkLogicalRelationStats(orcTable, expectedRowCount = Some (500 ))
269+ checkLogicalRelationStats(orcTable, expectedStats = None )
270+ sql(s " ANALYZE TABLE $orcTable COMPUTE STATISTICS " )
271+ checkLogicalRelationStats(orcTable, expectedStats =
272+ Some (Statistics (sizeInBytes = 3023 , rowCount = Some (500 ))))
253273 }
254274 }
255275 }
@@ -262,14 +282,22 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton with SQLTestUtils
262282 assert(DDLUtils .isDatasourceTable(catalogTable))
263283
264284 sql(s " INSERT INTO TABLE $parquetTable SELECT * FROM src " )
285+ checkLogicalRelationStats(parquetTable, expectedStats = None )
265286
266287 // noscan won't count the number of rows
267288 sql(s " ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan " )
268- checkLogicalRelationStats(parquetTable, expectedRowCount = None )
289+ checkLogicalRelationStats(parquetTable, expectedStats =
290+ Some (Statistics (sizeInBytes = 4236 , rowCount = None )))
291+
292+ sql(s " INSERT INTO TABLE $parquetTable SELECT * FROM src " )
293+ sql(s " ANALYZE TABLE $parquetTable COMPUTE STATISTICS noscan " )
294+ checkLogicalRelationStats(parquetTable, expectedStats =
295+ Some (Statistics (sizeInBytes = 8472 , rowCount = None )))
269296
270297 // without noscan, we count the number of rows
271298 sql(s " ANALYZE TABLE $parquetTable COMPUTE STATISTICS " )
272- checkLogicalRelationStats(parquetTable, expectedRowCount = Some (500 ))
299+ checkLogicalRelationStats(parquetTable, expectedStats =
300+ Some (Statistics (sizeInBytes = 8472 , rowCount = Some (1000 ))))
273301 }
274302 }
275303
0 commit comments