Skip to content

Commit 9c32d25

Browse files
committed
More comments.
1 parent 94fb669 commit 9c32d25

File tree

3 files changed

+29
-18
lines changed

3 files changed

+29
-18
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/BasicStatsPlanVisitor.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import org.apache.spark.sql.types.LongType
2626
*/
2727
object BasicStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
2828

29+
/** Falls back to the estimation computed by [[SizeInBytesOnlyStatsPlanVisitor]]. */
2930
private def fallback(p: LogicalPlan): Statistics = SizeInBytesOnlyStatsPlanVisitor.visit(p)
3031

3132
override def default(p: LogicalPlan): Statistics = fallback(p)

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/LogicalPlanStats.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
1919

2020
import org.apache.spark.sql.catalyst.plans.logical._
2121

22-
22+
/**
23+
* A trait to add statistics propagation to [[LogicalPlan]].
24+
*/
2325
trait LogicalPlanStats { self: LogicalPlan =>
2426

2527
/**

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/SizeInBytesOnlyStatsPlanVisitor.scala

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,31 @@ import org.apache.spark.sql.catalyst.plans.logical._
2727
*/
2828
object SizeInBytesOnlyStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
2929

30+
/**
31+
* A default, commonly used estimation for unary nodes. We assume the input row number is the
32+
* same as the output row number, and compute sizes based on the column types.
33+
*/
34+
private def visitUnaryNode(p: UnaryNode): Statistics = {
35+
// There should be some overhead in Row object, the size should not be zero when there is
36+
// no columns, this help to prevent divide-by-zero error.
37+
val childRowSize = p.child.output.map(_.dataType.defaultSize).sum + 8
38+
val outputRowSize = p.output.map(_.dataType.defaultSize).sum + 8
39+
// Assume there will be the same number of rows as child has.
40+
var sizeInBytes = (p.child.stats.sizeInBytes * outputRowSize) / childRowSize
41+
if (sizeInBytes == 0) {
42+
// sizeInBytes can't be zero, or sizeInBytes of BinaryNode will also be zero
43+
// (product of children).
44+
sizeInBytes = 1
45+
}
46+
47+
// Don't propagate rowCount and attributeStats, since they are not estimated here.
48+
Statistics(sizeInBytes = sizeInBytes, hints = p.child.stats.hints)
49+
}
50+
51+
/**
52+
* For leaf nodes, use its computeStats. For other nodes, we assume the size in bytes is the
53+
* sum of all of the children's.
54+
*/
3055
override def default(p: LogicalPlan): Statistics = p match {
3156
case p: LeafNode => p.computeStats()
3257
case _: LogicalPlan => Statistics(sizeInBytes = p.children.map(_.stats.sizeInBytes).product)
@@ -135,21 +160,4 @@ object SizeInBytesOnlyStatsPlanVisitor extends LogicalPlanVisitor[Statistics] {
135160
override def visitUnion(p: Union): Statistics = {
136161
Statistics(sizeInBytes = p.children.map(_.stats.sizeInBytes).sum)
137162
}
138-
139-
private def visitUnaryNode(p: UnaryNode): Statistics = {
140-
// There should be some overhead in Row object, the size should not be zero when there is
141-
// no columns, this help to prevent divide-by-zero error.
142-
val childRowSize = p.child.output.map(_.dataType.defaultSize).sum + 8
143-
val outputRowSize = p.output.map(_.dataType.defaultSize).sum + 8
144-
// Assume there will be the same number of rows as child has.
145-
var sizeInBytes = (p.child.stats.sizeInBytes * outputRowSize) / childRowSize
146-
if (sizeInBytes == 0) {
147-
// sizeInBytes can't be zero, or sizeInBytes of BinaryNode will also be zero
148-
// (product of children).
149-
sizeInBytes = 1
150-
}
151-
152-
// Don't propagate rowCount and attributeStats, since they are not estimated here.
153-
Statistics(sizeInBytes = sizeInBytes, hints = p.child.stats.hints)
154-
}
155163
}

0 commit comments

Comments
 (0)