@@ -24,11 +24,13 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeMap, Attribute, Attri
24
24
import org .apache .spark .sql .catalyst .types ._
25
25
26
26
private [sql] class ColumnStatisticsSchema (a : Attribute ) extends Serializable {
27
- val upperBound = AttributeReference (a.name + " .upperBound" , a.dataType, nullable = false )()
28
- val lowerBound = AttributeReference (a.name + " .lowerBound" , a.dataType, nullable = false )()
29
- val nullCount = AttributeReference (a.name + " .nullCount" , IntegerType , nullable = false )()
27
+ val upperBound = AttributeReference (a.name + " .upperBound" , a.dataType, nullable = true )()
28
+ val lowerBound = AttributeReference (a.name + " .lowerBound" , a.dataType, nullable = true )()
29
+ val nullCount = AttributeReference (a.name + " .nullCount" , IntegerType , nullable = false )()
30
+ val count = AttributeReference (a.name + " .count" , IntegerType , nullable = false )()
31
+ val sizeInBytes = AttributeReference (a.name + " .sizeInBytes" , LongType , nullable = false )()
30
32
31
- val schema = Seq (lowerBound, upperBound, nullCount)
33
+ val schema = Seq (lowerBound, upperBound, nullCount, count, sizeInBytes )
32
34
}
33
35
34
36
private [sql] class PartitionStatistics (tableSchema : Seq [Attribute ]) extends Serializable {
@@ -45,6 +47,10 @@ private[sql] class PartitionStatistics(tableSchema: Seq[Attribute]) extends Seri
45
47
* brings significant performance penalty.
46
48
*/
47
49
private [sql] sealed trait ColumnStats extends Serializable {
50
+ protected var count = 0
51
+ protected var nullCount = 0
52
+ protected var sizeInBytes = 0L
53
+
48
54
/**
49
55
* Gathers statistics information from `row(ordinal)`.
50
56
*/
@@ -65,9 +71,8 @@ private[sql] class NoopColumnStats extends ColumnStats {
65
71
}
66
72
67
73
private [sql] class ByteColumnStats extends ColumnStats {
68
- var upper = Byte .MinValue
69
- var lower = Byte .MaxValue
70
- var nullCount = 0
74
+ protected var upper = Byte .MinValue
75
+ protected var lower = Byte .MaxValue
71
76
72
77
override def gatherStats (row : Row , ordinal : Int ): Unit = {
73
78
if (! row.isNullAt(ordinal)) {
@@ -77,15 +82,16 @@ private[sql] class ByteColumnStats extends ColumnStats {
77
82
} else {
78
83
nullCount += 1
79
84
}
85
+ count += 1
86
+ sizeInBytes += BYTE .defaultSize
80
87
}
81
88
82
- def collectedStatistics = Row (lower, upper, nullCount)
89
+ def collectedStatistics = Row (lower, upper, nullCount, count, sizeInBytes )
83
90
}
84
91
85
92
private [sql] class ShortColumnStats extends ColumnStats {
86
- var upper = Short .MinValue
87
- var lower = Short .MaxValue
88
- var nullCount = 0
93
+ protected var upper = Short .MinValue
94
+ protected var lower = Short .MaxValue
89
95
90
96
override def gatherStats (row : Row , ordinal : Int ): Unit = {
91
97
if (! row.isNullAt(ordinal)) {
@@ -95,15 +101,16 @@ private[sql] class ShortColumnStats extends ColumnStats {
95
101
} else {
96
102
nullCount += 1
97
103
}
104
+ count += 1
105
+ sizeInBytes += SHORT .defaultSize
98
106
}
99
107
100
- def collectedStatistics = Row (lower, upper, nullCount)
108
+ def collectedStatistics = Row (lower, upper, nullCount, count, sizeInBytes )
101
109
}
102
110
103
111
private [sql] class LongColumnStats extends ColumnStats {
104
- var upper = Long .MinValue
105
- var lower = Long .MaxValue
106
- var nullCount = 0
112
+ protected var upper = Long .MinValue
113
+ protected var lower = Long .MaxValue
107
114
108
115
override def gatherStats (row : Row , ordinal : Int ): Unit = {
109
116
if (! row.isNullAt(ordinal)) {
@@ -113,15 +120,16 @@ private[sql] class LongColumnStats extends ColumnStats {
113
120
} else {
114
121
nullCount += 1
115
122
}
123
+ count += 1
124
+ sizeInBytes += LONG .defaultSize
116
125
}
117
126
118
- def collectedStatistics = Row (lower, upper, nullCount)
127
+ def collectedStatistics = Row (lower, upper, nullCount, count, sizeInBytes )
119
128
}
120
129
121
130
private [sql] class DoubleColumnStats extends ColumnStats {
122
- var upper = Double .MinValue
123
- var lower = Double .MaxValue
124
- var nullCount = 0
131
+ protected var upper = Double .MinValue
132
+ protected var lower = Double .MaxValue
125
133
126
134
override def gatherStats (row : Row , ordinal : Int ): Unit = {
127
135
if (! row.isNullAt(ordinal)) {
@@ -131,15 +139,16 @@ private[sql] class DoubleColumnStats extends ColumnStats {
131
139
} else {
132
140
nullCount += 1
133
141
}
142
+ count += 1
143
+ sizeInBytes += DOUBLE .defaultSize
134
144
}
135
145
136
- def collectedStatistics = Row (lower, upper, nullCount)
146
+ def collectedStatistics = Row (lower, upper, nullCount, count, sizeInBytes )
137
147
}
138
148
139
149
private [sql] class FloatColumnStats extends ColumnStats {
140
- var upper = Float .MinValue
141
- var lower = Float .MaxValue
142
- var nullCount = 0
150
+ protected var upper = Float .MinValue
151
+ protected var lower = Float .MaxValue
143
152
144
153
override def gatherStats (row : Row , ordinal : Int ): Unit = {
145
154
if (! row.isNullAt(ordinal)) {
@@ -149,15 +158,16 @@ private[sql] class FloatColumnStats extends ColumnStats {
149
158
} else {
150
159
nullCount += 1
151
160
}
161
+ count += 1
162
+ sizeInBytes += FLOAT .defaultSize
152
163
}
153
164
154
- def collectedStatistics = Row (lower, upper, nullCount)
165
+ def collectedStatistics = Row (lower, upper, nullCount, count, sizeInBytes )
155
166
}
156
167
157
168
private [sql] class IntColumnStats extends ColumnStats {
158
- var upper = Int .MinValue
159
- var lower = Int .MaxValue
160
- var nullCount = 0
169
+ protected var upper = Int .MinValue
170
+ protected var lower = Int .MaxValue
161
171
162
172
override def gatherStats (row : Row , ordinal : Int ): Unit = {
163
173
if (! row.isNullAt(ordinal)) {
@@ -167,15 +177,16 @@ private[sql] class IntColumnStats extends ColumnStats {
167
177
} else {
168
178
nullCount += 1
169
179
}
180
+ count += 1
181
+ sizeInBytes += INT .defaultSize
170
182
}
171
183
172
- def collectedStatistics = Row (lower, upper, nullCount)
184
+ def collectedStatistics = Row (lower, upper, nullCount, count, sizeInBytes )
173
185
}
174
186
175
187
private [sql] class StringColumnStats extends ColumnStats {
176
- var upper : String = null
177
- var lower : String = null
178
- var nullCount = 0
188
+ protected var upper : String = null
189
+ protected var lower : String = null
179
190
180
191
override def gatherStats (row : Row , ordinal : Int ): Unit = {
181
192
if (! row.isNullAt(ordinal)) {
@@ -185,15 +196,16 @@ private[sql] class StringColumnStats extends ColumnStats {
185
196
} else {
186
197
nullCount += 1
187
198
}
199
+ count += 1
200
+ sizeInBytes += STRING .actualSize(row, ordinal)
188
201
}
189
202
190
- def collectedStatistics = Row (lower, upper, nullCount)
203
+ def collectedStatistics = Row (lower, upper, nullCount, count, sizeInBytes )
191
204
}
192
205
193
206
private [sql] class DateColumnStats extends ColumnStats {
194
- var upper : Date = null
195
- var lower : Date = null
196
- var nullCount = 0
207
+ protected var upper : Date = null
208
+ protected var lower : Date = null
197
209
198
210
override def gatherStats (row : Row , ordinal : Int ) {
199
211
if (! row.isNullAt(ordinal)) {
@@ -203,15 +215,16 @@ private[sql] class DateColumnStats extends ColumnStats {
203
215
} else {
204
216
nullCount += 1
205
217
}
218
+ count += 1
219
+ sizeInBytes += DATE .defaultSize
206
220
}
207
221
208
- def collectedStatistics = Row (lower, upper, nullCount)
222
+ def collectedStatistics = Row (lower, upper, nullCount, count, sizeInBytes )
209
223
}
210
224
211
225
private [sql] class TimestampColumnStats extends ColumnStats {
212
- var upper : Timestamp = null
213
- var lower : Timestamp = null
214
- var nullCount = 0
226
+ protected var upper : Timestamp = null
227
+ protected var lower : Timestamp = null
215
228
216
229
override def gatherStats (row : Row , ordinal : Int ): Unit = {
217
230
if (! row.isNullAt(ordinal)) {
@@ -221,7 +234,9 @@ private[sql] class TimestampColumnStats extends ColumnStats {
221
234
} else {
222
235
nullCount += 1
223
236
}
237
+ count += 1
238
+ sizeInBytes += TIMESTAMP .defaultSize
224
239
}
225
240
226
- def collectedStatistics = Row (lower, upper, nullCount)
241
+ def collectedStatistics = Row (lower, upper, nullCount, count, sizeInBytes )
227
242
}
0 commit comments