18
18
package org .apache .spark .sql .parquet
19
19
20
20
import scala .collection .mutable .{Buffer , ArrayBuffer , HashMap }
21
- import scala .reflect .ClassTag
22
- import scala .reflect .runtime .universe .runtimeMirror
23
21
24
22
import parquet .io .api .{PrimitiveConverter , GroupConverter , Binary , Converter }
25
23
import parquet .schema .MessageType
26
24
27
25
import org .apache .spark .sql .catalyst .types ._
28
- import org .apache .spark .sql .catalyst .expressions .{GenericRow , Row , Attribute }
26
+ import org .apache .spark .sql .catalyst .expressions .{NativeRow , GenericRow , Row , Attribute }
29
27
import org .apache .spark .sql .parquet .CatalystConverter .FieldType
30
- import org .apache .spark .util .Utils
31
28
32
29
private [parquet] object CatalystConverter {
33
30
// The type internally used for fields
@@ -83,7 +80,7 @@ private[parquet] object CatalystConverter {
83
80
val attributes = ParquetTypesConverter .convertToAttributes(parquetSchema)
84
81
// For non-nested types we use the optimized Row converter
85
82
if (attributes.forall(a => ParquetTypesConverter .isPrimitiveType(a.dataType))) {
86
- new MutableRowGroupConverter (attributes)
83
+ new PrimitiveRowGroupConverter (attributes)
87
84
} else {
88
85
new CatalystGroupConverter (attributes)
89
86
}
@@ -170,6 +167,9 @@ private[parquet] class CatalystGroupConverter(
170
167
def getCurrentRecord : Row = {
171
168
assert(isRootConverter, " getCurrentRecord should only be called in root group converter!" )
172
169
// TODO: use iterators if possible
170
+ // Note: this will ever only be called in the root converter when the record has been
171
+ // fully processed. Therefore it will be difficult to use mutable rows instead, since
172
+ // any non-root converter never would be sure when it would be safe to re-use the buffer.
173
173
new GenericRow (current.toArray)
174
174
}
175
175
@@ -180,14 +180,9 @@ private[parquet] class CatalystGroupConverter(
180
180
current.update(fieldIndex, value)
181
181
}
182
182
183
- override protected [parquet] def clearBuffer (): Unit = {
184
- // TODO: reuse buffer?
185
- buffer = new ArrayBuffer [Row ](CatalystArrayConverter .INITIAL_ARRAY_SIZE )
186
- }
183
+ override protected [parquet] def clearBuffer (): Unit = buffer.clear()
187
184
188
185
override def start (): Unit = {
189
- // TODO: reuse buffer?
190
- // Allocate new array in the root converter (others will be called clearBuffer() on)
191
186
current = ArrayBuffer .fill(schema.length)(null )
192
187
converters.foreach {
193
188
converter => if (! converter.isPrimitive) {
@@ -196,12 +191,10 @@ private[parquet] class CatalystGroupConverter(
196
191
}
197
192
}
198
193
199
- // TODO: think about reusing the buffer
200
194
override def end (): Unit = {
201
195
if (! isRootConverter) {
202
196
assert(current!= null ) // there should be no empty groups
203
197
buffer.append(new GenericRow (current.toArray))
204
- // TODO: use iterators if possible, avoid Row wrapping
205
198
parent.updateField(index, new GenericRow (buffer.toArray.asInstanceOf [Array [Any ]]))
206
199
}
207
200
}
@@ -212,7 +205,7 @@ private[parquet] class CatalystGroupConverter(
212
205
* to a [[org.apache.spark.sql.catalyst.expressions.Row ]] object. Note that his
213
206
* converter is optimized for rows of primitive types (non-nested records).
214
207
*/
215
- private [parquet] class MutableRowGroupConverter (
208
+ private [parquet] class PrimitiveRowGroupConverter (
216
209
protected [parquet] val schema : Seq [FieldType ],
217
210
protected [parquet] var current : ParquetRelation .RowType )
218
211
extends GroupConverter with CatalystConverter {
@@ -334,7 +327,7 @@ object CatalystArrayConverter {
334
327
* [[org.apache.spark.sql.parquet.ParquetTypesConverter ]]) into an
335
328
* [[org.apache.spark.sql.catalyst.types.ArrayType ]].
336
329
*
337
- * @param elementType The type of the array elements
330
+ * @param elementType The type of the array elements (complex or primitive)
338
331
* @param index The position of this (array) field inside its parent converter
339
332
* @param parent The parent converter
340
333
* @param buffer A data buffer
@@ -345,8 +338,6 @@ private[parquet] class CatalystArrayConverter(
345
338
protected [parquet] val parent : CatalystConverter ,
346
339
protected [parquet] var buffer : Buffer [Any ])
347
340
extends GroupConverter with CatalystConverter {
348
- // TODO: In the future consider using native arrays instead of buffer for
349
- // primitive types for performance reasons
350
341
351
342
def this (elementType : DataType , index : Int , parent : CatalystConverter ) =
352
343
this (
@@ -374,8 +365,7 @@ private[parquet] class CatalystArrayConverter(
374
365
}
375
366
376
367
override protected [parquet] def clearBuffer (): Unit = {
377
- // TODO: reuse buffer?
378
- buffer = new ArrayBuffer [Any ](CatalystArrayConverter .INITIAL_ARRAY_SIZE )
368
+ buffer.clear()
379
369
}
380
370
381
371
override def start (): Unit = {
@@ -384,10 +374,8 @@ private[parquet] class CatalystArrayConverter(
384
374
}
385
375
}
386
376
387
- // TODO: think about reusing the buffer
388
377
override def end (): Unit = {
389
378
assert(parent != null )
390
- // TODO: use iterators if possible, avoid Row wrapping
391
379
parent.updateField(index, new GenericRow (buffer.toArray))
392
380
clearBuffer()
393
381
}
@@ -396,20 +384,27 @@ private[parquet] class CatalystArrayConverter(
396
384
override def getCurrentRecord : Row = throw new UnsupportedOperationException
397
385
}
398
386
399
- private [parquet] class CatalystNativeArrayConverter [T <: NativeType ](
387
+ /**
388
+ * A `parquet.io.api.GroupConverter` that converts a single-element groups that
389
+ * match the characteristics of an array (see
390
+ * [[org.apache.spark.sql.parquet.ParquetTypesConverter ]]) into an
391
+ * [[org.apache.spark.sql.catalyst.types.ArrayType ]].
392
+ *
393
+ * @param elementType The type of the array elements (native)
394
+ * @param index The position of this (array) field inside its parent converter
395
+ * @param parent The parent converter
396
+ * @param capacity The (initial) capacity of the buffer
397
+ */
398
+ private [parquet] class CatalystNativeArrayConverter (
400
399
val elementType : NativeType ,
401
400
val index : Int ,
402
401
protected [parquet] val parent : CatalystConverter ,
403
402
protected [parquet] var capacity : Int = CatalystArrayConverter .INITIAL_ARRAY_SIZE )
404
403
extends GroupConverter with CatalystConverter {
405
404
406
- // similar comment as in [[Decoder]]: this should probably be in NativeType
407
- private val classTag = {
408
- val mirror = runtimeMirror(Utils .getSparkClassLoader)
409
- ClassTag [T # JvmType ](mirror.runtimeClass(elementType.tag.tpe))
410
- }
405
+ type nativeType = elementType.JvmType
411
406
412
- private var buffer : Array [T # JvmType ] = classTag.newArray(capacity)
407
+ private var buffer : Array [nativeType ] = elementType. classTag.newArray(capacity)
413
408
414
409
private var elements : Int = 0
415
410
@@ -432,43 +427,43 @@ private[parquet] class CatalystNativeArrayConverter[T <: NativeType](
432
427
// Overriden here to avoid auto-boxing for primitive types
433
428
override protected [parquet] def updateBoolean (fieldIndex : Int , value : Boolean ): Unit = {
434
429
checkGrowBuffer()
435
- buffer(elements) = value.asInstanceOf [T # JvmType ]
430
+ buffer(elements) = value.asInstanceOf [nativeType ]
436
431
elements += 1
437
432
}
438
433
439
434
override protected [parquet] def updateInt (fieldIndex : Int , value : Int ): Unit = {
440
435
checkGrowBuffer()
441
- buffer(elements) = value.asInstanceOf [T # JvmType ]
436
+ buffer(elements) = value.asInstanceOf [nativeType ]
442
437
elements += 1
443
438
}
444
439
445
440
override protected [parquet] def updateLong (fieldIndex : Int , value : Long ): Unit = {
446
441
checkGrowBuffer()
447
- buffer(elements) = value.asInstanceOf [T # JvmType ]
442
+ buffer(elements) = value.asInstanceOf [nativeType ]
448
443
elements += 1
449
444
}
450
445
451
446
override protected [parquet] def updateDouble (fieldIndex : Int , value : Double ): Unit = {
452
447
checkGrowBuffer()
453
- buffer(elements) = value.asInstanceOf [T # JvmType ]
448
+ buffer(elements) = value.asInstanceOf [nativeType ]
454
449
elements += 1
455
450
}
456
451
457
452
override protected [parquet] def updateFloat (fieldIndex : Int , value : Float ): Unit = {
458
453
checkGrowBuffer()
459
- buffer(elements) = value.asInstanceOf [T # JvmType ]
454
+ buffer(elements) = value.asInstanceOf [nativeType ]
460
455
elements += 1
461
456
}
462
457
463
458
override protected [parquet] def updateBinary (fieldIndex : Int , value : Binary ): Unit = {
464
459
checkGrowBuffer()
465
- buffer(elements) = value.getBytes.asInstanceOf [T # JvmType ]
460
+ buffer(elements) = value.getBytes.asInstanceOf [nativeType ]
466
461
elements += 1
467
462
}
468
463
469
464
override protected [parquet] def updateString (fieldIndex : Int , value : Binary ): Unit = {
470
465
checkGrowBuffer()
471
- buffer(elements) = value.toStringUsingUTF8.asInstanceOf [T # JvmType ]
466
+ buffer(elements) = value.toStringUsingUTF8.asInstanceOf [nativeType ]
472
467
elements += 1
473
468
}
474
469
@@ -482,12 +477,7 @@ private[parquet] class CatalystNativeArrayConverter[T <: NativeType](
482
477
assert(parent != null )
483
478
parent.updateField(
484
479
index,
485
- new GenericRow {
486
- // TODO: it would be much nicer to use a view here but GenericRow requires an Array
487
- // TODO: we should avoid using GenericRow as a wrapper but [[GetField]] current
488
- // requires that
489
- override val values = buffer.slice(0 , elements).map(_.asInstanceOf [Any ])
490
- })
480
+ new NativeRow [nativeType](buffer.slice(0 , elements)))
491
481
clearBuffer()
492
482
}
493
483
@@ -497,7 +487,7 @@ private[parquet] class CatalystNativeArrayConverter[T <: NativeType](
497
487
private def checkGrowBuffer (): Unit = {
498
488
if (elements >= capacity) {
499
489
val newCapacity = 2 * capacity
500
- val tmp : Array [T # JvmType ] = classTag.newArray(newCapacity)
490
+ val tmp : Array [nativeType ] = elementType. classTag.newArray(newCapacity)
501
491
Array .copy(buffer, 0 , tmp, 0 , capacity)
502
492
buffer = tmp
503
493
capacity = newCapacity
0 commit comments