18
18
package org .apache .spark .sql .execution
19
19
20
20
import java .util .Locale
21
+ import java .util .function .Supplier
21
22
22
23
import scala .collection .mutable
23
24
@@ -414,6 +415,58 @@ object WholeStageCodegenExec {
414
415
}
415
416
}
416
417
418
+ object WholeStageCodegenId {
419
+ // codegenStageId: ID for codegen stages within a query plan.
420
+ // It does not affect equality, nor does it participate in destructuring pattern matching
421
+ // of WholeStageCodegenExec.
422
+ //
423
+ // This ID is used to help differentiate between codegen stages. It is included as a part
424
+ // of the explain output for physical plans, e.g.
425
+ //
426
+ // == Physical Plan ==
427
+ // *(5) SortMergeJoin [x#3L], [y#9L], Inner
428
+ // :- *(2) Sort [x#3L ASC NULLS FIRST], false, 0
429
+ // : +- Exchange hashpartitioning(x#3L, 200)
430
+ // : +- *(1) Project [(id#0L % 2) AS x#3L]
431
+ // : +- *(1) Filter isnotnull((id#0L % 2))
432
+ // : +- *(1) Range (0, 5, step=1, splits=8)
433
+ // +- *(4) Sort [y#9L ASC NULLS FIRST], false, 0
434
+ // +- Exchange hashpartitioning(y#9L, 200)
435
+ // +- *(3) Project [(id#6L % 2) AS y#9L]
436
+ // +- *(3) Filter isnotnull((id#6L % 2))
437
+ // +- *(3) Range (0, 5, step=1, splits=8)
438
+ //
439
+ // where the ID makes it obvious that not all adjacent codegen'd plan operators are of the
440
+ // same codegen stage.
441
+ //
442
+ // The codegen stage ID is also optionally included in the name of the generated classes as
443
+ // a suffix, so that it's easier to associate a generated class back to the physical operator.
444
+ // This is controlled by SQLConf: spark.sql.codegen.useIdInClassName
445
+ //
446
+ // The ID is also included in various log messages.
447
+ //
448
+ // Within a query, a codegen stage in a plan starts counting from 1, in "insertion order".
449
+ // WholeStageCodegenExec operators are inserted into a plan in depth-first post-order.
450
+ // See CollapseCodegenStages.insertWholeStageCodegen for the definition of insertion order.
451
+ //
452
+ // 0 is reserved as a special ID value to indicate a temporary WholeStageCodegenExec object
453
+ // is created, e.g. for special fallback handling when an existing WholeStageCodegenExec
454
+ // failed to generate/compile code.
455
+
456
+ private val codegenStageCounter = ThreadLocal .withInitial(new Supplier [Integer ] {
457
+ override def get () = 1 // TODO: change to Scala lambda syntax when upgraded to Scala 2.12+
458
+ })
459
+
460
+ def resetPerQuery (): Unit = codegenStageCounter.set(1 )
461
+
462
+ def getNextStageId (): Int = {
463
+ val counter = codegenStageCounter
464
+ val id = counter.get()
465
+ counter.set(id + 1 )
466
+ id
467
+ }
468
+ }
469
+
417
470
/**
418
471
* WholeStageCodegen compiles a subtree of plans that support codegen together into single Java
419
472
* function.
@@ -442,7 +495,8 @@ object WholeStageCodegenExec {
442
495
* `doCodeGen()` will create a `CodeGenContext`, which will hold a list of variables for input,
443
496
* used to generated code for [[BoundReference ]].
444
497
*/
445
- case class WholeStageCodegenExec (child : SparkPlan ) extends UnaryExecNode with CodegenSupport {
498
+ case class WholeStageCodegenExec (child : SparkPlan )(val codegenStageId : Int )
499
+ extends UnaryExecNode with CodegenSupport {
446
500
447
501
override def output : Seq [Attribute ] = child.output
448
502
@@ -454,6 +508,12 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
454
508
" pipelineTime" -> SQLMetrics .createTimingMetric(sparkContext,
455
509
WholeStageCodegenExec .PIPELINE_DURATION_METRIC ))
456
510
511
+ def generatedClassName (): String = if (conf.wholeStageUseIdInClassName) {
512
+ s " GeneratedIteratorForCodegenStage $codegenStageId"
513
+ } else {
514
+ " GeneratedIterator"
515
+ }
516
+
457
517
/**
458
518
* Generates code for this subtree.
459
519
*
@@ -471,19 +531,23 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
471
531
}
472
532
""" , inlineToOuterClass = true )
473
533
534
+ val className = generatedClassName()
535
+
474
536
val source = s """
475
537
public Object generate(Object[] references) {
476
- return new GeneratedIterator (references);
538
+ return new $className (references);
477
539
}
478
540
479
- ${ctx.registerComment(s """ Codegend pipeline for\n ${child.treeString.trim}""" )}
480
- final class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator {
541
+ ${ctx.registerComment(
542
+ s """ Codegend pipeline for stage (id= $codegenStageId)
543
+ | ${this .treeString.trim}""" .stripMargin)}
544
+ final class $className extends ${classOf [BufferedRowIterator ].getName} {
481
545
482
546
private Object[] references;
483
547
private scala.collection.Iterator[] inputs;
484
548
${ctx.declareMutableStates()}
485
549
486
- public GeneratedIterator (Object[] references) {
550
+ public $className (Object[] references) {
487
551
this.references = references;
488
552
}
489
553
@@ -516,7 +580,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
516
580
} catch {
517
581
case _ : Exception if ! Utils .isTesting && sqlContext.conf.codegenFallback =>
518
582
// We should already saw the error message
519
- logWarning(s " Whole-stage codegen disabled for this plan: \n $treeString" )
583
+ logWarning(s " Whole-stage codegen disabled for plan (id= $codegenStageId ) :\n $treeString" )
520
584
return child.execute()
521
585
}
522
586
@@ -525,7 +589,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
525
589
logInfo(s " Found too long generated codes and JIT optimization might not work: " +
526
590
s " the bytecode size ( $maxCodeSize) is above the limit " +
527
591
s " ${sqlContext.conf.hugeMethodLimit}, and the whole-stage codegen was disabled " +
528
- s " for this plan. To avoid this, you can raise the limit " +
592
+ s " for this plan (id= $codegenStageId ) . To avoid this, you can raise the limit " +
529
593
s " ` ${SQLConf .WHOLESTAGE_HUGE_METHOD_LIMIT .key}`: \n $treeString" )
530
594
child match {
531
595
// The fallback solution of batch file source scan still uses WholeStageCodegenExec
@@ -603,10 +667,12 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
603
667
verbose : Boolean ,
604
668
prefix : String = " " ,
605
669
addSuffix : Boolean = false ): StringBuilder = {
606
- child.generateTreeString(depth, lastChildren, builder, verbose, " * " )
670
+ child.generateTreeString(depth, lastChildren, builder, verbose, s " *( $codegenStageId ) " )
607
671
}
608
672
609
673
override def needStopCheck : Boolean = true
674
+
675
+ override protected def otherCopyArgs : Seq [AnyRef ] = Seq (codegenStageId.asInstanceOf [Integer ])
610
676
}
611
677
612
678
@@ -657,13 +723,14 @@ case class CollapseCodegenStages(conf: SQLConf) extends Rule[SparkPlan] {
657
723
case plan if plan.output.length == 1 && plan.output.head.dataType.isInstanceOf [ObjectType ] =>
658
724
plan.withNewChildren(plan.children.map(insertWholeStageCodegen))
659
725
case plan : CodegenSupport if supportCodegen(plan) =>
660
- WholeStageCodegenExec (insertInputAdapter(plan))
726
+ WholeStageCodegenExec (insertInputAdapter(plan))( WholeStageCodegenId .getNextStageId())
661
727
case other =>
662
728
other.withNewChildren(other.children.map(insertWholeStageCodegen))
663
729
}
664
730
665
731
def apply (plan : SparkPlan ): SparkPlan = {
666
732
if (conf.wholeStageEnabled) {
733
+ WholeStageCodegenId .resetPerQuery()
667
734
insertWholeStageCodegen(plan)
668
735
} else {
669
736
plan
0 commit comments