apache · bdrillard · Oct 17, 2017 · Oct 17, 2017 · cloud-fan · Nov 10, 2017
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -801,12 +801,12 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
   private[this] def castToByteCode(from: DataType, ctx: CodegenContext): CastFunction = from match {
     case StringType =>
       val wrapper = ctx.freshName("wrapper")
-      ctx.addMutableState("UTF8String.IntWrapper", wrapper,
+      val wrapperAccessor = ctx.addMutableState("UTF8String.IntWrapper", wrapper,
         s"$wrapper = new UTF8String.IntWrapper();")
       (c, evPrim, evNull) =>
         s"""
-          if ($c.toByte($wrapper)) {
-            $evPrim = (byte) $wrapper.value;
+          if ($c.toByte($wrapperAccessor)) {
+            $evPrim = (byte) $wrapperAccessor.value;
           } else {
             $evNull = true;
           }
@@ -828,12 +828,12 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
       ctx: CodegenContext): CastFunction = from match {
     case StringType =>
       val wrapper = ctx.freshName("wrapper")
-      ctx.addMutableState("UTF8String.IntWrapper", wrapper,
+      val wrapperAccessor = ctx.addMutableState("UTF8String.IntWrapper", wrapper,
         s"$wrapper = new UTF8String.IntWrapper();")
       (c, evPrim, evNull) =>
         s"""
-          if ($c.toShort($wrapper)) {
-            $evPrim = (short) $wrapper.value;
+          if ($c.toShort($wrapperAccessor)) {
+            $evPrim = (short) $wrapperAccessor.value;
           } else {
             $evNull = true;
           }
@@ -853,12 +853,12 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
   private[this] def castToIntCode(from: DataType, ctx: CodegenContext): CastFunction = from match {
     case StringType =>
       val wrapper = ctx.freshName("wrapper")
-      ctx.addMutableState("UTF8String.IntWrapper", wrapper,
+      val wrapperAccessor = ctx.addMutableState("UTF8String.IntWrapper", wrapper,
         s"$wrapper = new UTF8String.IntWrapper();")
       (c, evPrim, evNull) =>
         s"""
-          if ($c.toInt($wrapper)) {
-            $evPrim = $wrapper.value;
+          if ($c.toInt($wrapperAccessor)) {
+            $evPrim = $wrapperAccessor.value;
           } else {
             $evNull = true;
           }
@@ -878,13 +878,13 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
   private[this] def castToLongCode(from: DataType, ctx: CodegenContext): CastFunction = from match {
     case StringType =>
       val wrapper = ctx.freshName("wrapper")
-      ctx.addMutableState("UTF8String.LongWrapper", wrapper,
+      val wrapperAccessor = ctx.addMutableState("UTF8String.LongWrapper", wrapper,
         s"$wrapper = new UTF8String.LongWrapper();")
 
       (c, evPrim, evNull) =>
         s"""
-          if ($c.toLong($wrapper)) {
-            $evPrim = $wrapper.value;
+          if ($c.toLong($wrapperAccessor)) {
+            $evPrim = $wrapperAccessor.value;
           } else {
             $evNull = true;
           }

diff --git a/.../src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala b/.../src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
@@ -67,14 +67,15 @@ case class MonotonicallyIncreasingID() extends LeafExpression with Nondeterminis
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val countTerm = ctx.freshName("count")
     val partitionMaskTerm = ctx.freshName("partitionMask")
-    ctx.addMutableState(ctx.JAVA_LONG, countTerm, "")
-    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "")
-    ctx.addPartitionInitializationStatement(s"$countTerm = 0L;")
-    ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;")
+    val countTermAccessor = ctx.addMutableState(ctx.JAVA_LONG, countTerm, "")
+    val partitionMaskTermAccessor = ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "")
+    ctx.addPartitionInitializationStatement(s"$countTermAccessor = 0L;")
+    ctx.addPartitionInitializationStatement(
+      s"$partitionMaskTermAccessor = ((long) partitionIndex) << 33;")
 
     ev.copy(code = s"""
-      final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
-      $countTerm++;""", isNull = "false")
+      final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTermAccessor + $countTermAccessor;
+      $countTermAccessor++;""", isNull = "false")
   }
 
   override def prettyName: String = "monotonically_increasing_id"

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -991,11 +991,11 @@ case class ScalaUDF(
 
     val converterTerm = ctx.freshName("converter")
     val expressionIdx = ctx.references.size - 1
-    ctx.addMutableState(converterClassName, converterTerm,
+    val converterTermAccessor = ctx.addMutableState(converterClassName, converterTerm,
       s"$converterTerm = ($converterClassName)$typeConvertersClassName" +
         s".createToScalaConverter(((${expressionClassName})((($scalaUDFClassName)" +
           s"references[$expressionIdx]).getChildren().apply($index))).dataType());")
-    converterTerm
+    converterTermAccessor
   }
 
   override def doGenCode(
@@ -1008,8 +1008,9 @@ case class ScalaUDF(
 
     // Generate codes used to convert the returned value of user-defined functions to Catalyst type
     val catalystConverterTerm = ctx.freshName("catalystConverter")
-    ctx.addMutableState(converterClassName, catalystConverterTerm,
-      s"$catalystConverterTerm = ($converterClassName)$typeConvertersClassName" +
+    val catalystConverterTermAccessor =
+      ctx.addMutableState(converterClassName, catalystConverterTerm,
+        s"$catalystConverterTerm = ($converterClassName)$typeConvertersClassName" +
         s".createToCatalystConverter($scalaUDF.dataType());")
 
     val resultTerm = ctx.freshName("result")
@@ -1022,7 +1023,7 @@ case class ScalaUDF(
     val funcClassName = s"scala.Function${children.size}"
 
     val funcTerm = ctx.freshName("udf")
-    ctx.addMutableState(funcClassName, funcTerm,
+    val funcTermAccessor = ctx.addMutableState(funcClassName, funcTerm,
       s"$funcTerm = ($funcClassName)$scalaUDF.userDefinedFunc();")
 
     // codegen for children expressions
@@ -1040,12 +1041,13 @@ case class ScalaUDF(
       (convert, argTerm)
     }.unzip
 
-    val getFuncResult = s"$funcTerm.apply(${funcArguments.mkString(", ")})"
+    val getFuncResult = s"$funcTermAccessor.apply(${funcArguments.mkString(", ")})"
     val callFunc =
       s"""
          ${ctx.boxedType(dataType)} $resultTerm = null;
          try {
-           $resultTerm = (${ctx.boxedType(dataType)})$catalystConverterTerm.apply($getFuncResult);
+           $resultTerm = (${ctx.boxedType(dataType)}) $catalystConverterTermAccessor
+             .apply($getFuncResult);
          } catch (Exception e) {
            throw new org.apache.spark.SparkException($scalaUDF.udfErrorMessage(), e);
          }

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
@@ -44,8 +44,9 @@ case class SparkPartitionID() extends LeafExpression with Nondeterministic {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val idTerm = ctx.freshName("partitionId")
-    ctx.addMutableState(ctx.JAVA_INT, idTerm, "")
+    val idTermAccessor = ctx.addMutableState(ctx.JAVA_INT, idTerm, "")
     ctx.addPartitionInitializationStatement(s"$idTerm = partitionIndex;")
-    ev.copy(code = s"final ${ctx.javaType(dataType)} ${ev.value} = $idTerm;", isNull = "false")
+    ev.copy(code = s"final ${ctx.javaType(dataType)} ${ev.value} = $idTermAccessor;",
+      isNull = "false")
   }
 }
diff --git a/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/...lyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -113,8 +113,9 @@ class CodegenContext {
     val idx = references.length
     references += obj
     val clsName = Option(className).getOrElse(obj.getClass.getName)
-    addMutableState(clsName, term, s"$term = ($clsName) references[$idx];")
-    term
+    val termAccessor = addMutableState(clsName, term, s"$term = ($clsName) references[$idx];")
+
+    termAccessor
   }
 
   /**
@@ -148,44 +149,150 @@ class CodegenContext {
    *
    * They will be kept as member variables in generated classes like `SpecificProjection`.
    */
-  val mutableStates: mutable.ArrayBuffer[(String, String, String)] =
-    mutable.ArrayBuffer.empty[(String, String, String)]
+  val mutableStates: mutable.ListBuffer[(String, String, String)] =
+    mutable.ListBuffer.empty[(String, String, String)]
+
+  // An array keyed by the tuple of mutable states' types and initialization code, holds the
+  // current max index of the array
+  var mutableStateArrayIdx: mutable.Map[(String, String), Int] =
+    mutable.Map.empty[(String, String), Int]
+
+  // An array keyed by the tuple of mutable states' types and initialization code, holds the name
+  // of the mutableStateArray into which state of the given key will be compacted
+  var mutableStateArrayNames: mutable.Map[(String, String), String] =
+    mutable.Map.empty[(String, String), String]
+
+  // An array keyed by the tuple of mutable states' types and initialization code, holds the code
+  // that will initialize the mutableStateArray when initialized in loops
+  var mutableStateArrayInitCodes: mutable.Map[(String, String), String] =
+    mutable.Map.empty[(String, String), String]
+
+  /**
+   * Adds an instance of globally-accessible mutable state. Mutable state may either be inlined
+   * as a private member variable to the class, or it may be compacted into arrays of the same
+   * type and initialization in order to avoid Constant Pool limit errors for both state declaration
+   * and initialization.
+   *
+   * We compact state into arrays when we can anticipate variables of the same type and `initCode`
+   * may appear numerous times. Variable names with integer suffixes (as given by the `freshName`
+   * function), that are either simply assigned (null, to the empty/base constructor of the type, or
+   * having no initialization) or are primitive are workable candidates for array compaction, as
+   * these variable types are likely to appear numerous times, and can be easily initialized in
+   * loops.
+   *
+   * @param javaType the javaType
+   * @param variableName the variable name
+   * @param initCode the initialization code for the variable
+   * @param inline whether the declaration and initialization code should be inlined rather than
+   *               compacted
+   * @return the name of the mutable state variable, which is either the original name if the
+   *         variable is inlined to the class, or an array access if the variable is to be stored
+   *         in an array of variables of the same type and initialization.
+   */
+  def addMutableState(
+    javaType: String,
+    variableName: String,
+    initCode: String,
+    inline: Boolean = false): String = {
+    if (!inline &&
+      // identifies a 'freshname' style variable with a numerical suffix, and possible
+      // underscore-delimited prefix.
+      variableName.matches("[\\w_]+\\d+") &&
+      // identifies a simply-assigned object, or a primitive type
+      (initCode.matches("(^[\\w_]+\\d+\\s*=\\s*null;|"
+        + "^[\\w_]+\\d+\\s*=\\s*new\\s*[\\w\\.]+\\(\\);$|"
+        + "^$)")
+        || isPrimitiveType(javaType))) {
+
+      // Create an initialization code agnostic to the actual variable name which we can key by
+      val initCodeKey = initCode.replaceAll(variableName, "*VALUE*")
+
+      if (mutableStateArrayIdx.contains((javaType, initCodeKey))) {
+        // a mutableStateArray for the given type and initialization has already been declared,
+        // update the max index of the array and return the array-based alias for the variable
+        val arrayName = mutableStateArrayNames((javaType, initCodeKey))
+        val idx = mutableStateArrayIdx((javaType, initCodeKey)) + 1
+
+        mutableStateArrayIdx.update((javaType, initCodeKey), idx)
+
+        s"$arrayName[$idx]"
+      } else {
+        // no mutableStateArray has been declared yet for the given type and initialization code.
+        // Create a new name for the array, and add entries keeping track of the new array name,
+        // its current index, and initialization code
+        val arrayName = freshName("mutableStateArray")
+        val qualifiedInitCode = initCode.replaceAll(variableName, s"$arrayName[i]")
+        mutableStateArrayNames += (javaType, initCodeKey) -> arrayName
+        mutableStateArrayIdx += (javaType, initCodeKey) -> 0
+        mutableStateArrayInitCodes += (javaType, initCodeKey) -> qualifiedInitCode
+
+        s"$arrayName[0]"
+      }
+    } else {
+      // non-primitive and non-simply-assigned state is declared inline to the outer class
+      mutableStates += Tuple3(javaType, variableName, initCode)
 
-  def addMutableState(javaType: String, variableName: String, initCode: String): Unit = {
-    mutableStates += ((javaType, variableName, initCode))
+      variableName
+    }
   }
 
+
   /**
    * Add buffer variable which stores data coming from an [[InternalRow]]. This methods guarantees
    * that the variable is safely stored, which is important for (potentially) byte array backed
    * data types like: UTF8String, ArrayData, MapData & InternalRow.
    */
   def addBufferedState(dataType: DataType, variableName: String, initCode: String): ExprCode = {
-    val value = freshName(variableName)
-    addMutableState(javaType(dataType), value, "")
+    val valueAccessor = addMutableState(javaType(dataType), freshName(variableName), "")
     val code = dataType match {
-      case StringType => s"$value = $initCode.clone();"
-      case _: StructType | _: ArrayType | _: MapType => s"$value = $initCode.copy();"
-      case _ => s"$value = $initCode;"
+      case StringType => s"$valueAccessor = $initCode.clone();"
+      case _: StructType | _: ArrayType | _: MapType => s"$valueAccessor = $initCode.copy();"
+      case _ => s"$valueAccessor = $initCode;"
     }
-    ExprCode(code, "false", value)
+    ExprCode(code, "false", valueAccessor)
   }
 
   def declareMutableStates(): String = {
     // It's possible that we add same mutable state twice, e.g. the `mergeExpressions` in
     // `TypedAggregateExpression`, we should call `distinct` here to remove the duplicated ones.
-    mutableStates.distinct.map { case (javaType, variableName, _) =>
+    val inlinedStates = mutableStates.distinct.map { case (javaType, variableName, _) =>
       s"private $javaType $variableName;"
-    }.mkString("\n")
+    }
+
+    val arrayStates = mutableStateArrayNames.map { case ((javaType, initCode), arrayName) =>
+      val length = mutableStateArrayIdx((javaType, initCode)) + 1
+      if (javaType.matches("^.*\\[\\]$")) {
+        val baseType = javaType.substring(0, javaType.length - 2)
+        s"private $javaType[] $arrayName = new $baseType[$length][];"
+      } else {
+        s"private $javaType[] $arrayName = new $javaType[$length];"
+      }
+    }
+
+    (inlinedStates ++ arrayStates).mkString("\n")
   }
 
   def initMutableStates(): String = {
     // It's possible that we add same mutable state twice, e.g. the `mergeExpressions` in
     // `TypedAggregateExpression`, we should call `distinct` here to remove the duplicated ones.
     val initCodes = mutableStates.distinct.map(_._3 + "\n")
+    // array state is initialized in loops
+    val arrayInitCodes = mutableStateArrayNames.map { case ((javaType, initCode), arrayName) =>
+      val qualifiedInitCode = mutableStateArrayInitCodes((javaType, initCode))
+      if (qualifiedInitCode.equals("")) {
+        ""
+      } else {
+        s"""
+           for (int i = 0; i < $arrayName.length; i++) {
+             $qualifiedInitCode
+           }
+         """
+      }
+    }
+
     // The generated initialization code may exceed 64kb function size limit in JVM if there are too
     // many mutable states, so split it into multiple functions.
-    splitExpressions(initCodes, "init", Nil)
+    splitExpressions(initCodes ++ arrayInitCodes, "init", Nil)
   }
 
   /**
@@ -761,15 +868,15 @@ class CodegenContext {
    * @param arguments the list of (type, name) of the arguments of the split function.
    * @param returnType the return type of the split function.
    * @param makeSplitFunction makes split function body, e.g. add preparation or cleanup.
-   * @param foldFunctions folds the split function calls.
+   * @param transformFunctions processes the function calls with an additional transformation.
    */
   def splitExpressions(
       expressions: Seq[String],
       funcName: String,
       arguments: Seq[(String, String)],
       returnType: String = "void",
       makeSplitFunction: String => String = identity,
-      foldFunctions: Seq[String] => String = _.mkString("", ";\n", ";")): String = {
+      transformFunctions: Seq[String] => Seq[String] = _.map(s => s + ";\n")): String = {
     val blocks = new ArrayBuffer[String]()
     val blockBuilder = new StringBuilder()
     for (code <- expressions) {
@@ -801,7 +908,10 @@ class CodegenContext {
         addNewFunction(name, code)
       }
 
-      foldFunctions(functions.map(name => s"$name(${arguments.map(_._2).mkString(", ")})"))
+      val exprs = transformFunctions(functions.map(name =>
+        s"$name(${arguments.map(_._2).mkString(", ")})"))
+
+      splitExpressions(exprs, funcName, arguments)
     }
   }
 
@@ -895,12 +1005,12 @@ class CodegenContext {
       //   2. Less code.
       // Currently, we will do this for all non-leaf only expression trees (i.e. expr trees with
       // at least two nodes) as the cost of doing it is expected to be low.
-      addMutableState("boolean", isNull, s"$isNull = false;")
-      addMutableState(javaType(expr.dataType), value,
+      val isNullAccessor = addMutableState("boolean", isNull, s"$isNull = false;")
+      val valueAccessor = addMutableState(javaType(expr.dataType), value,
         s"$value = ${defaultValue(expr.dataType)};")
 
       subexprFunctions += s"${addNewFunction(fnName, fn)}($INPUT_ROW);"
-      val state = SubExprEliminationState(isNull, value)
+      val state = SubExprEliminationState(isNullAccessor, valueAccessor)
       e.foreach(subExprEliminationExprs.put(_, state))
     }
   }