Add docs for shuffle manager properties, and allow short names for them

mateiz · mateiz · commit ca3efd965378 · 2014-08-06T19:43:17.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -158,8 +158,8 @@ object SparkEnv extends Logging {
 
     // Create an instance of the class named by the given Java system property, or by
     // defaultClassName if the property is not set, and return it as a T
-    def instantiateClass[T](propertyName: String, defaultClassName: String): T = {
-      val name = conf.get(propertyName,  defaultClassName)
+    def instantiateClass[T](propertyName: String, defaultClassName: String = null): T = {
+      val name = conf.get(propertyName, defaultClassName)
       val cls = Class.forName(name, true, Utils.getContextOrSparkClassLoader)
       // Look for a constructor taking a SparkConf and a boolean isDriver, then one taking just
       // SparkConf, then one taking no arguments
@@ -246,8 +246,13 @@ object SparkEnv extends Logging {
       "."
     }
 
-    val shuffleManager = instantiateClass[ShuffleManager](
-      "spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")
+    // Let the user specify short names for shuffle managers
+    val shortShuffleMgrNames = Map(
+      "HASH" -> "org.apache.spark.shuffle.hash.HashShuffleManager",
+      "SORT" -> "org.apache.spark.shuffle.sort.SortShuffleManager")
+    val shuffleMgrName = conf.get("spark.shuffle.manager", "HASH")
+    val shuffleMgrClass = shortShuffleMgrNames.getOrElse(shuffleMgrName, shuffleMgrName)
+    val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)
 
     val shuffleMemoryManager = new ShuffleMemoryManager(conf)
 
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -281,6 +281,24 @@ Apart from these, the following properties are also available, and may be useful
     overhead per reduce task, so keep it small unless you have a large amount of memory.
   </td>
 </tr>
+<tr>
+  <td><code>spark.shuffle.manager</code></td>
+  <td>HASH</td>
+  <td>
+    Implementation to use for shuffling data. A hash-based shuffle manager is the default, but
+    starting in Spark 1.1 there is an experimental sort-based shuffle manager that is more 
+    memory-efficient in environments with small executors, such as YARN. To use that, change
+    this value to <code>SORT</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.sort.bypassMergeThreshold</code></td>
+  <td>200</td>
+  <td>
+    (Advanced) In the sort-based shuffle manager, avoid merge-sorting data if there is no
+    map-side aggregation and there are at most this many reduce partitions.
+  </td>
+</tr>
 </table>
 
 #### Spark UI