apache
diff --git a/‎.rat-excludes‎
Lines changed: 1 addition & 0 deletions b/‎.rat-excludes‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/r/RUtils.scala‎
Lines changed: 6 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/api/r/RUtils.scala‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala‎
Lines changed: 12 additions & 6 deletions b/‎core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala‎
Lines changed: 3 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala‎
Lines changed: 112 additions & 16 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala‎
Lines changed: 112 additions & 16 deletions
diff --git a/‎sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala‎
Lines changed: 3 additions & 1 deletion b/‎sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister‎
Lines changed: 3 additions & 0 deletions b/‎sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala‎
Lines changed: 1 addition & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala‎
Lines changed: 1 addition & 1 deletion
@@ -93,3 +93,4 @@ INDEX
 .lintr
 gen-java.*
 .*avpr
+org.apache.spark.sql.sources.DataSourceRegister
@@ -67,7 +67,11 @@ private[spark] object RUtils {
 
   /** Check if R is installed before running tests that use R commands. */
   def isRInstalled: Boolean = {
-    val builder = new ProcessBuilder(Seq("R", "--version"))
-    builder.start().waitFor() == 0
+    try {
+      val builder = new ProcessBuilder(Seq("R", "--version"))
+      builder.start().waitFor() == 0
+    } catch {
+      case e: Exception => false
+    }
   }
 }
@@ -284,7 +284,7 @@ object SparkSubmit {
         Nil
       }
     val resolvedMavenCoordinates = SparkSubmitUtils.resolveMavenCoordinates(args.packages,
-      Some(args.repositories), Some(args.ivyRepoPath), exclusions = exclusions)
+      Option(args.repositories), Option(args.ivyRepoPath), exclusions = exclusions)
     if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
       args.jars = mergeFileLists(args.jars, resolvedMavenCoordinates)
       if (args.isPython) {
 
@@ -176,10 +176,15 @@ class BernoulliSampler[T: ClassTag](fraction: Double) extends RandomSampler[T, T
  * A sampler for sampling with replacement, based on values drawn from Poisson distribution.
  *
  * @param fraction the sampling fraction (with replacement)
+ * @param useGapSamplingIfPossible if true, use gap sampling when sampling ratio is low.
  * @tparam T item type
  */
 @DeveloperApi
-class PoissonSampler[T: ClassTag](fraction: Double) extends RandomSampler[T, T] {
+class PoissonSampler[T: ClassTag](
+    fraction: Double,
+    useGapSamplingIfPossible: Boolean) extends RandomSampler[T, T] {
+
+  def this(fraction: Double) = this(fraction, useGapSamplingIfPossible = true)
 
   /** Epsilon slop to avoid failure from floating point jitter. */
   require(
@@ -199,17 +204,18 @@ class PoissonSampler[T: ClassTag](fraction: Double) extends RandomSampler[T, T]
   override def sample(items: Iterator[T]): Iterator[T] = {
     if (fraction <= 0.0) {
       Iterator.empty
-    } else if (fraction <= RandomSampler.defaultMaxGapSamplingFraction) {
-        new GapSamplingReplacementIterator(items, fraction, rngGap, RandomSampler.rngEpsilon)
+    } else if (useGapSamplingIfPossible &&
+               fraction <= RandomSampler.defaultMaxGapSamplingFraction) {
+      new GapSamplingReplacementIterator(items, fraction, rngGap, RandomSampler.rngEpsilon)
     } else {
-      items.flatMap { item => {
+      items.flatMap { item =>
         val count = rng.sample()
         if (count == 0) Iterator.empty else Iterator.fill(count)(item)
-      }}
+      }
     }
   }
 
-  override def clone: PoissonSampler[T] = new PoissonSampler[T](fraction)
+  override def clone: PoissonSampler[T] = new PoissonSampler[T](fraction, useGapSamplingIfPossible)
 }
 
 
 
@@ -408,7 +408,7 @@ class Analyzer(
     /**
      * Returns true if `exprs` contains a [[Star]].
      */
-    protected def containsStar(exprs: Seq[Expression]): Boolean =
+    def containsStar(exprs: Seq[Expression]): Boolean =
       exprs.exists(_.collect { case _: Star => true }.nonEmpty)
   }
 
@@ -602,6 +602,8 @@ class Analyzer(
    */
   object ResolveGenerate extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+      case g: Generate if ResolveReferences.containsStar(g.generator.children) =>
+        failAnalysis("Cannot explode *, explode can only be applied on a specific column.")
       case p: Generate if !p.child.resolved || !p.generator.resolved => p
       case g: Generate if !g.resolved =>
         g.copy(generatorOutput = makeGeneratorOutput(g.generator, g.generatorOutput.map(_.name)))
 
@@ -75,6 +75,37 @@ case class OrderedDistribution(ordering: Seq[SortOrder]) extends Distribution {
   def clustering: Set[Expression] = ordering.map(_.child).toSet
 }
 
+/**
+ * Describes how an operator's output is split across partitions. The `compatibleWith`,
+ * `guarantees`, and `satisfies` methods describe relationships between child partitionings,
+ * target partitionings, and [[Distribution]]s. These relations are described more precisely in
+ * their individual method docs, but at a high level:
+ *
+ *  - `satisfies` is a relationship between partitionings and distributions.
+ *  - `compatibleWith` is relationships between an operator's child output partitionings.
+ *  - `guarantees` is a relationship between a child's existing output partitioning and a target
+ *     output partitioning.
+ *
+ *  Diagrammatically:
+ *
+ *            +--------------+
+ *            | Distribution |
+ *            +--------------+
+ *                    ^
+ *                    |
+ *               satisfies
+ *                    |
+ *            +--------------+                  +--------------+
+ *            |    Child     |                  |    Target    |
+ *       +----| Partitioning |----guarantees--->| Partitioning |
+ *       |    +--------------+                  +--------------+
+ *       |            ^
+ *       |            |
+ *       |     compatibleWith
+ *       |            |
+ *       +------------+
+ *
+ */
 sealed trait Partitioning {
   /** Returns the number of partitions that the data is split across */
   val numPartitions: Int
@@ -90,9 +121,66 @@ sealed trait Partitioning {
   /**
    * Returns true iff we can say that the partitioning scheme of this [[Partitioning]]
    * guarantees the same partitioning scheme described by `other`.
+   *
+   * Compatibility of partitionings is only checked for operators that have multiple children
+   * and that require a specific child output [[Distribution]], such as joins.
+   *
+   * Intuitively, partitionings are compatible if they route the same partitioning key to the same
+   * partition. For instance, two hash partitionings are only compatible if they produce the same
+   * number of output partitionings and hash records according to the same hash function and
+   * same partitioning key schema.
+   *
+   * Put another way, two partitionings are compatible with each other if they satisfy all of the
+   * same distribution guarantees.
    */
-  // TODO: Add an example once we have the `nullSafe` concept.
-  def guarantees(other: Partitioning): Boolean
+  def compatibleWith(other: Partitioning): Boolean
+
+  /**
+   * Returns true iff we can say that the partitioning scheme of this [[Partitioning]] guarantees
+   * the same partitioning scheme described by `other`. If a `A.guarantees(B)`, then repartitioning
+   * the child's output according to `B` will be unnecessary. `guarantees` is used as a performance
+   * optimization to allow the exchange planner to avoid redundant repartitionings. By default,
+   * a partitioning only guarantees partitionings that are equal to itself (i.e. the same number
+   * of partitions, same strategy (range or hash), etc).
+   *
+   * In order to enable more aggressive optimization, this strict equality check can be relaxed.
+   * For example, say that the planner needs to repartition all of an operator's children so that
+   * they satisfy the [[AllTuples]] distribution. One way to do this is to repartition all children
+   * to have the [[SinglePartition]] partitioning. If one of the operator's children already happens
+   * to be hash-partitioned with a single partition then we do not need to re-shuffle this child;
+   * this repartitioning can be avoided if a single-partition [[HashPartitioning]] `guarantees`
+   * [[SinglePartition]].
+   *
+   * The SinglePartition example given above is not particularly interesting; guarantees' real
+   * value occurs for more advanced partitioning strategies. SPARK-7871 will introduce a notion
+   * of null-safe partitionings, under which partitionings can specify whether rows whose
+   * partitioning keys contain null values will be grouped into the same partition or whether they
+   * will have an unknown / random distribution. If a partitioning does not require nulls to be
+   * clustered then a partitioning which _does_ cluster nulls will guarantee the null clustered
+   * partitioning. The converse is not true, however: a partitioning which clusters nulls cannot
+   * be guaranteed by one which does not cluster them. Thus, in general `guarantees` is not a
+   * symmetric relation.
+   *
+   * Another way to think about `guarantees`: if `A.guarantees(B)`, then any partitioning of rows
+   * produced by `A` could have also been produced by `B`.
+   */
+  def guarantees(other: Partitioning): Boolean = this == other
+}
+
+object Partitioning {
+  def allCompatible(partitionings: Seq[Partitioning]): Boolean = {
+    // Note: this assumes transitivity
+    partitionings.sliding(2).map {
+      case Seq(a) => true
+      case Seq(a, b) =>
+        if (a.numPartitions != b.numPartitions) {
+          assert(!a.compatibleWith(b) && !b.compatibleWith(a))
+          false
+        } else {
+          a.compatibleWith(b) && b.compatibleWith(a)
+        }
+    }.forall(_ == true)
+  }
 }
 
 case class UnknownPartitioning(numPartitions: Int) extends Partitioning {
@@ -101,6 +189,8 @@ case class UnknownPartitioning(numPartitions: Int) extends Partitioning {
     case _ => false
   }
 
+  override def compatibleWith(other: Partitioning): Boolean = false
+
   override def guarantees(other: Partitioning): Boolean = false
 }
 
@@ -109,21 +199,9 @@ case object SinglePartition extends Partitioning {
 
   override def satisfies(required: Distribution): Boolean = true
 
-  override def guarantees(other: Partitioning): Boolean = other match {
-    case SinglePartition => true
-    case _ => false
-  }
-}
-
-case object BroadcastPartitioning extends Partitioning {
-  val numPartitions = 1
+  override def compatibleWith(other: Partitioning): Boolean = other.numPartitions == 1
 
-  override def satisfies(required: Distribution): Boolean = true
-
-  override def guarantees(other: Partitioning): Boolean = other match {
-    case BroadcastPartitioning => true
-    case _ => false
-  }
+  override def guarantees(other: Partitioning): Boolean = other.numPartitions == 1
 }
 
 /**
@@ -147,6 +225,12 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
     case _ => false
   }
 
+  override def compatibleWith(other: Partitioning): Boolean = other match {
+    case o: HashPartitioning =>
+      this.clusteringSet == o.clusteringSet && this.numPartitions == o.numPartitions
+    case _ => false
+  }
+
   override def guarantees(other: Partitioning): Boolean = other match {
     case o: HashPartitioning =>
       this.clusteringSet == o.clusteringSet && this.numPartitions == o.numPartitions
@@ -185,6 +269,11 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
     case _ => false
   }
 
+  override def compatibleWith(other: Partitioning): Boolean = other match {
+    case o: RangePartitioning => this == o
+    case _ => false
+  }
+
   override def guarantees(other: Partitioning): Boolean = other match {
     case o: RangePartitioning => this == o
     case _ => false
@@ -228,6 +317,13 @@ case class PartitioningCollection(partitionings: Seq[Partitioning])
   override def satisfies(required: Distribution): Boolean =
     partitionings.exists(_.satisfies(required))
 
+  /**
+   * Returns true if any `partitioning` of this collection is compatible with
+   * the given [[Partitioning]].
+   */
+  override def compatibleWith(other: Partitioning): Boolean =
+    partitionings.exists(_.compatibleWith(other))
+
   /**
    * Returns true if any `partitioning` of this collection guarantees
    * the given [[Partitioning]].
 
@@ -71,6 +71,8 @@ trait AnalysisTest extends PlanTest {
     val e = intercept[Exception] {
       analyzer.checkAnalysis(analyzer.execute(inputPlan))
     }
-    expectedErrors.forall(e.getMessage.contains)
+    assert(expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains),
+      s"Expected to throw Exception contains: ${expectedErrors.mkString(", ")}, " +
+        s"actually we get ${e.getMessage}")
   }
 }
@@ -0,0 +1,3 @@
+org.apache.spark.sql.jdbc.DefaultSource
+org.apache.spark.sql.json.DefaultSource
+org.apache.spark.sql.parquet.DefaultSource
@@ -168,7 +168,7 @@ class DataFrame private[sql](
   }
 
   /**
-   * Internal API for Python
+   * Compose the string representing rows for output
    * @param _numRows Number of rows to show
    * @param truncate Whether truncate long strings and align cells right
    */
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,11 @@ private[spark] object RUtils {`
`67`	`67`
`68`	`68`	`/** Check if R is installed before running tests that use R commands. */`
`69`	`69`	`def isRInstalled: Boolean = {`
`70`		`- val builder = new ProcessBuilder(Seq("R", "--version"))`
`71`		`- builder.start().waitFor() == 0`
	`70`	`+ try {`
	`71`	`+ val builder = new ProcessBuilder(Seq("R", "--version"))`
	`72`	`+ builder.start().waitFor() == 0`
	`73`	`+ } catch {`
	`74`	`+ case e: Exception => false`
	`75`	`+ }`
`72`	`76`	`}`
`73`	`77`	`}`
Original file line number	Diff line number	Diff line change
`@@ -284,7 +284,7 @@ object SparkSubmit {`
`284`	`284`	`Nil`
`285`	`285`	`}`
`286`	`286`	`val resolvedMavenCoordinates = SparkSubmitUtils.resolveMavenCoordinates(args.packages,`
`287`		`- Some(args.repositories), Some(args.ivyRepoPath), exclusions = exclusions)`
	`287`	`+ Option(args.repositories), Option(args.ivyRepoPath), exclusions = exclusions)`
`288`	`288`	`if (!StringUtils.isBlank(resolvedMavenCoordinates)) {`
`289`	`289`	`args.jars = mergeFileLists(args.jars, resolvedMavenCoordinates)`
`290`	`290`	`if (args.isPython) {`
Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,8 @@ trait AnalysisTest extends PlanTest {`
`71`	`71`	`val e = intercept[Exception] {`
`72`	`72`	`analyzer.checkAnalysis(analyzer.execute(inputPlan))`
`73`	`73`	`}`
`74`		`- expectedErrors.forall(e.getMessage.contains)`
	`74`	`+ assert(expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains),`
	`75`	`+ s"Expected to throw Exception contains: ${expectedErrors.mkString(", ")}, " +`
	`76`	`+ s"actually we get ${e.getMessage}")`
`75`	`77`	`}`
`76`	`78`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+org.apache.spark.sql.jdbc.DefaultSource`
	`2`	`+org.apache.spark.sql.json.DefaultSource`
	`3`	`+org.apache.spark.sql.parquet.DefaultSource`
Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,7 @@ class DataFrame private[sql](`
`168`	`168`	`}`
`169`	`169`
`170`	`170`	`/**`
`171`		`- * Internal API for Python`
	`171`	`+ * Compose the string representing rows for output`
`172`	`172`	`* @param _numRows Number of rows to show`
`173`	`173`	`* @param truncate Whether truncate long strings and align cells right`
`174`	`174`	`*/`