@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, UnresolvedAli
25
25
import org .apache .spark .sql .catalyst .expressions ._
26
26
import org .apache .spark .sql .catalyst .expressions .aggregate ._
27
27
import org .apache .spark .sql .catalyst .plans .logical .{Pivot , Rollup , Cube , Aggregate }
28
- import org .apache .spark .sql .types .{ StringType , NumericType }
28
+ import org .apache .spark .sql .types .NumericType
29
29
30
30
31
31
/**
@@ -282,74 +282,96 @@ class GroupedData protected[sql](
282
282
}
283
283
284
284
/**
285
- * (Scala-specific) Pivots a column of the current [[DataFrame ]] and preform the specified
286
- * aggregation.
287
- * {{{
288
- * // Compute the sum of earnings for each year by course with each course as a separate column
289
- * df.groupBy($"year").pivot($"course", "dotNET", "Java").agg(sum($"earnings"))
290
- * // Or without specifying column values
291
- * df.groupBy($"year").pivot($"course").agg(sum($"earnings"))
292
- * }}}
293
- * @param pivotColumn Column to pivot
294
- * @param values Optional list of values of pivotColumn that will be translated to columns in the
295
- * output data frame. If values are not provided the method with do an immediate
296
- * call to .distinct() on the pivot column.
297
- * @since 1.6.0
298
- */
299
- @ scala.annotation.varargs
300
- def pivot (pivotColumn : Column , values : Column * ): GroupedData = groupType match {
301
- case _ : GroupedData .PivotType =>
302
- throw new UnsupportedOperationException (" repeated pivots are not supported" )
303
- case GroupedData .GroupByType =>
304
- val pivotValues = if (values.nonEmpty) {
305
- values.map {
306
- case Column (literal : Literal ) => literal
307
- case other =>
308
- throw new UnsupportedOperationException (
309
- s " The values of a pivot must be literals, found $other" )
310
- }
311
- } else {
312
- // This is to prevent unintended OOM errors when the number of distinct values is large
313
- val maxValues = df.sqlContext.conf.getConf(SQLConf .DATAFRAME_PIVOT_MAX_VALUES )
314
- // Get the distinct values of the column and sort them so its consistent
315
- val values = df.select(pivotColumn)
316
- .distinct()
317
- .sort(pivotColumn)
318
- .map(_.get(0 ))
319
- .take(maxValues + 1 )
320
- .map(Literal (_)).toSeq
321
- if (values.length > maxValues) {
322
- throw new RuntimeException (
323
- s " The pivot column $pivotColumn has more than $maxValues distinct values, " +
324
- " this could indicate an error. " +
325
- " If this was intended, set \" " + SQLConf .DATAFRAME_PIVOT_MAX_VALUES .key + " \" " +
326
- s " to at least the number of distinct values of the pivot column. " )
327
- }
328
- values
329
- }
330
- new GroupedData (df, groupingExprs, GroupedData .PivotType (pivotColumn.expr, pivotValues))
331
- case _ =>
332
- throw new UnsupportedOperationException (" pivot is only supported after a groupBy" )
285
+ * Pivots a column of the current [[DataFrame ]] and preform the specified aggregation.
286
+ * There are two versions of pivot function: one that requires the caller to specify the list
287
+ * of distinct values to pivot on, and one that does not. The latter is more concise but less
288
+ * efficient, because Spark needs to first compute the list of distinct values internally.
289
+ *
290
+ * {{{
291
+ * // Compute the sum of earnings for each year by course with each course as a separate column
292
+ * df.groupBy("year").pivot("course", Seq("dotNET", "Java")).sum("earnings")
293
+ *
294
+ * // Or without specifying column values (less efficient)
295
+ * df.groupBy("year").pivot("course").sum("earnings")
296
+ * }}}
297
+ *
298
+ * @param pivotColumn Name of the column to pivot.
299
+ * @since 1.6.0
300
+ */
301
+ def pivot (pivotColumn : String ): GroupedData = {
302
+ // This is to prevent unintended OOM errors when the number of distinct values is large
303
+ val maxValues = df.sqlContext.conf.getConf(SQLConf .DATAFRAME_PIVOT_MAX_VALUES )
304
+ // Get the distinct values of the column and sort them so its consistent
305
+ val values = df.select(pivotColumn)
306
+ .distinct()
307
+ .sort(pivotColumn)
308
+ .map(_.get(0 ))
309
+ .take(maxValues + 1 )
310
+ .toSeq
311
+
312
+ if (values.length > maxValues) {
313
+ throw new AnalysisException (
314
+ s " The pivot column $pivotColumn has more than $maxValues distinct values, " +
315
+ " this could indicate an error. " +
316
+ s " If this was intended, set ${SQLConf .DATAFRAME_PIVOT_MAX_VALUES .key} " +
317
+ " to at least the number of distinct values of the pivot column." )
318
+ }
319
+
320
+ pivot(pivotColumn, values)
333
321
}
334
322
335
323
/**
336
- * Pivots a column of the current [[DataFrame ]] and preform the specified aggregation.
337
- * {{{
338
- * // Compute the sum of earnings for each year by course with each course as a separate column
339
- * df.groupBy("year").pivot("course", "dotNET", "Java").sum("earnings")
340
- * // Or without specifying column values
341
- * df.groupBy("year").pivot("course").sum("earnings")
342
- * }}}
343
- * @param pivotColumn Column to pivot
344
- * @param values Optional list of values of pivotColumn that will be translated to columns in the
345
- * output data frame. If values are not provided the method with do an immediate
346
- * call to .distinct() on the pivot column.
347
- * @since 1.6.0
348
- */
349
- @ scala.annotation.varargs
350
- def pivot (pivotColumn : String , values : Any * ): GroupedData = {
351
- val resolvedPivotColumn = Column (df.resolve(pivotColumn))
352
- pivot(resolvedPivotColumn, values.map(functions.lit): _* )
324
+ * Pivots a column of the current [[DataFrame ]] and preform the specified aggregation.
325
+ * There are two versions of pivot function: one that requires the caller to specify the list
326
+ * of distinct values to pivot on, and one that does not. The latter is more concise but less
327
+ * efficient, because Spark needs to first compute the list of distinct values internally.
328
+ *
329
+ * {{{
330
+ * // Compute the sum of earnings for each year by course with each course as a separate column
331
+ * df.groupBy("year").pivot("course", Seq("dotNET", "Java")).sum("earnings")
332
+ *
333
+ * // Or without specifying column values (less efficient)
334
+ * df.groupBy("year").pivot("course").sum("earnings")
335
+ * }}}
336
+ *
337
+ * @param pivotColumn Name of the column to pivot.
338
+ * @param values List of values that will be translated to columns in the output DataFrame.
339
+ * @since 1.6.0
340
+ */
341
+ def pivot (pivotColumn : String , values : Seq [Any ]): GroupedData = {
342
+ groupType match {
343
+ case GroupedData .GroupByType =>
344
+ new GroupedData (
345
+ df,
346
+ groupingExprs,
347
+ GroupedData .PivotType (df.resolve(pivotColumn), values.map(Literal .apply)))
348
+ case _ : GroupedData .PivotType =>
349
+ throw new UnsupportedOperationException (" repeated pivots are not supported" )
350
+ case _ =>
351
+ throw new UnsupportedOperationException (" pivot is only supported after a groupBy" )
352
+ }
353
+ }
354
+
355
+ /**
356
+ * Pivots a column of the current [[DataFrame ]] and preform the specified aggregation.
357
+ * There are two versions of pivot function: one that requires the caller to specify the list
358
+ * of distinct values to pivot on, and one that does not. The latter is more concise but less
359
+ * efficient, because Spark needs to first compute the list of distinct values internally.
360
+ *
361
+ * {{{
362
+ * // Compute the sum of earnings for each year by course with each course as a separate column
363
+ * df.groupBy("year").pivot("course", Arrays.<Object>asList("dotNET", "Java")).sum("earnings");
364
+ *
365
+ * // Or without specifying column values (less efficient)
366
+ * df.groupBy("year").pivot("course").sum("earnings");
367
+ * }}}
368
+ *
369
+ * @param pivotColumn Name of the column to pivot.
370
+ * @param values List of values that will be translated to columns in the output DataFrame.
371
+ * @since 1.6.0
372
+ */
373
+ def pivot (pivotColumn : String , values : java.util.List [Any ]): GroupedData = {
374
+ pivot(pivotColumn, values.asScala)
353
375
}
354
376
}
355
377
0 commit comments