Skip to content

Commit 430fe20

Browse files
Group By
1 parent 997cfde commit 430fe20

File tree

2 files changed

+77
-4
lines changed

2 files changed

+77
-4
lines changed
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
package com.sparkbyexamples.spark.dataframe
2+
3+
import org.apache.spark.sql.SparkSession
4+
import org.apache.spark.sql.functions._
5+
6+
object GroupbyExample extends App {
7+
8+
val spark: SparkSession = SparkSession.builder()
9+
.master("local[1]")
10+
.appName("SparkByExamples.com")
11+
.getOrCreate()
12+
13+
spark.sparkContext.setLogLevel("ERROR")
14+
15+
import spark.implicits._
16+
17+
val simpleData = Seq(("James","Sales","NY",90000,34,10000),
18+
("Michael","Sales","NY",86000,56,20000),
19+
("Robert","Sales","CA",81000,30,23000),
20+
("Maria","Finance","CA",90000,24,23000),
21+
("Raman","Finance","CA",99000,40,24000),
22+
("Scott","Finance","NY",83000,36,19000),
23+
("Jen","Finance","NY",79000,53,15000),
24+
("Jeff","Marketing","CA",80000,25,18000),
25+
("Kumar","Marketing","NY",91000,50,21000)
26+
)
27+
val df = simpleData.toDF("employee_name","department","state","salary","age","bonus")
28+
df.show()
29+
30+
//Group By on single column
31+
df.groupBy("department").count().show(false)
32+
df.groupBy("department").avg("salary").show(false)
33+
df.groupBy("department").sum("salary").show(false)
34+
df.groupBy("department").min("salary").show(false)
35+
df.groupBy("department").max("salary").show(false)
36+
df.groupBy("department").mean("salary").show(false)
37+
38+
//GroupBy on multiple columns
39+
df.groupBy("department","state")
40+
.sum("salary","bonus")
41+
.show(false)
42+
df.groupBy("department","state")
43+
.avg("salary","bonus")
44+
.show(false)
45+
df.groupBy("department","state")
46+
.max("salary","bonus")
47+
.show(false)
48+
df.groupBy("department","state")
49+
.min("salary","bonus")
50+
.show(false)
51+
df.groupBy("department","state")
52+
.mean("salary","bonus")
53+
.show(false)
54+
55+
//Running Filter
56+
df.groupBy("department","state")
57+
.sum("salary","bonus")
58+
.show(false)
59+
60+
//using agg function
61+
df.groupBy("department")
62+
.agg(
63+
sum("salary").as("sum_salary"),
64+
avg("salary").as("avg_salary"),
65+
sum("bonus").as("sum_bonus"),
66+
max("bonus").as("max_bonus"))
67+
.show(false)
68+
69+
df.groupBy("department")
70+
.agg(
71+
sum("salary").as("sum_salary"),
72+
avg("salary").as("avg_salary"),
73+
sum("bonus").as("sum_bonus"),
74+
stddev("bonus").as("stddev_bonus"))
75+
.where(col("sum_bonus") > 50000)
76+
.show(false)
77+
}

src/main/scala/com/sparkbyexamples/spark/dataframe/HandleNullExample.scala

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,5 @@ object HandleNullExample extends App{
2828
.na.fill("",Array("type"))
2929
.show(false)
3030

31-
32-
3331
// Array and map columns
34-
35-
3632
}

0 commit comments

Comments
 (0)