Skip to content

Commit 5f694f9

Browse files
Spark Examples
1 parent 8110950 commit 5f694f9

21 files changed

+616
-12
lines changed

pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
<packaging>jar</packaging>
1212
<properties>
1313
<scala.version>2.11.11</scala.version>
14-
<spark.version>2.4.0</spark.version>
14+
<spark.version>2.4.4</spark.version>
1515
</properties>
1616

1717
<repositories>
@@ -71,7 +71,7 @@
7171
<dependency>
7272
<groupId>org.apache.spark</groupId>
7373
<artifactId>spark-avro_2.11</artifactId>
74-
<version>2.4.0</version>
74+
<version>2.4.4</version>
7575
</dependency>
7676

7777
</dependencies>

src/main/resources/test.txt

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,115 @@ at no cost and with
1212
This eBook is for the use
1313
of anyone anywhere
1414
at no cost and with
15+
Project Gutenberg’s
16+
Alice’s Adventures in Wonderland
17+
by Lewis Carroll
18+
This eBook is for the use
19+
of anyone anywhere
20+
at no cost and with
21+
Alice’s Adventures in Wonderland
22+
by Lewis Carroll
23+
This eBook is for the use
24+
of anyone anywhere
25+
at no cost and with
26+
This eBook is for the use
27+
of anyone anywhere
28+
at no cost and with
29+
Project Gutenberg’s
30+
Alice’s Adventures in Wonderland
31+
by Lewis Carroll
32+
This eBook is for the use
33+
of anyone anywhere
34+
at no cost and with
35+
Alice’s Adventures in Wonderland
36+
by Lewis Carroll
37+
This eBook is for the use
38+
of anyone anywhere
39+
at no cost and with
40+
This eBook is for the use
41+
of anyone anywhere
42+
at no cost and with
43+
Project Gutenberg’s
44+
Alice’s Adventures in Wonderland
45+
by Lewis Carroll
46+
This eBook is for the use
47+
of anyone anywhere
48+
at no cost and with
49+
Alice’s Adventures in Wonderland
50+
by Lewis Carroll
51+
This eBook is for the use
52+
of anyone anywhere
53+
at no cost and with
54+
This eBook is for the use
55+
of anyone anywhere
56+
at no cost and with
57+
Project Gutenberg’s
58+
Alice’s Adventures in Wonderland
59+
by Lewis Carroll
60+
This eBook is for the use
61+
of anyone anywhere
62+
at no cost and with
63+
Alice’s Adventures in Wonderland
64+
by Lewis Carroll
65+
This eBook is for the use
66+
of anyone anywhere
67+
at no cost and with
68+
This eBook is for the use
69+
of anyone anywhere
70+
at no cost and with
71+
Project Gutenberg’s
72+
Alice’s Adventures in Wonderland
73+
by Lewis Carroll
74+
This eBook is for the use
75+
of anyone anywhere
76+
at no cost and with
77+
Alice’s Adventures in Wonderland
78+
by Lewis Carroll
79+
This eBook is for the use
80+
of anyone anywhere
81+
at no cost and with
82+
This eBook is for the use
83+
of anyone anywhere
84+
at no cost and with
85+
Project Gutenberg’s
86+
Alice’s Adventures in Wonderland
87+
by Lewis Carroll
88+
This eBook is for the use
89+
of anyone anywhere
90+
at no cost and with
91+
Alice’s Adventures in Wonderland
92+
by Lewis Carroll
93+
This eBook is for the use
94+
of anyone anywhere
95+
at no cost and with
96+
This eBook is for the use
97+
of anyone anywhere
98+
at no cost and with
99+
Project Gutenberg’s
100+
Alice’s Adventures in Wonderland
101+
by Lewis Carroll
102+
This eBook is for the use
103+
of anyone anywhere
104+
at no cost and with
105+
Alice’s Adventures in Wonderland
106+
by Lewis Carroll
107+
This eBook is for the use
108+
of anyone anywhere
109+
at no cost and with
110+
This eBook is for the use
111+
of anyone anywhere
112+
at no cost and with
113+
Project Gutenberg’s
114+
Alice’s Adventures in Wonderland
115+
by Lewis Carroll
116+
This eBook is for the use
117+
of anyone anywhere
118+
at no cost and with
119+
Alice’s Adventures in Wonderland
120+
by Lewis Carroll
121+
This eBook is for the use
122+
of anyone anywhere
123+
at no cost and with
124+
This eBook is for the use
125+
of anyone anywhere
126+
at no cost and with
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package com.sparkbyexamples.spark
2+
3+
import org.apache.spark.sql.{SQLContext, SparkSession}
4+
5+
object SQLContextExample extends App {
6+
7+
val spark = SparkSession.builder()
8+
.master("local[1]")
9+
.appName("SparkByExamples.com")
10+
.getOrCreate();
11+
12+
spark.sparkContext.setLogLevel("ERROR")
13+
14+
15+
val sqlContext:SQLContext = spark.sqlContext
16+
17+
//read csv with options
18+
val df = sqlContext.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true"))
19+
.csv("src/main/resources/zipcodes.csv")
20+
df.show()
21+
df.printSchema()
22+
23+
df.createOrReplaceTempView("TAB")
24+
sqlContext.sql("select * from TAB")
25+
.show(false)
26+
27+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package com.sparkbyexamples.spark
2+
3+
import com.sparkbyexamples.spark.dataframe.functions.SortExample.spark
4+
import org.apache.spark.SparkContext
5+
import org.apache.spark.sql.{SQLContext, SparkSession}
6+
7+
object SparkContextExample extends App{
8+
9+
val spark = SparkSession.builder()
10+
.master("local[1]")
11+
.appName("SparkByExamples.com")
12+
.getOrCreate();
13+
14+
spark.sparkContext.setLogLevel("ERROR")
15+
16+
val sparkContext:SparkContext = spark.sparkContext
17+
val sqlCon:SQLContext = spark.sqlContext
18+
19+
val sqlContext = new org.apache.spark.sql.SQLContext(spark.sparkContext)
20+
21+
println("First SparkContext:")
22+
println("APP Name :"+spark.sparkContext.appName);
23+
println("Deploy Mode :"+spark.sparkContext.deployMode);
24+
println("Master :"+spark.sparkContext.master);
25+
26+
val sparkSession2 = SparkSession.builder()
27+
.master("local[1]")
28+
.appName("SparkByExample-test")
29+
.getOrCreate();
30+
31+
println("Second SparkContext:")
32+
println("APP Name :"+sparkSession2.sparkContext.appName);
33+
println("Deploy Mode :"+sparkSession2.sparkContext.deployMode);
34+
println("Master :"+sparkSession2.sparkContext.master);
35+
}

src/main/scala/com/sparkbyexamples/spark/SparkSessionTest.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,12 @@ object SparkSessionTest {
66

77
def main(args:Array[String]): Unit ={
88

9+
910
val spark = SparkSession.builder()
1011
.master("local[1]")
1112
.appName("SparkByExample")
1213
.getOrCreate();
13-
14+
1415
println("First SparkContext:")
1516
println("APP Name :"+spark.sparkContext.appName);
1617
println("Deploy Mode :"+spark.sparkContext.deployMode);
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package com.sparkbyexamples.spark.dataframe
2+
3+
import org.apache.spark.sql.SparkSession
4+
5+
object BroadcastExample extends App{
6+
7+
val spark = SparkSession.builder()
8+
.appName("SparkByExamples.com")
9+
.master("local")
10+
.getOrCreate()
11+
12+
val states = Map(("NY","New York"),("CA","California"),("FL","Florida"))
13+
val countries = Map(("USA","United States of America"),("IN","India"))
14+
15+
val broadcastStates = spark.sparkContext.broadcast(states)
16+
val broadcastCountries = spark.sparkContext.broadcast(countries)
17+
18+
val data = Seq(("James","Smith","USA","CA"),
19+
("Michael","Rose","USA","NY"),
20+
("Robert","Williams","USA","CA"),
21+
("Maria","Jones","USA","FL")
22+
)
23+
24+
val columns = Seq("firstname","lastname","country","state")
25+
import spark.sqlContext.implicits._
26+
val df = data.toDF(columns:_*)
27+
28+
val df2 = df.map(row=>{
29+
val country = row.getString(2)
30+
val state = row.getString(3)
31+
32+
val fullCountry = broadcastCountries.value.get(country).get
33+
val fullState = broadcastStates.value.get(state).get
34+
(row.getString(0),row.getString(1),fullCountry,fullState)
35+
}).toDF(columns:_*)
36+
37+
df2.show(false)
38+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package com.sparkbyexamples.spark.dataframe
2+
3+
import com.sparkbyexamples.spark.SQLContextExample.spark
4+
import org.apache.log4j.lf5.LogLevel
5+
import org.apache.spark.sql.SparkSession
6+
7+
object ColumnTruncate extends App {
8+
9+
val spark:SparkSession = SparkSession.builder()
10+
.master("local[1]")
11+
.appName("SparkByExamples.com")
12+
.getOrCreate()
13+
14+
15+
16+
import spark.implicits._
17+
val columns = Seq("Seqno","Quote")
18+
val data = Seq(("1", "Be the change that you wish to see in the world"),
19+
("2", "Everyone thinks of changing the world, but no one thinks of changing himself."),
20+
("3", "The purpose of our lives is to be happy."))
21+
val df = data.toDF(columns:_*)
22+
df.show(false)
23+
24+
25+
}

src/main/scala/com/sparkbyexamples/spark/dataframe/FromJsonFile.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,6 @@ object FromJsonFile {
6868
//Write json file
6969

7070
df2.write
71-
.json("/tmp/spark_output/zipcodes.json")
71+
.json("/tmp/spark_output/zipcodes1.json")
7272
}
7373
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
package com.sparkbyexamples.spark
2+
3+
import org.apache.spark.sql.SparkSession
4+
5+
object ParquetAWSExample {
6+
7+
def main(args:Array[String]):Unit= {
8+
9+
val spark: SparkSession = SparkSession.builder()
10+
.master("local[1]")
11+
.appName("SparkByExamples.com")
12+
.getOrCreate()
13+
spark.sparkContext
14+
.hadoopConfiguration.set("fs.s3a.access.key", "AKIA3EEY5YGIE4JSQJZU")
15+
spark.sparkContext
16+
.hadoopConfiguration.set("fs.s3a.secret.key", "BhhNlJwGyVWCjnjuVQA16wYbpzi6Myg5XxURv8lW")
17+
//spark.sparkContext
18+
//.hadoopConfiguration.set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com")
19+
spark.sparkContext
20+
.hadoopConfiguration.set("fs.s3a.endpoint", "s3.amazonaws.com")
21+
// spark.sparkContext
22+
// .hadoopConfiguration.set("fs.s3a.path.style.access", "true")
23+
24+
val data = Seq(("James ","Rose","Smith","36636","M",3000),
25+
("Michael","Rose","","40288","M",4000),
26+
("Robert","Mary","Williams","42114","M",4000),
27+
("Maria","Anne","Jones","39192","F",4000),
28+
("Jen","Mary","Brown","1234","F",-1)
29+
)
30+
31+
val columns = Seq("firstname","middlename","lastname","dob","gender","salary")
32+
import spark.sqlContext.implicits._
33+
val df = data.toDF(columns:_*)
34+
35+
df.show()
36+
df.printSchema()
37+
38+
df.write
39+
.csv("s3a://sparkbyexamples/people1234.csv")
40+
41+
//df.write.csv("C:/tmp/123.csv")
42+
43+
// val parqDF = spark.read.parquet("C:\\tmp\\output\\people.parquet")
44+
// parqDF.createOrReplaceTempView("ParquetTable")
45+
//
46+
// spark.sql("select * from ParquetTable where salary >= 4000").explain()
47+
// val parkSQL = spark.sql("select * from ParquetTable where salary >= 4000 ")
48+
//
49+
// parkSQL.show()
50+
// parkSQL.printSchema()
51+
//
52+
// df.write
53+
// .partitionBy("gender","salary")
54+
// .parquet("C:\\tmp\\output\\people2.parquet")
55+
//
56+
// val parqDF2 = spark.read.parquet("C:\\tmp\\output\\people2.parquet")
57+
// parqDF2.createOrReplaceTempView("ParquetTable2")
58+
//
59+
// val df3 = spark.sql("select * from ParquetTable2 where gender='M' and salary >= 4000")
60+
// df3.explain()
61+
// df3.printSchema()
62+
// df3.show()
63+
//
64+
// val parqDF3 = spark.read
65+
// .parquet("C:\\tmp\\output\\people2.parquet\\gender=M")
66+
// parqDF3.show()
67+
68+
}
69+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package com.sparkbyexamples.spark.dataframe.examples
2+
3+
import org.apache.spark.sql.SparkSession
4+
import org.apache.spark.sql.functions._
5+
object CacheExample extends App {
6+
7+
val spark:SparkSession = SparkSession.builder()
8+
.master("local[1]")
9+
.appName("SparkByExamples.com")
10+
.getOrCreate()
11+
12+
//read csv with options
13+
val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true"))
14+
.csv("src/main/resources/zipcodes.csv")
15+
16+
val df2 = df.where(col("State") === "PR").cache()
17+
df2.show(false)
18+
19+
println(df2.count())
20+
21+
val df3 = df2.where(col("Zipcode") === 704)
22+
23+
24+
println(df2.count())
25+
26+
}

0 commit comments

Comments
 (0)