Skip to content

Commit fe89391

Browse files
Apache Spark Examples
1 parent 6e583d9 commit fe89391

File tree

12 files changed

+308
-11
lines changed

12 files changed

+308
-11
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Id,Address Line1,City,State,Zipcode
2+
1,9182 Clear Water Rd,Fayetteville,AR,72704
3+
2,"9920 State
4+
Highway 89",Ringling,OK,73456
5+
3,9724 E Landon Ln,Kennewick,WA,99338
6+
7+

src/main/resources/free-zipcode-database.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47392,7 +47392,7 @@
4739247392
47391,"95476","STANDARD","SONOMA","CA","PRIMARY",38.24,-122.46,-0.42,-0.66,0.61,"NA","US","Sonoma, CA","NA-US-CA-SONOMA","false",14900,24893,627942566,
4739347393
47392,"95476","STANDARD","AGUA CALIENTE","CA","NOT ACCEPTABLE",38.24,-122.46,-0.42,-0.66,0.61,"NA","US","Agua Caliente, CA","NA-US-CA-AGUA CALIENTE","false",14900,24893,627942566,
4739447394
47393,"95476","STANDARD","SCHELLVILLE","CA","NOT ACCEPTABLE",38.24,-122.46,-0.42,-0.66,0.61,"NA","US","Schellville, CA","NA-US-CA-SCHELLVILLE","false",14900,24893,627942566,
47395-
47394,"94080","STANDARD","SOUTH SAN FRANCISCO","CA","PRIMARY",37.65,-122.42,-0.42,-0.66,0.61,"NA","US","South San Francisco, CA","NA-US-CA-SOUTH SAN FRANCISCO","false",30695,53242,1494580412,
47395+
47394,"94080","STANDARD","SOUTH SAN FRANCISCO","HZFB MNHFFGHN ","PRIMARY",37.65,-122.42,-0.42,-0.66,0.61,"NA","US","South San Francisco, CA","NA-US-CA-SOUTH SAN FRANCISCO","false",30695,53242,1494580412,
4739647396
47395,"94080","STANDARD","S SAN FRAN","CA","ACCEPTABLE",37.65,-122.42,-0.42,-0.66,0.61,"NA","US","S San Fran, CA","NA-US-CA-S SAN FRAN","false",30695,53242,1494580412,
4739747397
47396,"94080","STANDARD","S SAN FRANCISCO","CA","NOT ACCEPTABLE",37.65,-122.42,-0.42,-0.66,0.61,"NA","US","S San Francisco, CA","NA-US-CA-S SAN FRANCISCO","false",30695,53242,1494580412,
4739847398
47397,"94080","STANDARD","SSF","CA","NOT ACCEPTABLE",37.65,-122.42,-0.42,-0.66,0.61,"NA","US","Ssf, CA","NA-US-CA-SSF","false",30695,53242,1494580412,
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
package com.sparkbyexamples.spark.dataframe
2+
3+
import org.apache.spark.sql.SparkSession
4+
5+
object FromCSVMultiline extends App {
6+
7+
val spark:SparkSession = SparkSession.builder()
8+
.master("local[3]")
9+
.appName("SparkByExamples.com")
10+
.getOrCreate()
11+
12+
13+
val df = spark.read
14+
.option("header",true)
15+
.option("delimiter",",")
16+
.option("multiLine",true)
17+
.option("quotes","\"")
18+
.csv("src/main/resources/address-multiline.csv")
19+
20+
df.show(false)
21+
}
22+
23+
24+
25+
26+
27+
28+
29+
30+
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package com.sparkbyexamples.spark.dataframe.examples
2+
3+
import org.apache.spark.sql.SparkSession
4+
5+
object CastStringToInt extends App {
6+
7+
val spark = SparkSession.builder
8+
.master("local[1]")
9+
.appName("SparkByExamples.com")
10+
.getOrCreate()
11+
12+
val simpleData = Seq(("James",34,"true","M","3000.6089"),
13+
("Michael",33,"true","F","3300.8067"),
14+
("Robert",37,"false","M","5000.5034")
15+
)
16+
17+
import spark.implicits._
18+
val df = simpleData.toDF("firstname","age","isGraduated","gender","salary")
19+
df.printSchema()
20+
21+
import org.apache.spark.sql.functions.col
22+
import org.apache.spark.sql.types.IntegerType
23+
// Convert String to Integer Type
24+
val df2= df.withColumn("salary",col("salary").cast(IntegerType))
25+
df2.printSchema()
26+
df2.show()
27+
28+
df.withColumn("salary",col("salary").cast("int")).printSchema()
29+
df.withColumn("salary",col("salary").cast("integer")).printSchema()
30+
31+
// Using select
32+
df.select(col("salary").cast("int").as("salary")).printSchema()
33+
34+
//Using selectExpr()
35+
df.selectExpr("cast(salary as int) salary","isGraduated").printSchema()
36+
df.selectExpr("INT(salary)","isGraduated").printSchema()
37+
38+
//Using with spark.sql()
39+
df.createOrReplaceTempView("CastExample")
40+
spark.sql("SELECT INT(salary),BOOLEAN(isGraduated),gender from CastExample").printSchema()
41+
spark.sql("SELECT cast(salary as int) salary, BOOLEAN(isGraduated),gender from CastExample").printSchema()
42+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
package com.sparkbyexamples.spark.dataframe.examples
2+
3+
import com.sparkbyexamples.spark.rdd.functions.FlatMapExample.spark
4+
import org.apache.spark.sql.{Row, SparkSession}
5+
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
6+
7+
object MapFlatMap extends App{
8+
9+
val spark: SparkSession = SparkSession.builder()
10+
.master("local[1]")
11+
.appName("SparkByExamples.com")
12+
.getOrCreate()
13+
14+
val data = Seq("Project Gutenberg’s",
15+
"Alice’s Adventures in Wonderland",
16+
"Project Gutenberg’s",
17+
"Adventures in Wonderland",
18+
"Project Gutenberg’s")
19+
20+
import spark.sqlContext.implicits._
21+
val df = data.toDF("data")
22+
df.show(false)
23+
24+
//Map Transformation
25+
val mapDF=df.map(fun=> {
26+
fun.getString(0).split(" ")
27+
})
28+
mapDF.show(false)
29+
30+
//Flat Map Transformation
31+
val flatMapDF=df.flatMap(fun=>
32+
{
33+
fun.getString(0).split(" ")
34+
})
35+
flatMapDF.show()
36+
37+
val arrayStructureData = Seq(
38+
Row("James,,Smith",List("Java","Scala","C++"),"CA"),
39+
Row("Michael,Rose,",List("Spark","Java","C++"),"NJ"),
40+
Row("Robert,,Williams",List("CSharp","VB","R"),"NV")
41+
)
42+
43+
val arrayStructureSchema = new StructType()
44+
.add("name",StringType)
45+
.add("languagesAtSchool", ArrayType(StringType))
46+
.add("currentState", StringType)
47+
48+
val df1 = spark.createDataFrame(
49+
spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
50+
51+
52+
//flatMap() Usage
53+
val df2=df1.flatMap(f => {
54+
val lang=f.getSeq[String](1)
55+
lang.map((f.getString(0),_,f.getString(2)))
56+
})
57+
58+
val df3=df2.toDF("Name","language","State")
59+
df3.show(false)
60+
61+
62+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package com.sparkbyexamples.spark.dataframe.examples
2+
3+
import org.apache.spark.sql.SparkSession
4+
import org.apache.spark.sql.functions.col
5+
object RangePartition extends App{
6+
7+
val spark: SparkSession = SparkSession.builder() .master("local[1]")
8+
.appName("SparkByExamples.com")
9+
.getOrCreate()
10+
11+
/**
12+
* Simple using columns list
13+
*/
14+
val data = Seq((1,10),(2,20),(3,10),(4,20),(5,10),
15+
(6,30),(7,50),(8,50),(9,50),(10,30),
16+
(11,10),(12,10),(13,40),(14,40),(15,40),
17+
(16,40),(17,50),(18,10),(19,40),(20,40)
18+
)
19+
20+
import spark.sqlContext.implicits._
21+
val dfRange = data.toDF("id","count")
22+
.repartitionByRange(5,col("count"))
23+
24+
dfRange.write.option("header",true).csv("c:/tmp/range-partition")
25+
dfRange.write.partitionBy()
26+
27+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
package com.sparkbyexamples.spark.dataframe.examples
2+
3+
import org.apache.hadoop.conf.Configuration
4+
import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
5+
import org.apache.spark.sql.SparkSession
6+
7+
object RenameDeleteFile extends App{
8+
9+
val spark:SparkSession = SparkSession.builder()
10+
.master("local[3]")
11+
.appName("SparkByExamples.com")
12+
.getOrCreate()
13+
14+
//Create Hadoop Configuration from Spark
15+
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
16+
17+
val srcPath=new Path("/tmp/address_rename_merged.csv")
18+
val destPath= new Path("/tmp/address_merged.csv")
19+
20+
//Rename a File
21+
if(fs.exists(srcPath) && fs.isFile(srcPath))
22+
fs.rename(srcPath,destPath)
23+
24+
//Alternatively, you can also create Hadoop configuration
25+
val hadoopConfig = new Configuration()
26+
val hdfs = FileSystem.get(hadoopConfig)
27+
if(hdfs.isFile(srcPath))
28+
hdfs.rename(srcPath,destPath)
29+
30+
31+
//Delete a File
32+
if(hdfs.isDirectory(srcPath))
33+
hdfs.delete(new Path("/tmp/.address_merged2.csv.crc"),true)
34+
35+
import scala.sys.process._
36+
//Delete a File
37+
s"hdfs dfs -rm /tmp/.address_merged2.csv.crc" !
38+
39+
//Delete a Directory
40+
s"hdfs dfs -rm -r /tmp/.address_merged2.csv.crc" !
41+
42+
43+
}

src/main/scala/com/sparkbyexamples/spark/dataframe/examples/SaveSingleFile.scala

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,19 @@ object SaveSingleFile extends App{
1313
.appName("SparkByExamples.com")
1414
.getOrCreate()
1515

16-
val df = spark.read.option("header",true).csv("src/main/resources/address.csv")
17-
df.repartition(1).write.mode(SaveMode.Overwrite).csv("/tmp/address")
16+
val df = spark.read.option("header",true)
17+
.csv("src/main/resources/address.csv")
18+
df.repartition(1)
19+
.write.mode(SaveMode.Overwrite).csv("/tmp/address")
1820

1921

2022
val hadoopConfig = new Configuration()
2123
val hdfs = FileSystem.get(hadoopConfig)
2224

2325
val srcPath=new Path("/tmp/address")
2426
val destPath= new Path("/tmp/address_merged.csv")
25-
val srcFile=FileUtil.listFiles(new File("c:/tmp/address")).filterNot(f=>f.getPath.endsWith(".csv"))(0)
27+
val srcFile=FileUtil.listFiles(new File("c:/tmp/address"))
28+
.filterNot(f=>f.getPath.endsWith(".csv"))(0)
2629
//Copy the CSV file outside of Directory and rename
2730
FileUtil.copy(srcFile,hdfs,destPath,true,hadoopConfig)
2831
//Remove Directory created by df.write()
@@ -31,10 +34,12 @@ object SaveSingleFile extends App{
3134
hdfs.delete(new Path("/tmp/.address_merged.csv.crc"),true)
3235

3336
// Merge Using Haddop API
34-
df.repartition(1).write.mode(SaveMode.Overwrite).csv("/tmp/address-tmp")
37+
df.repartition(1).write.mode(SaveMode.Overwrite)
38+
.csv("/tmp/address-tmp")
3539
val srcFilePath=new Path("/tmp/address-tmp")
3640
val destFilePath= new Path("/tmp/address_merged2.csv")
3741
FileUtil.copyMerge(hdfs, srcFilePath, hdfs, destFilePath, true, hadoopConfig, null)
3842
//Remove hidden CRC file if not needed.
3943
hdfs.delete(new Path("/tmp/.address_merged2.csv.crc"),true)
44+
4045
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
package com.sparkbyexamples.spark.dataframe.examples
2+
3+
import org.apache.spark.sql.types.{StringType, StructType}
4+
import org.apache.spark.sql.{Row, SparkSession}
5+
6+
object SelectExamples extends App{
7+
8+
val spark = SparkSession.builder
9+
.master("local[1]")
10+
.appName("SparkByExamples.com")
11+
.getOrCreate()
12+
13+
val data = Seq(("James","Smith","USA","CA"),
14+
("Michael","Rose","USA","NY"),
15+
("Robert","Williams","USA","CA"),
16+
("Maria","Jones","USA","FL")
17+
)
18+
19+
val columns = Seq("firstname","lastname","country","state")
20+
21+
import spark.implicits._
22+
val df = data.toDF(columns:_*)
23+
df.show(false)
24+
25+
26+
df.select("firstname","lastname").show()
27+
//Using Dataframe object name
28+
df.select(df("firstname"),df("lastname")).show()
29+
//Using col function
30+
import org.apache.spark.sql.functions.col
31+
df.select(col("firstname"),col("lastname")).show()
32+
33+
//Show all columns
34+
df.select("*").show()
35+
val columnsAll=df.columns.map(m=>col(m))
36+
df.select(columnsAll:_*).show()
37+
df.select(columns.map(m=>col(m)):_*).show()
38+
39+
//Show columns from list
40+
val listCols= List("lastname","country")
41+
df.select(listCols.map(m=>col(m)):_*).show()
42+
43+
//Show first few columns
44+
df.select(df.columns.slice(0,3).map(m=>col(m)):_*).show(1)
45+
46+
//Show columns by index or position
47+
df.select(df.columns(3)).show(3)
48+
49+
//Show columns from start and end index
50+
df.select(df.columns.slice(2,4).map(m=>col(m)):_*).show(3)
51+
52+
//Show columns by regular expression
53+
df.select(df.colRegex("`^.*name*`")).show()
54+
55+
df.select(df.columns.filter(f=>f.startsWith("first")).map(m=>col(m)):_*).show(3)
56+
df.select(df.columns.filter(f=>f.endsWith("name")).map(m=>col(m)):_*).show(3)
57+
58+
//Show Nested columns
59+
val data2 = Seq(Row(Row("James","","Smith"),"OH","M"),
60+
Row(Row("Anna","Rose",""),"NY","F"),
61+
Row(Row("Julia","","Williams"),"OH","F"),
62+
Row(Row("Maria","Anne","Jones"),"NY","M"),
63+
Row(Row("Jen","Mary","Brown"),"NY","M"),
64+
Row(Row("Mike","Mary","Williams"),"OH","M")
65+
)
66+
67+
val schema = new StructType()
68+
.add("name",new StructType()
69+
.add("firstname",StringType)
70+
.add("middlename",StringType)
71+
.add("lastname",StringType))
72+
.add("state",StringType)
73+
.add("gender",StringType)
74+
75+
val df2 = spark.createDataFrame(
76+
spark.sparkContext.parallelize(data2),schema)
77+
df2.printSchema()
78+
df2.show(false)
79+
df2.select("name").show(false)
80+
df2.select("name.firstname","name.lastname").show(false)
81+
df2.select("name.*").show(false)
82+
}

src/main/scala/com/sparkbyexamples/spark/dataframe/examples/SelectSelectExpr.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@ object SelectSelectExpr extends App {
66

77
val spark:SparkSession = SparkSession.builder()
88
.master("local[1]")
9-
.appName("SparkByExample")
9+
.appName("SparkByExamples.com")
1010
.getOrCreate()
1111

1212
val data = Seq(("Java", "20000"), ("Python", "100000"), ("Scala", "3000"))
1313
val df = spark.createDataFrame(data).toDF("language","users_count")
1414
df.select("language","users_count as count").show() //Example 1
1515
df.select(df("language"),df("users_count").as("count")).show() //Example 2
1616
df.select(col("language"),col("users_count")).show() ////Example 3
17+
//df.select("language",col("users_count")).show() ////Example 3
1718

1819
df.selectExpr("language","users_count as count").show() //Example 1
1920
//df.selectExpr(df("language"),df("users_count").as("count")).show() //Example 2

src/main/scala/com/sparkbyexamples/spark/dataframe/functions/datetime/AddTime.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package com.sparkbyexamples.spark.dataframe.functions.datetime
22

33
import org.apache.spark.sql.SparkSession
4-
import org.apache.spark.sql.functions.{expr,col}
4+
import org.apache.spark.sql.functions._
55
object AddTime extends App {
66

77
val spark:SparkSession = SparkSession.builder()

src/main/scala/com/sparkbyexamples/spark/dataframe/functions/window/WindowFunctions.scala

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,10 @@ object WindowFunctions extends App {
6666
//Aggregate Functions
6767
val windowSpecAgg = Window.partitionBy("department")
6868
val aggDF = df.withColumn("row",row_number.over(windowSpec))
69-
.withColumn("avg", avg(col("salary")).over(windowSpecAgg))
69+
.withColumn("avg", avg(col("salary")).over(windowSpecAgg))
7070
.withColumn("sum", sum(col("salary")).over(windowSpecAgg))
7171
.withColumn("min", min(col("salary")).over(windowSpecAgg))
7272
.withColumn("max", max(col("salary")).over(windowSpecAgg))
73-
// .where(col("row")===1).select("department","avg","sum","min","max")
73+
.where(col("row")===1).select("department","avg","sum","min","max")
7474
.show()
75-
76-
7775
}

0 commit comments

Comments
 (0)