spark-examples
diff --git a/‎pom.xml
Lines changed: 2 additions & 2 deletions b/‎pom.xml
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/main/resources/test.txt
Lines changed: 112 additions & 0 deletions b/‎src/main/resources/test.txt
Lines changed: 112 additions & 0 deletions
diff --git a/‎src/main/scala/com/sparkbyexamples/spark/SQLContextExample.scala
Lines changed: 27 additions & 0 deletions b/‎src/main/scala/com/sparkbyexamples/spark/SQLContextExample.scala
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/main/scala/com/sparkbyexamples/spark/SparkContextExample.scala
Lines changed: 35 additions & 0 deletions b/‎src/main/scala/com/sparkbyexamples/spark/SparkContextExample.scala
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/main/scala/com/sparkbyexamples/spark/SparkSessionTest.scala
Lines changed: 2 additions & 1 deletion b/‎src/main/scala/com/sparkbyexamples/spark/SparkSessionTest.scala
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/main/scala/com/sparkbyexamples/spark/dataframe/BroadcastExample.scala
Lines changed: 38 additions & 0 deletions b/‎src/main/scala/com/sparkbyexamples/spark/dataframe/BroadcastExample.scala
Lines changed: 38 additions & 0 deletions
diff --git a/‎src/main/scala/com/sparkbyexamples/spark/dataframe/ColumnTruncate.scala
Lines changed: 25 additions & 0 deletions b/‎src/main/scala/com/sparkbyexamples/spark/dataframe/ColumnTruncate.scala
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/main/scala/com/sparkbyexamples/spark/dataframe/FromJsonFile.scala
Lines changed: 1 addition & 1 deletion b/‎src/main/scala/com/sparkbyexamples/spark/dataframe/FromJsonFile.scala
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetAWSExample.scala
Lines changed: 69 additions & 0 deletions b/‎src/main/scala/com/sparkbyexamples/spark/dataframe/ParquetAWSExample.scala
Lines changed: 69 additions & 0 deletions
diff --git a/‎src/main/scala/com/sparkbyexamples/spark/dataframe/examples/CacheExample.scala
Lines changed: 26 additions & 0 deletions b/‎src/main/scala/com/sparkbyexamples/spark/dataframe/examples/CacheExample.scala
Lines changed: 26 additions & 0 deletions
@@ -11,7 +11,7 @@
     <packaging>jar</packaging>
     <properties>
         <scala.version>2.11.11</scala.version>
-        <spark.version>2.4.0</spark.version>
+        <spark.version>2.4.4</spark.version>
     </properties>
 
     <repositories>
@@ -71,7 +71,7 @@
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-avro_2.11</artifactId>
-            <version>2.4.0</version>
+            <version>2.4.4</version>
         </dependency>
 
     </dependencies>
 
@@ -12,3 +12,115 @@ at no cost and with
 This eBook is for the use
 of anyone anywhere
 at no cost and with
+Project Gutenberg’s
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Project Gutenberg’s
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Project Gutenberg’s
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Project Gutenberg’s
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Project Gutenberg’s
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Project Gutenberg’s
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Project Gutenberg’s
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Project Gutenberg’s
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+Alice’s Adventures in Wonderland
+by Lewis Carroll
+This eBook is for the use
+of anyone anywhere
+at no cost and with
+This eBook is for the use
+of anyone anywhere
+at no cost and with
@@ -0,0 +1,27 @@
+package com.sparkbyexamples.spark
+
+import org.apache.spark.sql.{SQLContext, SparkSession}
+
+object SQLContextExample extends App {
+
+  val spark = SparkSession.builder()
+    .master("local[1]")
+    .appName("SparkByExamples.com")
+    .getOrCreate();
+
+  spark.sparkContext.setLogLevel("ERROR")
+
+
+  val sqlContext:SQLContext = spark.sqlContext
+
+  //read csv with options
+  val df = sqlContext.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true"))
+    .csv("src/main/resources/zipcodes.csv")
+  df.show()
+  df.printSchema()
+
+  df.createOrReplaceTempView("TAB")
+  sqlContext.sql("select * from TAB")
+    .show(false)
+
+}
@@ -0,0 +1,35 @@
+package com.sparkbyexamples.spark
+
+import com.sparkbyexamples.spark.dataframe.functions.SortExample.spark
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{SQLContext, SparkSession}
+
+object SparkContextExample extends App{
+
+  val spark = SparkSession.builder()
+    .master("local[1]")
+    .appName("SparkByExamples.com")
+    .getOrCreate();
+
+  spark.sparkContext.setLogLevel("ERROR")
+
+  val sparkContext:SparkContext = spark.sparkContext
+  val sqlCon:SQLContext = spark.sqlContext
+
+  val sqlContext = new org.apache.spark.sql.SQLContext(spark.sparkContext)
+
+  println("First SparkContext:")
+  println("APP Name :"+spark.sparkContext.appName);
+  println("Deploy Mode :"+spark.sparkContext.deployMode);
+  println("Master :"+spark.sparkContext.master);
+
+  val sparkSession2 = SparkSession.builder()
+    .master("local[1]")
+    .appName("SparkByExample-test")
+    .getOrCreate();
+
+  println("Second SparkContext:")
+  println("APP Name :"+sparkSession2.sparkContext.appName);
+  println("Deploy Mode :"+sparkSession2.sparkContext.deployMode);
+  println("Master :"+sparkSession2.sparkContext.master);
+}
@@ -6,11 +6,12 @@ object SparkSessionTest {
 
   def main(args:Array[String]): Unit ={
 
+
     val spark = SparkSession.builder()
       .master("local[1]")
       .appName("SparkByExample")
       .getOrCreate();
-
+    
     println("First SparkContext:")
     println("APP Name :"+spark.sparkContext.appName);
     println("Deploy Mode :"+spark.sparkContext.deployMode);
 
@@ -0,0 +1,38 @@
+package com.sparkbyexamples.spark.dataframe
+
+import org.apache.spark.sql.SparkSession
+
+object BroadcastExample extends App{
+
+  val spark = SparkSession.builder()
+    .appName("SparkByExamples.com")
+    .master("local")
+    .getOrCreate()
+
+  val states = Map(("NY","New York"),("CA","California"),("FL","Florida"))
+  val countries = Map(("USA","United States of America"),("IN","India"))
+
+  val broadcastStates = spark.sparkContext.broadcast(states)
+  val broadcastCountries = spark.sparkContext.broadcast(countries)
+
+  val data = Seq(("James","Smith","USA","CA"),
+    ("Michael","Rose","USA","NY"),
+    ("Robert","Williams","USA","CA"),
+    ("Maria","Jones","USA","FL")
+  )
+
+  val columns = Seq("firstname","lastname","country","state")
+  import spark.sqlContext.implicits._
+  val df = data.toDF(columns:_*)
+
+  val df2 = df.map(row=>{
+    val country = row.getString(2)
+    val state = row.getString(3)
+
+    val fullCountry = broadcastCountries.value.get(country).get
+    val fullState = broadcastStates.value.get(state).get
+    (row.getString(0),row.getString(1),fullCountry,fullState)
+  }).toDF(columns:_*)
+
+  df2.show(false)
+}
@@ -0,0 +1,25 @@
+package com.sparkbyexamples.spark.dataframe
+
+import com.sparkbyexamples.spark.SQLContextExample.spark
+import org.apache.log4j.lf5.LogLevel
+import org.apache.spark.sql.SparkSession
+
+object ColumnTruncate extends App {
+
+  val spark:SparkSession = SparkSession.builder()
+    .master("local[1]")
+    .appName("SparkByExamples.com")
+    .getOrCreate()
+
+
+
+  import spark.implicits._
+  val columns = Seq("Seqno","Quote")
+  val data = Seq(("1", "Be the change that you wish to see in the world"),
+    ("2", "Everyone thinks of changing the world, but no one thinks of changing himself."),
+    ("3", "The purpose of our lives is to be happy."))
+  val df = data.toDF(columns:_*)
+  df.show(false)
+
+
+}
@@ -68,6 +68,6 @@ object FromJsonFile {
     //Write json file
 
     df2.write
-      .json("/tmp/spark_output/zipcodes.json")
+      .json("/tmp/spark_output/zipcodes1.json")
   }
 }
@@ -0,0 +1,69 @@
+package com.sparkbyexamples.spark
+
+import org.apache.spark.sql.SparkSession
+
+object ParquetAWSExample {
+
+  def main(args:Array[String]):Unit= {
+
+    val spark: SparkSession = SparkSession.builder()
+      .master("local[1]")
+      .appName("SparkByExamples.com")
+      .getOrCreate()
+    spark.sparkContext
+      .hadoopConfiguration.set("fs.s3a.access.key", "AKIA3EEY5YGIE4JSQJZU")
+    spark.sparkContext
+      .hadoopConfiguration.set("fs.s3a.secret.key", "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")
+    //spark.sparkContext
+    //.hadoopConfiguration.set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com")
+    spark.sparkContext
+      .hadoopConfiguration.set("fs.s3a.endpoint", "s3.amazonaws.com")
+    // spark.sparkContext
+    // .hadoopConfiguration.set("fs.s3a.path.style.access", "true")
+
+    val data = Seq(("James ","Rose","Smith","36636","M",3000),
+      ("Michael","Rose","","40288","M",4000),
+      ("Robert","Mary","Williams","42114","M",4000),
+      ("Maria","Anne","Jones","39192","F",4000),
+      ("Jen","Mary","Brown","1234","F",-1)
+    )
+
+    val columns = Seq("firstname","middlename","lastname","dob","gender","salary")
+    import spark.sqlContext.implicits._
+    val df = data.toDF(columns:_*)
+
+    df.show()
+    df.printSchema()
+
+    df.write
+    .csv("s3a://sparkbyexamples/people1234.csv")
+
+    //df.write.csv("C:/tmp/123.csv")
+
+    //    val parqDF = spark.read.parquet("C:\\tmp\\output\\people.parquet")
+    //    parqDF.createOrReplaceTempView("ParquetTable")
+    //
+    //    spark.sql("select * from ParquetTable where salary >= 4000").explain()
+    //    val parkSQL = spark.sql("select * from ParquetTable where salary >= 4000 ")
+    //
+    //    parkSQL.show()
+    //    parkSQL.printSchema()
+    //
+    //    df.write
+    //      .partitionBy("gender","salary")
+    //      .parquet("C:\\tmp\\output\\people2.parquet")
+    //
+    //    val parqDF2 = spark.read.parquet("C:\\tmp\\output\\people2.parquet")
+    //    parqDF2.createOrReplaceTempView("ParquetTable2")
+    //
+    //    val df3 = spark.sql("select * from ParquetTable2  where gender='M' and salary >= 4000")
+    //    df3.explain()
+    //    df3.printSchema()
+    //    df3.show()
+    //
+    //    val parqDF3 = spark.read
+    //      .parquet("C:\\tmp\\output\\people2.parquet\\gender=M")
+    //    parqDF3.show()
+
+  }
+}
@@ -0,0 +1,26 @@
+package com.sparkbyexamples.spark.dataframe.examples
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.functions._
+object CacheExample extends App {
+
+  val spark:SparkSession = SparkSession.builder()
+    .master("local[1]")
+    .appName("SparkByExamples.com")
+    .getOrCreate()
+
+  //read csv with options
+  val df = spark.read.options(Map("inferSchema"->"true","delimiter"->",","header"->"true"))
+    .csv("src/main/resources/zipcodes.csv")
+
+  val df2 = df.where(col("State") === "PR").cache()
+  df2.show(false)
+
+  println(df2.count())
+
+  val df3 = df2.where(col("Zipcode") === 704)
+
+
+  println(df2.count())
+
+}
Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,6 @@ object FromJsonFile {`
`68`	`68`	`//Write json file`
`69`	`69`
`70`	`70`	`df2.write`
`71`		`- .json("/tmp/spark_output/zipcodes.json")`
	`71`	`+ .json("/tmp/spark_output/zipcodes1.json")`
`72`	`72`	`}`
`73`	`73`	`}`