[RP] Files renamed and README updated.

jgperrin · Nov 9, 2019 · fbe2035 · fbe2035
1 parent 0a5d2f0
commit fbe2035
Show file tree

Hide file tree

Showing 32 changed files with 111 additions and 66 deletions.
diff --git a/.gitignore b/.gitignore
@@ -27,4 +27,7 @@ hs_err_pid*
 /bin/
 *.DS_Store
 .idea/*
-*.iml
+*.iml
+target/*
+project/target/*
+project/project/*
diff --git a/README.md b/README.md
@@ -7,34 +7,59 @@ Welcome to Spark with Java, chapter 3. This chapter is all about the dataframe,
 Labs:
  * #200: `IngestionSchemaManipulationApp`: ingestion of a CSV, manipulation of schema structure post-ingestion
 
+## Running the lab in Java
 
-## Running PySpark
+For information on running the Java lab, see chapter 1 in [Spark in Action, 2nd edition](http://jgp.net/sia).
+
+
+## Running the lab using PySpark
+
+Prerequisites:
+
+You will need:
+ * `git`.
+ * Apache Spark (please refer Appendix P - 'Spark in production: installation and a few tips').
 
 1. Clone this project
-   Assume that cloned this project to ${MY_HOME_DIR}
 
-2. cd ${MY_HOME_DIR}/src/main/python
+    git clone https://github.com/jgperrin/net.jgp.books.spark.ch03
+
+2. Go to the lab in the Python directory
+
+    cd net.jgp.books.spark.ch03/src/main/python/lab200_ingestion_schema_manipulation/
+
+3. Execute the following spark-submit command to create a jar file to our this application
 
-3. Execute the following spark-submit command to run this application
-```
-spark-submit net/jgp/books/spark/ch03/lab200_ingestion_schema_manipulation/ingestionSchemaManipulationApp.py
-```
+   ```
+    spark-submit ingestionSchemaManipulationApp.py
+   ```
+
+## Running the lab in Scala
+
+Prerequisites:
+
+You will need:
+ * `git`.
+ * Apache Spark (please refer Appendix P - 'Spark in production: installation and a few tips'). 
+
+
+1. Clone this project
 
-## Running Scala
+    git clone https://github.com/jgperrin/net.jgp.books.spark.ch03
 
-    1. Clone this project
-       Assume that cloned this project to ${MY_HOME_DIR}
+2. cd net.jgp.books.spark.ch03
 
-    2. cd ${MY_HOME_DIR}
+3. Package application using sbt command
 
-    3. Create application jar file
-       ```mvn clean package```
+   ```
+     sbt clean assembly
+   ```
 
-    4. Execute the following spark-submit command to run this application
-    ```
-    spark-submit --class net.jgp.books.spark.ch03.lab200_ingestion_schema_manipulation.IngestionSchemaManipulateApp target/sparkInAction2-chapter03-1.0.0-SNAPSHOT.jar
-    ```
+4. Run Spark/Scala application using spark-submit command as shown below:
 
+   ```
+     spark-submit --class net.jgp.books.spark.ch03.lab200_ingestion_schema_manipulation.IngestionSchemaManipulationScalaApp target/scala-2.11/SparkInAction2-Chapter03-assembly-1.0.0.jar
+   ```
 
 Notes:
  1. Due to renaming the packages to match more closely Java standards, this project is not in sync with the book's MEAP prior to v10.

diff --git a/build.sbt b/build.sbt
@@ -0,0 +1,22 @@
+
+name := "SparkInAction2-Chapter03"
+
+version := "1.0.0"
+
+scalaVersion := "2.11.11"
+
+val sparkVersion = "2.4.4"
+
+resolvers ++= Seq(
+  "apache-snapshots" at "http://repository.apache.org/snapshots/"
+)
+
+libraryDependencies ++= Seq(
+  "org.apache.spark" %% "spark-core" % sparkVersion,
+  "org.apache.spark" %% "spark-sql" % sparkVersion
+)
+
+assemblyMergeStrategy in assembly := {
+  case PathList("META-INF", xs @ _*) => MergeStrategy.discard
+  case x => MergeStrategy.first
+}
diff --git a/pom.xml b/pom.xml
@@ -13,18 +13,11 @@
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <java.version>1.8</java.version>
     <scala.version>2.11</scala.version>
-    <scala.full.version>2.11.11</scala.full.version>
     <spark.version>2.4.4</spark.version>
     <maven-compiler-plugin.version>3.8.0</maven-compiler-plugin.version>
   </properties>
 
   <dependencies>
-    <!-- Scala Language Library -->
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-library</artifactId>
-      <version>${scala.full.version}</version>
-    </dependency>
     <!-- Spark -->
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -80,22 +73,6 @@
           <target>${java.version}</target>
         </configuration>
       </plugin>
-      <!--To enable scala features in a java project-->
-      <plugin>
-        <groupId>net.alchim31.maven</groupId>
-        <artifactId>scala-maven-plugin</artifactId>
-        <version>4.2.0</version>
-        <executions>
-          <execution>
-            <goals>
-              <goal>compile</goal>
-            </goals>
-          </execution>
-        </executions>
-        <configuration>
-          <scalaVersion>${scala.full.version}</scalaVersion>
-        </configuration>
-      </plugin>
     </plugins>
   </build>
 </project>
diff --git a/project/build.properties b/project/build.properties
@@ -0,0 +1 @@
+sbt.version=1.0.3
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -0,0 +1 @@
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10")
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/main/__init__.py b/src/main/__init__.py
diff --git a/src/main/python/__init__.py b/src/main/python/__init__.py
diff --git a/...ulation/ingestionSchemaManipulationApp.py → ...ulation/ingestionSchemaManipulationApp.py b/...ulation/ingestionSchemaManipulationApp.py → ...ulation/ingestionSchemaManipulationApp.py
@@ -3,20 +3,22 @@
 
   @author rambabu.posa
 """
-
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import lit,col,concat
+import os
+
+current_dir = os.path.dirname(__file__)
+relative_path = "../../../../data/Restaurants_in_Wake_County_NC.csv"
+absolute_file_path = os.path.join(current_dir, relative_path)
 
 # Creates a session on a local master
 spark = SparkSession.builder.appName("Restaurants in Wake County, NC") \
     .master("local[*]").getOrCreate()
 
-
 # Reads a CSV file with header, called
 # Restaurants_in_Wake_County_NC.csv,
 # stores it in a dataframe
-df = spark.read.csv(header=True, inferSchema=True,
-                      path="../../../data/Restaurants_in_Wake_County_NC.csv")
+df = spark.read.csv(header=True, inferSchema=True,path=absolute_file_path)
 
 print("*** Right after ingestion")
 df.show(5)
@@ -47,11 +49,8 @@
 print("*** Dataframe transformed")
 df.show(5)
 
-
 # for book only
-drop_cols=["address2","zip","tel","dateStart",
-           "geoX","geoY","address1","datasetId"]
-dfUsedForBook  =  df.drop(drop_cols)
+dfUsedForBook = df.drop("address2","zip","tel","dateStart","geoX","geoY","address1","datasetId")
 
 dfUsedForBook.show(5, 15)
 # end

diff --git a/...a_introspection/schemaIntrospectionApp.py → ...a_introspection/schemaIntrospectionApp.py b/...a_introspection/schemaIntrospectionApp.py → ...a_introspection/schemaIntrospectionApp.py
@@ -5,7 +5,12 @@
 """
 
 from pyspark.sql import SparkSession
-from pyspark.sql.functions import lit,col,concat
+from pyspark.sql import functions as F
+import os
+
+current_dir = os.path.dirname(__file__)
+relative_path = "../../../../data/Restaurants_in_Wake_County_NC.csv"
+absolute_file_path = os.path.join(current_dir, relative_path)
 
 # Creates a session on a local master
 spark = SparkSession.builder.appName("Schema introspection for restaurants in Wake County, NC") \
@@ -14,11 +19,10 @@
 # Reads a CSV file with header, called
 # Restaurants_in_Wake_County_NC.csv,
 # stores it in a dataframe
-df = spark.read.csv(header=True, inferSchema=True,
-                      path="../../../data/Restaurants_in_Wake_County_NC.csv")
+df = spark.read.csv(header=True, inferSchema=True,path=absolute_file_path)
 
 # Let's transform our dataframe
-df =  df.withColumn("county", lit("Wake")) \
+df =  df.withColumn("county", F.lit("Wake")) \
         .withColumnRenamed("HSISID", "datasetId") \
         .withColumnRenamed("NAME", "name") \
         .withColumnRenamed("ADDRESS1", "address1") \
@@ -33,7 +37,7 @@
         .withColumnRenamed("Y", "geoY")
 
 df = df.withColumn("id",
-        concat(col("state"), lit("_"), col("county"), lit("_"), col("datasetId")))
+        F.concat(F.col("state"), F.lit("_"), F.col("county"), F.lit("_"), F.col("datasetId")))
 
 # NEW
 #//////////////////////////////////////////////////////////////////
@@ -48,4 +52,5 @@
 schemaAsJson = schema.prettyjson
 print("*** Schema as JSON: " + schemaAsJson)
 
-df.stop()
+# Good to stop SparkSession at the end of the application
+spark.stop()
diff --git a/...ion/jsonIngestionSchemaManipulationApp.py → ...ion/jsonIngestionSchemaManipulationApp.py b/...ion/jsonIngestionSchemaManipulationApp.py → ...ion/jsonIngestionSchemaManipulationApp.py
@@ -6,14 +6,19 @@
 
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import (lit,col,concat,split)
+import os
+
+current_dir = os.path.dirname(__file__)
+relative_path = "../../../../data/Restaurants_in_Durham_County_NC.json"
+absolute_file_path = os.path.join(current_dir, relative_path)
 
 # Creates a session on a local master
 spark = SparkSession.builder.appName("Restaurants in Durham County, NC") \
     .master("local[*]").getOrCreate()
 
 # Reads a JSON file called Restaurants_in_Durham_County_NC.json, stores
 # it in a dataframe
-df = spark.read.json("../../../data/Restaurants_in_Durham_County_NC.json")
+df = spark.read.json(relative_path)
 print("*** Right after ingestion")
 df.show(5)
 df.printSchema()

diff --git a/...230_dataframe_union/dataframesUnionApp.py → ...230_dataframe_union/dataframesUnionApp.py b/...230_dataframe_union/dataframesUnionApp.py → ...230_dataframe_union/dataframesUnionApp.py
@@ -5,14 +5,22 @@
 """
 import util
 from pyspark.sql import SparkSession
+import os
+
+current_dir = os.path.dirname(__file__)
+relative_path1 = "../../../../data/Restaurants_in_Wake_County_NC.csv"
+absolute_file_path1 = os.path.join(current_dir, relative_path1)
+
+relative_path2 = "../../../../data/Restaurants_in_Durham_County_NC.json"
+absolute_file_path2 = os.path.join(current_dir, relative_path2)
 
 # Creates a session on a local master
 spark = SparkSession.builder.appName("Union of two dataframes") \
     .master("local[*]").getOrCreate()
 
-df1 = spark.read.csv(path="../../../data/Restaurants_in_Wake_County_NC.csv",header=True)
+df1 = spark.read.csv(path=absolute_file_path1,header=True,inferSchema=True)
 
-df2 = spark.read.json("../../../data/Restaurants_in_Durham_County_NC.json")
+df2 = spark.read.json(absolute_file_path2)
 
 
 wakeRestaurantsDf = util.build_wake_restaurants_dataframe(df1)

diff --git a/...spark/ch03/lab230_dataframe_union/util.py → ...ain/python/lab230_dataframe_union/util.py b/...spark/ch03/lab230_dataframe_union/util.py → ...ain/python/lab230_dataframe_union/util.py
diff --git a/...park/ch03/lab230_dataframe_union/util.pyc → ...in/python/lab230_dataframe_union/util.pyc b/...park/ch03/lab230_dataframe_union/util.pyc → ...in/python/lab230_dataframe_union/util.pyc
diff --git a/...ch03/lab300_dataset/array2DataframeApp.py → ...hon/lab300_dataset/arrayToDataframeApp.py b/...ch03/lab300_dataset/array2DataframeApp.py → ...hon/lab300_dataset/arrayToDataframeApp.py
diff --git a/src/main/python/net/__init__.py b/src/main/python/net/__init__.py
diff --git a/src/main/python/net/jgp/__init__.py b/src/main/python/net/jgp/__init__.py
diff --git a/src/main/python/net/jgp/books/__init__.py b/src/main/python/net/jgp/books/__init__.py
diff --git a/src/main/python/net/jgp/books/spark/__init__.py b/src/main/python/net/jgp/books/spark/__init__.py
diff --git a/src/main/python/net/jgp/books/spark/ch03/__init__.py b/src/main/python/net/jgp/books/spark/ch03/__init__.py
diff --git a/src/main/python/net/jgp/books/spark/ch03/lab200_ingestion_schema_manipulation/__init__.py b/src/main/python/net/jgp/books/spark/ch03/lab200_ingestion_schema_manipulation/__init__.py
diff --git a/src/main/python/net/jgp/books/spark/ch03/lab210_schema_introspection/__init__.py b/src/main/python/net/jgp/books/spark/ch03/lab210_schema_introspection/__init__.py
diff --git a/src/main/python/net/jgp/books/spark/ch03/lab230_dataframe_union/__init__.py b/src/main/python/net/jgp/books/spark/ch03/lab230_dataframe_union/__init__.py
diff --git a/src/main/python/net/jgp/books/spark/ch03/lab300_dataset/__init__.py b/src/main/python/net/jgp/books/spark/ch03/lab300_dataset/__init__.py
diff --git a/...lation/IngestionSchemaManipulateApp.scala → ...IngestionSchemaManipulationScalaApp.scala b/...lation/IngestionSchemaManipulateApp.scala → ...IngestionSchemaManipulationScalaApp.scala
@@ -13,7 +13,7 @@ import org.apache.spark.sql.SparkSession
   *
   * @author rambabu.posa
   */
-object IngestionSchemaManipulateApp {
+object IngestionSchemaManipulationScalaApp {
 
   /**
     * main() is your entry point to the application.

diff --git a/...a_introspection/SchemaIntrospectApp.scala → ...pection/SchemaIntrospectionScalaApp.scala b/...a_introspection/SchemaIntrospectApp.scala → ...pection/SchemaIntrospectionScalaApp.scala
@@ -9,7 +9,7 @@ import org.apache.spark.sql.SparkSession
   *
   * @author rambabu.posa
   */
-object SchemaIntrospectApp {
+object SchemaIntrospectionScalaApp {
   /**
     * main() is your entry point to the application.
     *

diff --git a/...on/JsonIngestionSchemaManipulateApp.scala → ...IngestionSchemaManipulationScalaApp.scala b/...on/JsonIngestionSchemaManipulateApp.scala → ...IngestionSchemaManipulationScalaApp.scala
@@ -8,7 +8,7 @@ import org.apache.spark.sql.SparkSession
   *
   * @author rambabu.posa
   */
-object JsonIngestionSchemaManipulateApp {
+object JsonIngestionSchemaManipulationScalaApp {
 
   /**
     * main() is your entry point to the application.

diff --git a/..._dataframe_union/DataframesUnionApp.scala → ...aframe_union/DataframeUnionScalaApp.scala b/..._dataframe_union/DataframesUnionApp.scala → ...aframe_union/DataframeUnionScalaApp.scala
@@ -8,7 +8,7 @@ import org.apache.spark.sql.{Dataset, Row, SparkSession}
   *
   * @author rambabu.posa
   */
-object DataframesUnionApp {
+object DataframeUnionScalaApp {
 
   /**
     * main() is your entry point to the application.

diff --git a/...h03/lab300_dataset/Array2DatasetApp.scala → ...b300_dataset/ArrayToDatasetScalaApp.scala b/...h03/lab300_dataset/Array2DatasetApp.scala → ...b300_dataset/ArrayToDatasetScalaApp.scala
@@ -8,7 +8,7 @@ import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
   *
   * @author rambabu.posa
   */
-object Array2DatasetApp {
+object ArrayToDatasetScalaApp {
 
   /**
     * main() is your entry point to the application.

diff --git a/...ataframe/Array2Dataset2DataframeApp.scala → ...e/ArrayToDatasetToDataframeScalaApp.scala b/...ataframe/Array2Dataset2DataframeApp.scala → ...e/ArrayToDatasetToDataframeScalaApp.scala
@@ -9,7 +9,7 @@ import org.apache.spark.sql.{Dataset, Encoders, Row, SparkSession}
   *
   * @author rambabu.posa
   */
-object Array2Dataset2DataframeApp {
+object ArrayToDatasetToDataframeScalaApp {
 
   /**
     * main() is your entry point to the application.

diff --git a/...aframe/Csv2DatasetBook2DataframeApp.scala → ...CsvToDatasetBookToDataframeScalaApp.scala b/...aframe/Csv2DatasetBook2DataframeApp.scala → ...CsvToDatasetBookToDataframeScalaApp.scala
@@ -13,7 +13,7 @@ import org.apache.spark.sql.{Dataset, Encoders, Row, SparkSession}
   *
   * @author rambabu.posa
   */
-object Csv2DatasetBook2DataframeApp {
+object CsvToDatasetBookToDataframeScalaApp {
 
   /**
     * This is a mapper class that will convert a Row to an instance of Book.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10")