Initial Commit

afzals2000@yahoo.com · afzals2000@yahoo.com · commit 1b83cee12e6b · 2019-01-17T17:32:14.000Z
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+.idea/
+spark-warehouse/
+target/
+*.class
+*.iml
+*.ipr
+*.iws
+.idea
+out
+tmp/
+
diff --git a/README.md b/README.md
@@ -0,0 +1,68 @@
+##MovieLens
+This is a example of how to write Scala Spark/Spark SQL using Test Driven Development.  
+
+## Prerequisites
+1. JDK 1.8    
+2. Scala 2.11.8
+3. SBT 
+    * Tool for library dependency management. .
+4. I've used Intellij IDEA for development and testing. If using Intellij then
+    * Install following Plugin
+        * SBT
+        * Scala
+        * (See build.sbt for component versions)
+
+####Data Files 
+Create following "::" delimiter files in /tmp dir
+
+* Movie.dat::
+```
+21::Toy Story (1995)::Animation|Children's|Comedy
+22::Jumanji (1995)::Adventure|Children's|Fantasy
+23::Grumpier Old Men (1995)::Comedy|Romance
+24::Waiting to Exhale (1995)::Comedy|Drama
+25::Father of the Bride Part II (1995)::Comedy
+```
+
+* User.dat::
+```
+1::F::1::10::48067
+2::M::56::16::70072
+3::M::25::15::55117
+4::M::45::7::02460
+6::F::50::9::55117
+```
+
+* Rating.dat::
+```
+1::21::5::978300760
+1::22::3::978300760
+2::22::4::978299026
+2::23::5::978299026
+2::24::4::978299026
+2::25::3::978299026
+3::23::2::978297837
+4::654321::5::978294008
+5::25::3::978245037
+```
+
+#### Running Test
+* movielens (master)*$ sbt clean package test
+
+#### Acknowledgments
+
+* Not Handled exception like Bad rows.
+
+## Change Log
+* 0.0.1
+   * Initial Commit
+
+## Meta
+https://github.com/afzals2000/SparkUsingTDD
+
+## Contributing
+1. Fork it (https://github.com/afzals2000/SparkUsingTDD)
+2. Create your feature branch (git checkout -b feature/fooBar)
+3. Commit your changes (git commit -am 'Add some fooBar')
+4. Push to the branch (git push origin feature/fooBar)
+5. Create a new Pull Request
diff --git a/build.sbt b/build.sbt
@@ -0,0 +1,25 @@
+name := "movielens"
+
+version := "1.0"
+
+scalaVersion := "2.11.8"
+
+lazy val spark = "2.0.2"
+
+artifactName := { (sv: ScalaVersion, module: ModuleID, artifact: Artifact) =>
+  artifact.name + "." + artifact.extension
+}
+
+resolvers ++= Seq(
+  "apache-snapshots" at "http://repository.apache.org/snapshots/"
+)
+
+libraryDependencies ++= Seq(
+  "org.apache.spark" %% "spark-core" % spark,
+  "org.apache.spark" %% "spark-sql" % spark,
+  "org.apache.spark" %% "spark-hive" % spark % "test",
+  "org.apache.spark" %% "spark-streaming" % spark,
+  "log4j" % "log4j" % "1.2.14",
+  "org.scalatest" %% "scalatest" % "2.2.1" % "test",
+  "com.holdenkarau" %% "spark-testing-base" % "2.0.2_0.4.7"
+)
diff --git a/project/build.properties b/project/build.properties
@@ -0,0 +1 @@
+sbt.version = 1.2.1
diff --git a/project/plugins.sbt b/project/plugins.sbt
diff --git a/src/main/scala/com/movie/DataLoader.scala b/src/main/scala/com/movie/DataLoader.scala
@@ -0,0 +1,57 @@
+package com.movie
+
+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.types.StructType
+
+object DataLoader {
+  val userSchema = StructType(
+    Seq(
+      StructField(name = "USERID", dataType = IntegerType, nullable = true),
+      StructField(name = "GENDER", dataType = StringType, nullable = true),
+      StructField(name = "AGE", dataType = IntegerType, nullable = true),
+      StructField(name = "OCCUPATION", dataType = IntegerType, nullable = true),
+      StructField(name = "ZIPCODE", dataType = StringType, nullable = true)
+    )
+  )
+
+  val movieSchema = StructType(
+    Seq(
+      StructField(name = "MOVIEID", dataType = IntegerType, nullable = true),
+      StructField(name = "TITLE", dataType = StringType, nullable = true),
+      StructField(name = "GENRES", dataType = StringType, nullable = true)
+    )
+  )
+
+  val ratingSchema = StructType(
+    Seq(
+      StructField(name = "USERID", dataType = IntegerType, nullable = true),
+      StructField(name = "MOVIEID", dataType = IntegerType, nullable = true),
+      StructField(name = "RATINGS", dataType = IntegerType, nullable = true),
+      StructField(name = "TS", dataType = LongType, nullable = true)
+    )
+  )
+
+  def loadUsers(file: String)(implicit spark: SparkSession): DataFrame = {
+    def row(line: Array[String]): Row = Row(line(0).toInt, line(1), line(2).toInt, line(3).toInt, line(4))
+    getDataFrame(file, row, userSchema)
+  }
+
+  def loadMovies(file: String)(implicit spark: SparkSession): DataFrame = {
+    def row(line: Array[String]): Row = Row(line(0).toInt, line(1), line(2))
+    getDataFrame(file, row, movieSchema)
+  }
+
+  def loadRatings(file: String)(implicit spark: SparkSession): DataFrame = {
+    def row(line: Array[String]): Row = Row(line(0).toInt, line(1).toInt, line(2).toInt, line(3).toLong)
+    getDataFrame(file, row, ratingSchema)
+  }
+
+
+  def getDataFrame(file: String, row: (Array[String]) => Row, schema: StructType)(implicit spark: SparkSession): DataFrame = {
+    val rawData = spark.sparkContext.textFile(file)
+    val rowRDD = rawData.map(line => line.split("::")).map(line => row(line))
+    spark.createDataFrame(rowRDD, schema)
+  }
+
+}
diff --git a/src/main/scala/com/movie/MovieLensJob.scala b/src/main/scala/com/movie/MovieLensJob.scala
@@ -0,0 +1,40 @@
+package com.movie
+
+import org.apache.spark.sql.{SparkSession}
+
+object MovieLensJob {
+
+  def main(args: Array[String]): Unit = {
+
+    implicit val spark: SparkSession = getSparkSession("MovieJob")
+
+    val users = DataLoader.loadUsers("/tmp/users.dat")
+    val movies = DataLoader.loadMovies("/tmp/movies.dat")
+    val ratings = DataLoader.loadRatings("/tmp/ratings.dat")
+
+    val userMovieDF = MovieLensService.userMovieRatings(ratings)
+    userMovieDF.show(5)
+    userMovieDF.coalesce(1).write.option("header", "true").csv("/tmp/user_output")
+
+    val movieGenres = MovieLensService.movieCountByGenre(movies)
+    movieGenres.show(5)
+    movieGenres.coalesce(1).write.option("header", "true").csv("/tmp/movie_genres")
+
+    val top100 = MovieLensService.top100Movie(movies, ratings)
+    top100.show()
+    top100.coalesce(1).write.parquet("/tmp/top100")
+
+  }
+
+  def getSparkSession(appName: String): SparkSession = {
+    val spark = SparkSession
+      .builder()
+      .appName("Movie Ratings")
+      .config("spark.master", "local")
+      .getOrCreate()
+
+    spark.sparkContext.setLogLevel("ERROR")
+    spark
+  }
+
+}
diff --git a/src/main/scala/com/movie/MovieLensService.scala b/src/main/scala/com/movie/MovieLensService.scala
@@ -0,0 +1,34 @@
+package com.movie
+
+import org.apache.spark.sql.expressions.{Window, WindowSpec}
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.IntegerType
+
+object MovieLensService {
+
+  def userMovieRatings(ratingDF: DataFrame)(implicit spark: SparkSession): DataFrame = {
+    val groupedByUser = ratingDF.groupBy(col("USERID")).agg(count("MOVIEID").as("NO_OF_MOVIE"), avg("RATINGS").as("AVG_RATINGS"))
+    groupedByUser.withColumn("AVG_RATINGS", bround(col("AVG_RATINGS"), 2))
+  }
+
+  def movieCountByGenre(movieDF: DataFrame)(implicit spark: SparkSession): DataFrame = {
+    val explodedGenres = movieDF.withColumn("GENRES", explode(split(col("GENRES"), "[|]")))
+    explodedGenres.groupBy(col("GENRES")).agg(countDistinct(col("MOVIEID")).as("TOTAL_MOVIES"))
+  }
+
+  def top100Movie(movieDF: DataFrame, ratingDF: DataFrame)(implicit spark: SparkSession): DataFrame = {
+    val groupByRatings = ratingDF.groupBy("MOVIEID").agg(avg("RATINGS").cast(IntegerType).as("AVG_RATINGS")) // => get top 100
+    val renameMovieID = groupByRatings.withColumnRenamed("MOVIEID", "GROUPED_MOVIEID")
+    val window: WindowSpec = Window.orderBy(col("AVG_RATINGS"), col("MOVIEID"))
+
+    val movieAndRatings = renameMovieID.join(movieDF, col("GROUPED_MOVIEID") === col("MOVIEID"))
+      .select(
+          col("MOVIEID")
+        , col("TITLE")
+        , col("AVG_RATINGS")
+      ).limit(100)
+    movieAndRatings.select(row_number() over (window) as "RANKING", movieAndRatings.col("*"))
+  }
+
+}
diff --git a/src/test/scala/com/movie/MovieLensServiceTest.scala b/src/test/scala/com/movie/MovieLensServiceTest.scala
@@ -0,0 +1,107 @@
+package com.movie
+
+import com.holdenkarau.spark.testing.DatasetSuiteBase
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.types.{StructField, StructType}
+import org.scalatest.{FunSuite}
+
+class MovieLensServiceTest extends FunSuite with DatasetSuiteBase {
+
+  lazy implicit val mySpark = spark
+  import spark.implicits._
+
+  test("No of Movie and Avg Rating By Users") {
+    spark.sparkContext.setLogLevel("ERROR")
+    lazy val ratingDF: DataFrame =
+      Seq((1, 21, 5), (1, 22, 4), (1, 23, 5), (2, 22, 4), (2, 23, 3), (3, 23, 2)
+      ).toDF("USERID", "MOVIEID", "RATINGS")
+
+    val expected = Seq(
+      (1, 3l, 4.67),(2, 2l, 3.5),(3, 1l, 2.0)
+    ).toDF("USERID", "NO_OF_MOVIE", "AVG_RATINGS")
+
+    val actual = MovieLensService.userMovieRatings(ratingDF)
+
+    actual.show()
+    expected.show()
+
+    assertEqualsNullable(actual,expected)
+  }
+
+  test("No of Movies by Genre") {
+    spark.sparkContext.setLogLevel("ERROR")
+    lazy val movieDF: DataFrame =
+      Seq((21, "Toy Story (1995)", "Animation|Children's|Comedy")
+        , (22, "Jumanji (1995)", "Adventure|Children's|Fantasy")
+        , (23, "Grumpier Old Men (1995)", "Comedy|Romance")
+      ).toDF("MOVIEID", "TITLE", "GENRES")
+
+    val expected = Seq(
+      ("Romance", 1l),("Adventure", 1l),("Children's", 2l),("Fantasy", 1l),("Animation", 1l),("Comedy", 2l)
+    ).toDF("GENRES", "TOTAL_MOVIES")
+
+    val actual = MovieLensService.movieCountByGenre(movieDF)
+
+    actual.show()
+    expected.show()
+
+    assertEqualsNullable(actual,expected)
+
+  }
+
+  test("Top 100 movie") {
+    spark.sparkContext.setLogLevel("ERROR")
+
+    lazy val movieDF: DataFrame =
+      Seq((21, "Toy Story (1995)", "Animation|Children's|Comedy")
+        , (22, "Jumanji (1995)", "Adventure|Children's|Fantasy")
+        , (23, "Grumpier Old Men (1995)", "Comedy|Romance")
+      ).toDF("MOVIEID", "TITLE", "GENRES")
+
+    lazy val ratingDF: DataFrame =
+      Seq((1, 21, 5)
+        , (1, 22, 4)
+        , (1, 23, 5)
+        , (2, 22, 4)
+        , (2, 23, 3)
+        , (3, 23, 2)
+      ).toDF("USERID", "MOVIEID", "RATINGS")
+
+    val expected = Seq(
+      (1, 23, "Grumpier Old Men (1995)", 3),
+      (2, 22, "Jumanji (1995)", 4),
+      (3, 21, "Toy Story (1995)", 5)
+    ).toDF("RANKING", "MOVIEID", "TITLE", "AVG_RATINGS")
+
+    val actual = MovieLensService.top100Movie(movieDF, ratingDF)
+    actual.show(false)
+    expected.show()
+
+    assertEqualsNullable(actual,expected)
+
+  }
+
+  private[this] def sortAndOrderDataFrame(inputDataFrame: DataFrame): DataFrame = {
+    val listColNames = inputDataFrame.schema.fieldNames
+    scala.util.Sorting.quickSort(listColNames)
+    val orderedDF = inputDataFrame.select(listColNames.map(name => col(name)): _*)
+    val keys = orderedDF.schema.fieldNames.map(col(_))
+    orderedDF.sort(keys: _*)
+  }
+
+  private[this] def assertEqualsNullable(expected: DataFrame, actual: DataFrame): Unit = {
+    val left = setNullableTrueForAllColumns(expected, true)
+    val right = setNullableTrueForAllColumns(actual, true)
+    assertDataFrameEquals(sortAndOrderDataFrame(left), sortAndOrderDataFrame(right))
+  }
+
+  private[this] def setNullableTrueForAllColumns(df : DataFrame, nullable : Boolean)(implicit spark : SparkSession) : DataFrame = {
+    val schema = df.schema
+    val newSchema = StructType(schema.map{
+      case StructField(c,t,_,m) => StructField(c,t,nullable = nullable ,m)
+    })
+    spark.createDataFrame(df.rdd, newSchema)
+  }
+
+}

-Original file line number
+Diff line change
@@ @@ -0,0 +1,11 @@ @@
 +.idea/
 +spark-warehouse/
 +target/
 +*.class
 +*.iml
 +*.ipr
 +*.iws
 +.idea
 +out
 +tmp/
++