Add DataFrame filter/SQL examples. Rename reduceByKey.scala file to examples.scala

sindbach · sindbach · commit 5c9f9008afc2 · 2016-07-21T10:54:03.000+10:00
diff --git a/README.md b/README.md
@@ -23,16 +23,19 @@ ${SPARK_HOME}/bin/spark-shell --conf "spark.mongodb.input.uri=mongodb://mongodb:
 ```
 
 
-For example, please see [reduceByKey.scala](spark/files/reduceByKey.scala) to query from mongodb, run a simple aggregation, and finally write output back to mongodb. This file will also be available inside of the spark container in `/home/ubuntu/reduceByKey.scala`
+For example, please see [examples.scala](spark/files/examples.scala) to query from mongodb, run a simple aggregation, dataframe SQL and finally write output back to mongodb. This file will also be available inside of the spark container in `/home/ubuntu/examples.scala`
 
 Run the `spark shell` by executing: 
 
 ```sh
 ${SPARK_HOME}/bin/spark-shell --conf "spark.mongodb.input.uri=mongodb://mongodb:27017/spark.times" --conf "spark.mongodb.output.uri=mongodb://mongodb/spark.output" --packages org.mongodb.spark:mongo-spark-connector_${SCALA_VERSION}:${MONGO_SPARK_VERSION}
 ```
 
-You can also append `-i <file.scala>` to execute a scala file via the spark shell. 
+You can also append `-i <file.scala>` to execute a scala file via the spark shell. For example: 
 
+```sh
+${SPARK_HOME}/bin/spark-shell --conf "spark.mongodb.input.uri=mongodb://mongodb:27017/spark.times" --conf "spark.mongodb.output.uri=mongodb://mongodb/spark.output" --packages org.mongodb.spark:mongo-spark-connector_${SCALA_VERSION}:${MONGO_SPARK_VERSION} -i ./examples.scala 
+```
 
 ### More Information. 
 
diff --git a/spark/Dockerfile b/spark/Dockerfile
@@ -17,11 +17,11 @@ ENV SCALA_VERSION 2.10
 
 WORKDIR ${HOME}
 
-ENV ${HOME}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}
+ENV SPARK_HOME ${HOME}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}
 
 COPY files/times.json /home/ubuntu/times.json
 COPY files/readme.txt /home/ubuntu/readme.txt
-COPY files/reduceByKey.scala /home/ubuntu/reduceByKey.scala
+COPY files/examples.scala /home/ubuntu/examples.scala
 COPY files/initDocuments.scala /home/ubuntu/initDocuments.scala
 
 RUN chown -R ubuntu:ubuntu /home/ubuntu/* 
diff --git a/spark/files/examples.scala b/spark/files/examples.scala
@@ -1,5 +1,8 @@
 
 import com.mongodb.spark._
+import com.mongodb.spark.config._
+import org.apache.spark.sql.SQLContext
+
 import org.bson.Document 
 
 /* Load collection as RDD */
@@ -44,15 +47,31 @@ val aggRdd = rdd.withPipeline(Seq(
 println("MongoDB aggregation pipeline reult: ")
 aggRdd.foreach(println)
 
-
 // Save result to MongoDB
 // 1) Default 
-import com.mongodb.spark.config._
 aggRdd.saveToMongoDB()
 // 2) Using helper and WriteConfig to modify destination 
 outputRDD.saveToMongoDB(WriteConfig(Map("uri"->"mongodb://mongodb:27017/spark.processing")))
 println("RDD is written to MongoDB")
 
+/* DataFrames examples */
+val sqlContext = SQLContext.getOrCreate(sc)
+val df = MongoSpark.load(sqlContext)
+// Print schema 
+df.printSchema()
+// Filter by Integer and by String
+df.filter(df("myid") < 2).show()
+df.filter(df("doc") === "V ").show()
+
+// DataFrames SQL example 
+df.registerTempTable("temporary")
+val sqlResult = sqlContext.sql("SELECT myid, doc, timestamp FROM temporary WHERE myid > 6 AND doc='V '")
+sqlResult.show()
+// Save out the filtered DataFrame result
+MongoSpark.save(sqlResult.write.option("collection", "DF_times").mode("overwrite"))
+// Read it back in 
+MongoSpark.load(sqlContext, ReadConfig(Map("collection" -> "DF_times"), Some(ReadConfig(sqlContext)))).show()
+
 println("Done")
 System.exit(0);
 
diff --git a/spark/files/readme.txt b/spark/files/readme.txt
@@ -1,16 +1,15 @@
 
-# set data : You can run mongoimport on host to import into 'mongodb' docker instance
-# to find out the IP on OSX docker-machine, you can use `docker-machine ip default`
-mongoimport -h <mongodb ip> -d spark -c times ./times.json
-
-# Or  you can just use initDocuments.scala to import using Spark itself
+# set data : You can run use initDocuments.scala to import using Spark itself.
 ${SPARK_HOME}/bin/spark-shell --conf "spark.mongodb.input.uri=mongodb://mongodb:27017/spark.times" --conf "spark.mongodb.output.uri=mongodb://mongodb/spark.output" --packages org.mongodb.spark:mongo-spark-connector_2.10:1.0.0 -i ./initDocuments.scala
 
 # Run spark-shell 
-${SPARK_HOME}/bin/spark-shell --conf "spark.mongodb.input.uri=mongodb://mongodb:27017/spark.times" --conf "spark.mongodb.output.uri=mongodb://mongodb:27107/spark.output" --packages org.mongodb.spark:mongo-spark-connector_${SCALA_VERSION}:${MONGO_SPARK_VERSION}
+${SPARK_HOME}/bin/spark-shell --conf "spark.mongodb.input.uri=mongodb://mongodb:27017/spark.times" --conf "spark.mongodb.output.uri=mongodb://mongodb/spark.output" --packages org.mongodb.spark:mongo-spark-connector_${SCALA_VERSION}:${MONGO_SPARK_VERSION}
+
+# Or you can run scala file through the shell by specifying `-i <file.scala>`. For example to run `examples.scala` example: 
+${SPARK_HOME}/bin/spark-shell --conf "spark.mongodb.input.uri=mongodb://mongodb:27017/spark.times" --conf "spark.mongodb.output.uri=mongodb://mongodb/spark.output" --packages org.mongodb.spark:mongo-spark-connector_${SCALA_VERSION}:${MONGO_SPARK_VERSION} -i ./examples.scala 
 
-# Or you can run scala file through the shell by specifying `-i <file.scala>`
 
 # start 1 master/worker
 ${SPARK_HOME}/sbin/start-master.sh
-${SPARK_HOME}/sbin/start-slave.sh spark://spark:7077
+${SPARK_HOME}/sbin/start-slave.sh spark://spark:7077
+