MapR [SPARK-325] Add examples for work with the MapRDB JSON connector into the Spark project (apache#361)

ekrivokonmapr · ekrivokonmapr · commit fddc84f63e86 · 2018-11-07T10:39:09.000+02:00
diff --git a/examples/pom.xml b/examples/pom.xml
@@ -126,6 +126,12 @@
       <scope>provided</scope>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>com.mapr.db</groupId>
+      <artifactId>maprdb-spark</artifactId>
+      <scope>provided</scope>
+      <version>${project.version}</version>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/examples/src/main/resources/words.txt b/examples/src/main/resources/words.txt
@@ -0,0 +1,5 @@
+1 hello world
+2 this is the text for the wordcount example of the maprdb connector
+3 first word of each line will be saved as id column by the maprdb ojai connector
+4 word1 word2 word1 word2
+5 word1 word3 word3 word3
diff --git a/examples/src/main/scala/org/apache/spark/examples/maprdbconnector/MaprDBJsonConnectorWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/maprdbconnector/MaprDBJsonConnectorWordCount.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.maprdbconnector
+
+import org.apache.spark.sql.SparkSession
+
+import com.mapr.db.spark.sql._
+
+object MaprDBJsonConnectorWordCount {
+
+  def main(args: Array[String]): Unit = {
+
+    parseArgs(args)
+
+    val pathToFileWithData = args(0)
+    val tableName = args(1)
+    val tableNameWithResult = args(2)
+
+    val spark = SparkSession
+      .builder()
+      .appName("OJAI MaprDB connector wordcount example")
+      .getOrCreate()
+
+    import spark.implicits._
+    val wordSequenceDS = importDataIntoSeq(pathToFileWithData).toDS()
+
+    wordSequenceDS.saveToMapRDB(tableName, createTable = true)
+
+    val dfWithDataFromMaprDB = spark.loadFromMapRDB(tableName)
+      .flatMap(line => line.getAs[String](1).split(" "))
+      .groupBy("value")
+      .count()
+
+    println("Dataset with counted words:")
+    dfWithDataFromMaprDB.show()
+
+    dfWithDataFromMaprDB.withColumn("_id", $"value")
+      .saveToMapRDB(tableNameWithResult, createTable = true)
+    println("Dataset with counted words was saved into the MaprDB table.")
+
+    spark.stop()
+  }
+
+  private def parseArgs(args: Array[String]): Unit = {
+    if (args.length != 3) {
+      printUsage()
+      System.exit(1)
+    }
+  }
+
+  private def printUsage(): Unit = {
+    val usage =
+      """OJAI MaprDB connector wordcount example
+        |Usage:
+        |1) path to the file with data (words.txt can be used for the test);
+        |2) name of the MaprDB table where data from file will be saved;
+        |3) name of the MaprDB table where result will be saved;
+        |""".stripMargin
+
+    println(usage)
+  }
+
+  private def importDataIntoSeq(filePath: String): Seq[Word] = {
+    scala.io.Source.fromURL(filePath)
+      .getLines
+      .map(line => {
+        val wordWithId = line.split(" ")
+        Word(wordWithId(0), wordWithId.drop(1).mkString(" "))
+      }).toSeq
+  }
+
+  private case class Word(_id: String, words: String)
+
+}