Documentation and test were adjusted to describe/cover Spark Datasets API usage.

srgg · srgg · commit 37f5f9d080eb · 2017-02-03T07:30:43.000+02:00
diff --git a/connector/src/test/scala/com/basho/riak/spark/rdd/timeseries/AbstractTimeSeriesTest.scala b/connector/src/test/scala/com/basho/riak/spark/rdd/timeseries/AbstractTimeSeriesTest.scala
@@ -1,5 +1,5 @@
 /**
-  * Copyright (c) 2015 Basho Technologies, Inc.
+  * Copyright (c) 2015-2017 Basho Technologies, Inc.
   *
   * This file is provided to you under the Apache License,
   * Version 2.0 (the "License"); you may not use this file
@@ -101,7 +101,8 @@ abstract class AbstractTimeSeriesTest(val createTestData: Boolean = true) extend
     new Cell(f.temperature_k))
   )
 
-  final val sqlWhereClause = s"WHERE time >= $queryFromMillis AND time <= $queryToMillis AND surrogate_key = 1 AND family = 'f'"
+  final val filterExpression = s"time >= $queryFromMillis AND time <= $queryToMillis AND surrogate_key = 1 AND family = 'f'"
+  final val sqlWhereClause = s"WHERE $filterExpression"
 
   final val sqlQuery = s"SELECT surrogate_key, family, time, user_id, temperature_k FROM $bucketName $sqlWhereClause"
 
diff --git a/connector/src/test/scala/com/basho/riak/spark/rdd/timeseries/SparkDataSetTest.scala b/connector/src/test/scala/com/basho/riak/spark/rdd/timeseries/SparkDataSetTest.scala
@@ -0,0 +1,55 @@
+/**
+  * Copyright (c) 2015-2017 Basho Technologies, Inc.
+  *
+  * This file is provided to you under the Apache License,
+  * Version 2.0 (the "License"); you may not use this file
+  * except in compliance with the License.  You may obtain
+  * a copy of the License at
+  *
+  * http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing,
+  * software distributed under the License is distributed on an
+  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  * KIND, either express or implied.  See the License for the
+  * specific language governing permissions and limitations
+  * under the License.
+  */
+package com.basho.riak.spark.rdd.timeseries
+
+import com.basho.riak.spark.rdd.RiakTSTests
+import org.junit.Test
+import org.junit.experimental.categories.Category
+
+/**
+  * @author Sergey Galkin <srggal at gmail dot com>
+  */
+@Category(Array(classOf[RiakTSTests]))
+class SparkDataSetTest extends AbstractTimeSeriesTest {
+
+  @Test
+  def genericLoadAsDataSet(): Unit = {
+    import sparkSession.implicits._
+
+    val ds = sparkSession.read
+      .format("org.apache.spark.sql.riak")
+        .option("spark.riakts.bindings.timestamp", "useLong")
+      .load(bucketName)
+      .filter(filterExpression)
+      .as[TimeSeriesData]
+
+    val data: Array[TimeSeriesData] = ds.collect()
+
+    // -- verification
+    assertEqualsUsingJSONIgnoreOrder(
+      """
+        |[
+        |   {time:111111, user_id:'bryce', temperature_k:305.37},
+        |   {time:111222, user_id:'bryce', temperature_k:300.12},
+        |   {time:111333, user_id:'bryce', temperature_k:295.95},
+        |   {time:111444, user_id:'ratman', temperature_k:362.121},
+        |   {time:111555, user_id:'ratman', temperature_k:3502.212}
+        |]
+      """.stripMargin, data)
+  }
+}
diff --git a/docs/using-connector.md b/docs/using-connector.md
@@ -12,6 +12,7 @@ Scroll down or click below for the desired information:
 - [Writing Data To TS Table](./using-connector.md#writing-data-to-ts-table)
 - [Spark Dataframes With KV Bucket](./using-connector.md#spark-dataframes-with-kv-bucket)
 - [Spark Dataframes With TS Table](./using-connector.md#spark-dataframes-with-ts-table)
+- [Spark DataSets With TS Table](./using-connector.md#spark-datasets-with-ts-table)
 - [Partitioning for KV Buckets](./using-connector.md#partitioning-for-kv-buckets)
 - [Working With TS Dates](./using-connector.md#working-with-ts-dates)
 - [Partitioning for Riak TS Table Queries](./using-connector.md#partitioning-for-riak-ts-table-queries)
@@ -419,6 +420,31 @@ inputDF.write \
 So far SaveMode.Append is the only mode available.
 Any of the Spark Connector options can be provided in `.option()` or `.options()`.
 
+## Spark Datasets With TS Table
+Spark Datasets aka strongly typed Dataframes might be created in a very similar manner to the dataframe, there are only two difference: 
+
+* Datasets requires to have an Encoder; builtin encoders for common Scala types and their product types are already available in implicits object, and you only need to import these implicits as follows:      
+```scala
+import spark.implicits._
+```
+
+* the data type should be provided by calling `as()` routine
+
+Here is an example of a Dataset creation:
+```scala
+import spark.implicits._
+
+case class TimeSeriesData(time: Long, user_id: String, temperature_k: Double)
+
+val ds = sparkSession.read
+  .format("org.apache.spark.sql.riak")
+  .option("spark.riakts.bindings.timestamp", "useLong")
+  .load(bucketName)
+  .filter(filterExpression)
+  .as[TimeSeriesData]
+```
+
+NOTE: There is no Datasets support for Python since Spark does not support this. 
 
 ## Partitioning for KV Buckets
 

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/**`
`2`		`- * Copyright (c) 2015 Basho Technologies, Inc.`
	`2`	`+ * Copyright (c) 2015-2017 Basho Technologies, Inc.`
`3`	`3`	`*`
`4`	`4`	`* This file is provided to you under the Apache License,`
`5`	`5`	`* Version 2.0 (the "License"); you may not use this file`
`@@ -101,7 +101,8 @@ abstract class AbstractTimeSeriesTest(val createTestData: Boolean = true) extend`
`101`	`101`	`new Cell(f.temperature_k))`
`102`	`102`	`)`
`103`	`103`
`104`		`- final val sqlWhereClause = s"WHERE time >= $queryFromMillis AND time <= $queryToMillis AND surrogate_key = 1 AND family = 'f'"`
	`104`	`+ final val filterExpression = s"time >= $queryFromMillis AND time <= $queryToMillis AND surrogate_key = 1 AND family = 'f'"`
	`105`	`+ final val sqlWhereClause = s"WHERE $filterExpression"`
`105`	`106`
`106`	`107`	`final val sqlQuery = s"SELECT surrogate_key, family, time, user_id, temperature_k FROM $bucketName $sqlWhereClause"`
`107`	`108`