Reflect review comments

HeartSaVioR · HeartSaVioR · commit afd6c1175bd6 · 2021-05-25T09:28:05.000+09:00
diff --git a/docs/structured-streaming-kafka-integration.md b/docs/structured-streaming-kafka-integration.md
@@ -368,7 +368,7 @@ The following configurations are optional:
   <td>none (next preference is <code>startingOffsetsByTimestamp</code>)</td>
   <td>streaming and batch</td>
   <td>The start point of timestamp when a query is started, a string specifying a starting timestamp for
-  all partitions in topics being subscribed. Please refer the details on timestamp offset options below. If the matched offset doesn't exist,
+  all partitions in topics being subscribed. Please refer the details on timestamp offset options below. If Kafka doesn't return the matched offset,
   the query will fail immediately to prevent unintended read from such partition. (This is a kind of limitation as of now, and will be addressed in near future.)<p/>
   <p/>
   Note1: <code>startingTimestamp</code> takes precedence over <code>startingOffsetsByTimestamp</code> and <code>startingOffsets</code>.<p/>
@@ -381,10 +381,10 @@ The following configurations are optional:
   <td>json string
   """ {"topicA":{"0": 1000, "1": 1000}, "topicB": {"0": 2000, "1": 2000}} """
   </td>
-  <td>none (the value of <code>startingOffsets</code> will apply)</td>
+  <td>none (next preference is <code>startingOffsets</code>)</td>
   <td>streaming and batch</td>
   <td>The start point of timestamp when a query is started, a json string specifying a starting timestamp for
-  each TopicPartition. Please refer the details on timestamp offset options below. If the matched offset doesn't exist,
+  each TopicPartition. Please refer the details on timestamp offset options below. If Kafka doesn't return the matched offset,
   the query will fail immediately to prevent unintended read from such partition. (This is a kind of limitation as of now, and will be addressed in near future.)<p/>
   <p/>
   Note1: <code>startingOffsetsByTimestamp</code> takes precedence over <code>startingOffsets</code>.<p/>
@@ -413,8 +413,8 @@ The following configurations are optional:
   <td>none (next preference is <code>endingOffsetsByTimestamp</code>)</td>
   <td>batch query</td>
   <td>The end point when a batch query is ended, a json string specifying an ending timestamp for
-  all partitions in topics being subscribed. Please refer the details on timestamp offset options below. If the matched offset doesn't exist, the offset will
-  be set to latest.<p/>
+  all partitions in topics being subscribed. Please refer the details on timestamp offset options below.
+  If Kafka doesn't return the matched offset, the offset will be set to latest.<p/>
   Note: <code>endingTimestamp</code> takes precedence over <code>endingOffsetsByTimestamp</code> and <code>endingOffsets</code>.<p/>
   </td>
 </tr>
@@ -426,8 +426,8 @@ The following configurations are optional:
   <td>none (next preference is <code>endingOffsets</code>)</td>
   <td>batch query</td>
   <td>The end point when a batch query is ended, a json string specifying an ending timestamp for each TopicPartition.
-  Please refer the details on timestamp offset options below. If the matched offset doesn't exist, the offset will
-  be set to latest.<p/>
+  Please refer the details on timestamp offset options below. If Kafka doesn't return the matched offset,
+  the offset will be set to latest.<p/>
   Note: <code>endingOffsetsByTimestamp</code> takes precedence over <code>endingOffsets</code>.
   </td>
 </tr>
@@ -529,7 +529,7 @@ The following configurations are optional:
 ### Details on timestamp offset options
 
 The returned offset for each partition is the earliest offset whose timestamp is greater than or equal to the given timestamp in the corresponding partition.
-The behavior varies across options if the matched offset doesn't exist - check the description of each option.
+The behavior varies across options if Kafka doesn't return the matched offset - check the description of each option.
 
 Spark simply passes the timestamp information to <code>KafkaConsumer.offsetsForTimes</code>, and doesn't interpret or reason about the value.
 For more details on <code>KafkaConsumer.offsetsForTimes</code>, please refer <a href="https://kafka.apache.org/21/javadoc/org/apache/kafka/clients/consumer/KafkaConsumer.html#offsetsForTimes-java.util.Map-">javadoc</a> for details.
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -581,32 +581,30 @@ private[kafka010] object KafkaSourceProvider extends Logging {
       defaultOffsets: KafkaOffsetRangeLimit): KafkaOffsetRangeLimit = {
     // The order below represents "preferences"
 
-    // 1. global timestamp
     if (params.contains(globalOffsetTimestampOptionKey)) {
+      // 1. global timestamp
       val tsStr = params(globalOffsetTimestampOptionKey).trim
       try {
         val ts = tsStr.toLong
-        return GlobalTimestampRangeLimit(ts)
+        GlobalTimestampRangeLimit(ts)
       } catch {
         case _: NumberFormatException =>
           throw new IllegalArgumentException(s"Expected a single long value, got $tsStr")
       }
-    }
-
-    // 2. timestamp per topic partition
-    if (params.contains(offsetByTimestampOptionKey)) {
+    } else if (params.contains(offsetByTimestampOptionKey)) {
+      // 2. timestamp per topic partition
       val json = params(offsetByTimestampOptionKey).trim
-      return SpecificTimestampRangeLimit(JsonUtils.partitionTimestamps(json))
-    }
-
-    // 3. latest/earliest/offset
-    params.get(offsetOptionKey).map(_.trim) match {
-      case Some(offset) if offset.toLowerCase(Locale.ROOT) == "latest" =>
-        LatestOffsetRangeLimit
-      case Some(offset) if offset.toLowerCase(Locale.ROOT) == "earliest" =>
-        EarliestOffsetRangeLimit
-      case Some(json) => SpecificOffsetRangeLimit(JsonUtils.partitionOffsets(json))
-      case None => defaultOffsets
+      SpecificTimestampRangeLimit(JsonUtils.partitionTimestamps(json))
+    } else {
+      // 3. latest/earliest/offset
+      params.get(offsetOptionKey).map(_.trim) match {
+        case Some(offset) if offset.toLowerCase(Locale.ROOT) == "latest" =>
+          LatestOffsetRangeLimit
+        case Some(offset) if offset.toLowerCase(Locale.ROOT) == "earliest" =>
+          EarliestOffsetRangeLimit
+        case Some(json) => SpecificOffsetRangeLimit(JsonUtils.partitionOffsets(json))
+        case None => defaultOffsets
+      }
     }
   }
 
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala
@@ -293,7 +293,7 @@ abstract class KafkaRelationSuiteBase extends QueryTest with SharedSparkSession
     TestUtils.assertExceptionMsg(e, "No offset matched from request")
   }
 
-  test("specifying both global timestamp and specific timestamp for partition") {
+  test("preferences on offset related options") {
     val (topic, timestamps) = prepareTimestampRelatedUnitTest
 
     /*
@@ -305,17 +305,30 @@ abstract class KafkaRelationSuiteBase extends QueryTest with SharedSparkSession
     * specific timestamp for partition
     starting only presented as "second", and ending not presented
 
-    Here we expect global timestamp will take effect.
+    * offsets
+    starting only presented as "earliest", and ending not presented
+
+    The preference goes to global timestamp -> timestamp for partition -> offsets
      */
-    verifyTimestampRelatedQueryResult({ df =>
-      val startTopicTimestamps = Map(
-        (0 to 2).map(new TopicPartition(topic, _) -> timestamps(1)): _*)
-      val startingTimestamps = JsonUtils.partitionTimestamps(startTopicTimestamps)
 
+    val startTopicTimestamps = Map(
+      (0 to 2).map(new TopicPartition(topic, _) -> timestamps(1)): _*)
+    val startingTimestamps = JsonUtils.partitionTimestamps(startTopicTimestamps)
+
+    // all options are specified: global timestamp
+    verifyTimestampRelatedQueryResult({ df =>
       df
         .option("startingTimestamp", timestamps(2))
         .option("startingOffsetsByTimestamp", startingTimestamps)
+        .option("startingOffsets", "earliest")
     }, topic, 20 to 29)
+
+    // timestamp for partition and offsets are specified: timestamp for partition
+    verifyTimestampRelatedQueryResult({ df =>
+      df
+        .option("startingOffsetsByTimestamp", startingTimestamps)
+        .option("startingOffsets", "earliest")
+    }, topic, 10 to 29)
   }
 
   test("no matched offset for timestamp - endingOffsets") {