@@ -82,6 +82,7 @@ private[kafka010] case class KafkaSource(
82
82
executorKafkaParams : ju.Map [String , Object ],
83
83
sourceOptions : Map [String , String ],
84
84
metadataPath : String ,
85
+ startFromEarliestOffset : Boolean ,
85
86
failOnDataLoss : Boolean )
86
87
extends Source with Logging {
87
88
@@ -109,7 +110,11 @@ private[kafka010] case class KafkaSource(
109
110
private lazy val initialPartitionOffsets = {
110
111
val metadataLog = new HDFSMetadataLog [KafkaSourceOffset ](sqlContext.sparkSession, metadataPath)
111
112
metadataLog.get(0 ).getOrElse {
112
- val offsets = KafkaSourceOffset (fetchPartitionOffsets(seekToEnd = false ))
113
+ val offsets = if (startFromEarliestOffset) {
114
+ KafkaSourceOffset (fetchEarliestOffsets())
115
+ } else {
116
+ KafkaSourceOffset (fetchLatestOffsets())
117
+ }
113
118
metadataLog.add(0 , offsets)
114
119
logInfo(s " Initial offsets: $offsets" )
115
120
offsets
@@ -123,7 +128,7 @@ private[kafka010] case class KafkaSource(
123
128
// Make sure initialPartitionOffsets is initialized
124
129
initialPartitionOffsets
125
130
126
- val offset = KafkaSourceOffset (fetchPartitionOffsets(seekToEnd = true ))
131
+ val offset = KafkaSourceOffset (fetchLatestOffsets( ))
127
132
logDebug(s " GetOffset: ${offset.partitionToOffsets.toSeq.map(_.toString).sorted}" )
128
133
Some (offset)
129
134
}
@@ -227,26 +232,34 @@ private[kafka010] case class KafkaSource(
227
232
override def toString (): String = s " KafkaSource[ $consumerStrategy] "
228
233
229
234
/**
230
- * Fetch the offset of a partition, either seek to the latest offsets or use the current offsets
231
- * in the consumer.
235
+ * Fetch the earliest offsets of partitions.
232
236
*/
233
- private def fetchPartitionOffsets (
234
- seekToEnd : Boolean ): Map [TopicPartition , Long ] = withRetriesWithoutInterrupt {
235
- // Make sure `KafkaConsumer.poll` won't be interrupted (KAFKA-1894)
236
- assert(Thread .currentThread().isInstanceOf [StreamExecutionThread ])
237
+ private def fetchEarliestOffsets (): Map [TopicPartition , Long ] = withRetriesWithoutInterrupt {
237
238
// Poll to get the latest assigned partitions
238
239
consumer.poll(0 )
239
240
val partitions = consumer.assignment()
240
241
consumer.pause(partitions)
241
- logDebug(s " Partitioned assigned to consumer: $partitions" )
242
+ logDebug(s " Partitions assigned to consumer: $partitions. Seeking to the beginning " )
242
243
243
- // Get the current or latest offset of each partition
244
- if (seekToEnd) {
245
- consumer.seekToEnd(partitions)
246
- logDebug(" Seeked to the end" )
247
- }
244
+ consumer.seekToBeginning(partitions)
245
+ val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap
246
+ logDebug(s " Got earliest offsets for partition : $partitionOffsets" )
247
+ partitionOffsets
248
+ }
249
+
250
+ /**
251
+ * Fetch the latest offset of partitions.
252
+ */
253
+ private def fetchLatestOffsets (): Map [TopicPartition , Long ] = withRetriesWithoutInterrupt {
254
+ // Poll to get the latest assigned partitions
255
+ consumer.poll(0 )
256
+ val partitions = consumer.assignment()
257
+ consumer.pause(partitions)
258
+ logDebug(s " Partitions assigned to consumer: $partitions. Seeking to the end. " )
259
+
260
+ consumer.seekToEnd(partitions)
248
261
val partitionOffsets = partitions.asScala.map(p => p -> consumer.position(p)).toMap
249
- logDebug(s " Got offsets for partition : $partitionOffsets" )
262
+ logDebug(s " Got latest offsets for partition : $partitionOffsets" )
250
263
partitionOffsets
251
264
}
252
265
@@ -256,22 +269,21 @@ private[kafka010] case class KafkaSource(
256
269
*/
257
270
private def fetchNewPartitionEarliestOffsets (
258
271
newPartitions : Seq [TopicPartition ]): Map [TopicPartition , Long ] = withRetriesWithoutInterrupt {
259
- // Make sure `KafkaConsumer.poll` won't be interrupted (KAFKA-1894)
260
- assert(Thread .currentThread().isInstanceOf [StreamExecutionThread ])
261
272
// Poll to get the latest assigned partitions
262
273
consumer.poll(0 )
263
274
val partitions = consumer.assignment()
275
+ consumer.pause(partitions)
264
276
logDebug(s " \t Partitioned assigned to consumer: $partitions" )
265
277
266
278
// Get the earliest offset of each partition
267
279
consumer.seekToBeginning(partitions)
268
- val partitionToOffsets = newPartitions.filter { p =>
280
+ val partitionOffsets = newPartitions.filter { p =>
269
281
// When deleting topics happen at the same time, some partitions may not be in `partitions`.
270
282
// So we need to ignore them
271
283
partitions.contains(p)
272
284
}.map(p => p -> consumer.position(p)).toMap
273
- logDebug(s " Got offsets for new partitions: $partitionToOffsets " )
274
- partitionToOffsets
285
+ logDebug(s " Got earliest offsets for new partitions: $partitionOffsets " )
286
+ partitionOffsets
275
287
}
276
288
277
289
/**
@@ -284,6 +296,9 @@ private[kafka010] case class KafkaSource(
284
296
*/
285
297
private def withRetriesWithoutInterrupt (
286
298
body : => Map [TopicPartition , Long ]): Map [TopicPartition , Long ] = {
299
+ // Make sure `KafkaConsumer.poll` won't be interrupted (KAFKA-1894)
300
+ assert(Thread .currentThread().isInstanceOf [StreamExecutionThread ])
301
+
287
302
synchronized {
288
303
var result : Option [Map [TopicPartition , Long ]] = None
289
304
var attempt = 1
0 commit comments