Skip to content

Commit f732b5a

Browse files
fixing latency in kinesis sink (#81)
1 parent f4713a4 commit f732b5a

File tree

3 files changed

+46
-6
lines changed

3 files changed

+46
-6
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ Refering $SPARK_HOME to the Spark installation directory.
148148
| awsUseInstanceProfile | true | Use Instance Profile Credentials if none of credentials provided |
149149
| kinesis.executor.recordMaxBufferedTime | 1000 (millis) | Specify the maximum buffered time of a record |
150150
| kinesis.executor.maxConnections | 1 | Specify the maximum connections to Kinesis |
151-
| kinesis.executor.aggregationEnabled | true | Specify if records should be aggregated before sending them to Kinesis |
151+
| kinesis.executor.aggregationEnabled | true | Specify if records should be aggregated before sending them to Kinesis |
152+
| kniesis.executor.flushwaittimemillis | 100 | Wait time while flushing records to Kinesis on Task End |
152153

153154
## Roadmap
154155
* We need to migrate to DataSource V2 APIs for MicroBatchExecution.

src/main/scala/org/apache/spark/sql/kinesis/KinesisSourceProvider.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ private[kinesis] object KinesisSourceProvider extends Logging {
238238
private[kinesis] val SINK_RECORD_MAX_BUFFERED_TIME = "kinesis.executor.recordmaxbufferedtime"
239239
private[kinesis] val SINK_MAX_CONNECTIONS = "kinesis.executor.maxconnections"
240240
private[kinesis] val SINK_AGGREGATION_ENABLED = "kinesis.executor.aggregationenabled"
241+
private[kinesis] val SINK_FLUSH_WAIT_TIME_MILLIS = "kniesis.executor.flushwaittimemillis"
241242

242243

243244
private[kinesis] def getKinesisPosition(
@@ -266,6 +267,8 @@ private[kinesis] object KinesisSourceProvider extends Logging {
266267
private[kinesis] val DEFAULT_SINK_MAX_CONNECTIONS: String = "1"
267268

268269
private[kinesis] val DEFAULT_SINK_AGGREGATION: String = "true"
270+
271+
private[kinesis] val DEFAULT_FLUSH_WAIT_TIME_MILLIS: String = "100"
269272
}
270273

271274

src/main/scala/org/apache/spark/sql/kinesis/KinesisWriteTask.scala

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ package org.apache.spark.sql.kinesis
1818

1919
import java.nio.ByteBuffer
2020

21+
import scala.util.Try
22+
2123
import com.amazonaws.services.kinesis.producer.{KinesisProducer, UserRecordResult}
2224
import com.google.common.util.concurrent.{FutureCallback, Futures}
2325

@@ -34,9 +36,19 @@ private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, Strin
3436
private val streamName = producerConfiguration.getOrElse(
3537
KinesisSourceProvider.SINK_STREAM_NAME_KEY, "")
3638

39+
private val flushWaitTimeMills = Try(producerConfiguration.getOrElse(
40+
KinesisSourceProvider.SINK_FLUSH_WAIT_TIME_MILLIS,
41+
KinesisSourceProvider.DEFAULT_FLUSH_WAIT_TIME_MILLIS).toLong).getOrElse {
42+
throw new IllegalArgumentException(
43+
s"${KinesisSourceProvider.SINK_FLUSH_WAIT_TIME_MILLIS} has to be a positive integer")
44+
}
45+
46+
private var failedWrite: Throwable = _
47+
48+
3749
def execute(iterator: Iterator[InternalRow]): Unit = {
3850
producer = CachedKinesisProducer.getOrCreate(producerConfiguration)
39-
while (iterator.hasNext) {
51+
while (iterator.hasNext && failedWrite == null) {
4052
val currentRow = iterator.next()
4153
val projectedRow = projection(currentRow)
4254
val partitionKey = projectedRow.getString(0)
@@ -54,7 +66,10 @@ private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, Strin
5466
val kinesisCallBack = new FutureCallback[UserRecordResult]() {
5567

5668
override def onFailure(t: Throwable): Unit = {
57-
logError(s"Writing to $streamName failed due to ${t.getCause}")
69+
if (failedWrite == null && t!= null) {
70+
failedWrite = t
71+
logError(s"Writing to $streamName failed due to ${t.getCause}")
72+
}
5873
}
5974

6075
override def onSuccess(result: UserRecordResult): Unit = {
@@ -68,13 +83,34 @@ private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, Strin
6883
sentSeqNumbers
6984
}
7085

71-
def close(): Unit = {
86+
private def flushRecordsIfNecessary(): Unit = {
7287
if (producer != null) {
73-
producer.flush()
74-
producer = null
88+
while (producer.getOutstandingRecordsCount > 0) {
89+
try {
90+
producer.flush()
91+
Thread.sleep(flushWaitTimeMills)
92+
checkForErrors()
93+
} catch {
94+
case e: InterruptedException =>
95+
96+
}
97+
}
7598
}
7699
}
77100

101+
def checkForErrors(): Unit = {
102+
if (failedWrite != null) {
103+
throw failedWrite
104+
}
105+
}
106+
107+
def close(): Unit = {
108+
checkForErrors()
109+
flushRecordsIfNecessary()
110+
checkForErrors()
111+
producer = null
112+
}
113+
78114
private def createProjection: UnsafeProjection = {
79115

80116
val partitionKeyExpression = inputSchema

0 commit comments

Comments
 (0)