@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.streaming.state
19
19
20
20
import java .io .File
21
21
import java .util .Locale
22
+ import java .util .concurrent .TimeUnit
22
23
import javax .annotation .concurrent .GuardedBy
23
24
24
25
import scala .collection .{mutable , Map }
@@ -49,6 +50,7 @@ case object RollbackStore extends RocksDBOpType("rollback_store")
49
50
case object CloseStore extends RocksDBOpType (" close_store" )
50
51
case object ReportStoreMetrics extends RocksDBOpType (" report_store_metrics" )
51
52
case object StoreTaskCompletionListener extends RocksDBOpType (" store_task_completion_listener" )
53
+ case object StoreMaintenance extends RocksDBOpType (" store_maintenance" )
52
54
53
55
/**
54
56
* Class representing a RocksDB instance that checkpoints version of data to DFS.
@@ -184,19 +186,23 @@ class RocksDB(
184
186
loadedVersion = latestSnapshotVersion
185
187
186
188
// reset last snapshot version
187
- lastSnapshotVersion = 0L
189
+ if (lastSnapshotVersion > latestSnapshotVersion) {
190
+ // discard any newer snapshots
191
+ lastSnapshotVersion = 0L
192
+ latestSnapshot = None
193
+ }
188
194
openDB()
189
195
190
196
numKeysOnWritingVersion = if (! conf.trackTotalNumberOfRows) {
191
- // we don't track the total number of rows - discard the number being track
192
- - 1L
193
- } else if (metadata.numKeys < 0 ) {
194
- // we track the total number of rows, but the snapshot doesn't have tracking number
195
- // need to count keys now
196
- countKeys()
197
- } else {
198
- metadata.numKeys
199
- }
197
+ // we don't track the total number of rows - discard the number being track
198
+ - 1L
199
+ } else if (metadata.numKeys < 0 ) {
200
+ // we track the total number of rows, but the snapshot doesn't have tracking number
201
+ // need to count keys now
202
+ countKeys()
203
+ } else {
204
+ metadata.numKeys
205
+ }
200
206
if (loadedVersion != version) replayChangelog(version)
201
207
// After changelog replay the numKeysOnWritingVersion will be updated to
202
208
// the correct number of keys in the loaded version.
@@ -571,16 +577,14 @@ class RocksDB(
571
577
// background operations.
572
578
val cp = Checkpoint .create(db)
573
579
cp.createCheckpoint(checkpointDir.toString)
574
- synchronized {
575
- // if changelog checkpointing is disabled, the snapshot is uploaded synchronously
576
- // inside the uploadSnapshot() called below.
577
- // If changelog checkpointing is enabled, snapshot will be uploaded asynchronously
578
- // during state store maintenance.
579
- latestSnapshot.foreach(_.close())
580
- latestSnapshot = Some (
581
- RocksDBSnapshot (checkpointDir, newVersion, numKeysOnWritingVersion))
582
- lastSnapshotVersion = newVersion
583
- }
580
+ // if changelog checkpointing is disabled, the snapshot is uploaded synchronously
581
+ // inside the uploadSnapshot() called below.
582
+ // If changelog checkpointing is enabled, snapshot will be uploaded asynchronously
583
+ // during state store maintenance.
584
+ latestSnapshot.foreach(_.close())
585
+ latestSnapshot = Some (
586
+ RocksDBSnapshot (checkpointDir, newVersion, numKeysOnWritingVersion))
587
+ lastSnapshotVersion = newVersion
584
588
}
585
589
}
586
590
@@ -668,7 +672,20 @@ class RocksDB(
668
672
669
673
def doMaintenance (): Unit = {
670
674
if (enableChangelogCheckpointing) {
671
- uploadSnapshot()
675
+ // There is race to update latestSnapshot between load(), commit()
676
+ // and uploadSnapshot().
677
+ // The load method will reset latestSnapshot to discard any snapshots taken
678
+ // from newer versions (when a old version is reloaded).
679
+ // commit() method deletes the existing snapshot while creating a new snapshot.
680
+ // In order to ensure that the snapshot being uploaded would not be modified
681
+ // concurrently, we need to synchronize the snapshot access between task thread
682
+ // and maintenance thread.
683
+ acquire(StoreMaintenance )
684
+ try {
685
+ uploadSnapshot()
686
+ } finally {
687
+ release(StoreMaintenance )
688
+ }
672
689
}
673
690
val cleanupTime = timeTakenMs {
674
691
fileManager.deleteOldVersions(conf.minVersionsToRetain)
@@ -788,8 +805,11 @@ class RocksDB(
788
805
*/
789
806
private def acquire (opType : RocksDBOpType ): Unit = acquireLock.synchronized {
790
807
val newAcquiredThreadInfo = AcquiredThreadInfo ()
791
- val waitStartTime = System .currentTimeMillis
792
- def timeWaitedMs = System .currentTimeMillis - waitStartTime
808
+ val waitStartTime = System .nanoTime()
809
+ def timeWaitedMs = {
810
+ val elapsedNanos = System .nanoTime() - waitStartTime
811
+ TimeUnit .MILLISECONDS .convert(elapsedNanos, TimeUnit .NANOSECONDS )
812
+ }
793
813
def isAcquiredByDifferentThread = acquiredThreadInfo != null &&
794
814
acquiredThreadInfo.threadRef.get.isDefined &&
795
815
newAcquiredThreadInfo.threadRef.get.get.getId != acquiredThreadInfo.threadRef.get.get.getId
0 commit comments