Skip to content

Commit c3db5fe

Browse files
committed
storage: improve safety of merge lock
Fix a known correctness bug in the merge locking scheme. See the comment updates within for details. As an added benefit, this fix frees store.RemoveReplica of conditionals to deal with uninitialized replicas. Acquiring the merge lock now always results in an initialized right-hand replica, so store.MergeRange is always removing an initialized right-hand replica. There are no tests for this because testing this particular edge case is quite difficult and I'm primarily interested in avoiding the obviously wrong call to batch.ClearRange(nil, nil) that occurs when calling store.RemoveReplica on an unintialized replica. This nonsensical ClearRange is, somehow, not currently an error, but it is preventing landing. Release note: None
1 parent b5590dc commit c3db5fe

File tree

2 files changed

+39
-43
lines changed

2 files changed

+39
-43
lines changed

pkg/storage/replica.go

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5134,33 +5134,38 @@ func (r *Replica) acquireMergeLock(
51345134
ctx context.Context, merge *roachpb.MergeTrigger,
51355135
) (func(storagebase.ReplicatedEvalResult), error) {
51365136
// The merge lock is the right-hand replica's raftMu. If the right-hand
5137-
// replica does not exist, we create an uninitialized replica. This is
5138-
// required for correctness: without the uninitialized replica, an incoming
5139-
// snapshot could create the right-hand replica before the merge trigger has a
5140-
// chance to widen the left-hand replica's end key. The merge trigger would
5141-
// then fatal the node upon realizing the right-hand replica already exists.
5142-
// With an uninitialized and locked right-hand replica in place, any snapshots
5143-
// for the right-hand range will block on raftMu, waiting for the merge to
5144-
// complete, after which the replica will realize it has been destroyed and
5145-
// reject the snapshot.
5146-
//
5147-
// TODO(benesch): if the right-hand replica splits, we could probably see a
5148-
// snapshot for the right-hand side of the right-hand side. The uninitialized
5149-
// replica this method installs will not block this snapshot from applying as
5150-
// the snapshot's range ID will be different. Add a placeholder to the
5151-
// replicasByKey map as well to be safe.
5152-
//
5153-
// TODO(benesch): make terminology for uninitialized replicas, which protect a
5154-
// range ID, and placeholder replicas, which protect a key range, consistent.
5155-
// Several code comments refer to "uninitialized placeholders," which is
5156-
// downright confusing.
5157-
rightRng, _, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, 0, nil)
5137+
// replica does not exist, we create one. This is required for correctness.
5138+
// Without a right-hand replica on this store, an incoming snapshot could
5139+
// create the right-hand replica before the merge trigger has a chance to
5140+
// widen the left-hand replica's end key. The merge trigger would then fatal
5141+
// the node upon realizing the right-hand replica already exists. With a
5142+
// right-hand replica in place, any snapshots for the right-hand range will
5143+
// block on raftMu, waiting for the merge to complete, after which the replica
5144+
// will realize it has been destroyed and reject the snapshot.
5145+
rightRepl, created, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, 0, nil)
51585146
if err != nil {
51595147
return nil, err
51605148
}
5149+
if created {
5150+
// rightRepl does not know its start and end keys. It can only block
5151+
// incoming snapshots with the same range ID. This is an insufficient lock:
5152+
// it's possible for this store to receive a snapshot with a different range
5153+
// ID that overlaps the right-hand keyspace while the merge is in progress.
5154+
// Consider the case where this replica is behind and the merged range
5155+
// resplits before this replica processes the merge. An up-to-date member of
5156+
// the new right-hand range could send this store a snapshot that would
5157+
// incorrectly sneak past rightRepl.
5158+
//
5159+
// So, install the right-hand descriptor from the merge trigger in
5160+
// rightRepl. This adds rightRepl to the replicasByKey map so that snapshots
5161+
// for overlapping keyspaces will block on its raftMu.
5162+
if err := rightRepl.setDesc(&merge.RightDesc); err != nil {
5163+
return nil, err
5164+
}
5165+
}
51615166

51625167
return func(storagebase.ReplicatedEvalResult) {
5163-
rightRng.raftMu.Unlock()
5168+
rightRepl.raftMu.Unlock()
51645169
}, nil
51655170
}
51665171

pkg/storage/store.go

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2317,8 +2317,7 @@ func (s *Store) MergeRange(
23172317
// call removeReplicaImpl directly to avoid deadlocking on the right-hand
23182318
// replica's raftMu.
23192319
if err := s.removeReplicaImpl(ctx, rightRepl, rightDesc, RemoveOptions{
2320-
DestroyData: false,
2321-
AllowPlaceholders: true,
2320+
DestroyData: false,
23222321
}); err != nil {
23232322
return errors.Errorf("cannot remove range: %s", err)
23242323
}
@@ -2426,8 +2425,7 @@ func (s *Store) addReplicaToRangeMapLocked(repl *Replica) error {
24262425

24272426
// RemoveOptions bundles boolean parameters for Store.RemoveReplica.
24282427
type RemoveOptions struct {
2429-
DestroyData bool
2430-
AllowPlaceholders bool
2428+
DestroyData bool
24312429
}
24322430

24332431
// RemoveReplica removes the replica from the store's replica map and
@@ -2439,9 +2437,6 @@ type RemoveOptions struct {
24392437
// If opts.DestroyData is true, data in all of the range's keyspaces
24402438
// is deleted. Otherwise, only data in the range-ID local keyspace is
24412439
// deleted. In either case a tombstone record is written.
2442-
//
2443-
// If opts.AllowPlaceholders is true, it is not an error if the replica
2444-
// targeted for removal is an uninitialized placeholder.
24452440
func (s *Store) RemoveReplica(
24462441
ctx context.Context, rep *Replica, consistentDesc roachpb.RangeDescriptor, opts RemoveOptions,
24472442
) error {
@@ -2498,14 +2493,12 @@ func (s *Store) removeReplicaImpl(
24982493
s.mu.Unlock()
24992494
return err
25002495
}
2501-
if !opts.AllowPlaceholders {
2502-
if placeholder := s.getOverlappingKeyRangeLocked(desc); placeholder != rep {
2503-
// This is a fatal error because uninitialized replicas shouldn't make it
2504-
// this far. This method will need some changes when we introduce GC of
2505-
// uninitialized replicas.
2506-
s.mu.Unlock()
2507-
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
2508-
}
2496+
if placeholder := s.getOverlappingKeyRangeLocked(desc); placeholder != rep {
2497+
// This is a fatal error because uninitialized replicas shouldn't make it
2498+
// this far. This method will need some changes when we introduce GC of
2499+
// uninitialized replicas.
2500+
s.mu.Unlock()
2501+
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
25092502
}
25102503
// Adjust stats before calling Destroy. This can be called before or after
25112504
// Destroy, but this configuration helps avoid races in stat verification
@@ -2539,12 +2532,10 @@ func (s *Store) removeReplicaImpl(
25392532
s.mu.replicas.Delete(int64(rep.RangeID))
25402533
delete(s.mu.uninitReplicas, rep.RangeID)
25412534
s.replicaQueues.Delete(int64(rep.RangeID))
2542-
if !opts.AllowPlaceholders {
2543-
if placeholder := s.mu.replicasByKey.Delete(rep); placeholder != rep {
2544-
// We already checked that our replica was present in replicasByKey
2545-
// above. Nothing should have been able to change that.
2546-
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
2547-
}
2535+
if placeholder := s.mu.replicasByKey.Delete(rep); placeholder != rep {
2536+
// We already checked that our replica was present in replicasByKey
2537+
// above. Nothing should have been able to change that.
2538+
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
25482539
}
25492540
delete(s.mu.replicaPlaceholders, rep.RangeID)
25502541
// TODO(peter): Could release s.mu.Lock() here.

0 commit comments

Comments
 (0)