Skip to content

Commit 0712a55

Browse files
committed
storage: improve safety of merge lock
Fix a known correctness bug in the merge locking scheme. See the comment updates within for details. As an added benefit, this fix frees store.RemoveReplica of conditionals to deal with uninitialized replicas. There are no tests for this because testing this particular edge case is quite difficult and I'm primarily interested in avoiding the obviously wrong call to batch.ClearRange(nil, nil) that occurs when calling store.RemoveReplica on an unintialized replica. This nonsensical ClearRange is, somehow, not currently an error, but it is preventing landing. Release note: None
1 parent b18abeb commit 0712a55

File tree

2 files changed

+78
-52
lines changed

2 files changed

+78
-52
lines changed

pkg/storage/replica.go

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5176,33 +5176,52 @@ func (r *Replica) acquireMergeLock(
51765176
ctx context.Context, merge *roachpb.MergeTrigger,
51775177
) (func(storagebase.ReplicatedEvalResult), error) {
51785178
// The merge lock is the right-hand replica's raftMu. If the right-hand
5179-
// replica does not exist, we create an uninitialized replica. This is
5180-
// required for correctness: without the uninitialized replica, an incoming
5181-
// snapshot could create the right-hand replica before the merge trigger has a
5182-
// chance to widen the left-hand replica's end key. The merge trigger would
5183-
// then fatal the node upon realizing the right-hand replica already exists.
5184-
// With an uninitialized and locked right-hand replica in place, any snapshots
5185-
// for the right-hand range will block on raftMu, waiting for the merge to
5186-
// complete, after which the replica will realize it has been destroyed and
5187-
// reject the snapshot.
5188-
//
5189-
// TODO(benesch): if the right-hand replica splits, we could probably see a
5190-
// snapshot for the right-hand side of the right-hand side. The uninitialized
5191-
// replica this method installs will not block this snapshot from applying as
5192-
// the snapshot's range ID will be different. Add a placeholder to the
5193-
// replicasByKey map as well to be safe.
5194-
//
5195-
// TODO(benesch): make terminology for uninitialized replicas, which protect a
5196-
// range ID, and placeholder replicas, which protect a key range, consistent.
5197-
// Several code comments refer to "uninitialized placeholders," which is
5198-
// downright confusing.
5199-
rightRng, _, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, 0, nil)
5179+
// replica does not exist, we create one. This is required for correctness.
5180+
// Without a right-hand replica on this store, an incoming snapshot could
5181+
// create the right-hand replica before the merge trigger has a chance to
5182+
// widen the left-hand replica's end key. The merge trigger would then fatal
5183+
// the node upon realizing the right-hand replica already exists. With a
5184+
// right-hand replica in place, any snapshots for the right-hand range will
5185+
// block on raftMu, waiting for the merge to complete, after which the replica
5186+
// will realize it has been destroyed and reject the snapshot.
5187+
rightRepl, created, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, 0, nil)
52005188
if err != nil {
52015189
return nil, err
52025190
}
5191+
var placeholder *ReplicaPlaceholder
5192+
if created {
5193+
// rightRepl does not know its start and end keys. It can only block
5194+
// incoming snapshots with the same range ID. This is an insufficient lock:
5195+
// it's possible for this store to receive a snapshot with a different range
5196+
// ID that overlaps the right-hand keyspace while the merge is in progress.
5197+
// Consider the case where this replica is behind and the merged range
5198+
// resplits before this replica processes the merge. An up-to-date member of
5199+
// the new right-hand range could send this store a snapshot that would
5200+
// incorrectly sneak past rightRepl.
5201+
//
5202+
// So install a placeholder for the right range's keyspace in the
5203+
// replicasByKey map to block any intersecting snapshots.
5204+
placeholder = &ReplicaPlaceholder{rangeDesc: merge.RightDesc}
5205+
if err := r.store.addPlaceholder(placeholder); err != nil {
5206+
return nil, err
5207+
}
5208+
}
52035209

52045210
return func(storagebase.ReplicatedEvalResult) {
5205-
rightRng.raftMu.Unlock()
5211+
if created {
5212+
// Sanity check that the merge cleaned up the placeholder created above.
5213+
r.store.mu.Lock()
5214+
if _, ok := r.store.mu.replicasByKey.Get(placeholder).(*ReplicaPlaceholder); ok {
5215+
r.store.mu.Unlock()
5216+
log.Fatalf(ctx, "merge did not remove placeholder %+v from replicasByKey", placeholder)
5217+
}
5218+
if _, ok := r.store.mu.replicaPlaceholders[rightRepl.RangeID]; ok {
5219+
r.store.mu.Unlock()
5220+
log.Fatalf(ctx, "merge did not remove placeholder %+v from replicaPlaceholders", placeholder)
5221+
}
5222+
r.store.mu.Unlock()
5223+
}
5224+
rightRepl.raftMu.Unlock()
52065225
}, nil
52075226
}
52085227

pkg/storage/store.go

Lines changed: 37 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2308,14 +2308,20 @@ func (s *Store) MergeRange(
23082308
leftRepl.raftMu.AssertHeld()
23092309
rightRepl.raftMu.AssertHeld()
23102310

2311-
// Note that we were called (indirectly) from raft processing so we must
2312-
// call removeReplicaImpl directly to avoid deadlocking on the right-hand
2313-
// replica's raftMu.
2314-
if err := s.removeReplicaImpl(ctx, rightRepl, rightDesc, RemoveOptions{
2315-
DestroyData: false,
2316-
AllowPlaceholders: true,
2317-
}); err != nil {
2318-
return errors.Errorf("cannot remove range: %s", err)
2311+
if rightRepl.IsInitialized() {
2312+
// Note that we were called (indirectly) from raft processing so we must
2313+
// call removeReplicaImpl directly to avoid deadlocking on the right-hand
2314+
// replica's raftMu.
2315+
if err := s.removeReplicaImpl(ctx, rightRepl, rightDesc, RemoveOptions{
2316+
DestroyData: false,
2317+
}); err != nil {
2318+
return errors.Errorf("cannot remove range: %s", err)
2319+
}
2320+
} else {
2321+
s.mu.Lock()
2322+
s.unlinkReplicaByRangeIDLocked(rightRepl.RangeID)
2323+
_ = s.removePlaceholderLocked(ctx, rightRepl.RangeID)
2324+
s.mu.Unlock()
23192325
}
23202326

23212327
if leftRepl.leaseholderStats != nil {
@@ -2365,8 +2371,16 @@ func (s *Store) addReplicaInternalLocked(repl *Replica) error {
23652371
return nil
23662372
}
23672373

2374+
// addPlaceholderLocked adds the specified placeholder. Requires that the
2375+
// raftMu of the replica whose place is being held is locked.
2376+
func (s *Store) addPlaceholder(placeholder *ReplicaPlaceholder) error {
2377+
s.mu.Lock()
2378+
defer s.mu.Unlock()
2379+
return s.addPlaceholderLocked(placeholder)
2380+
}
2381+
23682382
// addPlaceholderLocked adds the specified placeholder. Requires that Store.mu
2369-
// and Replica.raftMu are held.
2383+
// and the raftMu of the replica whose place is being held are locked.
23702384
func (s *Store) addPlaceholderLocked(placeholder *ReplicaPlaceholder) error {
23712385
rangeID := placeholder.Desc().RangeID
23722386
if exRng := s.mu.replicasByKey.ReplaceOrInsert(placeholder); exRng != nil {
@@ -2381,15 +2395,16 @@ func (s *Store) addPlaceholderLocked(placeholder *ReplicaPlaceholder) error {
23812395

23822396
// removePlaceholder removes a placeholder for the specified range if it
23832397
// exists, returning true if a placeholder was present and removed and false
2384-
// otherwise. Requires that Replica.raftMu is held.
2398+
// otherwise. Requires that the raftMu of the replica whose place is being held
2399+
// is locked.
23852400
func (s *Store) removePlaceholder(ctx context.Context, rngID roachpb.RangeID) bool {
23862401
s.mu.Lock()
23872402
defer s.mu.Unlock()
23882403
return s.removePlaceholderLocked(ctx, rngID)
23892404
}
23902405

23912406
// removePlaceholderLocked removes the specified placeholder. Requires that
2392-
// Store.mu and Replica.raftMu are held.
2407+
// Store.mu and the raftMu of the replica whose place is being held are locked.
23932408
func (s *Store) removePlaceholderLocked(ctx context.Context, rngID roachpb.RangeID) bool {
23942409
placeholder, ok := s.mu.replicaPlaceholders[rngID]
23952410
if !ok {
@@ -2421,8 +2436,7 @@ func (s *Store) addReplicaToRangeMapLocked(repl *Replica) error {
24212436

24222437
// RemoveOptions bundles boolean parameters for Store.RemoveReplica.
24232438
type RemoveOptions struct {
2424-
DestroyData bool
2425-
AllowPlaceholders bool
2439+
DestroyData bool
24262440
}
24272441

24282442
// RemoveReplica removes the replica from the store's replica map and
@@ -2434,9 +2448,6 @@ type RemoveOptions struct {
24342448
// If opts.DestroyData is true, data in all of the range's keyspaces
24352449
// is deleted. Otherwise, only data in the range-ID local keyspace is
24362450
// deleted. In either case a tombstone record is written.
2437-
//
2438-
// If opts.AllowPlaceholders is true, it is not an error if the replica
2439-
// targeted for removal is an uninitialized placeholder.
24402451
func (s *Store) RemoveReplica(
24412452
ctx context.Context, rep *Replica, consistentDesc roachpb.RangeDescriptor, opts RemoveOptions,
24422453
) error {
@@ -2492,14 +2503,12 @@ func (s *Store) removeReplicaImpl(
24922503
}
24932504

24942505
s.mu.Lock()
2495-
if !opts.AllowPlaceholders {
2496-
if placeholder := s.getOverlappingKeyRangeLocked(desc); placeholder != rep {
2497-
// This is a fatal error because uninitialized replicas shouldn't make it
2498-
// this far. This method will need some changes when we introduce GC of
2499-
// uninitialized replicas.
2500-
s.mu.Unlock()
2501-
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
2502-
}
2506+
if placeholder := s.getOverlappingKeyRangeLocked(desc); placeholder != rep {
2507+
// This is a fatal error because uninitialized replicas shouldn't make it
2508+
// this far. This method will need some changes when we introduce GC of
2509+
// uninitialized replicas.
2510+
s.mu.Unlock()
2511+
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
25032512
}
25042513
// Adjust stats before calling Destroy. This can be called before or after
25052514
// Destroy, but this configuration helps avoid races in stat verification
@@ -2528,12 +2537,10 @@ func (s *Store) removeReplicaImpl(
25282537
s.mu.Lock()
25292538
defer s.mu.Unlock()
25302539
s.unlinkReplicaByRangeIDLocked(rep.RangeID)
2531-
if !opts.AllowPlaceholders {
2532-
if placeholder := s.mu.replicasByKey.Delete(rep); placeholder != rep {
2533-
// We already checked that our replica was present in replicasByKey
2534-
// above. Nothing should have been able to change that.
2535-
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
2536-
}
2540+
if placeholder := s.mu.replicasByKey.Delete(rep); placeholder != rep {
2541+
// We already checked that our replica was present in replicasByKey
2542+
// above. Nothing should have been able to change that.
2543+
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
25372544
}
25382545
delete(s.mu.replicaPlaceholders, rep.RangeID)
25392546
// TODO(peter): Could release s.mu.Lock() here.

0 commit comments

Comments
 (0)