Skip to content

Commit 9521635

Browse files
craig[bot]benesch
andcommitted
Merge #27061
27061: storage: improve safety of merge lock r=tschottdorf,bdarnell a=benesch Fix a known correctness bug in the merge locking scheme. See the comment updates within for details. As an added benefit, this fix frees store.RemoveReplica of conditionals to deal with uninitialized replicas. Acquiring the merge lock now always results in an initialized right-hand replica, so store.MergeRange is always removing an initialized right-hand replica. There are no tests for this because testing this particular edge case is quite difficult and I'm primarily interested in avoiding the obviously wrong call to batch.ClearRange(nil, nil) that occurs when calling store.RemoveReplica on an unintialized replica. This nonsensical ClearRange is, somehow, not currently an error, but it is preventing #26872 ("engine: require iterators to specify an upper bound") from landing. Release note: None --- P.S. The `storage.RemoveOptions` struct now bundles precisely one parameter. Let me know if you'd prefer I change back the signature of `store.removeReplicaInternal` to take the bool directly. Bool parameters are frowned upon, though, so I wasn't sure what was preferred. Co-authored-by: Nikhil Benesch <nikhil.benesch@gmail.com>
2 parents 8ae7f0a + 0712a55 commit 9521635

File tree

2 files changed

+78
-52
lines changed

2 files changed

+78
-52
lines changed

pkg/storage/replica.go

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5176,33 +5176,52 @@ func (r *Replica) acquireMergeLock(
51765176
ctx context.Context, merge *roachpb.MergeTrigger,
51775177
) (func(storagebase.ReplicatedEvalResult), error) {
51785178
// The merge lock is the right-hand replica's raftMu. If the right-hand
5179-
// replica does not exist, we create an uninitialized replica. This is
5180-
// required for correctness: without the uninitialized replica, an incoming
5181-
// snapshot could create the right-hand replica before the merge trigger has a
5182-
// chance to widen the left-hand replica's end key. The merge trigger would
5183-
// then fatal the node upon realizing the right-hand replica already exists.
5184-
// With an uninitialized and locked right-hand replica in place, any snapshots
5185-
// for the right-hand range will block on raftMu, waiting for the merge to
5186-
// complete, after which the replica will realize it has been destroyed and
5187-
// reject the snapshot.
5188-
//
5189-
// TODO(benesch): if the right-hand replica splits, we could probably see a
5190-
// snapshot for the right-hand side of the right-hand side. The uninitialized
5191-
// replica this method installs will not block this snapshot from applying as
5192-
// the snapshot's range ID will be different. Add a placeholder to the
5193-
// replicasByKey map as well to be safe.
5194-
//
5195-
// TODO(benesch): make terminology for uninitialized replicas, which protect a
5196-
// range ID, and placeholder replicas, which protect a key range, consistent.
5197-
// Several code comments refer to "uninitialized placeholders," which is
5198-
// downright confusing.
5199-
rightRng, _, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, 0, nil)
5179+
// replica does not exist, we create one. This is required for correctness.
5180+
// Without a right-hand replica on this store, an incoming snapshot could
5181+
// create the right-hand replica before the merge trigger has a chance to
5182+
// widen the left-hand replica's end key. The merge trigger would then fatal
5183+
// the node upon realizing the right-hand replica already exists. With a
5184+
// right-hand replica in place, any snapshots for the right-hand range will
5185+
// block on raftMu, waiting for the merge to complete, after which the replica
5186+
// will realize it has been destroyed and reject the snapshot.
5187+
rightRepl, created, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, 0, nil)
52005188
if err != nil {
52015189
return nil, err
52025190
}
5191+
var placeholder *ReplicaPlaceholder
5192+
if created {
5193+
// rightRepl does not know its start and end keys. It can only block
5194+
// incoming snapshots with the same range ID. This is an insufficient lock:
5195+
// it's possible for this store to receive a snapshot with a different range
5196+
// ID that overlaps the right-hand keyspace while the merge is in progress.
5197+
// Consider the case where this replica is behind and the merged range
5198+
// resplits before this replica processes the merge. An up-to-date member of
5199+
// the new right-hand range could send this store a snapshot that would
5200+
// incorrectly sneak past rightRepl.
5201+
//
5202+
// So install a placeholder for the right range's keyspace in the
5203+
// replicasByKey map to block any intersecting snapshots.
5204+
placeholder = &ReplicaPlaceholder{rangeDesc: merge.RightDesc}
5205+
if err := r.store.addPlaceholder(placeholder); err != nil {
5206+
return nil, err
5207+
}
5208+
}
52035209

52045210
return func(storagebase.ReplicatedEvalResult) {
5205-
rightRng.raftMu.Unlock()
5211+
if created {
5212+
// Sanity check that the merge cleaned up the placeholder created above.
5213+
r.store.mu.Lock()
5214+
if _, ok := r.store.mu.replicasByKey.Get(placeholder).(*ReplicaPlaceholder); ok {
5215+
r.store.mu.Unlock()
5216+
log.Fatalf(ctx, "merge did not remove placeholder %+v from replicasByKey", placeholder)
5217+
}
5218+
if _, ok := r.store.mu.replicaPlaceholders[rightRepl.RangeID]; ok {
5219+
r.store.mu.Unlock()
5220+
log.Fatalf(ctx, "merge did not remove placeholder %+v from replicaPlaceholders", placeholder)
5221+
}
5222+
r.store.mu.Unlock()
5223+
}
5224+
rightRepl.raftMu.Unlock()
52065225
}, nil
52075226
}
52085227

pkg/storage/store.go

Lines changed: 37 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2307,14 +2307,20 @@ func (s *Store) MergeRange(
23072307
leftRepl.raftMu.AssertHeld()
23082308
rightRepl.raftMu.AssertHeld()
23092309

2310-
// Note that we were called (indirectly) from raft processing so we must
2311-
// call removeReplicaImpl directly to avoid deadlocking on the right-hand
2312-
// replica's raftMu.
2313-
if err := s.removeReplicaImpl(ctx, rightRepl, rightDesc, RemoveOptions{
2314-
DestroyData: false,
2315-
AllowPlaceholders: true,
2316-
}); err != nil {
2317-
return errors.Errorf("cannot remove range: %s", err)
2310+
if rightRepl.IsInitialized() {
2311+
// Note that we were called (indirectly) from raft processing so we must
2312+
// call removeReplicaImpl directly to avoid deadlocking on the right-hand
2313+
// replica's raftMu.
2314+
if err := s.removeReplicaImpl(ctx, rightRepl, rightDesc, RemoveOptions{
2315+
DestroyData: false,
2316+
}); err != nil {
2317+
return errors.Errorf("cannot remove range: %s", err)
2318+
}
2319+
} else {
2320+
s.mu.Lock()
2321+
s.unlinkReplicaByRangeIDLocked(rightRepl.RangeID)
2322+
_ = s.removePlaceholderLocked(ctx, rightRepl.RangeID)
2323+
s.mu.Unlock()
23182324
}
23192325

23202326
if leftRepl.leaseholderStats != nil {
@@ -2364,8 +2370,16 @@ func (s *Store) addReplicaInternalLocked(repl *Replica) error {
23642370
return nil
23652371
}
23662372

2373+
// addPlaceholderLocked adds the specified placeholder. Requires that the
2374+
// raftMu of the replica whose place is being held is locked.
2375+
func (s *Store) addPlaceholder(placeholder *ReplicaPlaceholder) error {
2376+
s.mu.Lock()
2377+
defer s.mu.Unlock()
2378+
return s.addPlaceholderLocked(placeholder)
2379+
}
2380+
23672381
// addPlaceholderLocked adds the specified placeholder. Requires that Store.mu
2368-
// and Replica.raftMu are held.
2382+
// and the raftMu of the replica whose place is being held are locked.
23692383
func (s *Store) addPlaceholderLocked(placeholder *ReplicaPlaceholder) error {
23702384
rangeID := placeholder.Desc().RangeID
23712385
if exRng := s.mu.replicasByKey.ReplaceOrInsert(placeholder); exRng != nil {
@@ -2380,15 +2394,16 @@ func (s *Store) addPlaceholderLocked(placeholder *ReplicaPlaceholder) error {
23802394

23812395
// removePlaceholder removes a placeholder for the specified range if it
23822396
// exists, returning true if a placeholder was present and removed and false
2383-
// otherwise. Requires that Replica.raftMu is held.
2397+
// otherwise. Requires that the raftMu of the replica whose place is being held
2398+
// is locked.
23842399
func (s *Store) removePlaceholder(ctx context.Context, rngID roachpb.RangeID) bool {
23852400
s.mu.Lock()
23862401
defer s.mu.Unlock()
23872402
return s.removePlaceholderLocked(ctx, rngID)
23882403
}
23892404

23902405
// removePlaceholderLocked removes the specified placeholder. Requires that
2391-
// Store.mu and Replica.raftMu are held.
2406+
// Store.mu and the raftMu of the replica whose place is being held are locked.
23922407
func (s *Store) removePlaceholderLocked(ctx context.Context, rngID roachpb.RangeID) bool {
23932408
placeholder, ok := s.mu.replicaPlaceholders[rngID]
23942409
if !ok {
@@ -2420,8 +2435,7 @@ func (s *Store) addReplicaToRangeMapLocked(repl *Replica) error {
24202435

24212436
// RemoveOptions bundles boolean parameters for Store.RemoveReplica.
24222437
type RemoveOptions struct {
2423-
DestroyData bool
2424-
AllowPlaceholders bool
2438+
DestroyData bool
24252439
}
24262440

24272441
// RemoveReplica removes the replica from the store's replica map and
@@ -2433,9 +2447,6 @@ type RemoveOptions struct {
24332447
// If opts.DestroyData is true, data in all of the range's keyspaces
24342448
// is deleted. Otherwise, only data in the range-ID local keyspace is
24352449
// deleted. In either case a tombstone record is written.
2436-
//
2437-
// If opts.AllowPlaceholders is true, it is not an error if the replica
2438-
// targeted for removal is an uninitialized placeholder.
24392450
func (s *Store) RemoveReplica(
24402451
ctx context.Context, rep *Replica, consistentDesc roachpb.RangeDescriptor, opts RemoveOptions,
24412452
) error {
@@ -2474,14 +2485,12 @@ func (s *Store) removeReplicaImpl(
24742485
}
24752486

24762487
s.mu.Lock()
2477-
if !opts.AllowPlaceholders {
2478-
if placeholder := s.getOverlappingKeyRangeLocked(desc); placeholder != rep {
2479-
// This is a fatal error because uninitialized replicas shouldn't make it
2480-
// this far. This method will need some changes when we introduce GC of
2481-
// uninitialized replicas.
2482-
s.mu.Unlock()
2483-
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
2484-
}
2488+
if placeholder := s.getOverlappingKeyRangeLocked(desc); placeholder != rep {
2489+
// This is a fatal error because uninitialized replicas shouldn't make it
2490+
// this far. This method will need some changes when we introduce GC of
2491+
// uninitialized replicas.
2492+
s.mu.Unlock()
2493+
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
24852494
}
24862495
// Adjust stats before calling Destroy. This can be called before or after
24872496
// Destroy, but this configuration helps avoid races in stat verification
@@ -2510,12 +2519,10 @@ func (s *Store) removeReplicaImpl(
25102519
s.mu.Lock()
25112520
defer s.mu.Unlock()
25122521
s.unlinkReplicaByRangeIDLocked(rep.RangeID)
2513-
if !opts.AllowPlaceholders {
2514-
if placeholder := s.mu.replicasByKey.Delete(rep); placeholder != rep {
2515-
// We already checked that our replica was present in replicasByKey
2516-
// above. Nothing should have been able to change that.
2517-
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
2518-
}
2522+
if placeholder := s.mu.replicasByKey.Delete(rep); placeholder != rep {
2523+
// We already checked that our replica was present in replicasByKey
2524+
// above. Nothing should have been able to change that.
2525+
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
25192526
}
25202527
delete(s.mu.replicaPlaceholders, rep.RangeID)
25212528
// TODO(peter): Could release s.mu.Lock() here.

0 commit comments

Comments
 (0)