Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 41 additions & 22 deletions pkg/storage/replica.go
Original file line number Diff line number Diff line change
Expand Up @@ -5176,33 +5176,52 @@ func (r *Replica) acquireMergeLock(
ctx context.Context, merge *roachpb.MergeTrigger,
) (func(storagebase.ReplicatedEvalResult), error) {
// The merge lock is the right-hand replica's raftMu. If the right-hand
// replica does not exist, we create an uninitialized replica. This is
// required for correctness: without the uninitialized replica, an incoming
// snapshot could create the right-hand replica before the merge trigger has a
// chance to widen the left-hand replica's end key. The merge trigger would
// then fatal the node upon realizing the right-hand replica already exists.
// With an uninitialized and locked right-hand replica in place, any snapshots
// for the right-hand range will block on raftMu, waiting for the merge to
// complete, after which the replica will realize it has been destroyed and
// reject the snapshot.
//
// TODO(benesch): if the right-hand replica splits, we could probably see a
// snapshot for the right-hand side of the right-hand side. The uninitialized
// replica this method installs will not block this snapshot from applying as
// the snapshot's range ID will be different. Add a placeholder to the
// replicasByKey map as well to be safe.
//
// TODO(benesch): make terminology for uninitialized replicas, which protect a
// range ID, and placeholder replicas, which protect a key range, consistent.
// Several code comments refer to "uninitialized placeholders," which is
// downright confusing.
rightRng, _, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, 0, nil)
// replica does not exist, we create one. This is required for correctness.
// Without a right-hand replica on this store, an incoming snapshot could
// create the right-hand replica before the merge trigger has a chance to
// widen the left-hand replica's end key. The merge trigger would then fatal
// the node upon realizing the right-hand replica already exists. With a
// right-hand replica in place, any snapshots for the right-hand range will
// block on raftMu, waiting for the merge to complete, after which the replica
// will realize it has been destroyed and reject the snapshot.
rightRepl, created, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID, 0, nil)
if err != nil {
return nil, err
}
var placeholder *ReplicaPlaceholder
if created {
// rightRepl does not know its start and end keys. It can only block
// incoming snapshots with the same range ID. This is an insufficient lock:
// it's possible for this store to receive a snapshot with a different range
// ID that overlaps the right-hand keyspace while the merge is in progress.
// Consider the case where this replica is behind and the merged range
// resplits before this replica processes the merge. An up-to-date member of
// the new right-hand range could send this store a snapshot that would
// incorrectly sneak past rightRepl.
//
// So install a placeholder for the right range's keyspace in the
// replicasByKey map to block any intersecting snapshots.
placeholder = &ReplicaPlaceholder{rangeDesc: merge.RightDesc}
if err := r.store.addPlaceholder(placeholder); err != nil {
return nil, err
}
}

return func(storagebase.ReplicatedEvalResult) {
rightRng.raftMu.Unlock()
if created {
// Sanity check that the merge cleaned up the placeholder created above.
r.store.mu.Lock()
if _, ok := r.store.mu.replicasByKey.Get(placeholder).(*ReplicaPlaceholder); ok {
r.store.mu.Unlock()
log.Fatalf(ctx, "merge did not remove placeholder %+v from replicasByKey", placeholder)
}
if _, ok := r.store.mu.replicaPlaceholders[rightRepl.RangeID]; ok {
r.store.mu.Unlock()
log.Fatalf(ctx, "merge did not remove placeholder %+v from replicaPlaceholders", placeholder)
}
r.store.mu.Unlock()
}
rightRepl.raftMu.Unlock()
}, nil
}

Expand Down
67 changes: 37 additions & 30 deletions pkg/storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2308,14 +2308,20 @@ func (s *Store) MergeRange(
leftRepl.raftMu.AssertHeld()
rightRepl.raftMu.AssertHeld()

// Note that we were called (indirectly) from raft processing so we must
// call removeReplicaImpl directly to avoid deadlocking on the right-hand
// replica's raftMu.
if err := s.removeReplicaImpl(ctx, rightRepl, rightDesc, RemoveOptions{
DestroyData: false,
AllowPlaceholders: true,
}); err != nil {
return errors.Errorf("cannot remove range: %s", err)
if rightRepl.IsInitialized() {
// Note that we were called (indirectly) from raft processing so we must
// call removeReplicaImpl directly to avoid deadlocking on the right-hand
// replica's raftMu.
if err := s.removeReplicaImpl(ctx, rightRepl, rightDesc, RemoveOptions{
DestroyData: false,
}); err != nil {
return errors.Errorf("cannot remove range: %s", err)
}
} else {
s.mu.Lock()
s.unlinkReplicaByRangeIDLocked(rightRepl.RangeID)
_ = s.removePlaceholderLocked(ctx, rightRepl.RangeID)
s.mu.Unlock()
}

if leftRepl.leaseholderStats != nil {
Expand Down Expand Up @@ -2365,8 +2371,16 @@ func (s *Store) addReplicaInternalLocked(repl *Replica) error {
return nil
}

// addPlaceholderLocked adds the specified placeholder. Requires that the
// raftMu of the replica whose place is being held is locked.
func (s *Store) addPlaceholder(placeholder *ReplicaPlaceholder) error {
s.mu.Lock()
defer s.mu.Unlock()
return s.addPlaceholderLocked(placeholder)
}

// addPlaceholderLocked adds the specified placeholder. Requires that Store.mu
// and Replica.raftMu are held.
// and the raftMu of the replica whose place is being held are locked.
func (s *Store) addPlaceholderLocked(placeholder *ReplicaPlaceholder) error {
rangeID := placeholder.Desc().RangeID
if exRng := s.mu.replicasByKey.ReplaceOrInsert(placeholder); exRng != nil {
Expand All @@ -2381,15 +2395,16 @@ func (s *Store) addPlaceholderLocked(placeholder *ReplicaPlaceholder) error {

// removePlaceholder removes a placeholder for the specified range if it
// exists, returning true if a placeholder was present and removed and false
// otherwise. Requires that Replica.raftMu is held.
// otherwise. Requires that the raftMu of the replica whose place is being held
// is locked.
func (s *Store) removePlaceholder(ctx context.Context, rngID roachpb.RangeID) bool {
s.mu.Lock()
defer s.mu.Unlock()
return s.removePlaceholderLocked(ctx, rngID)
}

// removePlaceholderLocked removes the specified placeholder. Requires that
// Store.mu and Replica.raftMu are held.
// Store.mu and the raftMu of the replica whose place is being held are locked.
func (s *Store) removePlaceholderLocked(ctx context.Context, rngID roachpb.RangeID) bool {
placeholder, ok := s.mu.replicaPlaceholders[rngID]
if !ok {
Expand Down Expand Up @@ -2421,8 +2436,7 @@ func (s *Store) addReplicaToRangeMapLocked(repl *Replica) error {

// RemoveOptions bundles boolean parameters for Store.RemoveReplica.
type RemoveOptions struct {
DestroyData bool
AllowPlaceholders bool
DestroyData bool
}

// RemoveReplica removes the replica from the store's replica map and
Expand All @@ -2434,9 +2448,6 @@ type RemoveOptions struct {
// If opts.DestroyData is true, data in all of the range's keyspaces
// is deleted. Otherwise, only data in the range-ID local keyspace is
// deleted. In either case a tombstone record is written.
//
// If opts.AllowPlaceholders is true, it is not an error if the replica
// targeted for removal is an uninitialized placeholder.
func (s *Store) RemoveReplica(
ctx context.Context, rep *Replica, consistentDesc roachpb.RangeDescriptor, opts RemoveOptions,
) error {
Expand Down Expand Up @@ -2492,14 +2503,12 @@ func (s *Store) removeReplicaImpl(
}

s.mu.Lock()
if !opts.AllowPlaceholders {
if placeholder := s.getOverlappingKeyRangeLocked(desc); placeholder != rep {
// This is a fatal error because uninitialized replicas shouldn't make it
// this far. This method will need some changes when we introduce GC of
// uninitialized replicas.
s.mu.Unlock()
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
}
if placeholder := s.getOverlappingKeyRangeLocked(desc); placeholder != rep {
// This is a fatal error because uninitialized replicas shouldn't make it
// this far. This method will need some changes when we introduce GC of
// uninitialized replicas.
s.mu.Unlock()
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
}
// Adjust stats before calling Destroy. This can be called before or after
// Destroy, but this configuration helps avoid races in stat verification
Expand Down Expand Up @@ -2528,12 +2537,10 @@ func (s *Store) removeReplicaImpl(
s.mu.Lock()
defer s.mu.Unlock()
s.unlinkReplicaByRangeIDLocked(rep.RangeID)
if !opts.AllowPlaceholders {
if placeholder := s.mu.replicasByKey.Delete(rep); placeholder != rep {
// We already checked that our replica was present in replicasByKey
// above. Nothing should have been able to change that.
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
}
if placeholder := s.mu.replicasByKey.Delete(rep); placeholder != rep {
// We already checked that our replica was present in replicasByKey
// above. Nothing should have been able to change that.
log.Fatalf(ctx, "replica %+v unexpectedly overlapped by %+v", rep, placeholder)
}
delete(s.mu.replicaPlaceholders, rep.RangeID)
// TODO(peter): Could release s.mu.Lock() here.
Expand Down