Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

checker: reduces the probability of deleting normal peers when the store becomes unavailable (#7249) #7337

Merged
merged 6 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions server/cluster/coordinator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -508,11 +508,11 @@ func TestReplica(t *testing.T) {
re.NoError(dispatchHeartbeat(co, region, stream))
waitNoResponse(re, stream)

// Remove peer from store 4.
// Remove peer from store 3.
re.NoError(tc.addLeaderRegion(2, 1, 2, 3, 4))
region = tc.GetRegion(2)
re.NoError(dispatchHeartbeat(co, region, stream))
region = waitRemovePeer(re, stream, region, 4)
region = waitRemovePeer(re, stream, region, 3) // store3 is down, we should remove it firstly.
re.NoError(dispatchHeartbeat(co, region, stream))
waitNoResponse(re, stream)

Expand Down
69 changes: 54 additions & 15 deletions server/schedule/checker/rule_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@
if len(fit.OrphanPeers) == 0 {
return nil, nil
}
var pinDownPeer *metapb.Peer

isUnhealthyPeer := func(id uint64) bool {
for _, downPeer := range region.GetDownPeers() {
if downPeer.Peer.GetId() == id {
Expand All @@ -404,24 +404,45 @@
}
return false
}

isDisconnectedPeer := func(p *metapb.Peer) bool {
// avoid to meet down store when fix orphan peers,
// Isdisconnected is more strictly than IsUnhealthy.
store := c.cluster.GetStore(p.GetStoreId())
if store == nil {
return true
}
return store.IsDisconnected()
}

checkDownPeer := func(peers []*metapb.Peer) (*metapb.Peer, bool) {
for _, p := range peers {
if isUnhealthyPeer(p.GetId()) {
// make sure is down peer.
if region.GetDownPeer(p.GetId()) != nil {
Comment on lines +420 to +422
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it duplicated?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isUnhealthyPeer contains isDownPeer and isPendingPeer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it will do in #7342

return p, true
}
return nil, true
}
if isDisconnectedPeer(p) {
return p, true
}
}
return nil, false
}

// remove orphan peers only when all rules are satisfied (count+role) and all peers selected
// by RuleFits is not pending or down.
var pinDownPeer *metapb.Peer
hasUnhealthyFit := false
loopFits:
for _, rf := range fit.RuleFits {
if !rf.IsSatisfied() {
hasUnhealthyFit = true
break
}
for _, p := range rf.Peers {
if isUnhealthyPeer(p.GetId()) {
// make sure is down peer.
if region.GetDownPeer(p.GetId()) != nil {
pinDownPeer = p
}
hasUnhealthyFit = true
break loopFits
}
pinDownPeer, hasUnhealthyFit = checkDownPeer(rf.Peers)
if hasUnhealthyFit {
break
}
}

Expand All @@ -434,16 +455,19 @@
// try to use orphan peers to replace unhealthy down peers.
for _, orphanPeer := range fit.OrphanPeers {
if pinDownPeer != nil {
if pinDownPeer.GetId() == orphanPeer.GetId() {
continue

Check warning on line 459 in server/schedule/checker/rule_checker.go

View check run for this annotation

Codecov / codecov/patch

server/schedule/checker/rule_checker.go#L459

Added line #L459 was not covered by tests
}
// make sure the orphan peer is healthy.
if isUnhealthyPeer(orphanPeer.GetId()) {
if isUnhealthyPeer(orphanPeer.GetId()) || isDisconnectedPeer(orphanPeer) {
continue
}
// no consider witness in this path.
if pinDownPeer.GetIsWitness() || orphanPeer.GetIsWitness() {
continue
}
// down peer's store should be down.
if !c.isStoreDownTimeHitMaxDownTime(pinDownPeer.GetStoreId()) {
// pinDownPeer's store should be disconnected, because we use more strict judge before.
if !isDisconnectedPeer(pinDownPeer) {
continue
}
// check if down peer can replace with orphan peer.
Expand All @@ -457,10 +481,14 @@
return operator.CreatePromoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer)
case orphanPeerRole == metapb.PeerRole_Voter && destRole == metapb.PeerRole_Learner:
return operator.CreateDemoteLearnerOperatorAndRemovePeer("replace-down-peer-with-orphan-peer", c.cluster, region, orphanPeer, pinDownPeer)
case orphanPeerRole == destRole && isDisconnectedPeer(pinDownPeer) && !dstStore.IsDisconnected():
return operator.CreateRemovePeerOperator("remove-replaced-orphan-peer", c.cluster, 0, region, pinDownPeer.GetStoreId())
default:
// destRole should not same with orphanPeerRole. if role is same, it fit with orphanPeer should be better than now.
// destRole never be leader, so we not consider it.
}
} else {
checkerCounter.WithLabelValues("rule_checker", "replace-orphan-peer-no-fit").Inc()

Check warning on line 491 in server/schedule/checker/rule_checker.go

View check run for this annotation

Codecov / codecov/patch

server/schedule/checker/rule_checker.go#L490-L491

Added lines #L490 - L491 were not covered by tests
}
}
}
Expand All @@ -469,14 +497,25 @@
// Ref https://github.com/tikv/pd/issues/4045
if len(fit.OrphanPeers) >= 2 {
hasHealthPeer := false
var disconnectedPeer *metapb.Peer
for _, orphanPeer := range fit.OrphanPeers {
if isDisconnectedPeer(orphanPeer) {
disconnectedPeer = orphanPeer
break
}
}
for _, orphanPeer := range fit.OrphanPeers {
if isUnhealthyPeer(orphanPeer.GetId()) {
checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc()
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId)
return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId)
}
if hasHealthPeer {
// there already exists a healthy orphan peer, so we can remove other orphan Peers.
checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc()
// if there exists a disconnected orphan peer, we will pick it to remove firstly.
if disconnectedPeer != nil {
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, disconnectedPeer.StoreId)
}
return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId)
}
hasHealthPeer = true
Expand Down
Loading
Loading