Skip to content

Commit

Permalink
This is an automated cherry-pick of tikv#6498
Browse files Browse the repository at this point in the history
ref tikv#6493

Signed-off-by: ti-chi-bot <ti-community-prow-bot@tidb.io>
  • Loading branch information
JmPotato authored and ti-chi-bot committed Jun 6, 2023
1 parent 8e9d0c4 commit 3ccbba6
Show file tree
Hide file tree
Showing 14 changed files with 397 additions and 8 deletions.
5 changes: 5 additions & 0 deletions errors.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ error = '''
TiKV cluster not bootstrapped, please start TiKV first
'''

["PD:cluster:ErrSchedulingIsHalted"]
error = '''
scheduling is halted
'''

["PD:cluster:ErrStoreIsUp"]
error = '''
store is still up, please remove store gracefully
Expand Down
109 changes: 108 additions & 1 deletion metrics/grafana/pd.json
Original file line number Diff line number Diff line change
Expand Up @@ -2332,6 +2332,113 @@
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": true,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "The allowance status of the scheduling.",
"fieldConfig": {
"defaults": {},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 41
},
"hiddenSeries": false,
"id": 1464,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": true,
"hideZero": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": false,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"paceLength": 10,
"percentage": false,
"pluginVersion": "7.5.10",
"pointradius": 1,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"exemplar": true,
"expr": "pd_scheduling_allowance_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\"}",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{kind}}",
"metric": "pd_scheduling_allowance_status",
"refId": "A",
"step": 2
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Scheduling Allowance Status",
"tooltip": {
"shared": true,
"sort": 1,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:533",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"$$hashKey": "object:534",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"cacheTimeout": null,
"colorBackground": false,
Expand Down Expand Up @@ -2959,7 +3066,7 @@
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{event}}",
"metric": "pd_scheduler_status",
"metric": "pd_schedule_operators_count",
"refId": "A",
"step": 4
}
Expand Down
7 changes: 4 additions & 3 deletions pkg/errs/errno.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,10 @@ var (

// cluster errors
var (
ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
ErrStoreIsUp = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
ErrInvalidStoreID = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
ErrStoreIsUp = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
ErrInvalidStoreID = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
ErrSchedulingIsHalted = errors.Normalize("scheduling is halted", errors.RFCCodeText("PD:cluster:ErrSchedulingIsHalted"))
)

// versioninfo errors
Expand Down
20 changes: 20 additions & 0 deletions pkg/mock/mockcluster/mockcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,26 @@ func (mc *Cluster) GetAllocator() id.Allocator {
return mc.IDAllocator
}

<<<<<<< HEAD
=======
// GetPersistOptions returns the persist options.
func (mc *Cluster) GetPersistOptions() *config.PersistOptions {
return mc.PersistOptions
}

// UpdateRegionsLabelLevelStats updates the label level stats for the regions.
func (mc *Cluster) UpdateRegionsLabelLevelStats(regions []*core.RegionInfo) {}

// IsSchedulerExisted checks if the scheduler with name is existed or not.
func (mc *Cluster) IsSchedulerExisted(name string) (bool, error) { return false, nil }

// IsSchedulerDisabled checks if the scheduler with name is disabled or not.
func (mc *Cluster) IsSchedulerDisabled(name string) (bool, error) { return false, nil }

// CheckSchedulingAllowance checks if the cluster allows scheduling currently.
func (mc *Cluster) CheckSchedulingAllowance() (bool, error) { return true, nil }

>>>>>>> 99e241955 (config, cluster: add an option to halt the cluster scheduling (#6498))
// ScanRegions scans region with start key, until number greater than limit.
func (mc *Cluster) ScanRegions(startKey, endKey []byte, limit int) []*core.RegionInfo {
return mc.ScanRange(startKey, endKey, limit)
Expand Down
104 changes: 104 additions & 0 deletions pkg/schedule/config/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package config

import (
"sync"
"time"

"github.com/coreos/go-semver/semver"
"github.com/pingcap/kvproto/pkg/metapb"
"github.com/tikv/pd/pkg/core/constant"
"github.com/tikv/pd/pkg/core/storelimit"
)

// RejectLeader is the label property type that suggests a store should not
// have any region leaders.
const RejectLeader = "reject-leader"

var schedulerMap sync.Map

// RegisterScheduler registers the scheduler type.
func RegisterScheduler(typ string) {
schedulerMap.Store(typ, struct{}{})
}

// IsSchedulerRegistered checks if the named scheduler type is registered.
func IsSchedulerRegistered(name string) bool {
_, ok := schedulerMap.Load(name)
return ok
}

// Config is the interface that wraps the Config related methods.
type Config interface {
GetReplicaScheduleLimit() uint64
GetRegionScheduleLimit() uint64
GetMergeScheduleLimit() uint64
GetLeaderScheduleLimit() uint64
GetHotRegionScheduleLimit() uint64
GetWitnessScheduleLimit() uint64

GetHotRegionCacheHitsThreshold() int
GetMaxMovableHotPeerSize() int64
IsTraceRegionFlow() bool

GetSplitMergeInterval() time.Duration
GetMaxMergeRegionSize() uint64
GetMaxMergeRegionKeys() uint64
GetKeyType() constant.KeyType
IsOneWayMergeEnabled() bool
IsCrossTableMergeEnabled() bool

IsPlacementRulesEnabled() bool
IsPlacementRulesCacheEnabled() bool

GetMaxReplicas() int
GetPatrolRegionInterval() time.Duration
GetMaxStoreDownTime() time.Duration
GetLocationLabels() []string
GetIsolationLevel() string
IsReplaceOfflineReplicaEnabled() bool
IsMakeUpReplicaEnabled() bool
IsRemoveExtraReplicaEnabled() bool
IsLocationReplacementEnabled() bool
IsRemoveDownReplicaEnabled() bool

GetSwitchWitnessInterval() time.Duration
IsWitnessAllowed() bool

GetLowSpaceRatio() float64
GetHighSpaceRatio() float64
GetTolerantSizeRatio() float64
GetLeaderSchedulePolicy() constant.SchedulePolicy
GetRegionScoreFormulaVersion() string

GetMaxSnapshotCount() uint64
GetMaxPendingPeerCount() uint64
GetSchedulerMaxWaitingOperator() uint64
GetStoreLimitByType(uint64, storelimit.Type) float64
SetAllStoresLimit(storelimit.Type, float64)
GetSlowStoreEvictingAffectedStoreRatioThreshold() float64
IsUseJointConsensus() bool
CheckLabelProperty(string, []*metapb.StoreLabel) bool
IsDebugMetricsEnabled() bool
GetClusterVersion() *semver.Version
GetStoreLimitVersion() string
IsDiagnosticAllowed() bool
// for test purpose
SetPlacementRuleEnabled(bool)
SetSplitMergeInterval(time.Duration)
SetMaxReplicas(int)
SetPlacementRulesCacheEnabled(bool)
SetEnableWitness(bool)
// only for store configuration
UseRaftV2()
}

// StoreConfig is the interface that wraps the StoreConfig related methods.
type StoreConfig interface {
GetRegionMaxSize() uint64
CheckRegionSize(uint64, uint64) error
CheckRegionKeys(uint64, uint64) error
IsEnableRegionBucket() bool
IsRaftKV2() bool
// for test purpose
SetRegionBucketEnabled(bool)
}
59 changes: 59 additions & 0 deletions pkg/schedule/core/cluster_informer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright 2017 TiKV Project Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package core

import (
"github.com/tikv/pd/pkg/core"
"github.com/tikv/pd/pkg/id"
sc "github.com/tikv/pd/pkg/schedule/config"
"github.com/tikv/pd/pkg/schedule/labeler"
"github.com/tikv/pd/pkg/schedule/placement"
"github.com/tikv/pd/pkg/statistics"
"github.com/tikv/pd/pkg/statistics/buckets"
"github.com/tikv/pd/pkg/storage"
"github.com/tikv/pd/server/config"
)

// ClusterInformer provides the necessary information of a cluster.
type ClusterInformer interface {
RegionHealthCluster
statistics.RegionStatInformer
statistics.StoreStatInformer
buckets.BucketStatInformer

GetBasicCluster() *core.BasicCluster
GetStoreConfig() sc.StoreConfig
GetAllocator() id.Allocator
GetRegionLabeler() *labeler.RegionLabeler
GetStorage() storage.Storage
RemoveScheduler(name string) error
AddSuspectRegions(ids ...uint64)
RecordOpStepWithTTL(regionID uint64)
UpdateRegionsLabelLevelStats(regions []*core.RegionInfo)
IsSchedulerExisted(name string) (bool, error)
IsSchedulerDisabled(name string) (bool, error)
CheckSchedulingAllowance() (bool, error)
GetPersistOptions() *config.PersistOptions
}

// RegionHealthCluster is an aggregate interface that wraps multiple interfaces
type RegionHealthCluster interface {
core.StoreSetInformer
core.StoreSetController
core.RegionSetInformer

GetOpts() sc.Config
GetRuleManager() *placement.RuleManager
}
22 changes: 22 additions & 0 deletions server/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2531,3 +2531,25 @@ func (c *RaftCluster) GetPausedSchedulerDelayAt(name string) (int64, error) {
func (c *RaftCluster) GetPausedSchedulerDelayUntil(name string) (int64, error) {
return c.coordinator.getPausedSchedulerDelayUntil(name)
}

var (
onlineUnsafeRecoveryStatus = schedulingAllowanceStatusGauge.WithLabelValues("online-unsafe-recovery")
haltSchedulingStatus = schedulingAllowanceStatusGauge.WithLabelValues("halt-scheduling")
)

// CheckSchedulingAllowance checks if the cluster allows scheduling currently.
func (c *RaftCluster) CheckSchedulingAllowance() (bool, error) {
// If the cluster is in the process of online unsafe recovery, it should not allow scheduling.
if c.GetUnsafeRecoveryController().IsRunning() {
onlineUnsafeRecoveryStatus.Set(1)
return false, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
}
onlineUnsafeRecoveryStatus.Set(0)
// If the halt-scheduling is set, it should not allow scheduling.
if c.opt.IsSchedulingHalted() {
haltSchedulingStatus.Set(1)
return false, errs.ErrSchedulingIsHalted.FastGenByArgs()
}
haltSchedulingStatus.Set(0)
return true, nil
}
8 changes: 4 additions & 4 deletions server/cluster/cluster_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error {

// HandleAskSplit handles the split request.
func (c *RaftCluster) HandleAskSplit(request *pdpb.AskSplitRequest) (*pdpb.AskSplitResponse, error) {
if c.GetUnsafeRecoveryController().IsRunning() {
return nil, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
if allowed, err := c.CheckSchedulingAllowance(); !allowed {
return nil, err
}
if !c.opt.IsTikvRegionSplitEnabled() {
return nil, errs.ErrSchedulerTiKVSplitDisabled.FastGenByArgs()
Expand Down Expand Up @@ -105,8 +105,8 @@ func (c *RaftCluster) ValidRequestRegion(reqRegion *metapb.Region) error {

// HandleAskBatchSplit handles the batch split request.
func (c *RaftCluster) HandleAskBatchSplit(request *pdpb.AskBatchSplitRequest) (*pdpb.AskBatchSplitResponse, error) {
if c.GetUnsafeRecoveryController().IsRunning() {
return nil, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
if allowed, err := c.CheckSchedulingAllowance(); !allowed {
return nil, err
}
if !c.opt.IsTikvRegionSplitEnabled() {
return nil, errs.ErrSchedulerTiKVSplitDisabled.FastGenByArgs()
Expand Down
Loading

0 comments on commit 3ccbba6

Please sign in to comment.