Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

config, cluster: add an option to halt the cluster scheduling #6498

Merged
merged 4 commits into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions errors.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ error = '''
TiKV cluster not bootstrapped, please start TiKV first
'''

["PD:cluster:ErrSchedulingIsHalted"]
error = '''
scheduling is halted
'''

["PD:cluster:ErrStoreIsUp"]
error = '''
store is still up, please remove store gracefully
Expand Down
7 changes: 4 additions & 3 deletions pkg/errs/errno.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,10 @@ var (

// cluster errors
var (
ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
ErrStoreIsUp = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
ErrInvalidStoreID = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
ErrNotBootstrapped = errors.Normalize("TiKV cluster not bootstrapped, please start TiKV first", errors.RFCCodeText("PD:cluster:ErrNotBootstrapped"))
ErrStoreIsUp = errors.Normalize("store is still up, please remove store gracefully", errors.RFCCodeText("PD:cluster:ErrStoreIsUp"))
ErrInvalidStoreID = errors.Normalize("invalid store id %d, not found", errors.RFCCodeText("PD:cluster:ErrInvalidStoreID"))
ErrSchedulingIsHalted = errors.Normalize("scheduling is halted", errors.RFCCodeText("PD:cluster:ErrSchedulingIsHalted"))
)

// versioninfo errors
Expand Down
2 changes: 1 addition & 1 deletion pkg/schedule/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ type Config interface {
SetSplitMergeInterval(time.Duration)
SetMaxReplicas(int)
SetPlacementRulesCacheEnabled(bool)
SetWitnessEnabled(bool)
SetEnableWitness(bool)
// only for store configuration
UseRaftV2()
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/schedule/placement/rule_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func newTestManager(t *testing.T, enableWitness bool) (endpoint.RuleStorage, *Ru
store := endpoint.NewStorageEndpoint(kv.NewMemoryKV(), nil)
var err error
manager := NewRuleManager(store, nil, mockconfig.NewTestOptions())
manager.conf.SetWitnessEnabled(enableWitness)
manager.conf.SetEnableWitness(enableWitness)
err = manager.Initialize(3, []string{"zone", "rack", "host"})
re.NoError(err)
return store, manager
Expand Down
13 changes: 13 additions & 0 deletions server/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2733,3 +2733,16 @@ func (c *RaftCluster) GetPausedSchedulerDelayAt(name string) (int64, error) {
func (c *RaftCluster) GetPausedSchedulerDelayUntil(name string) (int64, error) {
return c.coordinator.getPausedSchedulerDelayUntil(name)
}

// checkSchedulingAllowance checks if the cluster allows scheduling.
func (c *RaftCluster) checkSchedulingAllowance() (bool, error) {
// If the cluster is in the process of online unsafe recovery, it should not allow scheduling.
if c.GetUnsafeRecoveryController().IsRunning() {
JmPotato marked this conversation as resolved.
Show resolved Hide resolved
return false, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
}
// If the halt-scheduling is set, it should not allow scheduling.
if c.opt.IsSchedulingHalted() {
JmPotato marked this conversation as resolved.
Show resolved Hide resolved
return false, errs.ErrSchedulingIsHalted.FastGenByArgs()
JmPotato marked this conversation as resolved.
Show resolved Hide resolved
}
return true, nil
}
8 changes: 4 additions & 4 deletions server/cluster/cluster_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error {

// HandleAskSplit handles the split request.
func (c *RaftCluster) HandleAskSplit(request *pdpb.AskSplitRequest) (*pdpb.AskSplitResponse, error) {
if c.GetUnsafeRecoveryController().IsRunning() {
return nil, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
if allowed, err := c.checkSchedulingAllowance(); !allowed {
return nil, err
}
if !c.opt.IsTikvRegionSplitEnabled() {
return nil, errs.ErrSchedulerTiKVSplitDisabled.FastGenByArgs()
Expand Down Expand Up @@ -105,8 +105,8 @@ func (c *RaftCluster) ValidRequestRegion(reqRegion *metapb.Region) error {

// HandleAskBatchSplit handles the batch split request.
func (c *RaftCluster) HandleAskBatchSplit(request *pdpb.AskBatchSplitRequest) (*pdpb.AskBatchSplitResponse, error) {
if c.GetUnsafeRecoveryController().IsRunning() {
return nil, errs.ErrUnsafeRecoveryIsRunning.FastGenByArgs()
if allowed, err := c.checkSchedulingAllowance(); !allowed {
return nil, err
}
if !c.opt.IsTikvRegionSplitEnabled() {
return nil, errs.ErrSchedulerTiKVSplitDisabled.FastGenByArgs()
Expand Down
7 changes: 3 additions & 4 deletions server/cluster/coordinator.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,7 @@ func (c *coordinator) patrolRegions() {
log.Info("patrol regions has been stopped")
return
}
if c.cluster.GetUnsafeRecoveryController().IsRunning() {
// Skip patrolling regions during unsafe recovery.
if allowed, _ := c.cluster.checkSchedulingAllowance(); !allowed {
JmPotato marked this conversation as resolved.
Show resolved Hide resolved
continue
}

Expand Down Expand Up @@ -540,7 +539,7 @@ func (c *coordinator) collectSchedulerMetrics() {
var allowScheduler float64
// If the scheduler is not allowed to schedule, it will disappear in Grafana panel.
// See issue #1341.
if !s.IsPaused() && !s.cluster.GetUnsafeRecoveryController().IsRunning() {
if allowed, _ := s.cluster.checkSchedulingAllowance(); !s.IsPaused() && !allowed {
allowScheduler = 1
}
schedulerStatusGauge.WithLabelValues(s.GetName(), "allow").Set(allowScheduler)
Expand Down Expand Up @@ -947,7 +946,7 @@ func (s *scheduleController) AllowSchedule(diagnosable bool) bool {
}
return false
}
if s.IsPaused() || s.cluster.GetUnsafeRecoveryController().IsRunning() {
if allowed, _ := s.cluster.checkSchedulingAllowance(); s.IsPaused() || allowed {
if diagnosable {
s.diagnosticRecorder.setResultFromStatus(paused)
}
Expand Down
9 changes: 9 additions & 0 deletions server/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ const (
defaultEnableGRPCGateway = true
defaultDisableErrorVerbose = true
defaultEnableWitness = false
defaultHaltScheduling = false

defaultDashboardAddress = "auto"

Expand Down Expand Up @@ -684,6 +685,10 @@ type ScheduleConfig struct {
// v1: which is based on the region count by rate limit.
// v2: which is based on region size by window size.
StoreLimitVersion string `toml:"store-limit-version" json:"store-limit-version,omitempty"`

// HaltScheduling is the option to halt the scheduling. Once it's on, PD will halt the scheduling,
// and any other scheduling configs will be ignored.
HaltScheduling bool `toml:"halt-scheduling" json:"halt-scheduling,string,omitempty"`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously, I am trying to introduce a scheduling mode to cover this case. For me, it's ok to use an individual config to control it. Maybe we can name it enable-scheduling or something else.

Copy link
Member Author

@JmPotato JmPotato May 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's best to use a configuration name with a default value of false to control the global scheduling switch, in order to avoid unexpected behaviors in scenarios that require compatibility considerations such as upgrades. Therefore, from this perspective, I think descriptions like "disable" or "halt" are more appropriate. At the same time, this global shutdown scheduling behavior should not be long-term. In addition, we already have the concept and operation of "pause" for Scheduler. So I ultimately chose the word "halt". WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I work on #6553, I found that maybe it's better to use one config for both unsafe recovery or halt, so that we can decouple the dependencies between cluster and coordinator.

}

// Clone returns a cloned scheduling configuration.
Expand Down Expand Up @@ -820,6 +825,10 @@ func (c *ScheduleConfig) adjust(meta *configutil.ConfigMetaData, reloading bool)
configutil.AdjustString(&c.RegionScoreFormulaVersion, defaultRegionScoreFormulaVersion)
}

if !meta.IsDefined("halt-scheduling") {
c.HaltScheduling = defaultHaltScheduling
}

adjustSchedulers(&c.Schedulers, DefaultSchedulers)

for k, b := range c.migrateConfigurationMap() {
Expand Down
19 changes: 12 additions & 7 deletions server/config/persist_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,6 @@ func (o *PersistOptions) SetPlacementRulesCacheEnabled(enabled bool) {
o.SetReplicationConfig(v)
}

// SetWitnessEnabled set EanbleWitness
func (o *PersistOptions) SetWitnessEnabled(enabled bool) {
v := o.GetScheduleConfig().Clone()
v.EnableWitness = enabled
o.SetScheduleConfig(v)
}

// GetStrictlyMatchLabel returns whether check label strict.
func (o *PersistOptions) GetStrictlyMatchLabel() bool {
return o.GetReplicationConfig().StrictlyMatchLabel
Expand Down Expand Up @@ -926,3 +919,15 @@ func (o *PersistOptions) SetAllStoresLimitTTL(ctx context.Context, client *clien
}
return err
}

// SetHaltScheduling set HaltScheduling.
func (o *PersistOptions) SetHaltScheduling(halt bool) {
v := o.GetScheduleConfig().Clone()
v.HaltScheduling = halt
o.SetScheduleConfig(v)
}

// IsSchedulingHalted returns if PD scheduling is halted.
func (o *PersistOptions) IsSchedulingHalted() bool {
return o.GetScheduleConfig().HaltScheduling
}