Skip to content

Commit

Permalink
ddl: fix panic tidb-server doesn't release table lock (#19586) (#20020)
Browse files Browse the repository at this point in the history
Signed-off-by: ti-srebot <ti-srebot@pingcap.com>
Signed-off-by: crazycs520 <crazycs520@gmail.com>
  • Loading branch information
ti-srebot authored Sep 21, 2020
1 parent 30cfb6a commit c2dc0c3
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 0 deletions.
44 changes: 44 additions & 0 deletions ddl/ddl.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/pingcap/parser/model"
"github.com/pingcap/parser/mysql"
pumpcli "github.com/pingcap/tidb-tools/tidb-binlog/pump_client"
"github.com/pingcap/tidb/config"
"github.com/pingcap/tidb/ddl/util"
"github.com/pingcap/tidb/infoschema"
"github.com/pingcap/tidb/kv"
Expand All @@ -41,6 +42,7 @@ import (
"github.com/pingcap/tidb/sessionctx/binloginfo"
"github.com/pingcap/tidb/sessionctx/variable"
"github.com/pingcap/tidb/table"
goutil "github.com/pingcap/tidb/util"
"github.com/pingcap/tidb/util/logutil"
"go.uber.org/zap"
)
Expand Down Expand Up @@ -195,6 +197,7 @@ type ddlCtx struct {
lease time.Duration // lease is schema lease.
binlogCli *pumpcli.PumpsClient // binlogCli is used for Binlog.
infoHandle *infoschema.Handle
tableLockCkr util.DeadTableLockChecker

// hook may be modified.
mu struct {
Expand Down Expand Up @@ -258,6 +261,7 @@ func newDDL(ctx context.Context, options ...Option) *ddl {
id := uuid.New().String()
var manager owner.Manager
var syncer util.SchemaSyncer
var deadLockCkr util.DeadTableLockChecker
if etcdCli := opt.EtcdCli; etcdCli == nil {
// The etcdCli is nil if the store is localstore which is only used for testing.
// So we use mockOwnerManager and MockSchemaSyncer.
Expand All @@ -266,6 +270,7 @@ func newDDL(ctx context.Context, options ...Option) *ddl {
} else {
manager = owner.NewOwnerManager(ctx, etcdCli, ddlPrompt, id, DDLOwnerKey)
syncer = util.NewSchemaSyncer(etcdCli, id, manager)
deadLockCkr = util.NewDeadTableLockChecker(etcdCli)
}

ddlCtx := &ddlCtx{
Expand All @@ -277,6 +282,7 @@ func newDDL(ctx context.Context, options ...Option) *ddl {
schemaSyncer: syncer,
binlogCli: binloginfo.GetPumpsClient(),
infoHandle: opt.InfoHandle,
tableLockCkr: deadLockCkr,
}
ddlCtx.mu.hook = opt.Hook
ddlCtx.mu.interceptor = &BaseInterceptor{}
Expand Down Expand Up @@ -345,6 +351,10 @@ func (d *ddl) Start(ctxPool *pools.ResourcePool) error {
}

go d.schemaSyncer.StartCleanWork()
if config.TableLockEnabled() {
d.wg.Add(1)
go d.startCleanDeadTableLock()
}
metrics.DDLCounter.WithLabelValues(metrics.StartCleanWork).Inc()
}

Expand Down Expand Up @@ -538,6 +548,40 @@ func (d *ddl) GetHook() Callback {
return d.mu.hook
}

func (d *ddl) startCleanDeadTableLock() {
defer func() {
goutil.Recover(metrics.LabelDDL, "startCleanDeadTableLock", nil, false)
d.wg.Done()
}()

ticker := time.NewTicker(time.Second * 10)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if !d.ownerManager.IsOwner() {
continue
}
if d.infoHandle == nil || !d.infoHandle.IsValid() {
continue
}
deadLockTables, err := d.tableLockCkr.GetDeadLockedTables(d.quitCh, d.infoHandle.Get().AllSchemas())
if err != nil {
logutil.BgLogger().Info("[ddl] get dead table lock failed.", zap.Error(err))
continue
}
for se, tables := range deadLockTables {
err := d.CleanDeadTableLock(tables, se)
if err != nil {
logutil.BgLogger().Info("[ddl] clean dead table lock failed.", zap.Error(err))
}
}
case <-d.quitCh:
return
}
}
}

// RecoverInfo contains information needed by DDL.RecoverTable.
type RecoverInfo struct {
SchemaID int64
Expand Down
27 changes: 27 additions & 0 deletions ddl/ddl_api.go
Original file line number Diff line number Diff line change
Expand Up @@ -4438,6 +4438,33 @@ func (d *ddl) UnlockTables(ctx sessionctx.Context, unlockTables []model.TableLoc
return errors.Trace(err)
}

// CleanDeadTableLock uses to clean dead table locks.
func (d *ddl) CleanDeadTableLock(unlockTables []model.TableLockTpInfo, se model.SessionInfo) error {
if len(unlockTables) == 0 {
return nil
}
arg := &lockTablesArg{
UnlockTables: unlockTables,
SessionInfo: se,
}
job := &model.Job{
SchemaID: unlockTables[0].SchemaID,
TableID: unlockTables[0].TableID,
Type: model.ActionUnlockTable,
BinlogInfo: &model.HistoryInfo{},
Args: []interface{}{arg},
}

ctx, err := d.sessPool.get()
if err != nil {
return err
}
defer d.sessPool.put(ctx)
err = d.doDDLJob(ctx, job)
err = d.callHookOnChanged(err)
return errors.Trace(err)
}

func throwErrIfInMemOrSysDB(ctx sessionctx.Context, dbLowerName string) error {
if util.IsMemOrSysDB(dbLowerName) {
if ctx.GetSessionVars().User != nil {
Expand Down
106 changes: 106 additions & 0 deletions ddl/util/dead_table_lock_checker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// Copyright 2020 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package util

import (
"context"
"strings"
"time"

"github.com/pingcap/errors"
"github.com/pingcap/parser/model"
"github.com/pingcap/tidb/util/logutil"
"go.etcd.io/etcd/clientv3"
"go.uber.org/zap"
)

const (
defaultRetryCnt = 5
defaultRetryInterval = time.Millisecond * 200
defaultTimeout = time.Second
)

// DeadTableLockChecker uses to check dead table locks.
// If tidb-server panic or killed by others, the table locks hold by the killed tidb-server maybe doesn't released.
type DeadTableLockChecker struct {
etcdCli *clientv3.Client
}

// NewDeadTableLockChecker creates new DeadLockChecker.
func NewDeadTableLockChecker(etcdCli *clientv3.Client) DeadTableLockChecker {
return DeadTableLockChecker{
etcdCli: etcdCli,
}
}

func (d *DeadTableLockChecker) getAliveServers(quitCh chan struct{}) (map[string]struct{}, error) {
var err error
var resp *clientv3.GetResponse
allInfos := make(map[string]struct{})
for i := 0; i < defaultRetryCnt; i++ {
select {
case <-quitCh:
return nil, nil
default:
}
childCtx, cancel := context.WithTimeout(context.Background(), defaultTimeout)
resp, err = d.etcdCli.Get(childCtx, DDLAllSchemaVersions, clientv3.WithPrefix())
cancel()
if err != nil {
logutil.BgLogger().Info("[ddl] clean dead table lock get alive servers failed.", zap.Error(err))
time.Sleep(defaultRetryInterval)
continue
}
for _, kv := range resp.Kvs {
serverID := strings.TrimPrefix(string(kv.Key), DDLAllSchemaVersions+"/")
allInfos[serverID] = struct{}{}
}
return allInfos, nil
}
return nil, errors.Trace(err)
}

// GetDeadLockedTables gets dead locked tables.
func (d *DeadTableLockChecker) GetDeadLockedTables(quitCh chan struct{}, schemas []*model.DBInfo) (map[model.SessionInfo][]model.TableLockTpInfo, error) {
if d.etcdCli == nil {
return nil, nil
}
aliveServers, err := d.getAliveServers(quitCh)
if err != nil || len(aliveServers) == 0 {
return nil, err
}
deadLockTables := make(map[model.SessionInfo][]model.TableLockTpInfo)
for _, schema := range schemas {
select {
case <-quitCh:
return nil, nil
default:
}
for _, tbl := range schema.Tables {
if tbl.Lock == nil {
continue
}
for _, se := range tbl.Lock.Sessions {
if _, ok := aliveServers[se.ServerID]; !ok {
deadLockTables[se] = append(deadLockTables[se], model.TableLockTpInfo{
SchemaID: schema.ID,
TableID: tbl.ID,
Tp: tbl.Lock.Tp,
})
}
}
}
}
return deadLockTables, nil
}

0 comments on commit c2dc0c3

Please sign in to comment.