Skip to content

reduce unnecessary election after etcd upgrade to v3.5 #8552

Open
@lhy1024

Description

Enhancement Task

Run test

start.sh

#!/bin/bash
bash close.sh
make pd-server
rm -rf /tmp/pd* /tmp/tikv
echo "Starting pd1..."
./bin/pd-server --data-dir /tmp/pd1 --name pd1 --client-urls "http://127.0.0.1:2379" &>pd1.log &
sleep 10
echo "Starting tikv..."
~/.tiup/components/tikv/v8.3.0-alpha-nightly/tikv-server --pd "127.0.0.1:2379" --data-dir "/tmp/tikv" &>tikv.log &
sleep 10
echo "Starting pd2..."
./bin/pd-server --data-dir /tmp/pd2 --join "http://127.0.0.1:2379" --client-urls "http://127.0.0.1:3379" --peer-urls "http://127.0.0.1:3380" --name pd2 &>"pd2.log" &
sleep 10
./bin/pd-ctl -u "http://127.0.0.1:2379" member show

echo "All processes are started."

close.sh

#!/bin/bash

pd1_pid=$(ps aux | grep '[p]d-server --data-dir /tmp/pd1' | awk '{print $2}')
pd2_pid=$(ps aux | grep '[p]d-server --data-dir /tmp/pd2' | awk '{print $2}')
tikv_pid=$(ps aux | grep '[t]ikv-server --pd' | awk '{print $2}')

if [ ! -z "$pd1_pid" ]; then
    echo "Ending pd1..."
    kill $pd1_pid
fi

if [ ! -z "$pd2_pid" ]; then
    echo "Ending pd2..."
    kill $pd2_pid
fi

if [ ! -z "$tikv_pid" ]; then
    echo "Ending tikv..."
    kill $tikv_pid
fi

while [[ ! -z "$pd1_pid" || ! -z "$pd2_pid" || ! -z "$tikv_pid" ]]; do
    pd1_pid=$(ps aux | grep '[p]d-server --data-dir /tmp/pd1' | awk '{print $2}')
    pd2_pid=$(ps aux | grep '[p]d-server --data-dir /tmp/pd2' | awk '{print $2}')
    tikv_pid=$(ps aux | grep '[t]ikv-server --pd' | awk '{print $2}')
    sleep 1
done

echo "All processes are ended."

Result

Before etcd upgrade, the git hash of pd is 3b37572. There is no additional election and pd leader is always pd1.

After etcd upgrade, the git hash of pd is 10cbdcf. There are some elections.

pd1 log

[2024/08/21 16:00:33.144 +08:00] [INFO] [cluster.go:421] ["added member"] [cluster-id=d985448f425c49d8] [local-member-id=3037d9ad26fb2231] [added-peer-id=8ab18be212274ed3] [added-peer-peer-urls="[http://127.0.0.1:3380]"]
[2024/08/21 16:00:33.144 +08:00] [INFO] [peer.go:133] ["starting remote peer"] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [pipeline.go:72] ["started HTTP pipelining with remote peer"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [peer.go:137] ["started remote peer"] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [transport.go:317] ["added remote peer"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3] [remote-peer-urls="[http://127.0.0.1:3380]"]
[2024/08/21 16:00:33.144 +08:00] [INFO] [server.go:1996] ["applied a configuration change through raft"] [local-member-id=3037d9ad26fb2231] [raft-conf-change=ConfChangeAddNode] [raft-conf-change-node-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [stream.go:169] ["started stream writer with remote peer"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [stream.go:169] ["started stream writer with remote peer"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [stream.go:395] ["started stream reader with remote peer"] [stream-reader-type="stream MsgApp v2"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [stream.go:395] ["started stream reader with remote peer"] [stream-reader-type="stream Message"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.668 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:34.010 +08:00] [WARN] [manager.go:121] ["failed to get member client urls"]
[2024/08/21 16:00:34.171 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:34.674 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:35.010 +08:00] [WARN] [manager.go:121] ["failed to get member client urls"]
[2024/08/21 16:00:35.176 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:35.677 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:36.008 +08:00] [WARN] [manager.go:121] ["failed to get member client urls"]
[2024/08/21 16:00:36.015 +08:00] [INFO] [lease.go:150] ["keep alive lease too slow"] [timeout-duration=3s] [actual-expire=2024/08/21 16:00:38.014 +08:00] [purpose="leader election"]
[2024/08/21 16:00:36.015 +08:00] [INFO] [lease.go:151] ["lease keep alive stopped"] [purpose="leader election"]
[2024/08/21 16:00:36.015 +08:00] [WARN] [lease.go:185] ["lease keep alive failed"] [purpose="leader election"] [start=2024/08/21 16:00:35.015 +08:00] [error="context canceled"]
[2024/08/21 16:00:36.015 +08:00] [WARN] [lease.go:185] ["lease keep alive failed"] [purpose="leader election"] [start=2024/08/21 16:00:36.015 +08:00] [error="context canceled"]
[2024/08/21 16:00:36.015 +08:00] [WARN] [lease.go:185] ["lease keep alive failed"] [purpose="leader election"] [start=2024/08/21 16:00:34.017 +08:00] [error="context canceled"]
[2024/08/21 16:00:36.015 +08:00] [INFO] [lease.go:202] ["stop lease keep alive worker"] [purpose="leader election"]
[2024/08/21 16:00:36.178 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:36.187 +08:00] [WARN] [util.go:170] ["apply request took too long"] [took=2.001309535s] [expected-duration=100ms] [prefix="read-only range "] [request="key:\"/topology/tidb/\" range_end:\"/topology/tidb0\" "] [response=] [error="context deadline exceeded"]
[2024/08/21 16:00:36.187 +08:00] [INFO] [trace.go:171] ["trace[2123652000] range"] [detail="{range_begin:/topology/tidb/; range_end:/topology/tidb0; }"] [duration=2.00138954s] [start=2024/08/21 16:00:34.185 +08:00] [end=2024/08/21 16:00:36.187 +08:00] [steps="[\"trace[2123652000] 'agreement among raft nodes before linearized reading'  (duration: 2.001307966s)\"]"] [step_count=1]
[2024/08/21 16:00:36.187 +08:00] [WARN] [interceptor.go:197] ["request stats"] ["start time"=2024/08/21 16:00:34.185 +08:00] ["time spent"=2.001460417s] [remote=127.0.0.1:51080] ["response type"=/etcdserverpb.KV/Range] ["request count"=0] ["request size"=34] ["response count"=0] ["response size"=0] ["request content"="key:\"/topology/tidb/\" range_end:\"/topology/tidb0\" "]
[2024/08/21 16:00:36.187 +08:00] [WARN] [retry_interceptor.go:63] ["retrying of unary invoker failed"] [target=etcd-endpoints://0xc001b7e000/127.0.0.1:2379] [attempt=0] [error="rpc error: code = DeadlineExceeded desc = context deadline exceeded"]
[2024/08/21 16:00:36.497 +08:00] [WARN] [raft] [zap_raft.go:85] ["3037d9ad26fb2231 stepped down to follower since quorum is not active"]
[2024/08/21 16:00:36.497 +08:00] [INFO] [raft] [zap_raft.go:77] ["3037d9ad26fb2231 became follower at term 2"]
[2024/08/21 16:00:36.497 +08:00] [INFO] [raft] [zap_raft.go:77] ["raft.node: 3037d9ad26fb2231 lost leader 3037d9ad26fb2231 at term 2"]
[2024/08/21 16:00:36.519 +08:00] [INFO] [server.go:1814] ["etcd leader changed, resigns pd leadership"] [old-pd-leader-name=pd1]

Activity

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Assignees

No one assigned

    Labels

    type/enhancementThe issue or PR belongs to an enhancement.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions