reduce unnecessary election after etcd upgrade to v3.5 #8552
Open
Description
Enhancement Task
Run test
start.sh
#!/bin/bash
bash close.sh
make pd-server
rm -rf /tmp/pd* /tmp/tikv
echo "Starting pd1..."
./bin/pd-server --data-dir /tmp/pd1 --name pd1 --client-urls "http://127.0.0.1:2379" &>pd1.log &
sleep 10
echo "Starting tikv..."
~/.tiup/components/tikv/v8.3.0-alpha-nightly/tikv-server --pd "127.0.0.1:2379" --data-dir "/tmp/tikv" &>tikv.log &
sleep 10
echo "Starting pd2..."
./bin/pd-server --data-dir /tmp/pd2 --join "http://127.0.0.1:2379" --client-urls "http://127.0.0.1:3379" --peer-urls "http://127.0.0.1:3380" --name pd2 &>"pd2.log" &
sleep 10
./bin/pd-ctl -u "http://127.0.0.1:2379" member show
echo "All processes are started."
close.sh
#!/bin/bash
pd1_pid=$(ps aux | grep '[p]d-server --data-dir /tmp/pd1' | awk '{print $2}')
pd2_pid=$(ps aux | grep '[p]d-server --data-dir /tmp/pd2' | awk '{print $2}')
tikv_pid=$(ps aux | grep '[t]ikv-server --pd' | awk '{print $2}')
if [ ! -z "$pd1_pid" ]; then
echo "Ending pd1..."
kill $pd1_pid
fi
if [ ! -z "$pd2_pid" ]; then
echo "Ending pd2..."
kill $pd2_pid
fi
if [ ! -z "$tikv_pid" ]; then
echo "Ending tikv..."
kill $tikv_pid
fi
while [[ ! -z "$pd1_pid" || ! -z "$pd2_pid" || ! -z "$tikv_pid" ]]; do
pd1_pid=$(ps aux | grep '[p]d-server --data-dir /tmp/pd1' | awk '{print $2}')
pd2_pid=$(ps aux | grep '[p]d-server --data-dir /tmp/pd2' | awk '{print $2}')
tikv_pid=$(ps aux | grep '[t]ikv-server --pd' | awk '{print $2}')
sleep 1
done
echo "All processes are ended."
Result
Before etcd upgrade, the git hash of pd is 3b37572. There is no additional election and pd leader is always pd1.
After etcd upgrade, the git hash of pd is 10cbdcf. There are some elections.
pd1 log
[2024/08/21 16:00:33.144 +08:00] [INFO] [cluster.go:421] ["added member"] [cluster-id=d985448f425c49d8] [local-member-id=3037d9ad26fb2231] [added-peer-id=8ab18be212274ed3] [added-peer-peer-urls="[http://127.0.0.1:3380]"]
[2024/08/21 16:00:33.144 +08:00] [INFO] [peer.go:133] ["starting remote peer"] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [pipeline.go:72] ["started HTTP pipelining with remote peer"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [peer.go:137] ["started remote peer"] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [transport.go:317] ["added remote peer"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3] [remote-peer-urls="[http://127.0.0.1:3380]"]
[2024/08/21 16:00:33.144 +08:00] [INFO] [server.go:1996] ["applied a configuration change through raft"] [local-member-id=3037d9ad26fb2231] [raft-conf-change=ConfChangeAddNode] [raft-conf-change-node-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [stream.go:169] ["started stream writer with remote peer"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [stream.go:169] ["started stream writer with remote peer"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [stream.go:395] ["started stream reader with remote peer"] [stream-reader-type="stream MsgApp v2"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.144 +08:00] [INFO] [stream.go:395] ["started stream reader with remote peer"] [stream-reader-type="stream Message"] [local-member-id=3037d9ad26fb2231] [remote-peer-id=8ab18be212274ed3]
[2024/08/21 16:00:33.668 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:34.010 +08:00] [WARN] [manager.go:121] ["failed to get member client urls"]
[2024/08/21 16:00:34.171 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:34.674 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:35.010 +08:00] [WARN] [manager.go:121] ["failed to get member client urls"]
[2024/08/21 16:00:35.176 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:35.677 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:36.008 +08:00] [WARN] [manager.go:121] ["failed to get member client urls"]
[2024/08/21 16:00:36.015 +08:00] [INFO] [lease.go:150] ["keep alive lease too slow"] [timeout-duration=3s] [actual-expire=2024/08/21 16:00:38.014 +08:00] [purpose="leader election"]
[2024/08/21 16:00:36.015 +08:00] [INFO] [lease.go:151] ["lease keep alive stopped"] [purpose="leader election"]
[2024/08/21 16:00:36.015 +08:00] [WARN] [lease.go:185] ["lease keep alive failed"] [purpose="leader election"] [start=2024/08/21 16:00:35.015 +08:00] [error="context canceled"]
[2024/08/21 16:00:36.015 +08:00] [WARN] [lease.go:185] ["lease keep alive failed"] [purpose="leader election"] [start=2024/08/21 16:00:36.015 +08:00] [error="context canceled"]
[2024/08/21 16:00:36.015 +08:00] [WARN] [lease.go:185] ["lease keep alive failed"] [purpose="leader election"] [start=2024/08/21 16:00:34.017 +08:00] [error="context canceled"]
[2024/08/21 16:00:36.015 +08:00] [INFO] [lease.go:202] ["stop lease keep alive worker"] [purpose="leader election"]
[2024/08/21 16:00:36.178 +08:00] [WARN] [v3_server.go:920] ["waiting for ReadIndex response took too long, retrying"] [sent-request-id=2463910398279564631] [retry-timeout=500ms]
[2024/08/21 16:00:36.187 +08:00] [WARN] [util.go:170] ["apply request took too long"] [took=2.001309535s] [expected-duration=100ms] [prefix="read-only range "] [request="key:\"/topology/tidb/\" range_end:\"/topology/tidb0\" "] [response=] [error="context deadline exceeded"]
[2024/08/21 16:00:36.187 +08:00] [INFO] [trace.go:171] ["trace[2123652000] range"] [detail="{range_begin:/topology/tidb/; range_end:/topology/tidb0; }"] [duration=2.00138954s] [start=2024/08/21 16:00:34.185 +08:00] [end=2024/08/21 16:00:36.187 +08:00] [steps="[\"trace[2123652000] 'agreement among raft nodes before linearized reading' (duration: 2.001307966s)\"]"] [step_count=1]
[2024/08/21 16:00:36.187 +08:00] [WARN] [interceptor.go:197] ["request stats"] ["start time"=2024/08/21 16:00:34.185 +08:00] ["time spent"=2.001460417s] [remote=127.0.0.1:51080] ["response type"=/etcdserverpb.KV/Range] ["request count"=0] ["request size"=34] ["response count"=0] ["response size"=0] ["request content"="key:\"/topology/tidb/\" range_end:\"/topology/tidb0\" "]
[2024/08/21 16:00:36.187 +08:00] [WARN] [retry_interceptor.go:63] ["retrying of unary invoker failed"] [target=etcd-endpoints://0xc001b7e000/127.0.0.1:2379] [attempt=0] [error="rpc error: code = DeadlineExceeded desc = context deadline exceeded"]
[2024/08/21 16:00:36.497 +08:00] [WARN] [raft] [zap_raft.go:85] ["3037d9ad26fb2231 stepped down to follower since quorum is not active"]
[2024/08/21 16:00:36.497 +08:00] [INFO] [raft] [zap_raft.go:77] ["3037d9ad26fb2231 became follower at term 2"]
[2024/08/21 16:00:36.497 +08:00] [INFO] [raft] [zap_raft.go:77] ["raft.node: 3037d9ad26fb2231 lost leader 3037d9ad26fb2231 at term 2"]
[2024/08/21 16:00:36.519 +08:00] [INFO] [server.go:1814] ["etcd leader changed, resigns pd leadership"] [old-pd-leader-name=pd1]
Activity