1
1
use std:: cmp:: Ordering ;
2
2
3
3
/// Ballot Leader Election algorithm for electing new leaders
4
- use crate :: util:: { defaults:: * , ConfigurationId , FlexibleQuorum , Quorum } ;
4
+ use crate :: {
5
+ sequence_paxos:: { Phase , Role } ,
6
+ util:: { defaults:: * , ConfigurationId , FlexibleQuorum , Quorum } ,
7
+ } ;
5
8
6
9
#[ cfg( feature = "logging" ) ]
7
10
use crate :: utils:: logger:: create_logger;
@@ -15,7 +18,7 @@ use crate::{
15
18
#[ cfg( feature = "serde" ) ]
16
19
use serde:: { Deserialize , Serialize } ;
17
20
#[ cfg( feature = "logging" ) ]
18
- use slog:: { debug , info, trace, warn, Logger } ;
21
+ use slog:: { info, trace, warn, Logger } ;
19
22
20
23
/// Used to define a Sequence Paxos epoch
21
24
#[ derive( Clone , Copy , Eq , Debug , Default , PartialEq ) ]
@@ -59,8 +62,8 @@ impl PartialOrd for Ballot {
59
62
}
60
63
}
61
64
62
- /// The connectivity of an OmniPaxos node
63
- pub ( crate ) type Connectivity = u8 ;
65
+ const INITIAL_ROUND : u32 = 1 ;
66
+ const RECOVERY_ROUND : u32 = 0 ;
64
67
65
68
/// A Ballot Leader Election component. Used in conjunction with OmniPaxos to handle the election of a leader for a cluster of OmniPaxos servers,
66
69
/// incoming messages and produces outgoing messages that the user has to fetch periodically and send using a network implementation.
@@ -74,18 +77,18 @@ pub(crate) struct BallotLeaderElection {
74
77
peers : Vec < NodeId > ,
75
78
/// The current round of the heartbeat cycle.
76
79
hb_round : u32 ,
77
- /// Vector which temporarily holds all the received heartbeats from one heartbeat round, including the current node .
78
- ballots : Vec < ( Ballot , Connectivity ) > ,
79
- /// Vector that holds all the received heartbeats from the previous heartbeat round, including the current node.
80
+ /// The heartbeat replies this instance received during the current round .
81
+ heartbeat_replies : Vec < HeartbeatReply > ,
82
+ /// Vector that holds all the received heartbeats from the previous heartbeat round, including the current node. Only used to display the connectivity of this node in the UI.
80
83
/// Represents nodes that are currently alive from the view of the current node.
81
- prev_round_ballots : Vec < ( Ballot , Connectivity ) > ,
84
+ prev_replies : Vec < HeartbeatReply > ,
82
85
/// Holds the current ballot of this instance.
83
86
current_ballot : Ballot ,
84
- /// The number of replicas inside the cluster that this instance is
85
- /// connected to (based on heartbeats received) including itself.
86
- connectivity : Connectivity ,
87
- /// Current elected leader.
88
- leader : Option < Ballot > ,
87
+ /// The current leader of this instance.
88
+ leader : Ballot ,
89
+ /// A happy node either sees that it is, is connected to, or sees evidence of a potential leader
90
+ /// for the cluster. If a node is unhappy then it is seeking a new leader.
91
+ happy : bool ,
89
92
/// The number of replicas inside the cluster whose heartbeats are needed to become and remain the leader.
90
93
quorum : Quorum ,
91
94
/// Vector which holds all the outgoing messages of the BLE instance.
@@ -97,23 +100,31 @@ pub(crate) struct BallotLeaderElection {
97
100
98
101
impl BallotLeaderElection {
99
102
/// Construct a new BallotLeaderElection node
100
- pub ( crate ) fn with ( config : BLEConfig , initial_leader : Option < Ballot > ) -> Self {
103
+ pub ( crate ) fn with ( config : BLEConfig , recovered_leader : Option < Ballot > ) -> Self {
101
104
let config_id = config. configuration_id ;
102
105
let pid = config. pid ;
103
106
let peers = config. peers ;
104
107
let num_nodes = & peers. len ( ) + 1 ;
105
108
let quorum = Quorum :: with ( config. flexible_quorum , num_nodes) ;
106
- let initial_ballot = Ballot :: with ( config_id, 0 , config. priority , pid) ;
109
+ let mut initial_ballot = Ballot :: with ( config_id, INITIAL_ROUND , config. priority , pid) ;
110
+ let initial_leader = match recovered_leader {
111
+ Some ( b) if b != Ballot :: default ( ) => {
112
+ // Prevents a recovered server from retaining BLE leadership with the same ballot.
113
+ initial_ballot. n = RECOVERY_ROUND ;
114
+ b
115
+ }
116
+ _ => initial_ballot,
117
+ } ;
107
118
let mut ble = BallotLeaderElection {
108
119
configuration_id : config_id,
109
120
pid,
110
121
peers,
111
122
hb_round : 0 ,
112
- ballots : Vec :: with_capacity ( num_nodes) ,
113
- prev_round_ballots : Vec :: with_capacity ( num_nodes) ,
123
+ heartbeat_replies : Vec :: with_capacity ( num_nodes) ,
124
+ prev_replies : Vec :: with_capacity ( num_nodes) ,
114
125
current_ballot : initial_ballot,
115
- connectivity : num_nodes as Connectivity ,
116
126
leader : initial_leader,
127
+ happy : true ,
117
128
quorum,
118
129
outgoing : Vec :: with_capacity ( config. buffer_size ) ,
119
130
#[ cfg( feature = "logging" ) ]
@@ -160,70 +171,20 @@ impl BallotLeaderElection {
160
171
}
161
172
}
162
173
163
- fn check_leader ( & mut self ) -> Option < Ballot > {
164
- let ballots = std:: mem:: take ( & mut self . ballots ) ;
165
- let top_accept_ballot = ballots
166
- . iter ( )
167
- . filter_map ( |& ( ballot, connectivity) | {
168
- if self . quorum . is_accept_quorum ( connectivity as usize ) {
169
- Some ( ballot)
170
- } else {
171
- None
172
- }
173
- } )
174
- . max ( )
175
- . unwrap_or_default ( ) ;
176
- let leader_ballot = self . leader . unwrap_or_default ( ) ;
177
- if top_accept_ballot == leader_ballot {
178
- // leader is still alive and has accept quorum
179
- None
180
- } else {
181
- // leader is dead || changed priority || doesn't have an accept quorum
182
- let top_prepare_ballot = ballots
183
- . iter ( )
184
- . filter_map ( |& ( ballot, connectivity) | {
185
- if self . quorum . is_prepare_quorum ( connectivity as usize ) {
186
- Some ( ballot)
187
- } else {
188
- None
189
- }
190
- } )
191
- . max ( )
192
- . unwrap_or_default ( ) ;
193
- if top_prepare_ballot > leader_ballot {
194
- // new leader with prepare quorum
195
- let new_leader = top_prepare_ballot;
196
- self . leader = Some ( new_leader) ;
197
- #[ cfg( feature = "logging" ) ]
198
- debug ! (
199
- self . logger,
200
- "BLE {}, New Leader elected: {:?}" , self . pid, new_leader
201
- ) ;
202
- Some ( new_leader)
203
- } else {
204
- // nobody has taken over leadership, let's try to ourselves
205
- self . current_ballot . n = leader_ballot. n + 1 ;
206
- self . leader = None ;
207
- None
208
- }
209
- }
210
- }
211
-
212
174
/// Initiates a new heartbeat round.
213
175
pub ( crate ) fn new_hb_round ( & mut self ) {
176
+ self . prev_replies = std:: mem:: take ( & mut self . heartbeat_replies ) ;
214
177
self . hb_round += 1 ;
215
178
#[ cfg( feature = "logging" ) ]
216
179
trace ! (
217
180
self . logger,
218
181
"Initiate new heartbeat round: {}" ,
219
182
self . hb_round
220
183
) ;
221
-
222
184
for peer in & self . peers {
223
185
let hb_request = HeartbeatRequest {
224
186
round : self . hb_round ,
225
187
} ;
226
-
227
188
self . outgoing . push ( BLEMessage {
228
189
from : self . pid ,
229
190
to : * peer,
@@ -232,41 +193,89 @@ impl BallotLeaderElection {
232
193
}
233
194
}
234
195
235
- pub ( crate ) fn hb_timeout ( & mut self ) -> Option < Ballot > {
236
- let my_connectivity = self . ballots . len ( ) + 1 ;
237
- self . connectivity = my_connectivity as Connectivity ;
238
- // Add our own ballot to the list of received ballots of current hb round
239
- self . ballots . push ( ( self . current_ballot , self . connectivity ) ) ;
240
- self . prev_round_ballots = self . ballots . clone ( ) ;
241
- let result: Option < Ballot > = if self . quorum . is_prepare_quorum ( my_connectivity) {
242
- #[ cfg( feature = "logging" ) ]
243
- debug ! (
244
- self . logger,
245
- "Received a majority of heartbeats, round: {}, {:?}" , self . hb_round, self . ballots
246
- ) ;
247
- self . check_leader ( )
196
+ /// End of a heartbeat round. Returns current leader and election status.
197
+ pub ( crate ) fn hb_timeout (
198
+ & mut self ,
199
+ seq_paxos_state : & ( Role , Phase ) ,
200
+ seq_paxos_promise : Ballot ,
201
+ ) -> Option < Ballot > {
202
+ self . update_leader ( ) ;
203
+ self . update_happiness ( seq_paxos_state) ;
204
+ self . check_takeover ( ) ;
205
+ self . new_hb_round ( ) ;
206
+ if seq_paxos_promise > self . leader {
207
+ // Sync leader with Paxos promise in case ballot didn't make it to BLE followers
208
+ self . leader = seq_paxos_promise;
209
+ self . happy = true ;
210
+ }
211
+ if self . leader == self . current_ballot {
212
+ Some ( self . current_ballot )
248
213
} else {
249
- #[ cfg( feature = "logging" ) ]
250
- warn ! (
251
- self . logger,
252
- "Did not receive a majority of heartbeats, round: {}, {:?}" ,
253
- self . hb_round,
254
- self . ballots
255
- ) ;
256
- self . ballots . clear ( ) ;
257
214
None
215
+ }
216
+ }
217
+
218
+ fn update_leader ( & mut self ) {
219
+ let max_reply_ballot = self . heartbeat_replies . iter ( ) . map ( |r| r. ballot ) . max ( ) ;
220
+ if let Some ( max) = max_reply_ballot {
221
+ if max > self . leader {
222
+ self . leader = max;
223
+ }
224
+ }
225
+ }
226
+
227
+ fn update_happiness ( & mut self , seq_paxos_state : & ( Role , Phase ) ) {
228
+ self . happy = if self . leader == self . current_ballot {
229
+ let potential_followers = self
230
+ . heartbeat_replies
231
+ . iter ( )
232
+ . filter ( |hb_reply| hb_reply. leader <= self . current_ballot )
233
+ . count ( ) ;
234
+ let can_form_quorum = match seq_paxos_state {
235
+ ( Role :: Leader , Phase :: Accept ) => {
236
+ self . quorum . is_accept_quorum ( potential_followers + 1 )
237
+ }
238
+ _ => self . quorum . is_prepare_quorum ( potential_followers + 1 ) ,
239
+ } ;
240
+ if can_form_quorum {
241
+ true
242
+ } else {
243
+ let see_larger_happy_leader = self
244
+ . heartbeat_replies
245
+ . iter ( )
246
+ . any ( |r| r. leader > self . current_ballot && r. happy ) ;
247
+ see_larger_happy_leader
248
+ }
249
+ } else {
250
+ self . heartbeat_replies
251
+ . iter ( )
252
+ . any ( |r| r. ballot == self . leader && r. happy )
258
253
} ;
259
- self . new_hb_round ( ) ;
260
- result
254
+ }
255
+
256
+ fn check_takeover ( & mut self ) {
257
+ if !self . happy {
258
+ let all_neighbors_unhappy = self . heartbeat_replies . iter ( ) . all ( |r| !r. happy ) ;
259
+ let im_quorum_connected = self
260
+ . quorum
261
+ . is_prepare_quorum ( self . heartbeat_replies . len ( ) + 1 ) ;
262
+ if all_neighbors_unhappy && im_quorum_connected {
263
+ // We increment past our leader instead of max of unhappy ballots because we
264
+ // assume we have already checked leader for this round so they should be equal
265
+ self . current_ballot . n = self . leader . n + 1 ;
266
+ self . leader = self . current_ballot ;
267
+ self . happy = true ;
268
+ }
269
+ }
261
270
}
262
271
263
272
fn handle_request ( & mut self , from : NodeId , req : HeartbeatRequest ) {
264
273
let hb_reply = HeartbeatReply {
265
274
round : req. round ,
266
275
ballot : self . current_ballot ,
267
- connectivity : self . connectivity ,
276
+ leader : self . leader ,
277
+ happy : self . happy ,
268
278
} ;
269
-
270
279
self . outgoing . push ( BLEMessage {
271
280
from : self . pid ,
272
281
to : from,
@@ -276,7 +285,7 @@ impl BallotLeaderElection {
276
285
277
286
fn handle_reply ( & mut self , rep : HeartbeatReply ) {
278
287
if rep. round == self . hb_round && rep. ballot . config_id == self . configuration_id {
279
- self . ballots . push ( ( rep. ballot , rep . connectivity ) ) ;
288
+ self . heartbeat_replies . push ( rep) ;
280
289
} else {
281
290
#[ cfg( feature = "logging" ) ]
282
291
warn ! (
@@ -290,8 +299,8 @@ impl BallotLeaderElection {
290
299
self . current_ballot
291
300
}
292
301
293
- pub ( crate ) fn get_ballots ( & self ) -> Vec < ( Ballot , Connectivity ) > {
294
- self . prev_round_ballots . clone ( )
302
+ pub ( crate ) fn get_ballots ( & self ) -> Vec < HeartbeatReply > {
303
+ self . prev_replies . clone ( )
295
304
}
296
305
}
297
306
0 commit comments