Skip to content

Allow election of nodes outside voting config #43243

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -364,9 +364,8 @@ private void startElection() {
// The preVoteCollector is only active while we are candidate, but it does not call this method with synchronisation, so we have
// to check our mode again here.
if (mode == Mode.CANDIDATE) {
if (electionQuorumContainsLocalNode(getLastAcceptedState()) == false) {
logger.trace("skip election as local node is not part of election quorum: {}",
getLastAcceptedState().coordinationMetaData());
if (localNodeMayWinElection(getLastAcceptedState()) == false) {
logger.trace("skip election as local node may not win it: {}", getLastAcceptedState().coordinationMetaData());
return;
}

Expand All @@ -391,16 +390,17 @@ private void abdicateTo(DiscoveryNode newMaster) {
becomeCandidate("after abdicating to " + newMaster);
}

private static boolean electionQuorumContainsLocalNode(ClusterState lastAcceptedState) {
private static boolean localNodeMayWinElection(ClusterState lastAcceptedState) {
final DiscoveryNode localNode = lastAcceptedState.nodes().getLocalNode();
assert localNode != null;
return electionQuorumContains(lastAcceptedState, localNode);
return nodeMayWinElection(lastAcceptedState, localNode);
}

private static boolean electionQuorumContains(ClusterState lastAcceptedState, DiscoveryNode node) {
private static boolean nodeMayWinElection(ClusterState lastAcceptedState, DiscoveryNode node) {
final String nodeId = node.getId();
return lastAcceptedState.getLastCommittedConfiguration().getNodeIds().contains(nodeId)
|| lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId);
|| lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId)
|| lastAcceptedState.getVotingConfigExclusions().stream().noneMatch(vce -> vce.getNodeId().equals(nodeId));
}

private Optional<Join> ensureTermAtLeast(DiscoveryNode sourceNode, long targetTerm) {
Expand Down Expand Up @@ -837,8 +837,8 @@ public boolean setInitialConfiguration(final VotingConfiguration votingConfigura
metaDataBuilder.coordinationMetaData(coordinationMetaData);

coordinationState.get().setInitialState(ClusterState.builder(currentState).metaData(metaDataBuilder).build());
assert electionQuorumContainsLocalNode(getLastAcceptedState()) :
"initial state does not have local node in its election quorum: " + getLastAcceptedState().coordinationMetaData();
assert localNodeMayWinElection(getLastAcceptedState()) :
"initial state does not allow local node to win election: " + getLastAcceptedState().coordinationMetaData();
preVoteCollector.update(getPreVoteResponse(), null); // pick up the change to last-accepted version
startElectionScheduler();
return true;
Expand Down Expand Up @@ -1134,8 +1134,8 @@ public void run() {
if (mode == Mode.CANDIDATE) {
final ClusterState lastAcceptedState = coordinationState.get().getLastAcceptedState();

if (electionQuorumContainsLocalNode(lastAcceptedState) == false) {
logger.trace("skip prevoting as local node is not part of election quorum: {}",
if (localNodeMayWinElection(lastAcceptedState) == false) {
logger.trace("skip prevoting as local node may not win election: {}",
lastAcceptedState.coordinationMetaData());
return;
}
Expand Down Expand Up @@ -1296,16 +1296,20 @@ public void onSuccess(String source) {
updateMaxTermSeen(getCurrentTerm());

if (mode == Mode.LEADER) {
// if necessary, abdicate to another node or improve the voting configuration
boolean attemptReconfiguration = true;
final ClusterState state = getLastAcceptedState(); // committed state
if (electionQuorumContainsLocalNode(state) == false) {
if (localNodeMayWinElection(state) == false) {
final List<DiscoveryNode> masterCandidates = completedNodes().stream()
.filter(DiscoveryNode::isMasterNode)
.filter(node -> electionQuorumContains(state, node))
.filter(node -> nodeMayWinElection(state, node))
.collect(Collectors.toList());
if (masterCandidates.isEmpty() == false) {
abdicateTo(masterCandidates.get(random.nextInt(masterCandidates.size())));
attemptReconfiguration = false;
}
} else {
}
if (attemptReconfiguration) {
scheduleReconfigurationIfNeeded();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ static class VotingConfigNode implements Comparable<VotingConfigNode> {

@Override
public int compareTo(VotingConfigNode other) {
// prefer current master
final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster);
if (currentMasterComp != 0) {
return currentMasterComp;
}
// prefer nodes that are live
final int liveComp = Boolean.compare(other.live, live);
if (liveComp != 0) {
Expand All @@ -160,11 +165,6 @@ public int compareTo(VotingConfigNode other) {
if (inCurrentConfigComp != 0) {
return inCurrentConfigComp;
}
// prefer current master
final int currentMasterComp = Boolean.compare(other.currentMaster, currentMaster);
if (currentMasterComp != 0) {
return currentMasterComp;
}
// tiebreak by node id to have stable ordering
return id.compareTo(other.id);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public void testReconfigurationExamples() {

check(nodes("a"), conf("a"), true, conf("a"));
check(nodes("a", "b"), conf("a"), true, conf("a"));
check(nodes("a", "b"), conf("b"), true, conf("b"));
check(nodes("a", "b"), conf("b"), true, conf("a"));
check(nodes("a", "b"), conf("a", "c"), true, conf("a"));
check(nodes("a", "b"), conf("a", "b"), true, conf("a"));
check(nodes("a", "b"), conf("a", "b", "e"), true, conf("a", "b", "e"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,38 @@
*/
package org.elasticsearch.cluster.coordination;

import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsAction;
import org.elasticsearch.action.admin.cluster.configuration.AddVotingConfigExclusionsRequest;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.Priority;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.transport.MockTransportService;
import org.elasticsearch.transport.TransportService;

import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;

@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0)
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.nullValue;

@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, autoManageMasterNodes = false)
public class VotingConfigurationIT extends ESIntegTestCase {

@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singletonList(MockTransportService.TestPlugin.class);
}

public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionException, InterruptedException {
internalCluster().setBootstrapMasterNodeIndex(0);
internalCluster().startNodes(2);
final String originalMaster = internalCluster().getMasterName();

Expand All @@ -38,4 +59,56 @@ public void testAbdicateAfterVotingConfigExclusionAdded() throws ExecutionExcept
client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
assertNotEquals(originalMaster, internalCluster().getMasterName());
}

public void testElectsNodeNotInVotingConfiguration() throws Exception {
internalCluster().setBootstrapMasterNodeIndex(0);
final List<String> nodeNames = internalCluster().startNodes(4);

// a 4-node cluster settles on a 3-node configuration; we then prevent the nodes in the configuration from winning an election
// by failing at the pre-voting stage, so that the extra node must be elected instead when the master shuts down. This extra node
// should then add itself into the voting configuration.

assertFalse(internalCluster().client().admin().cluster().prepareHealth()
.setWaitForNodes("4").setWaitForEvents(Priority.LANGUID).get().isTimedOut());

String excludedNodeName = null;
final ClusterState clusterState
= internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState();
final Set<String> votingConfiguration = clusterState.getLastCommittedConfiguration().getNodeIds();
assertThat(votingConfiguration, hasSize(3));
assertThat(clusterState.nodes().getSize(), equalTo(4));
assertThat(votingConfiguration, hasItem(clusterState.nodes().getMasterNodeId()));
for (DiscoveryNode discoveryNode : clusterState.nodes()) {
if (votingConfiguration.contains(discoveryNode.getId()) == false) {
assertThat(excludedNodeName, nullValue());
excludedNodeName = discoveryNode.getName();
}
}

for (final String sender : nodeNames) {
if (sender.equals(excludedNodeName)) {
continue;
}
final MockTransportService senderTransportService
= (MockTransportService) internalCluster().getInstance(TransportService.class, sender);
for (final String receiver : nodeNames) {
senderTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, receiver),
(connection, requestId, action, request, options) -> {
if (action.equals(PreVoteCollector.REQUEST_PRE_VOTE_ACTION_NAME)) {
throw new ElasticsearchException("rejected");
}
connection.sendRequest(requestId, action, request, options);
});
}
}

internalCluster().stopCurrentMasterNode();
assertFalse(internalCluster().client().admin().cluster().prepareHealth()
.setWaitForNodes("3").setWaitForEvents(Priority.LANGUID).get().isTimedOut());

final ClusterState newClusterState
= internalCluster().client().admin().cluster().prepareState().clear().setNodes(true).setMetaData(true).get().getState();
assertThat(newClusterState.nodes().getMasterNode().getName(), equalTo(excludedNodeName));
assertThat(newClusterState.getLastCommittedConfiguration().getNodeIds(), hasItem(newClusterState.nodes().getMasterNodeId()));
}
}