Skip to content

[Zen2] Add lag detector #35685

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Nov 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
private long maxTermSeen;
private final Reconfigurator reconfigurator;
private final ClusterBootstrapService clusterBootstrapService;
private final LagDetector lagDetector;

private Mode mode;
private Optional<DiscoveryNode> lastKnownLeader;
Expand Down Expand Up @@ -156,6 +157,8 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
masterService.setClusterStateSupplier(this::getStateForMasterService);
this.reconfigurator = new Reconfigurator(settings, clusterSettings);
this.clusterBootstrapService = new ClusterBootstrapService(settings, transportService);
this.lagDetector = new LagDetector(settings, transportService.getThreadPool(), n -> removeNode(n, "lagging"),
transportService::getLocalNode);
}

private Runnable getOnLeaderFailure() {
Expand Down Expand Up @@ -373,6 +376,7 @@ void becomeCandidate(String method) {

followersChecker.clearCurrentNodes();
followersChecker.updateFastResponseState(getCurrentTerm(), mode);
lagDetector.clearTrackedNodes();

if (applierState.nodes().getMasterNodeId() != null) {
applierState = clusterStateWithNoMasterBlock(applierState);
Expand Down Expand Up @@ -427,6 +431,7 @@ void becomeFollower(String method, DiscoveryNode leaderNode) {

followersChecker.clearCurrentNodes();
followersChecker.updateFastResponseState(getCurrentTerm(), mode);
lagDetector.clearTrackedNodes();
}

private PreVoteResponse getPreVoteResponse() {
Expand Down Expand Up @@ -511,6 +516,11 @@ public void invariant() {
assert (applierState.nodes().getMasterNodeId() == null) == applierState.blocks().hasGlobalBlock(NO_MASTER_BLOCK_WRITES.id());
assert preVoteCollector.getPreVoteResponse().equals(getPreVoteResponse())
: preVoteCollector + " vs " + getPreVoteResponse();

assert lagDetector.getTrackedNodes().contains(getLocalNode()) == false : lagDetector.getTrackedNodes();
assert followersChecker.getKnownFollowers().equals(lagDetector.getTrackedNodes())
: followersChecker.getKnownFollowers() + " vs " + lagDetector.getTrackedNodes();

if (mode == Mode.LEADER) {
final boolean becomingMaster = getStateForMasterService().term() != getCurrentTerm();

Expand Down Expand Up @@ -830,8 +840,10 @@ public String toString() {
}
});

leaderChecker.setCurrentNodes(publishRequest.getAcceptedState().nodes());
followersChecker.setCurrentNodes(publishRequest.getAcceptedState().nodes());
final DiscoveryNodes publishNodes = publishRequest.getAcceptedState().nodes();
leaderChecker.setCurrentNodes(publishNodes);
followersChecker.setCurrentNodes(publishNodes);
lagDetector.setTrackedNodes(publishNodes);
publication.start(followersChecker.getFaultyNodes());
}
} catch (Exception e) {
Expand Down Expand Up @@ -984,6 +996,9 @@ public void onNodeAck(DiscoveryNode node, Exception e) {
}
} else {
ackListener.onNodeAck(node, e);
if (e == null) {
lagDetector.setAppliedVersion(node, publishRequest.getAcceptedState().version());
}
}
}
},
Expand Down Expand Up @@ -1050,6 +1065,7 @@ public void onSuccess(String source) {
if (mode == Mode.LEADER) {
scheduleReconfigurationIfNeeded();
}
lagDetector.startLagDetector(publishRequest.getAcceptedState().version());
}
ackListener.onNodeAck(getLocalNode(), null);
publishListener.onResponse(null);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.coordination;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.threadpool.ThreadPool.Names;

import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;
import java.util.function.Supplier;
import java.util.stream.Collectors;

import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;

/**
* A publication can succeed and complete before all nodes have applied the published state and acknowledged it; however we need every node
* eventually either to apply the published state (or a later state) or be removed from the cluster. This component achieves this by
* removing any lagging nodes from the cluster after a timeout.
*/
public class LagDetector {

private static final Logger logger = LogManager.getLogger(LagDetector.class);

// the timeout for each node to apply a cluster state update after the leader has applied it, before being removed from the cluster
public static final Setting<TimeValue> CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING =
Setting.timeSetting("cluster.follower_lag.timeout",
TimeValue.timeValueMillis(90000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);

private final TimeValue clusterStateApplicationTimeout;
private final Consumer<DiscoveryNode> onLagDetected;
private final Supplier<DiscoveryNode> localNodeSupplier;
private final ThreadPool threadPool;
private final Map<DiscoveryNode, NodeAppliedStateTracker> appliedStateTrackersByNode = newConcurrentMap();

public LagDetector(final Settings settings, final ThreadPool threadPool, final Consumer<DiscoveryNode> onLagDetected,
final Supplier<DiscoveryNode> localNodeSupplier) {
this.threadPool = threadPool;
this.clusterStateApplicationTimeout = CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING.get(settings);
this.onLagDetected = onLagDetected;
this.localNodeSupplier = localNodeSupplier;
}

public void setTrackedNodes(final Iterable<DiscoveryNode> discoveryNodes) {
final Set<DiscoveryNode> discoveryNodeSet = new HashSet<>();
discoveryNodes.forEach(discoveryNodeSet::add);
discoveryNodeSet.remove(localNodeSupplier.get());
appliedStateTrackersByNode.keySet().retainAll(discoveryNodeSet);
discoveryNodeSet.forEach(node -> appliedStateTrackersByNode.putIfAbsent(node, new NodeAppliedStateTracker(node)));
}

public void clearTrackedNodes() {
appliedStateTrackersByNode.clear();
}

public void setAppliedVersion(final DiscoveryNode discoveryNode, final long appliedVersion) {
final NodeAppliedStateTracker nodeAppliedStateTracker = appliedStateTrackersByNode.get(discoveryNode);
if (nodeAppliedStateTracker == null) {
// Received an ack from a node that a later publication has removed (or we are no longer master). No big deal.
logger.trace("node {} applied version {} but this node's version is not being tracked", discoveryNode, appliedVersion);
} else {
nodeAppliedStateTracker.increaseAppliedVersion(appliedVersion);
}
}

public void startLagDetector(final long version) {
final List<NodeAppliedStateTracker> laggingTrackers
= appliedStateTrackersByNode.values().stream().filter(t -> t.appliedVersionLessThan(version)).collect(Collectors.toList());

if (laggingTrackers.isEmpty()) {
logger.trace("lag detection for version {} is unnecessary: {}", version, appliedStateTrackersByNode.values());
} else {
logger.debug("starting lag detector for version {}: {}", version, laggingTrackers);

threadPool.scheduleUnlessShuttingDown(clusterStateApplicationTimeout, Names.GENERIC, new Runnable() {
@Override
public void run() {
laggingTrackers.forEach(t -> t.checkForLag(version));
}

@Override
public String toString() {
return "lag detector for version " + version + " on " + laggingTrackers;
}
});
}
}

@Override
public String toString() {
return "LagDetector{" +
"clusterStateApplicationTimeout=" + clusterStateApplicationTimeout +
", appliedStateTrackersByNode=" + appliedStateTrackersByNode.values() +
'}';
}

// for assertions
Set<DiscoveryNode> getTrackedNodes() {
return Collections.unmodifiableSet(appliedStateTrackersByNode.keySet());
}

private class NodeAppliedStateTracker {
private final DiscoveryNode discoveryNode;
private final AtomicLong appliedVersion = new AtomicLong();

NodeAppliedStateTracker(final DiscoveryNode discoveryNode) {
this.discoveryNode = discoveryNode;
}

void increaseAppliedVersion(long appliedVersion) {
long maxAppliedVersion = this.appliedVersion.updateAndGet(v -> Math.max(v, appliedVersion));
logger.trace("{} applied version {}, max now {}", this, appliedVersion, maxAppliedVersion);
}

boolean appliedVersionLessThan(final long version) {
return appliedVersion.get() < version;
}

@Override
public String toString() {
return "NodeAppliedStateTracker{" +
"discoveryNode=" + discoveryNode +
", appliedVersion=" + appliedVersion +
'}';
}

void checkForLag(final long version) {
if (appliedStateTrackersByNode.get(discoveryNode) != this) {
logger.trace("{} no longer active when checking version {}", this, version);
return;
}

long appliedVersion = this.appliedVersion.get();
if (version <= appliedVersion) {
logger.trace("{} satisfied when checking version {}, node applied version {}", this, version, appliedVersion);
return;
}

logger.debug("{}, detected lag at version {}, node has only applied version {}", this, version, appliedVersion);
onLagDetected.accept(discoveryNode);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.elasticsearch.cluster.NodeConnectionsService;
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
import org.elasticsearch.cluster.coordination.LagDetector;
import org.elasticsearch.cluster.coordination.Coordinator;
import org.elasticsearch.cluster.coordination.ElectionSchedulerFactory;
import org.elasticsearch.cluster.coordination.FollowersChecker;
Expand Down Expand Up @@ -469,7 +470,8 @@ public void apply(Settings value, Settings current, Settings previous) {
LeaderChecker.LEADER_CHECK_RETRY_COUNT_SETTING,
Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION,
TransportAddVotingTombstonesAction.MAXIMUM_VOTING_TOMBSTONES_SETTING,
ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING
ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING,
LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING
)));

public static List<SettingUpgrader<?>> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ public void testAckListenerReceivesNoAckFromHangingFollower() {

assertTrue("expected immediate ack from " + follower1, ackCollector.hasAckedSuccessfully(follower1));
assertFalse("expected no ack from " + leader, ackCollector.hasAckedSuccessfully(leader));
cluster.stabilise();
cluster.stabilise(defaultMillis(PUBLISH_TIMEOUT_SETTING));
assertTrue("expected eventual ack from " + leader, ackCollector.hasAckedSuccessfully(leader));
assertFalse("expected no ack from " + follower0, ackCollector.hasAcked(follower0));
}
Expand Down Expand Up @@ -1101,7 +1101,6 @@ void stabilise(long stabilisationDurationMillis) {
}

runFor(stabilisationDurationMillis, "stabilising");
fixLag();

final ClusterNode leader = getAnyLeader();
final long leaderTerm = leader.coordinator.getCurrentTerm();
Expand Down Expand Up @@ -1158,35 +1157,6 @@ void stabilise(long stabilisationDurationMillis) {
leader.improveConfiguration(lastAcceptedState), sameInstance(lastAcceptedState));
}

// TODO remove this when lag detection is implemented
void fixLag() {
final ClusterNode leader = getAnyLeader();
final long leaderVersion = leader.getLastAppliedClusterState().version();
final long minVersion = clusterNodes.stream()
.filter(n -> isConnectedPair(n, leader))
.map(n -> n.getLastAppliedClusterState().version()).min(Long::compare).orElse(Long.MIN_VALUE);
assert minVersion >= 0;
if (minVersion < leaderVersion) {
logger.info("--> fixLag publishing a value to fix lag, leaderVersion={}, minVersion={}", leaderVersion, minVersion);
onNode(leader.getLocalNode(), () -> {
synchronized (leader.coordinator.mutex) {
leader.submitValue(randomLong());
}
}).run();

runFor(DEFAULT_CLUSTER_STATE_UPDATE_DELAY
// may need to bump terms too
+ DEFAULT_ELECTION_DELAY,
"re-stabilising after lag-fixing publication");

if (clusterNodes.stream().anyMatch(n -> n.getClusterStateApplyResponse().equals(ClusterStateApplyResponse.HANG))) {
runFor(defaultMillis(PUBLISH_TIMEOUT_SETTING), "allowing lag-fixing publication to time out");
}
} else {
logger.info("--> fixLag found no lag, leader={}, leaderVersion={}, minVersion={}", leader, leaderVersion, minVersion);
}
}

void runFor(long runDurationMillis, String description) {
final long endTime = deterministicTaskQueue.getCurrentTimeMillis() + runDurationMillis;
logger.info("--> runFor({}ms) running until [{}ms]: {}", runDurationMillis, endTime, description);
Expand Down
Loading