Skip to content

Commit 10807a0

Browse files
authored
Avoid NPE when disassociateDeadNodes is executed for a node present in the desired balance (#91659) (#91686)
1 parent 5a5b360 commit 10807a0

File tree

4 files changed

+89
-37
lines changed

4 files changed

+89
-37
lines changed

docs/changelog/91659.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 91659
2+
summary: Avoid NPE when disassociateDeadNodes is executed for a node present in the
3+
desired balance
4+
area: Allocation
5+
type: bug
6+
issues: []

server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -415,24 +415,30 @@ private DiscoveryNode findRelocationTarget(
415415
) {
416416
for (final var nodeId : desiredNodeIds) {
417417
// TODO consider ignored nodes here too?
418-
if (nodeId.equals(shardRouting.currentNodeId()) == false) {
419-
final var currentNode = routingNodes.node(nodeId);
420-
final var decision = canAllocateDecider.apply(shardRouting, currentNode);
421-
logger.trace("relocate {} to {}: {}", shardRouting, nodeId, decision);
422-
if (decision.type() == Decision.Type.YES) {
423-
return currentNode.node();
424-
}
418+
if (nodeId.equals(shardRouting.currentNodeId())) {
419+
continue;
420+
}
421+
final var node = routingNodes.node(nodeId);
422+
if (node == null) { // node left the cluster while reconciliation is still in progress
423+
continue;
424+
}
425+
final var decision = canAllocateDecider.apply(shardRouting, node);
426+
logger.trace("relocate {} to {}: {}", shardRouting, nodeId, decision);
427+
if (decision.type() == Decision.Type.YES) {
428+
return node.node();
425429
}
426430
}
427431

428432
return null;
429433
}
430434

431435
private Decision decideCanAllocate(ShardRouting shardRouting, RoutingNode target) {
436+
assert target != null : "Target node is not found";
432437
return allocation.deciders().canAllocate(shardRouting, target, allocation);
433438
}
434439

435440
private Decision decideCanForceAllocateForVacate(ShardRouting shardRouting, RoutingNode target) {
441+
assert target != null : "Target node is not found";
436442
return allocation.deciders().canForceAllocateDuringReplace(shardRouting, target, allocation);
437443
}
438444
}

server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java

Lines changed: 69 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.elasticsearch.cluster.node.DiscoveryNode;
2222
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
2323
import org.elasticsearch.cluster.node.DiscoveryNodes;
24+
import org.elasticsearch.cluster.routing.IndexRoutingTable;
2425
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
2526
import org.elasticsearch.cluster.routing.RecoverySource;
2627
import org.elasticsearch.cluster.routing.RoutingChangesObserver;
@@ -78,25 +79,22 @@
7879
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_INDEX_VERSION_CREATED;
7980
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS;
8081
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS;
82+
import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
83+
import static org.elasticsearch.cluster.routing.TestShardRouting.newShardRouting;
8184
import static org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING;
8285
import static org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING;
8386
import static org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING;
8487
import static org.hamcrest.Matchers.equalTo;
8588
import static org.hamcrest.Matchers.hasSize;
89+
import static org.hamcrest.Matchers.notNullValue;
90+
import static org.hamcrest.Matchers.nullValue;
8691
import static org.hamcrest.Matchers.oneOf;
8792

8893
public class DesiredBalanceReconcilerTests extends ESTestCase {
8994

9095
public void testNoChangesOnEmptyDesiredBalance() {
9196
final var clusterState = DesiredBalanceComputerTests.createInitialClusterState(3);
92-
final var routingAllocation = new RoutingAllocation(
93-
new AllocationDeciders(List.of()),
94-
clusterState.mutableRoutingNodes(),
95-
clusterState,
96-
ClusterInfo.EMPTY,
97-
SnapshotShardSizeInfo.EMPTY,
98-
0L
99-
);
97+
final var routingAllocation = createRoutingAllocationFrom(clusterState);
10098

10199
reconcile(routingAllocation, new DesiredBalance(1, Map.of()));
102100
assertFalse(routingAllocation.routingNodesChanged());
@@ -319,10 +317,6 @@ public void testUnassignedShardsInterleaving() {
319317

320318
final var stateWithInitializingPrimaries = startInitializingShardsAndReroute(allocationService, clusterState);
321319
for (final var indexRoutingTable : stateWithInitializingPrimaries.routingTable()) {
322-
for (int i = 0; i < indexRoutingTable.size(); i++) {
323-
final var indexShardRoutingTable = indexRoutingTable.shard(i);
324-
}
325-
326320
for (int i = 0; i < indexRoutingTable.size(); i++) {
327321
final var indexShardRoutingTable = indexRoutingTable.shard(i);
328322
assertTrue(indexShardRoutingTable.primaryShard().initializing());
@@ -1021,6 +1015,49 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocat
10211015
assertThat(reroutedState.getRoutingNodes().node("node-1").shardsWithState(ShardRoutingState.RELOCATING), hasSize(1));
10221016
}
10231017

1018+
public void testDoNotRebalanceToTheNodeThatNoLongerExists() {
1019+
1020+
var indexMetadata = IndexMetadata.builder("index-1")
1021+
.settings(
1022+
Settings.builder()
1023+
.put(SETTING_NUMBER_OF_SHARDS, 1)
1024+
.put(SETTING_NUMBER_OF_REPLICAS, 0)
1025+
.put(SETTING_INDEX_VERSION_CREATED.getKey(), Version.CURRENT)
1026+
)
1027+
.system(randomBoolean())
1028+
.build();
1029+
final var index = indexMetadata.getIndex();
1030+
final var shardId = new ShardId(index, 0);
1031+
1032+
final var clusterState = ClusterState.builder(ClusterName.DEFAULT)
1033+
.nodes(
1034+
DiscoveryNodes.builder()
1035+
// data-node-1 left the cluster
1036+
.localNodeId("data-node-2")
1037+
.masterNodeId("data-node-2")
1038+
.add(new DiscoveryNode("data-node-2", buildNewFakeTransportAddress(), Version.CURRENT))
1039+
)
1040+
.metadata(Metadata.builder().put(indexMetadata, true))
1041+
.routingTable(
1042+
RoutingTable.builder()
1043+
.add(IndexRoutingTable.builder(index).addShard(newShardRouting(shardId, "data-node-2", true, STARTED)))
1044+
)
1045+
.build();
1046+
1047+
final var allocation = createRoutingAllocationFrom(clusterState);
1048+
final var balance = new DesiredBalance(
1049+
1,
1050+
Map.of(shardId, new ShardAssignment(Set.of("data-node-1"), 1, 0, 0)) // shard is assigned to the node that has left
1051+
);
1052+
1053+
reconcile(allocation, balance);
1054+
1055+
assertThat(allocation.routingNodes().node("data-node-1"), nullValue());
1056+
assertThat(allocation.routingNodes().node("data-node-2"), notNullValue());
1057+
// shard is kept wherever until balance is recalculated
1058+
assertThat(allocation.routingNodes().node("data-node-2").getByShardId(shardId), notNullValue());
1059+
}
1060+
10241061
private static void reconcile(RoutingAllocation routingAllocation, DesiredBalance desiredBalance) {
10251062
new DesiredBalanceReconciler(desiredBalance, routingAllocation, new NodeAllocationOrdering()).run();
10261063
}
@@ -1037,6 +1074,17 @@ private static AllocationService createTestAllocationService(
10371074
);
10381075
}
10391076

1077+
private static RoutingAllocation createRoutingAllocationFrom(ClusterState clusterState) {
1078+
return new RoutingAllocation(
1079+
new AllocationDeciders(List.of()),
1080+
clusterState.mutableRoutingNodes(),
1081+
clusterState,
1082+
ClusterInfo.EMPTY,
1083+
SnapshotShardSizeInfo.EMPTY,
1084+
0L
1085+
);
1086+
}
1087+
10401088
private static AllocationService createTestAllocationService(
10411089
Consumer<RoutingAllocation> allocationConsumer,
10421090
ClusterInfoService clusterInfoService,
@@ -1116,19 +1164,16 @@ private static DesiredBalance desiredBalance(ClusterState clusterState, BiPredic
11161164
private static DiscoveryNodes discoveryNodes(int nodeCount) {
11171165
final var discoveryNodes = DiscoveryNodes.builder();
11181166
for (var i = 0; i < nodeCount; i++) {
1119-
final var transportAddress = buildNewFakeTransportAddress();
1120-
final var discoveryNode = new DiscoveryNode(
1121-
"node-" + i,
1122-
"node-" + i,
1123-
UUIDs.randomBase64UUID(random()),
1124-
transportAddress.address().getHostString(),
1125-
transportAddress.getAddress(),
1126-
transportAddress,
1127-
Map.of(),
1128-
Set.of(DiscoveryNodeRole.MASTER_ROLE, DiscoveryNodeRole.DATA_ROLE),
1129-
Version.CURRENT
1167+
discoveryNodes.add(
1168+
new DiscoveryNode(
1169+
"node-" + i,
1170+
"node-" + i,
1171+
buildNewFakeTransportAddress(),
1172+
Map.of(),
1173+
Set.of(DiscoveryNodeRole.MASTER_ROLE, DiscoveryNodeRole.DATA_ROLE),
1174+
Version.CURRENT
1175+
)
11301176
);
1131-
discoveryNodes.add(discoveryNode);
11321177
}
11331178
discoveryNodes.masterNodeId("node-0").localNodeId("node-0");
11341179
return discoveryNodes.build();

server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceShardsAllocatorTests.java

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import org.elasticsearch.cluster.service.ClusterApplierService;
3434
import org.elasticsearch.cluster.service.ClusterService;
3535
import org.elasticsearch.cluster.service.FakeThreadPoolMasterService;
36-
import org.elasticsearch.common.UUIDs;
3736
import org.elasticsearch.common.settings.ClusterSettings;
3837
import org.elasticsearch.common.settings.Settings;
3938
import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue;
@@ -377,14 +376,10 @@ public void onFailure(Exception e) {
377376
}
378377

379378
private static DiscoveryNode createDiscoveryNode(String nodeId) {
380-
var transportAddress = buildNewFakeTransportAddress();
381379
return new DiscoveryNode(
382380
nodeId,
383381
nodeId,
384-
UUIDs.randomBase64UUID(random()),
385-
transportAddress.address().getHostString(),
386-
transportAddress.getAddress(),
387-
transportAddress,
382+
buildNewFakeTransportAddress(),
388383
Map.of(),
389384
Set.of(DiscoveryNodeRole.MASTER_ROLE, DiscoveryNodeRole.DATA_ROLE),
390385
Version.CURRENT

0 commit comments

Comments
 (0)