diff --git a/server/src/main/java/org/opensearch/gateway/PrimaryShardAllocator.java b/server/src/main/java/org/opensearch/gateway/PrimaryShardAllocator.java index f6b20b7dfb581..4dc9396751fc9 100644 --- a/server/src/main/java/org/opensearch/gateway/PrimaryShardAllocator.java +++ b/server/src/main/java/org/opensearch/gateway/PrimaryShardAllocator.java @@ -313,8 +313,9 @@ private static ShardStoreInfo shardStoreInfo(NodeGatewayStartedShards nodeShardS NodeGatewayStartedShards::primary ).reversed(); - private static final Comparator HIGHEST_REPLICATION_CHECKPOINT_FIRST_COMPARATOR = Comparator.nullsLast( - Comparator.comparing(NodeGatewayStartedShards::replicationCheckpoint) + private static final Comparator HIGHEST_REPLICATION_CHECKPOINT_FIRST_COMPARATOR = Comparator.comparing( + NodeGatewayStartedShards::replicationCheckpoint, + Comparator.nullsLast(Comparator.naturalOrder()) ); /** @@ -387,9 +388,9 @@ protected static NodeShardsResult buildNodeShardsResult( /** * Orders the active shards copies based on below comparators - * 1. No store exception - * 2. Shard copies previously primary shard - * 3. Shard copies with highest replication checkpoint. This comparator is NO-OP for doc rep enabled indices. + * 1. No store exception i.e. shard copy is readable + * 2. Prefer previous primary shard + * 3. Prefer shard copy with the highest replication checkpoint. It is NO-OP for doc rep enabled indices. */ final Comparator comparator; // allocation preference if (matchAnyShard) { diff --git a/server/src/test/java/org/opensearch/gateway/PrimaryShardAllocatorTests.java b/server/src/test/java/org/opensearch/gateway/PrimaryShardAllocatorTests.java index 9982bb0d71f4f..3c39ec9f03b2a 100644 --- a/server/src/test/java/org/opensearch/gateway/PrimaryShardAllocatorTests.java +++ b/server/src/test/java/org/opensearch/gateway/PrimaryShardAllocatorTests.java @@ -272,6 +272,39 @@ public void testPreferReplicaWithNullReplicationCheckpoint() { assertClusterHealthStatus(allocation, ClusterHealthStatus.YELLOW); } + /** + * Tests that null ReplicationCheckpoint are ignored + */ + public void testPreferReplicaWithAllNullReplicationCheckpoint() { + String allocId1 = randomAlphaOfLength(10); + String allocId2 = randomAlphaOfLength(10); + String allocId3 = randomAlphaOfLength(10); + final RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas( + yesAllocationDeciders(), + CLUSTER_RECOVERED, + allocId1, + allocId2, + allocId3 + ); + testAllocator.addData(node1, allocId1, false, null, null); + testAllocator.addData(node2, allocId2, false, null, null); + testAllocator.addData(node3, allocId3, true, null, null); + allocateAllUnassigned(allocation); + assertThat(allocation.routingNodesChanged(), equalTo(true)); + assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true)); + assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1)); + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), + equalTo(node3.getId()) + ); + // Assert node3's allocation id should be used as it was previous primary + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).allocationId().getId(), + equalTo(allocId3) + ); + assertClusterHealthStatus(allocation, ClusterHealthStatus.YELLOW); + } + /** * Tests that replica with highest segment info version will be selected as target on equal primary terms */