Only log warning when actually failing shards (#28558)

dnhatn · dnhatn · commit 6b869adee6b6 · 2018-02-08T18:07:33.000-05:00
Currently the master node logs a warning message whenever it receives a failed shard request. However, this can be noisy because - Multiple failed shard requests can be issued for a single shard - Failed shard requests can be still issued for an already failed shard This commit moves the log-warn to AllocationService in which the failing shard action actually happens. This is another prerequisite step in order to not ignore the shard not-available exceptions in the replication. Relates #28534
diff --git a/server/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java b/server/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java
@@ -205,7 +205,7 @@ private static class ShardFailedTransportHandler implements TransportRequestHand
 
         @Override
         public void messageReceived(FailedShardEntry request, TransportChannel channel) throws Exception {
-            logger.warn((Supplier<?>) () -> new ParameterizedMessage("{} received shard failed for {}", request.shardId, request), request.failure);
+            logger.debug((Supplier<?>) () -> new ParameterizedMessage("{} received shard failed for {}", request.shardId, request), request.failure);
             clusterService.submitStateUpdateTask(
                 "shard-failed",
                 request,
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java
@@ -19,6 +19,7 @@
 
 package org.elasticsearch.cluster.routing.allocation;
 
+import org.apache.logging.log4j.message.ParameterizedMessage;
 import org.elasticsearch.cluster.ClusterInfoService;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.RestoreInProgress;
@@ -160,7 +161,7 @@ public ClusterState applyFailedShards(final ClusterState clusterState, final Lis
         if (staleShards.isEmpty() && failedShards.isEmpty()) {
             return clusterState;
         }
-        ClusterState tmpState = IndexMetaDataUpdater.removeStaleIdsWithoutRoutings(clusterState, staleShards);
+        ClusterState tmpState = IndexMetaDataUpdater.removeStaleIdsWithoutRoutings(clusterState, staleShards, logger);
 
         RoutingNodes routingNodes = getMutableRoutingNodes(tmpState);
         // shuffle the unassigned nodes, just so we won't have things like poison failed shards
@@ -188,6 +189,7 @@ public ClusterState applyFailedShards(final ClusterState clusterState, final Lis
                 if (failedShardEntry.markAsStale()) {
                     allocation.removeAllocationId(failedShard);
                 }
+                logger.warn(new ParameterizedMessage("failing shard [{}]", failedShardEntry), failedShardEntry.getFailure());
                 routingNodes.failShard(logger, failedShard, unassignedInfo, indexMetaData, allocation.changes());
             } else {
                 logger.trace("{} shard routing failed in an earlier iteration (routing: {})", shardToFail.shardId(), shardToFail);
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/IndexMetaDataUpdater.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/IndexMetaDataUpdater.java
@@ -19,6 +19,7 @@
 
 package org.elasticsearch.cluster.routing.allocation;
 
+import org.apache.logging.log4j.Logger;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.metadata.MetaData;
@@ -210,7 +211,7 @@ private IndexMetaData.Builder updateInSyncAllocations(RoutingTable newRoutingTab
      * Removes allocation ids from the in-sync set for shard copies for which there is no routing entries in the routing table.
      * This method is called in AllocationService before any changes to the routing table are made.
      */
-    public static ClusterState removeStaleIdsWithoutRoutings(ClusterState clusterState, List<StaleShard> staleShards) {
+    public static ClusterState removeStaleIdsWithoutRoutings(ClusterState clusterState, List<StaleShard> staleShards, Logger logger) {
         MetaData oldMetaData = clusterState.metaData();
         RoutingTable oldRoutingTable = clusterState.routingTable();
         MetaData.Builder metaDataBuilder = null;
@@ -238,6 +239,7 @@ public static ClusterState removeStaleIdsWithoutRoutings(ClusterState clusterSta
                     }
                     indexMetaDataBuilder.putInSyncAllocationIds(shardNumber, remainingInSyncAllocations);
                 }
+                logger.warn("{} marking unavailable shards as stale: {}", shardEntry.getKey(), idsToRemove);
             }
 
             if (indexMetaDataBuilder != null) {