diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java index 71e265be1b9c8..ab0e1b345e9f9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java @@ -208,6 +208,7 @@ public class FederationClientInterceptor private final Clock clock = new MonotonicClock(); private boolean returnPartialReport; private long submitIntervalTime; + private boolean allowPartialResult; @Override public void init(String userName) { @@ -263,6 +264,10 @@ public void init(String userName) { returnPartialReport = conf.getBoolean( YarnConfiguration.ROUTER_CLIENTRM_PARTIAL_RESULTS_ENABLED, YarnConfiguration.DEFAULT_ROUTER_CLIENTRM_PARTIAL_RESULTS_ENABLED); + + allowPartialResult = conf.getBoolean( + YarnConfiguration.ROUTER_INTERCEPTOR_ALLOW_PARTIAL_RESULT_ENABLED, + YarnConfiguration.DEFAULT_ROUTER_INTERCEPTOR_ALLOW_PARTIAL_RESULT_ENABLED); } @Override @@ -895,8 +900,10 @@ Collection invokeConcurrent(ClientMethod request, Class clazz) // All sub-clusters return results to be considered successful, // otherwise an exception will be thrown. if (exceptions != null && !exceptions.isEmpty()) { - throw new YarnException("invokeConcurrent Failed = " + - StringUtils.join(exceptions.values(), ",")); + if (!allowPartialResult || exceptions.keySet().size() == subClusterIds.size()) { + throw new YarnException("invokeConcurrent Failed = " + + StringUtils.join(exceptions.values(), ",")); + } } // return result @@ -2350,4 +2357,9 @@ protected int getNumMaxThreads(Configuration conf) { public void setNumSubmitRetries(int numSubmitRetries) { this.numSubmitRetries = numSubmitRetries; } + + @VisibleForTesting + public void setAllowPartialResult(boolean allowPartialResult) { + this.allowPartialResult = allowPartialResult; + } } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/clientrm/TestFederationClientInterceptorRetry.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/clientrm/TestFederationClientInterceptorRetry.java index bf7ef7d17913d..f0ecf8367cc87 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/clientrm/TestFederationClientInterceptorRetry.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/clientrm/TestFederationClientInterceptorRetry.java @@ -35,6 +35,7 @@ import org.apache.hadoop.yarn.api.protocolrecords.SubmitApplicationRequest; import org.apache.hadoop.yarn.api.protocolrecords.SubmitApplicationResponse; import org.apache.hadoop.yarn.api.protocolrecords.GetClusterMetricsRequest; +import org.apache.hadoop.yarn.api.protocolrecords.GetClusterMetricsResponse; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; @@ -410,4 +411,21 @@ public void testGetClusterMetricsOneBadNodeWithRealError() throws Exception { "subClusterId 1 exec getClusterMetrics error RM is stopped.", () -> interceptor.getClusterMetrics(request)); } + + @Test + public void testGetClusterMetricsOneBadOneGoodNodeWithRealError() throws Exception { + LOG.info("Test getClusterMetrics with one bad and one good SubCluster."); + setupCluster(Arrays.asList(bad1, good)); + GetClusterMetricsRequest request = GetClusterMetricsRequest.newInstance(); + + GetClusterMetricsResponse clusterMetrics = interceptor.getClusterMetrics(request); + Assert.assertNotNull(clusterMetrics); + + // If partial results are not allowed to be returned, an exception will be thrown. + interceptor.setAllowPartialResult(false); + LambdaTestUtils.intercept(YarnException.class, + "subClusterId 1 exec getClusterMetrics error RM is stopped.", + () -> interceptor.getClusterMetrics(request)); + interceptor.setAllowPartialResult(true); + } }