Skip to content

Commit 7cb22eb

Browse files
authored
YARN-11371. [Federation] Refactor FederationInterceptorREST#createNewApplication\submitApplication Use FederationActionRetry. (#5130)
1 parent 3c37a01 commit 7cb22eb

File tree

5 files changed

+334
-318
lines changed

5 files changed

+334
-318
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/utils/FederationStateStoreFacade.java

Lines changed: 169 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222
import java.nio.ByteBuffer;
2323
import java.util.HashMap;
2424
import java.util.List;
25+
import java.util.ArrayList;
2526
import java.util.Map;
2627
import java.util.concurrent.TimeUnit;
28+
import java.util.Random;
2729

2830
import javax.cache.Cache;
2931
import javax.cache.CacheManager;
@@ -38,6 +40,8 @@
3840
import javax.cache.integration.CacheLoaderException;
3941
import javax.cache.spi.CachingProvider;
4042

43+
import org.apache.commons.collections.CollectionUtils;
44+
import org.apache.commons.collections.MapUtils;
4145
import org.apache.commons.lang3.NotImplementedException;
4246
import org.apache.hadoop.conf.Configuration;
4347
import org.apache.hadoop.io.retry.RetryPolicies;
@@ -50,6 +54,8 @@
5054
import org.apache.hadoop.yarn.conf.YarnConfiguration;
5155
import org.apache.hadoop.yarn.exceptions.YarnException;
5256
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
57+
import org.apache.hadoop.yarn.server.federation.policies.FederationPolicyUtils;
58+
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyException;
5359
import org.apache.hadoop.yarn.server.federation.resolver.SubClusterResolver;
5460
import org.apache.hadoop.yarn.server.federation.store.FederationStateStore;
5561
import org.apache.hadoop.yarn.server.federation.store.exception.FederationStateStoreRetriableException;
@@ -110,6 +116,8 @@ public final class FederationStateStoreFacade {
110116
private static final FederationStateStoreFacade FACADE =
111117
new FederationStateStoreFacade();
112118

119+
private static Random rand = new Random(System.currentTimeMillis());
120+
113121
private FederationStateStore stateStore;
114122
private int cacheTimeToLive;
115123
private Configuration conf;
@@ -496,6 +504,7 @@ public void deleteReservationHomeSubCluster(ReservationId reservationId) throws
496504
* @param defaultValue the default implementation for fallback
497505
* @param type the class for which a retry proxy is required
498506
* @param retryPolicy the policy for retrying method call failures
507+
* @param <T> The type of the instance
499508
* @return a retry proxy for the specified interface
500509
*/
501510
public static <T> Object createRetryInstance(Configuration conf,
@@ -731,7 +740,7 @@ public FederationStateStore getStateStore() {
731740
return stateStore;
732741
}
733742

734-
/*
743+
/**
735744
* The Router Supports Store NewMasterKey (RouterMasterKey{@link RouterMasterKey}).
736745
*
737746
* @param newKey Key used for generating and verifying delegation tokens
@@ -849,4 +858,163 @@ public RouterRMTokenResponse getTokenByRouterStoreToken(RMDelegationTokenIdentif
849858
RouterRMTokenRequest request = RouterRMTokenRequest.newInstance(storeToken);
850859
return stateStore.getTokenByRouterStoreToken(request);
851860
}
861+
862+
/**
863+
* Get the number of active cluster nodes.
864+
*
865+
* @return number of active cluster nodes.
866+
* @throws YarnException if the call to the state store is unsuccessful.
867+
*/
868+
public int getActiveSubClustersCount() throws YarnException {
869+
Map<SubClusterId, SubClusterInfo> activeSubClusters = getSubClusters(true);
870+
if (activeSubClusters == null || activeSubClusters.isEmpty()) {
871+
return 0;
872+
} else {
873+
return activeSubClusters.size();
874+
}
875+
}
876+
877+
/**
878+
* Randomly pick ActiveSubCluster.
879+
* During the selection process, we will exclude SubClusters from the blacklist.
880+
*
881+
* @param activeSubClusters List of active subClusters.
882+
* @param blackList blacklist.
883+
* @return Active SubClusterId.
884+
* @throws YarnException When there is no Active SubCluster,
885+
* an exception will be thrown (No active SubCluster available to submit the request.)
886+
*/
887+
public static SubClusterId getRandomActiveSubCluster(
888+
Map<SubClusterId, SubClusterInfo> activeSubClusters, List<SubClusterId> blackList)
889+
throws YarnException {
890+
891+
// Check if activeSubClusters is empty, if it is empty, we need to throw an exception
892+
if (MapUtils.isEmpty(activeSubClusters)) {
893+
throw new FederationPolicyException(
894+
FederationPolicyUtils.NO_ACTIVE_SUBCLUSTER_AVAILABLE);
895+
}
896+
897+
// Change activeSubClusters to List
898+
List<SubClusterId> subClusterIds = new ArrayList<>(activeSubClusters.keySet());
899+
900+
// If the blacklist is not empty, we need to remove all the subClusters in the blacklist
901+
if (CollectionUtils.isNotEmpty(blackList)) {
902+
subClusterIds.removeAll(blackList);
903+
}
904+
905+
// Check there are still active subcluster after removing the blacklist
906+
if (CollectionUtils.isEmpty(subClusterIds)) {
907+
throw new FederationPolicyException(
908+
FederationPolicyUtils.NO_ACTIVE_SUBCLUSTER_AVAILABLE);
909+
}
910+
911+
// Randomly choose a SubCluster
912+
return subClusterIds.get(rand.nextInt(subClusterIds.size()));
913+
}
914+
915+
/**
916+
* Get the number of retries.
917+
*
918+
* @param configRetries User-configured number of retries.
919+
* @return number of retries.
920+
* @throws YarnException yarn exception.
921+
*/
922+
public int getRetryNumbers(int configRetries) throws YarnException {
923+
int activeSubClustersCount = getActiveSubClustersCount();
924+
int actualRetryNums = Math.min(activeSubClustersCount, configRetries);
925+
// Normally, we don't set a negative number for the number of retries,
926+
// but if the user sets a negative number for the number of retries,
927+
// we will return 0
928+
if (actualRetryNums < 0) {
929+
return 0;
930+
}
931+
return actualRetryNums;
932+
}
933+
934+
/**
935+
* Query SubClusterId By applicationId.
936+
*
937+
* If SubClusterId is not empty, it means it exists and returns true;
938+
* if SubClusterId is empty, it means it does not exist and returns false.
939+
*
940+
* @param applicationId applicationId
941+
* @return true, SubClusterId exists; false, SubClusterId not exists.
942+
*/
943+
public boolean existsApplicationHomeSubCluster(ApplicationId applicationId) {
944+
try {
945+
SubClusterId subClusterId = getApplicationHomeSubCluster(applicationId);
946+
if (subClusterId != null) {
947+
return true;
948+
}
949+
} catch (YarnException e) {
950+
LOG.warn("get homeSubCluster by applicationId = {} error.", applicationId, e);
951+
}
952+
return false;
953+
}
954+
955+
/**
956+
* Add ApplicationHomeSubCluster to FederationStateStore.
957+
*
958+
* @param applicationId applicationId.
959+
* @param homeSubCluster homeSubCluster, homeSubCluster selected according to policy.
960+
* @throws YarnException yarn exception.
961+
*/
962+
public void addApplicationHomeSubCluster(ApplicationId applicationId,
963+
ApplicationHomeSubCluster homeSubCluster) throws YarnException {
964+
try {
965+
addApplicationHomeSubCluster(homeSubCluster);
966+
} catch (YarnException e) {
967+
String msg = String.format(
968+
"Unable to insert the ApplicationId %s into the FederationStateStore.", applicationId);
969+
throw new YarnException(msg, e);
970+
}
971+
}
972+
973+
/**
974+
* Update ApplicationHomeSubCluster to FederationStateStore.
975+
*
976+
* @param subClusterId homeSubClusterId
977+
* @param applicationId applicationId.
978+
* @param homeSubCluster homeSubCluster, homeSubCluster selected according to policy.
979+
* @throws YarnException yarn exception.
980+
*/
981+
public void updateApplicationHomeSubCluster(SubClusterId subClusterId,
982+
ApplicationId applicationId, ApplicationHomeSubCluster homeSubCluster) throws YarnException {
983+
try {
984+
updateApplicationHomeSubCluster(homeSubCluster);
985+
} catch (YarnException e) {
986+
SubClusterId subClusterIdInStateStore = getApplicationHomeSubCluster(applicationId);
987+
if (subClusterId == subClusterIdInStateStore) {
988+
LOG.info("Application {} already submitted on SubCluster {}.", applicationId, subClusterId);
989+
} else {
990+
String msg = String.format(
991+
"Unable to update the ApplicationId %s into the FederationStateStore.", applicationId);
992+
throw new YarnException(msg, e);
993+
}
994+
}
995+
}
996+
997+
/**
998+
* Add or Update ApplicationHomeSubCluster.
999+
*
1000+
* @param applicationId applicationId, is the id of the application.
1001+
* @param subClusterId homeSubClusterId, this is selected by strategy.
1002+
* @param retryCount number of retries.
1003+
* @throws YarnException yarn exception.
1004+
*/
1005+
public void addOrUpdateApplicationHomeSubCluster(ApplicationId applicationId,
1006+
SubClusterId subClusterId, int retryCount) throws YarnException {
1007+
Boolean exists = existsApplicationHomeSubCluster(applicationId);
1008+
ApplicationHomeSubCluster appHomeSubCluster =
1009+
ApplicationHomeSubCluster.newInstance(applicationId, subClusterId);
1010+
if (!exists || retryCount == 0) {
1011+
// persist the mapping of applicationId and the subClusterId which has
1012+
// been selected as its home.
1013+
addApplicationHomeSubCluster(applicationId, appHomeSubCluster);
1014+
} else {
1015+
// update the mapping of applicationId and the home subClusterId to
1016+
// the new subClusterId we have selected.
1017+
updateApplicationHomeSubCluster(subClusterId, applicationId, appHomeSubCluster);
1018+
}
1019+
}
8521020
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterServerUtil.java

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818

1919
package org.apache.hadoop.yarn.server.router;
2020

21-
import org.apache.commons.collections.CollectionUtils;
22-
import org.apache.commons.collections.MapUtils;
2321
import org.apache.commons.lang3.math.NumberUtils;
2422
import org.apache.hadoop.classification.InterfaceAudience.Private;
2523
import org.apache.hadoop.classification.InterfaceAudience.Public;
@@ -30,9 +28,6 @@
3028
import org.apache.hadoop.util.StringUtils;
3129
import org.apache.hadoop.yarn.exceptions.YarnException;
3230
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
33-
import org.apache.hadoop.yarn.server.federation.policies.FederationPolicyUtils;
34-
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId;
35-
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
3631
import org.slf4j.Logger;
3732
import org.slf4j.LoggerFactory;
3833

@@ -41,8 +36,6 @@
4136
import java.util.ArrayList;
4237
import java.util.Collection;
4338
import java.util.List;
44-
import java.util.Map;
45-
import java.util.Random;
4639
import java.io.IOException;
4740

4841
/**
@@ -61,8 +54,6 @@ public final class RouterServerUtil {
6154

6255
private static final String EPOCH_PREFIX = "e";
6356

64-
private static Random rand = new Random(System.currentTimeMillis());
65-
6657
/** Disable constructor. */
6758
private RouterServerUtil() {
6859
}
@@ -479,42 +470,6 @@ public static void validateContainerId(String containerId)
479470
}
480471
}
481472

482-
/**
483-
* Randomly pick ActiveSubCluster.
484-
* During the selection process, we will exclude SubClusters from the blacklist.
485-
*
486-
* @param activeSubClusters List of active subClusters.
487-
* @param blackList blacklist.
488-
* @return Active SubClusterId.
489-
* @throws YarnException When there is no Active SubCluster,
490-
* an exception will be thrown (No active SubCluster available to submit the request.)
491-
*/
492-
public static SubClusterId getRandomActiveSubCluster(
493-
Map<SubClusterId, SubClusterInfo> activeSubClusters, List<SubClusterId> blackList)
494-
throws YarnException {
495-
496-
// Check if activeSubClusters is empty, if it is empty, we need to throw an exception
497-
if (MapUtils.isEmpty(activeSubClusters)) {
498-
logAndThrowException(FederationPolicyUtils.NO_ACTIVE_SUBCLUSTER_AVAILABLE, null);
499-
}
500-
501-
// Change activeSubClusters to List
502-
List<SubClusterId> subClusterIds = new ArrayList<>(activeSubClusters.keySet());
503-
504-
// If the blacklist is not empty, we need to remove all the subClusters in the blacklist
505-
if (CollectionUtils.isNotEmpty(blackList)) {
506-
subClusterIds.removeAll(blackList);
507-
}
508-
509-
// Check there are still active subcluster after removing the blacklist
510-
if (CollectionUtils.isEmpty(subClusterIds)) {
511-
logAndThrowException(FederationPolicyUtils.NO_ACTIVE_SUBCLUSTER_AVAILABLE, null);
512-
}
513-
514-
// Randomly choose a SubCluster
515-
return subClusterIds.get(rand.nextInt(subClusterIds.size()));
516-
}
517-
518473
public static UserGroupInformation setupUser(final String userName) {
519474
UserGroupInformation user = null;
520475
try {

0 commit comments

Comments
 (0)