|
22 | 22 | import java.nio.ByteBuffer;
|
23 | 23 | import java.util.HashMap;
|
24 | 24 | import java.util.List;
|
| 25 | +import java.util.ArrayList; |
25 | 26 | import java.util.Map;
|
26 | 27 | import java.util.concurrent.TimeUnit;
|
| 28 | +import java.util.Random; |
27 | 29 |
|
28 | 30 | import javax.cache.Cache;
|
29 | 31 | import javax.cache.CacheManager;
|
|
38 | 40 | import javax.cache.integration.CacheLoaderException;
|
39 | 41 | import javax.cache.spi.CachingProvider;
|
40 | 42 |
|
| 43 | +import org.apache.commons.collections.CollectionUtils; |
| 44 | +import org.apache.commons.collections.MapUtils; |
41 | 45 | import org.apache.commons.lang3.NotImplementedException;
|
42 | 46 | import org.apache.hadoop.conf.Configuration;
|
43 | 47 | import org.apache.hadoop.io.retry.RetryPolicies;
|
|
50 | 54 | import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
51 | 55 | import org.apache.hadoop.yarn.exceptions.YarnException;
|
52 | 56 | import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
| 57 | +import org.apache.hadoop.yarn.server.federation.policies.FederationPolicyUtils; |
| 58 | +import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyException; |
53 | 59 | import org.apache.hadoop.yarn.server.federation.resolver.SubClusterResolver;
|
54 | 60 | import org.apache.hadoop.yarn.server.federation.store.FederationStateStore;
|
55 | 61 | import org.apache.hadoop.yarn.server.federation.store.exception.FederationStateStoreRetriableException;
|
@@ -110,6 +116,8 @@ public final class FederationStateStoreFacade {
|
110 | 116 | private static final FederationStateStoreFacade FACADE =
|
111 | 117 | new FederationStateStoreFacade();
|
112 | 118 |
|
| 119 | + private static Random rand = new Random(System.currentTimeMillis()); |
| 120 | + |
113 | 121 | private FederationStateStore stateStore;
|
114 | 122 | private int cacheTimeToLive;
|
115 | 123 | private Configuration conf;
|
@@ -496,6 +504,7 @@ public void deleteReservationHomeSubCluster(ReservationId reservationId) throws
|
496 | 504 | * @param defaultValue the default implementation for fallback
|
497 | 505 | * @param type the class for which a retry proxy is required
|
498 | 506 | * @param retryPolicy the policy for retrying method call failures
|
| 507 | + * @param <T> The type of the instance |
499 | 508 | * @return a retry proxy for the specified interface
|
500 | 509 | */
|
501 | 510 | public static <T> Object createRetryInstance(Configuration conf,
|
@@ -731,7 +740,7 @@ public FederationStateStore getStateStore() {
|
731 | 740 | return stateStore;
|
732 | 741 | }
|
733 | 742 |
|
734 |
| - /* |
| 743 | + /** |
735 | 744 | * The Router Supports Store NewMasterKey (RouterMasterKey{@link RouterMasterKey}).
|
736 | 745 | *
|
737 | 746 | * @param newKey Key used for generating and verifying delegation tokens
|
@@ -849,4 +858,163 @@ public RouterRMTokenResponse getTokenByRouterStoreToken(RMDelegationTokenIdentif
|
849 | 858 | RouterRMTokenRequest request = RouterRMTokenRequest.newInstance(storeToken);
|
850 | 859 | return stateStore.getTokenByRouterStoreToken(request);
|
851 | 860 | }
|
| 861 | + |
| 862 | + /** |
| 863 | + * Get the number of active cluster nodes. |
| 864 | + * |
| 865 | + * @return number of active cluster nodes. |
| 866 | + * @throws YarnException if the call to the state store is unsuccessful. |
| 867 | + */ |
| 868 | + public int getActiveSubClustersCount() throws YarnException { |
| 869 | + Map<SubClusterId, SubClusterInfo> activeSubClusters = getSubClusters(true); |
| 870 | + if (activeSubClusters == null || activeSubClusters.isEmpty()) { |
| 871 | + return 0; |
| 872 | + } else { |
| 873 | + return activeSubClusters.size(); |
| 874 | + } |
| 875 | + } |
| 876 | + |
| 877 | + /** |
| 878 | + * Randomly pick ActiveSubCluster. |
| 879 | + * During the selection process, we will exclude SubClusters from the blacklist. |
| 880 | + * |
| 881 | + * @param activeSubClusters List of active subClusters. |
| 882 | + * @param blackList blacklist. |
| 883 | + * @return Active SubClusterId. |
| 884 | + * @throws YarnException When there is no Active SubCluster, |
| 885 | + * an exception will be thrown (No active SubCluster available to submit the request.) |
| 886 | + */ |
| 887 | + public static SubClusterId getRandomActiveSubCluster( |
| 888 | + Map<SubClusterId, SubClusterInfo> activeSubClusters, List<SubClusterId> blackList) |
| 889 | + throws YarnException { |
| 890 | + |
| 891 | + // Check if activeSubClusters is empty, if it is empty, we need to throw an exception |
| 892 | + if (MapUtils.isEmpty(activeSubClusters)) { |
| 893 | + throw new FederationPolicyException( |
| 894 | + FederationPolicyUtils.NO_ACTIVE_SUBCLUSTER_AVAILABLE); |
| 895 | + } |
| 896 | + |
| 897 | + // Change activeSubClusters to List |
| 898 | + List<SubClusterId> subClusterIds = new ArrayList<>(activeSubClusters.keySet()); |
| 899 | + |
| 900 | + // If the blacklist is not empty, we need to remove all the subClusters in the blacklist |
| 901 | + if (CollectionUtils.isNotEmpty(blackList)) { |
| 902 | + subClusterIds.removeAll(blackList); |
| 903 | + } |
| 904 | + |
| 905 | + // Check there are still active subcluster after removing the blacklist |
| 906 | + if (CollectionUtils.isEmpty(subClusterIds)) { |
| 907 | + throw new FederationPolicyException( |
| 908 | + FederationPolicyUtils.NO_ACTIVE_SUBCLUSTER_AVAILABLE); |
| 909 | + } |
| 910 | + |
| 911 | + // Randomly choose a SubCluster |
| 912 | + return subClusterIds.get(rand.nextInt(subClusterIds.size())); |
| 913 | + } |
| 914 | + |
| 915 | + /** |
| 916 | + * Get the number of retries. |
| 917 | + * |
| 918 | + * @param configRetries User-configured number of retries. |
| 919 | + * @return number of retries. |
| 920 | + * @throws YarnException yarn exception. |
| 921 | + */ |
| 922 | + public int getRetryNumbers(int configRetries) throws YarnException { |
| 923 | + int activeSubClustersCount = getActiveSubClustersCount(); |
| 924 | + int actualRetryNums = Math.min(activeSubClustersCount, configRetries); |
| 925 | + // Normally, we don't set a negative number for the number of retries, |
| 926 | + // but if the user sets a negative number for the number of retries, |
| 927 | + // we will return 0 |
| 928 | + if (actualRetryNums < 0) { |
| 929 | + return 0; |
| 930 | + } |
| 931 | + return actualRetryNums; |
| 932 | + } |
| 933 | + |
| 934 | + /** |
| 935 | + * Query SubClusterId By applicationId. |
| 936 | + * |
| 937 | + * If SubClusterId is not empty, it means it exists and returns true; |
| 938 | + * if SubClusterId is empty, it means it does not exist and returns false. |
| 939 | + * |
| 940 | + * @param applicationId applicationId |
| 941 | + * @return true, SubClusterId exists; false, SubClusterId not exists. |
| 942 | + */ |
| 943 | + public boolean existsApplicationHomeSubCluster(ApplicationId applicationId) { |
| 944 | + try { |
| 945 | + SubClusterId subClusterId = getApplicationHomeSubCluster(applicationId); |
| 946 | + if (subClusterId != null) { |
| 947 | + return true; |
| 948 | + } |
| 949 | + } catch (YarnException e) { |
| 950 | + LOG.warn("get homeSubCluster by applicationId = {} error.", applicationId, e); |
| 951 | + } |
| 952 | + return false; |
| 953 | + } |
| 954 | + |
| 955 | + /** |
| 956 | + * Add ApplicationHomeSubCluster to FederationStateStore. |
| 957 | + * |
| 958 | + * @param applicationId applicationId. |
| 959 | + * @param homeSubCluster homeSubCluster, homeSubCluster selected according to policy. |
| 960 | + * @throws YarnException yarn exception. |
| 961 | + */ |
| 962 | + public void addApplicationHomeSubCluster(ApplicationId applicationId, |
| 963 | + ApplicationHomeSubCluster homeSubCluster) throws YarnException { |
| 964 | + try { |
| 965 | + addApplicationHomeSubCluster(homeSubCluster); |
| 966 | + } catch (YarnException e) { |
| 967 | + String msg = String.format( |
| 968 | + "Unable to insert the ApplicationId %s into the FederationStateStore.", applicationId); |
| 969 | + throw new YarnException(msg, e); |
| 970 | + } |
| 971 | + } |
| 972 | + |
| 973 | + /** |
| 974 | + * Update ApplicationHomeSubCluster to FederationStateStore. |
| 975 | + * |
| 976 | + * @param subClusterId homeSubClusterId |
| 977 | + * @param applicationId applicationId. |
| 978 | + * @param homeSubCluster homeSubCluster, homeSubCluster selected according to policy. |
| 979 | + * @throws YarnException yarn exception. |
| 980 | + */ |
| 981 | + public void updateApplicationHomeSubCluster(SubClusterId subClusterId, |
| 982 | + ApplicationId applicationId, ApplicationHomeSubCluster homeSubCluster) throws YarnException { |
| 983 | + try { |
| 984 | + updateApplicationHomeSubCluster(homeSubCluster); |
| 985 | + } catch (YarnException e) { |
| 986 | + SubClusterId subClusterIdInStateStore = getApplicationHomeSubCluster(applicationId); |
| 987 | + if (subClusterId == subClusterIdInStateStore) { |
| 988 | + LOG.info("Application {} already submitted on SubCluster {}.", applicationId, subClusterId); |
| 989 | + } else { |
| 990 | + String msg = String.format( |
| 991 | + "Unable to update the ApplicationId %s into the FederationStateStore.", applicationId); |
| 992 | + throw new YarnException(msg, e); |
| 993 | + } |
| 994 | + } |
| 995 | + } |
| 996 | + |
| 997 | + /** |
| 998 | + * Add or Update ApplicationHomeSubCluster. |
| 999 | + * |
| 1000 | + * @param applicationId applicationId, is the id of the application. |
| 1001 | + * @param subClusterId homeSubClusterId, this is selected by strategy. |
| 1002 | + * @param retryCount number of retries. |
| 1003 | + * @throws YarnException yarn exception. |
| 1004 | + */ |
| 1005 | + public void addOrUpdateApplicationHomeSubCluster(ApplicationId applicationId, |
| 1006 | + SubClusterId subClusterId, int retryCount) throws YarnException { |
| 1007 | + Boolean exists = existsApplicationHomeSubCluster(applicationId); |
| 1008 | + ApplicationHomeSubCluster appHomeSubCluster = |
| 1009 | + ApplicationHomeSubCluster.newInstance(applicationId, subClusterId); |
| 1010 | + if (!exists || retryCount == 0) { |
| 1011 | + // persist the mapping of applicationId and the subClusterId which has |
| 1012 | + // been selected as its home. |
| 1013 | + addApplicationHomeSubCluster(applicationId, appHomeSubCluster); |
| 1014 | + } else { |
| 1015 | + // update the mapping of applicationId and the home subClusterId to |
| 1016 | + // the new subClusterId we have selected. |
| 1017 | + updateApplicationHomeSubCluster(subClusterId, applicationId, appHomeSubCluster); |
| 1018 | + } |
| 1019 | + } |
852 | 1020 | }
|
0 commit comments