Skip to content

Commit beebaeb

Browse files
slfan1989HarshitGupta11
authored andcommitted
YARN-11235. Refactor Policy Code and Define getReservationHomeSubcluster (apache#4656)
1 parent 47b48e5 commit beebaeb

24 files changed

+454
-344
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/FederationPolicyUtils.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import java.nio.ByteBuffer;
2121
import java.nio.charset.StandardCharsets;
2222
import java.util.ArrayList;
23-
import java.util.List;
23+
import java.util.Collection;
2424
import java.util.Random;
2525

2626
import org.apache.hadoop.classification.InterfaceAudience.Private;
@@ -188,8 +188,8 @@ public static FederationAMRMProxyPolicy loadAMRMPolicy(String queue,
188188
* @throws FederationPolicyException if there are no usable subclusters.
189189
*/
190190
public static void validateSubClusterAvailability(
191-
List<SubClusterId> activeSubClusters,
192-
List<SubClusterId> blackListSubClusters)
191+
Collection<SubClusterId> activeSubClusters,
192+
Collection<SubClusterId> blackListSubClusters)
193193
throws FederationPolicyException {
194194
if (activeSubClusters != null && !activeSubClusters.isEmpty()) {
195195
if (blackListSubClusters == null) {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/RouterPolicyFacade.java

Lines changed: 91 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
import org.apache.hadoop.conf.Configuration;
2727
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
28+
import org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionRequest;
2829
import org.apache.hadoop.yarn.conf.YarnConfiguration;
2930
import org.apache.hadoop.yarn.exceptions.YarnException;
3031
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyException;
@@ -136,7 +137,7 @@ public SubClusterId getHomeSubcluster(
136137

137138
if (appSubmissionContext == null) {
138139
throw new FederationPolicyException(
139-
"The ApplicationSubmissionContext " + "cannot be null.");
140+
"The ApplicationSubmissionContext cannot be null.");
140141
}
141142

142143
String queue = appSubmissionContext.getQueue();
@@ -148,51 +149,7 @@ public SubClusterId getHomeSubcluster(
148149
queue = YarnConfiguration.DEFAULT_QUEUE_NAME;
149150
}
150151

151-
// the facade might cache this request, based on its parameterization
152-
SubClusterPolicyConfiguration configuration = null;
153-
154-
try {
155-
configuration = federationFacade.getPolicyConfiguration(queue);
156-
} catch (YarnException e) {
157-
String errMsg = "There is no policy configured for the queue: " + queue
158-
+ ", falling back to defaults.";
159-
LOG.warn(errMsg, e);
160-
}
161-
162-
// If there is no policy configured for this queue, fallback to the baseline
163-
// policy that is configured either in the store or via XML config (and
164-
// cached)
165-
if (configuration == null) {
166-
LOG.warn("There is no policies configured for queue: " + queue + " we"
167-
+ " fallback to default policy for: "
168-
+ YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY);
169-
170-
queue = YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY;
171-
try {
172-
configuration = federationFacade.getPolicyConfiguration(queue);
173-
} catch (YarnException e) {
174-
String errMsg = "Cannot retrieve policy configured for the queue: "
175-
+ queue + ", falling back to defaults.";
176-
LOG.warn(errMsg, e);
177-
178-
}
179-
}
180-
181-
// the fallback is not configure via store, but via XML, using
182-
// previously loaded configuration.
183-
if (configuration == null) {
184-
configuration =
185-
cachedConfs.get(YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY);
186-
}
187-
188-
// if the configuration has changed since last loaded, reinit the policy
189-
// based on current configuration
190-
if (!cachedConfs.containsKey(queue)
191-
|| !cachedConfs.get(queue).equals(configuration)) {
192-
singlePolicyReinit(policyMap, cachedConfs, queue, configuration);
193-
}
194-
195-
FederationRouterPolicy policy = policyMap.get(queue);
152+
FederationRouterPolicy policy = getFederationRouterPolicy(cachedConfs, policyMap, queue);
196153
if (policy == null) {
197154
// this should never happen, as the to maps are updated together
198155
throw new FederationPolicyException("No FederationRouterPolicy found "
@@ -262,4 +219,92 @@ public synchronized void reset() {
262219

263220
}
264221

222+
/**
223+
* This method provides a wrapper of all policy functionalities for routing a
224+
* reservation. Internally it manages configuration changes, and policy
225+
* init/reinit.
226+
*
227+
* @param request the reservation to route.
228+
*
229+
* @return the id of the subcluster that will be the "home" for this
230+
* reservation.
231+
*
232+
* @throws YarnException if there are issues initializing policies, or no
233+
* valid sub-cluster id could be found for this reservation.
234+
*/
235+
public SubClusterId getReservationHomeSubCluster(
236+
ReservationSubmissionRequest request) throws YarnException {
237+
238+
// the maps are concurrent, but we need to protect from reset()
239+
// reinitialization mid-execution by creating a new reference local to this
240+
// method.
241+
Map<String, SubClusterPolicyConfiguration> cachedConfs = globalConfMap;
242+
Map<String, FederationRouterPolicy> policyMap = globalPolicyMap;
243+
244+
if (request == null) {
245+
throw new FederationPolicyException(
246+
"The ReservationSubmissionRequest cannot be null.");
247+
}
248+
249+
String queue = request.getQueue();
250+
FederationRouterPolicy policy = getFederationRouterPolicy(cachedConfs, policyMap, queue);
251+
252+
if (policy == null) {
253+
// this should never happen, as the to maps are updated together
254+
throw new FederationPolicyException("No FederationRouterPolicy found "
255+
+ "for queue: " + request.getQueue() + " (while routing "
256+
+ "reservation: " + request.getReservationId() + ") "
257+
+ "and no default specified.");
258+
}
259+
260+
return policy.getReservationHomeSubcluster(request);
261+
}
262+
263+
private FederationRouterPolicy getFederationRouterPolicy(
264+
Map<String, SubClusterPolicyConfiguration> cachedConfiguration,
265+
Map<String, FederationRouterPolicy> policyMap, String queue)
266+
throws FederationPolicyInitializationException {
267+
268+
// the facade might cache this request, based on its parameterization
269+
SubClusterPolicyConfiguration configuration = null;
270+
String copyQueue = queue;
271+
272+
try {
273+
configuration = federationFacade.getPolicyConfiguration(copyQueue);
274+
} catch (YarnException e) {
275+
LOG.warn("There is no policy configured for the queue: {}, falling back to defaults.",
276+
copyQueue, e);
277+
}
278+
279+
// If there is no policy configured for this queue, fallback to the baseline
280+
// policy that is configured either in the store or via XML config (and cached)
281+
if (configuration == null) {
282+
final String policyKey = YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY;
283+
LOG.warn("There is no policies configured for queue: {} " +
284+
"we fallback to default policy for: {}. ", copyQueue, policyKey);
285+
copyQueue = YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY;
286+
try {
287+
configuration = federationFacade.getPolicyConfiguration(copyQueue);
288+
} catch (YarnException e) {
289+
LOG.warn("Cannot retrieve policy configured for the queue: {}, falling back to defaults.",
290+
copyQueue, e);
291+
}
292+
}
293+
294+
// the fallback is not configure via store, but via XML, using
295+
// previously loaded configuration.
296+
if (configuration == null) {
297+
configuration = cachedConfiguration.get(YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY);
298+
}
299+
300+
// if the configuration has changed since last loaded, reinit the policy
301+
// based on current configuration
302+
SubClusterPolicyConfiguration policyConfiguration =
303+
cachedConfiguration.getOrDefault(copyQueue, null);
304+
if (policyConfiguration == null || !policyConfiguration.equals(configuration)) {
305+
singlePolicyReinit(policyMap, cachedConfiguration, copyQueue, configuration);
306+
}
307+
308+
return policyMap.get(copyQueue);
309+
}
265310
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/router/AbstractRouterPolicy.java

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,22 @@
1818

1919
package org.apache.hadoop.yarn.server.federation.policies.router;
2020

21+
import java.util.List;
2122
import java.util.Map;
2223

24+
import org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionRequest;
2325
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
26+
import org.apache.hadoop.yarn.api.records.ReservationId;
2427
import org.apache.hadoop.yarn.conf.YarnConfiguration;
28+
import org.apache.hadoop.yarn.exceptions.YarnException;
2529
import org.apache.hadoop.yarn.server.federation.policies.AbstractConfigurableFederationPolicy;
30+
import org.apache.hadoop.yarn.server.federation.policies.FederationPolicyUtils;
2631
import org.apache.hadoop.yarn.server.federation.policies.dao.WeightedPolicyInfo;
2732
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyException;
2833
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyInitializationException;
34+
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId;
2935
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterIdInfo;
36+
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
3037

3138
/**
3239
* Base abstract class for {@link FederationRouterPolicy} implementations, that
@@ -63,4 +70,107 @@ public void validate(ApplicationSubmissionContext appSubmissionContext)
6370
}
6471
}
6572

73+
/**
74+
* This method is implemented by the specific policy, and it is used to route
75+
* both reservations, and applications among a given set of
76+
* sub-clusters.
77+
*
78+
* @param queue the queue for this application/reservation
79+
* @param preSelectSubClusters a pre-filter set of sub-clusters
80+
* @return the chosen sub-cluster
81+
*
82+
* @throws YarnException if the policy fails to choose a sub-cluster
83+
*/
84+
protected abstract SubClusterId chooseSubCluster(String queue,
85+
Map<SubClusterId, SubClusterInfo> preSelectSubClusters) throws YarnException;
86+
87+
/**
88+
* Filter chosen SubCluster based on reservationId.
89+
*
90+
* @param reservationId the globally unique identifier for a reservation.
91+
* @param activeSubClusters the map of ids to info for all active subclusters.
92+
* @return the chosen sub-cluster
93+
* @throws YarnException if the policy fails to choose a sub-cluster
94+
*/
95+
protected Map<SubClusterId, SubClusterInfo> prefilterSubClusters(
96+
ReservationId reservationId, Map<SubClusterId, SubClusterInfo> activeSubClusters)
97+
throws YarnException {
98+
99+
// if a reservation exists limit scope to the sub-cluster this
100+
// reservation is mapped to
101+
// TODO: Implemented in YARN-11236
102+
return activeSubClusters;
103+
}
104+
105+
/**
106+
* Simply picks from alphabetically-sorted active subclusters based on the
107+
* hash of quey name. Jobs of the same queue will all be routed to the same
108+
* sub-cluster, as far as the number of active sub-cluster and their names
109+
* remain the same.
110+
*
111+
* @param appContext the {@link ApplicationSubmissionContext} that
112+
* has to be routed to an appropriate subCluster for execution.
113+
*
114+
* @param blackLists the list of subClusters as identified by
115+
* {@link SubClusterId} to blackList from the selection of the home
116+
* subCluster.
117+
*
118+
* @return a hash-based chosen {@link SubClusterId} that will be the "home"
119+
* for this application.
120+
*
121+
* @throws YarnException if there are no active subclusters.
122+
*/
123+
@Override
124+
public SubClusterId getHomeSubcluster(ApplicationSubmissionContext appContext,
125+
List<SubClusterId> blackLists) throws YarnException {
126+
127+
// null checks and default-queue behavior
128+
validate(appContext);
129+
130+
// apply filtering based on reservation location and active sub-clusters
131+
Map<SubClusterId, SubClusterInfo> filteredSubClusters = prefilterSubClusters(
132+
appContext.getReservationID(), getActiveSubclusters());
133+
134+
FederationPolicyUtils.validateSubClusterAvailability(filteredSubClusters.keySet(), blackLists);
135+
136+
// remove black SubCluster
137+
if (blackLists != null) {
138+
blackLists.forEach(filteredSubClusters::remove);
139+
}
140+
141+
// pick the chosen subCluster from the active ones
142+
return chooseSubCluster(appContext.getQueue(), filteredSubClusters);
143+
}
144+
145+
/**
146+
* This method provides a wrapper of all policy functionalities for routing a
147+
* reservation. Internally it manages configuration changes, and policy
148+
* init/reinit.
149+
*
150+
* @param request the reservation to route.
151+
*
152+
* @return the id of the subcluster that will be the "home" for this
153+
* reservation.
154+
*
155+
* @throws YarnException if there are issues initializing policies, or no
156+
* valid sub-cluster id could be found for this reservation.
157+
*/
158+
@Override
159+
public SubClusterId getReservationHomeSubcluster(ReservationSubmissionRequest request)
160+
throws YarnException {
161+
if (request == null) {
162+
throw new FederationPolicyException("The ReservationSubmissionRequest cannot be null.");
163+
}
164+
165+
if (request.getQueue() == null) {
166+
request.setQueue(YarnConfiguration.DEFAULT_QUEUE_NAME);
167+
}
168+
169+
// apply filtering based on reservation location and active sub-clusters
170+
Map<SubClusterId, SubClusterInfo> filteredSubClusters = prefilterSubClusters(
171+
request.getReservationId(), getActiveSubclusters());
172+
173+
// pick the chosen subCluster from the active ones
174+
return chooseSubCluster(request.getQueue(), filteredSubClusters);
175+
}
66176
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-common/src/main/java/org/apache/hadoop/yarn/server/federation/policies/router/FederationRouterPolicy.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import java.util.List;
2121

22+
import org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionRequest;
2223
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
2324
import org.apache.hadoop.yarn.exceptions.YarnException;
2425
import org.apache.hadoop.yarn.server.federation.policies.ConfigurableFederationPolicy;
@@ -49,4 +50,16 @@ public interface FederationRouterPolicy extends ConfigurableFederationPolicy {
4950
SubClusterId getHomeSubcluster(
5051
ApplicationSubmissionContext appSubmissionContext,
5152
List<SubClusterId> blackListSubClusters) throws YarnException;
53+
54+
/**
55+
* Determines the sub-cluster where a ReservationSubmissionRequest should be
56+
* sent to.
57+
*
58+
* @param request the original request
59+
* @return a mapping of sub-clusters and the requests
60+
*
61+
* @throws YarnException if the policy fails to choose a sub-cluster
62+
*/
63+
SubClusterId getReservationHomeSubcluster(
64+
ReservationSubmissionRequest request) throws YarnException;
5265
}

0 commit comments

Comments
 (0)