Skip to content

YARN-11509. The FederationInterceptor#launchUAM Added retry logic. #5727

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4058,6 +4058,20 @@ public static boolean isAclEnabled(Configuration conf) {
public static final long DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT =
60000; // one minute

// AMRMProxy Register UAM Retry-Num
public static final String FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT =
FEDERATION_PREFIX + "amrmproxy.register.uam.retry-count";
// Register a UAM , we will retry a maximum of 3 times.
public static final int DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT =
3;

// AMRMProxy Register UAM Retry Interval
public static final String FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL =
FEDERATION_PREFIX + "amrmproxy.register.uam.interval";
// Retry Interval, default 100 ms
public static final long DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL =
TimeUnit.MILLISECONDS.toMillis(100);

public static final String DEFAULT_FEDERATION_POLICY_KEY = "*";
public static final String FEDERATION_POLICY_MANAGER = FEDERATION_PREFIX
+ "policy-manager";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5408,4 +5408,22 @@
</description>
</property>

<property>
<description>
The number of retry for Register UAM.
The default value is 3.
</description>
<name>yarn.federation.amrmproxy.register.uam.retry-count</name>
<value>3</value>
</property>

<property>
<description>
Interval between retry for Register UAM.
The default value is 100ms.
</description>
<name>yarn.federation.amrmproxy.register.uam.interval</name>
<value>100ms</value>
</property>

</configuration>
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
import java.util.concurrent.TimeUnit;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
Expand Down Expand Up @@ -87,6 +88,7 @@
import org.apache.hadoop.yarn.server.federation.policies.amrmproxy.FederationAMRMProxyPolicy;
import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyInitializationException;
import org.apache.hadoop.yarn.server.federation.resolver.SubClusterResolver;
import org.apache.hadoop.yarn.server.federation.retry.FederationActionRetry;
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId;
import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo;
import org.apache.hadoop.yarn.server.federation.utils.FederationRegistryClient;
Expand Down Expand Up @@ -251,6 +253,10 @@ public class FederationInterceptor extends AbstractRequestInterceptor {
// the maximum wait time for the first async heart beat response
private long heartbeatMaxWaitTimeMs;

private int registerUamRetryNum;

private long registerUamRetryInterval;

private boolean waitUamRegisterDone;

private MonotonicClock clock = new MonotonicClock();
Expand Down Expand Up @@ -355,6 +361,24 @@ public void init(AMRMProxyApplicationContext appContext) {
this.subClusterTimeOut =
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_SUBCLUSTER_TIMEOUT;
}

this.registerUamRetryNum = conf.getInt(
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT,
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT);
if (this.registerUamRetryNum <= 0) {
LOG.info("{} configured to be {}, should be positive. Using default of {}.",
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT,
this.subClusterTimeOut,
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT);
this.registerUamRetryNum =
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_COUNT;
}

this.registerUamRetryInterval = conf.getTimeDuration(
YarnConfiguration.FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL,
YarnConfiguration.DEFAULT_FEDERATION_AMRMPROXY_REGISTER_UAM_RETRY_INTERVAL,
TimeUnit.MILLISECONDS);

this.waitUamRegisterDone = conf.getBoolean(YarnConfiguration.AMRM_PROXY_WAIT_UAM_REGISTER_DONE,
YarnConfiguration.DEFAULT_AMRM_PROXY_WAIT_UAM_REGISTER_DONE);
}
Expand Down Expand Up @@ -701,7 +725,7 @@ public AllocateResponse allocate(AllocateRequest request)

if (this.finishAMCalled) {
LOG.warn("FinishApplicationMaster already called by {}, skip heartbeat "
+ "processing and return dummy response" + this.attemptId);
+ "processing and return dummy response.", this.attemptId);
return RECORD_FACTORY.newRecordInstance(AllocateResponse.class);
}

Expand Down Expand Up @@ -1255,85 +1279,77 @@ private List<SubClusterId> registerAndAllocateWithNewSubClusters(
// Check to see if there are any new sub-clusters in this request
// list and create and register Unmanaged AM instance for the new ones
List<SubClusterId> newSubClusters = new ArrayList<>();
for (SubClusterId subClusterId : requests.keySet()) {
if (!subClusterId.equals(this.homeSubClusterId)
&& !this.uamPool.hasUAMId(subClusterId.getId())) {
newSubClusters.add(subClusterId);

requests.keySet().stream().forEach(subClusterId -> {
String id = subClusterId.getId();
if (!subClusterId.equals(this.homeSubClusterId) && !this.uamPool.hasUAMId(id)) {
newSubClusters.add(subClusterId);
// Set sub-cluster to be timed out initially
lastSCResponseTime.put(subClusterId,
clock.getTime() - subClusterTimeOut);
lastSCResponseTime.put(subClusterId, clock.getTime() - subClusterTimeOut);
}
}
});

this.uamRegisterFutures.clear();

for (final SubClusterId scId : newSubClusters) {
Future<?> future = this.threadpool.submit(new Runnable() {
@Override
public void run() {
String subClusterId = scId.getId();

// Create a config loaded with federation on and subclusterId
// for each UAM
YarnConfiguration config = new YarnConfiguration(getConf());
FederationProxyProviderUtil.updateConfForFederation(config,
subClusterId);

RegisterApplicationMasterResponse uamResponse = null;
Token<AMRMTokenIdentifier> token = null;
try {
ApplicationId applicationId = attemptId.getApplicationId();
ApplicationSubmissionContext originalSubmissionContext =
federationFacade.getApplicationSubmissionContext(applicationId);

// For appNameSuffix, use subClusterId of the home sub-cluster
token = uamPool.launchUAM(subClusterId, config,
applicationId, amRegistrationResponse.getQueue(),
getApplicationContext().getUser(), homeSubClusterId.toString(),
true, subClusterId, originalSubmissionContext);

secondaryRelayers.put(subClusterId,
uamPool.getAMRMClientRelayer(subClusterId));

uamResponse = uamPool.registerApplicationMaster(subClusterId,
amRegistrationRequest);
} catch (Throwable e) {
LOG.error("Failed to register application master: " + subClusterId
+ " Application: " + attemptId, e);
// TODO: UAM registration for this sub-cluster RM
// failed. For now, we ignore the resource requests and continue
// but we need to fix this and handle this situation. One way would
// be to send the request to another RM by consulting the policy.
return;
}
uamRegistrations.put(scId, uamResponse);
LOG.info("Successfully registered unmanaged application master: "
+ subClusterId + " ApplicationId: " + attemptId);

try {
uamPool.allocateAsync(subClusterId, requests.get(scId),
new HeartbeatCallBack(scId, true));
} catch (Throwable e) {
LOG.error("Failed to allocate async to " + subClusterId
+ " Application: " + attemptId, e);
}
Future<?> future = this.threadpool.submit(() -> {

// Save the UAM token in registry or NMSS
try {
if (registryClient != null) {
registryClient.writeAMRMTokenForUAM(attemptId.getApplicationId(),
subClusterId, token);
} else if (getNMStateStore() != null) {
getNMStateStore().storeAMRMProxyAppContextEntry(attemptId,
NMSS_SECONDARY_SC_PREFIX + subClusterId,
token.encodeToUrlString().getBytes(STRING_TO_BYTE_FORMAT));
}
} catch (Throwable e) {
LOG.error("Failed to persist UAM token from " + subClusterId
+ " Application: " + attemptId, e);
String subClusterId = scId.getId();

// Create a config loaded with federation on and subclusterId
// for each UAM
YarnConfiguration config = new YarnConfiguration(getConf());
FederationProxyProviderUtil.updateConfForFederation(config, subClusterId);
ApplicationId applicationId = attemptId.getApplicationId();

RegisterApplicationMasterResponse uamResponse;
Token<AMRMTokenIdentifier> token;

// LaunchUAM And RegisterApplicationMaster
try {
TokenAndRegisterResponse result =
((FederationActionRetry<TokenAndRegisterResponse>) (retryCount) ->
launchUAMAndRegisterApplicationMaster(config, subClusterId, applicationId)).
runWithRetries(registerUamRetryNum, registerUamRetryInterval);

token = result.getToken();
uamResponse = result.getResponse();
} catch (Throwable e) {
LOG.error("Failed to register application master: {} Application: {}.",
subClusterId, attemptId, e);
return;
}

uamRegistrations.put(scId, uamResponse);

LOG.info("Successfully registered unmanaged application master: {} " +
"ApplicationId: {}.", subClusterId, attemptId);

// Allocate Request
try {
uamPool.allocateAsync(subClusterId, requests.get(scId),
new HeartbeatCallBack(scId, true));
} catch (Throwable e) {
LOG.error("Failed to allocate async to {} Application: {}.",
subClusterId, attemptId, e);
}

// Save the UAM token in registry or NMSS
try {
if (registryClient != null) {
registryClient.writeAMRMTokenForUAM(applicationId, subClusterId, token);
} else if (getNMStateStore() != null) {
getNMStateStore().storeAMRMProxyAppContextEntry(attemptId,
NMSS_SECONDARY_SC_PREFIX + subClusterId,
token.encodeToUrlString().getBytes(STRING_TO_BYTE_FORMAT));
}
} catch (Throwable e) {
LOG.error("Failed to persist UAM token from {} Application {}",
subClusterId, attemptId, e);
}
});

this.uamRegisterFutures.put(scId, future);
}

Expand All @@ -1347,10 +1363,34 @@ public void run() {
}
}


return newSubClusters;
}

protected TokenAndRegisterResponse launchUAMAndRegisterApplicationMaster(
YarnConfiguration config, String subClusterId, ApplicationId applicationId)
throws IOException, YarnException {

// Prepare parameter information
ApplicationSubmissionContext originalSubmissionContext =
federationFacade.getApplicationSubmissionContext(applicationId);
String submitter = getApplicationContext().getUser();
String homeRM = homeSubClusterId.toString();
String queue = amRegistrationResponse.getQueue();

// For appNameSuffix, use subClusterId of the home sub-cluster
Token<AMRMTokenIdentifier> token = uamPool.launchUAM(subClusterId, config, applicationId,
queue, submitter, homeRM, true, subClusterId, originalSubmissionContext);

// Set the relationship between SubCluster and AMRMClientRelayer.
secondaryRelayers.put(subClusterId, uamPool.getAMRMClientRelayer(subClusterId));

// RegisterApplicationMaster
RegisterApplicationMasterResponse uamResponse =
uamPool.registerApplicationMaster(subClusterId, amRegistrationRequest);

return new TokenAndRegisterResponse(token, uamResponse);
}

/**
* Prepare the base allocation response. Use lastSCResponse and
* lastHeartbeatTimeStamp to assemble entries about cluster-wide info, e.g.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.yarn.server.nodemanager.amrmproxy;

import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;

/**
* This class contains information about the AMRM token and the RegisterApplicationMasterResponse.
*/
public class TokenAndRegisterResponse {
private Token<AMRMTokenIdentifier> token;
private RegisterApplicationMasterResponse response;

public TokenAndRegisterResponse(Token<AMRMTokenIdentifier> pToken,
RegisterApplicationMasterResponse pResponse) {
this.token = pToken;
this.response = pResponse;
}

public Token<AMRMTokenIdentifier> getToken() {
return token;
}

public RegisterApplicationMasterResponse getResponse() {
return response;
}
}
Loading