Skip to content

HADOOP-18606. ABFS: Add reason in x-ms-client-request-id on a retried API call. (#5299) #5461

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,17 @@ public final class AbfsHttpConstants {
public static final char CHAR_EQUALS = '=';
public static final char CHAR_STAR = '*';
public static final char CHAR_PLUS = '+';
/**
* Value that differentiates categories of the http_status.
* <pre>
* 100 - 199 : Informational responses
* 200 - 299 : Successful responses
* 300 - 399 : Redirection messages
* 400 - 499 : Client error responses
* 500 - 599 : Server error responses
* </pre>
*/
public static final Integer HTTP_STATUS_CATEGORY_QUOTIENT = 100;

private AbfsHttpConstants() {}
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ public String getErrorCode() {
return this.errorCode;
}

public String getErrorMessage() {
return this.errorMessage;
}

public static List<AzureServiceErrorCode> getAzureServiceCode(int httpStatusCode) {
List<AzureServiceErrorCode> errorCodes = new ArrayList<>();
if (httpStatusCode == UNKNOWN.httpStatusCode) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.classification.VisibleForTesting;
import org.apache.hadoop.fs.azurebfs.AbfsStatistic;
import org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants;
import org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException;
Expand Down Expand Up @@ -73,6 +74,12 @@ public class AbfsRestOperation {
private AbfsHttpOperation result;
private AbfsCounters abfsCounters;

/**
* This variable contains the reason of last API call within the same
* AbfsRestOperation object.
*/
private String failureReason;

/**
* Checks if there is non-null HTTP response.
* @return true if there is a non-null HTTP response from the ABFS call.
Expand Down Expand Up @@ -208,7 +215,7 @@ public void execute(TracingContext tracingContext)
private void completeExecute(TracingContext tracingContext)
throws AzureBlobFileSystemException {
// see if we have latency reports from the previous requests
String latencyHeader = this.client.getAbfsPerfTracker().getClientLatency();
String latencyHeader = getClientLatency();
if (latencyHeader != null && !latencyHeader.isEmpty()) {
AbfsHttpHeader httpHeader =
new AbfsHttpHeader(HttpHeaderConfigurations.X_MS_ABFS_CLIENT_LATENCY, latencyHeader);
Expand Down Expand Up @@ -237,6 +244,11 @@ private void completeExecute(TracingContext tracingContext)
LOG.trace("{} REST operation complete", operationType);
}

@VisibleForTesting
String getClientLatency() {
return client.getAbfsPerfTracker().getClientLatency();
}

/**
* Executes a single HTTP operation to complete the REST operation. If it
* fails, there may be a retry. The retryCount is incremented with each
Expand All @@ -248,9 +260,9 @@ private boolean executeHttpOperation(final int retryCount,

try {
// initialize the HTTP request and open the connection
httpOperation = new AbfsHttpOperation(url, method, requestHeaders);
httpOperation = createHttpOperation();
incrementCounter(AbfsStatistic.CONNECTIONS_MADE, 1);
tracingContext.constructHeader(httpOperation);
tracingContext.constructHeader(httpOperation, failureReason);

switch(client.getAuthType()) {
case Custom:
Expand Down Expand Up @@ -303,6 +315,7 @@ private boolean executeHttpOperation(final int retryCount,
} catch (UnknownHostException ex) {
String hostname = null;
hostname = httpOperation.getHost();
failureReason = RetryReason.getAbbreviation(ex, null, null);
LOG.warn("Unknown host name: {}. Retrying to resolve the host name...",
hostname);
if (!client.getRetryPolicy().shouldRetry(retryCount, -1)) {
Expand All @@ -314,6 +327,8 @@ private boolean executeHttpOperation(final int retryCount,
LOG.debug("HttpRequestFailure: {}, {}", httpOperation, ex);
}

failureReason = RetryReason.getAbbreviation(ex, -1, "");

if (!client.getRetryPolicy().shouldRetry(retryCount, -1)) {
throw new InvalidAbfsRestOperationException(ex);
}
Expand All @@ -326,6 +341,8 @@ private boolean executeHttpOperation(final int retryCount,
LOG.debug("HttpRequest: {}: {}", operationType, httpOperation);

if (client.getRetryPolicy().shouldRetry(retryCount, httpOperation.getStatusCode())) {
int status = httpOperation.getStatusCode();
failureReason = RetryReason.getAbbreviation(null, status, httpOperation.getStorageErrorMessage());
return false;
}

Expand All @@ -334,6 +351,15 @@ private boolean executeHttpOperation(final int retryCount,
return true;
}

/**
* Creates new object of {@link AbfsHttpOperation} with the url, method, and
* requestHeaders fields of the AbfsRestOperation object.
*/
@VisibleForTesting
AbfsHttpOperation createHttpOperation() throws IOException {
return new AbfsHttpOperation(url, method, requestHeaders);
}

/**
* Incrementing Abfs counters with a long value.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.fs.azurebfs.services;

import java.util.LinkedList;
import java.util.List;

import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.ClientErrorRetryReason;
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.ConnectionResetRetryReason;
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.ConnectionTimeoutRetryReason;
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.ReadTimeoutRetryReason;
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.RetryReasonCategory;
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.ServerErrorRetryReason;
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.UnknownHostRetryReason;
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.UnknownIOExceptionRetryReason;
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.UnknownSocketExceptionRetryReason;


/**
* This utility class exposes methods to convert a server response-error to a
* category of error.
*/
final class RetryReason {

/**
* Linked-list of the implementations of RetryReasonCategory. The objects in the
* list are arranged by the rank of their significance.
* <ul>
* <li>ServerError (statusCode==5XX), ClientError (statusCode==4XX) are
* independent of other retryReason categories.</li>
* <li>Since {@link java.net.SocketException} is subclass of
* {@link java.io.IOException},
* hence, {@link UnknownIOExceptionRetryReason} is placed before
* {@link UnknownSocketExceptionRetryReason}</li>
* <li>Since, connectionTimeout, readTimeout, and connectionReset are
* {@link java.net.SocketTimeoutException} exceptions with different messages,
* hence, {@link ConnectionTimeoutRetryReason}, {@link ReadTimeoutRetryReason},
* {@link ConnectionResetRetryReason} are above {@link UnknownIOExceptionRetryReason}.
* There is no order between the three reasons as they are differentiated
* by exception-message.</li>
* <li>Since, {@link java.net.UnknownHostException} is subclass of
* {@link java.io.IOException}, {@link UnknownHostRetryReason} is placed
* over {@link UnknownIOExceptionRetryReason}</li>
* </ul>
*/
private static List<RetryReasonCategory> rankedReasonCategories
= new LinkedList<RetryReasonCategory>() {{
add(new ServerErrorRetryReason());
add(new ClientErrorRetryReason());
add(new UnknownIOExceptionRetryReason());
add(new UnknownSocketExceptionRetryReason());
add(new ConnectionTimeoutRetryReason());
add(new ReadTimeoutRetryReason());
add(new UnknownHostRetryReason());
add(new ConnectionResetRetryReason());
}};

private RetryReason() {

}

/**
* Method to get correct abbreviation for a given set of exception, statusCode,
* storageStatusCode.
*
* @param ex exception caught during server communication.
* @param statusCode statusCode in the server response.
* @param storageErrorMessage storageErrorMessage in the server response.
*
* @return abbreviation for the the given set of exception, statusCode, storageStatusCode.
*/
static String getAbbreviation(Exception ex,
Integer statusCode,
String storageErrorMessage) {
String result = null;
for (RetryReasonCategory retryReasonCategory : rankedReasonCategories) {
final String abbreviation
= retryReasonCategory.captureAndGetAbbreviation(ex,
statusCode, storageErrorMessage);
if (abbreviation != null) {
result = abbreviation;
}
}
return result;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.fs.azurebfs.services;

public final class RetryReasonConstants {

private RetryReasonConstants() {

}
public static final String CONNECTION_TIMEOUT_JDK_MESSAGE = "connect timed out";
public static final String READ_TIMEOUT_JDK_MESSAGE = "Read timed out";
public static final String CONNECTION_RESET_MESSAGE = "Connection reset";
public static final String OPERATION_BREACH_MESSAGE = "Operations per second is over the account limit.";
public static final String CONNECTION_RESET_ABBREVIATION = "CR";
public static final String CONNECTION_TIMEOUT_ABBREVIATION = "CT";
public static final String READ_TIMEOUT_ABBREVIATION = "RT";
public static final String INGRESS_LIMIT_BREACH_ABBREVIATION = "ING";
public static final String EGRESS_LIMIT_BREACH_ABBREVIATION = "EGR";
public static final String OPERATION_LIMIT_BREACH_ABBREVIATION = "OPR";
public static final String UNKNOWN_HOST_EXCEPTION_ABBREVIATION = "UH";
public static final String IO_EXCEPTION_ABBREVIATION = "IOE";
public static final String SOCKET_EXCEPTION_ABBREVIATION = "SE";
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.fs.azurebfs.services.retryReasonCategories;

import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.HTTP_STATUS_CATEGORY_QUOTIENT;

/**
* Category that can capture server-response errors for 4XX status-code.
*/
public class ClientErrorRetryReason extends RetryReasonCategory {

@Override
Boolean canCapture(final Exception ex,
final Integer statusCode,
final String serverErrorMessage) {
if (statusCode == null || statusCode / HTTP_STATUS_CATEGORY_QUOTIENT != 4) {
return false;
}
return true;
}

@Override
String getAbbreviation(final Integer statusCode,
final String serverErrorMessage) {
return statusCode + "";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.fs.azurebfs.services.retryReasonCategories;

import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_RESET_ABBREVIATION;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_RESET_MESSAGE;

/**
* Category that can capture server-response errors for connection-reset exception.
*/
public class ConnectionResetRetryReason extends
RetryReasonCategory {

@Override
Boolean canCapture(final Exception ex,
final Integer statusCode,
final String serverErrorMessage) {
return checkExceptionMessage(ex, CONNECTION_RESET_MESSAGE);
}

@Override
String getAbbreviation(final Integer statusCode,
final String serverErrorMessage) {
return CONNECTION_RESET_ABBREVIATION;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.fs.azurebfs.services.retryReasonCategories;


import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_TIMEOUT_ABBREVIATION;
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_TIMEOUT_JDK_MESSAGE;

/**
* Category that can capture server-response errors for connection-timeout.
*/
public class ConnectionTimeoutRetryReason extends
RetryReasonCategory {

@Override
String getAbbreviation(final Integer statusCode,
final String serverErrorMessage) {
return CONNECTION_TIMEOUT_ABBREVIATION;
}

@Override
Boolean canCapture(final Exception ex,
final Integer statusCode,
final String serverErrorMessage) {
return checkExceptionMessage(ex, CONNECTION_TIMEOUT_JDK_MESSAGE);
}
}
Loading