Skip to content

Commit 358bf80

Browse files
authored
HADOOP-18606. ABFS: Add reason in x-ms-client-request-id on a retried API call. (#5299)
Contributed by Pranav Saxena
1 parent 9274018 commit 358bf80

18 files changed

+1124
-4
lines changed

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/AbfsHttpConstants.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,17 @@ public final class AbfsHttpConstants {
111111
public static final char CHAR_EQUALS = '=';
112112
public static final char CHAR_STAR = '*';
113113
public static final char CHAR_PLUS = '+';
114+
/**
115+
* Value that differentiates categories of the http_status.
116+
* <pre>
117+
* 100 - 199 : Informational responses
118+
* 200 - 299 : Successful responses
119+
* 300 - 399 : Redirection messages
120+
* 400 - 499 : Client error responses
121+
* 500 - 599 : Server error responses
122+
* </pre>
123+
*/
124+
public static final Integer HTTP_STATUS_CATEGORY_QUOTIENT = 100;
114125

115126
private AbfsHttpConstants() {}
116127
}

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/contracts/services/AzureServiceErrorCode.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ public String getErrorCode() {
6666
return this.errorCode;
6767
}
6868

69+
public String getErrorMessage() {
70+
return this.errorMessage;
71+
}
72+
6973
public static List<AzureServiceErrorCode> getAzureServiceCode(int httpStatusCode) {
7074
List<AzureServiceErrorCode> errorCodes = new ArrayList<>();
7175
if (httpStatusCode == UNKNOWN.httpStatusCode) {

hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsRestOperation.java

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.slf4j.Logger;
2929
import org.slf4j.LoggerFactory;
3030

31+
import org.apache.hadoop.classification.VisibleForTesting;
3132
import org.apache.hadoop.fs.azurebfs.AbfsStatistic;
3233
import org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants;
3334
import org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException;
@@ -73,6 +74,12 @@ public class AbfsRestOperation {
7374
private AbfsHttpOperation result;
7475
private AbfsCounters abfsCounters;
7576

77+
/**
78+
* This variable contains the reason of last API call within the same
79+
* AbfsRestOperation object.
80+
*/
81+
private String failureReason;
82+
7683
/**
7784
* Checks if there is non-null HTTP response.
7885
* @return true if there is a non-null HTTP response from the ABFS call.
@@ -208,7 +215,7 @@ public void execute(TracingContext tracingContext)
208215
private void completeExecute(TracingContext tracingContext)
209216
throws AzureBlobFileSystemException {
210217
// see if we have latency reports from the previous requests
211-
String latencyHeader = this.client.getAbfsPerfTracker().getClientLatency();
218+
String latencyHeader = getClientLatency();
212219
if (latencyHeader != null && !latencyHeader.isEmpty()) {
213220
AbfsHttpHeader httpHeader =
214221
new AbfsHttpHeader(HttpHeaderConfigurations.X_MS_ABFS_CLIENT_LATENCY, latencyHeader);
@@ -237,6 +244,11 @@ private void completeExecute(TracingContext tracingContext)
237244
LOG.trace("{} REST operation complete", operationType);
238245
}
239246

247+
@VisibleForTesting
248+
String getClientLatency() {
249+
return client.getAbfsPerfTracker().getClientLatency();
250+
}
251+
240252
/**
241253
* Executes a single HTTP operation to complete the REST operation. If it
242254
* fails, there may be a retry. The retryCount is incremented with each
@@ -248,9 +260,9 @@ private boolean executeHttpOperation(final int retryCount,
248260

249261
try {
250262
// initialize the HTTP request and open the connection
251-
httpOperation = new AbfsHttpOperation(url, method, requestHeaders);
263+
httpOperation = createHttpOperation();
252264
incrementCounter(AbfsStatistic.CONNECTIONS_MADE, 1);
253-
tracingContext.constructHeader(httpOperation);
265+
tracingContext.constructHeader(httpOperation, failureReason);
254266

255267
switch(client.getAuthType()) {
256268
case Custom:
@@ -303,6 +315,7 @@ private boolean executeHttpOperation(final int retryCount,
303315
} catch (UnknownHostException ex) {
304316
String hostname = null;
305317
hostname = httpOperation.getHost();
318+
failureReason = RetryReason.getAbbreviation(ex, null, null);
306319
LOG.warn("Unknown host name: {}. Retrying to resolve the host name...",
307320
hostname);
308321
if (!client.getRetryPolicy().shouldRetry(retryCount, -1)) {
@@ -314,6 +327,8 @@ private boolean executeHttpOperation(final int retryCount,
314327
LOG.debug("HttpRequestFailure: {}, {}", httpOperation, ex);
315328
}
316329

330+
failureReason = RetryReason.getAbbreviation(ex, -1, "");
331+
317332
if (!client.getRetryPolicy().shouldRetry(retryCount, -1)) {
318333
throw new InvalidAbfsRestOperationException(ex);
319334
}
@@ -326,6 +341,8 @@ private boolean executeHttpOperation(final int retryCount,
326341
LOG.debug("HttpRequest: {}: {}", operationType, httpOperation);
327342

328343
if (client.getRetryPolicy().shouldRetry(retryCount, httpOperation.getStatusCode())) {
344+
int status = httpOperation.getStatusCode();
345+
failureReason = RetryReason.getAbbreviation(null, status, httpOperation.getStorageErrorMessage());
329346
return false;
330347
}
331348

@@ -334,6 +351,15 @@ private boolean executeHttpOperation(final int retryCount,
334351
return true;
335352
}
336353

354+
/**
355+
* Creates new object of {@link AbfsHttpOperation} with the url, method, and
356+
* requestHeaders fields of the AbfsRestOperation object.
357+
*/
358+
@VisibleForTesting
359+
AbfsHttpOperation createHttpOperation() throws IOException {
360+
return new AbfsHttpOperation(url, method, requestHeaders);
361+
}
362+
337363
/**
338364
* Incrementing Abfs counters with a long value.
339365
*
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
* <p>
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
* <p>
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hadoop.fs.azurebfs.services;
20+
21+
import java.util.LinkedList;
22+
import java.util.List;
23+
24+
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.ClientErrorRetryReason;
25+
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.ConnectionResetRetryReason;
26+
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.ConnectionTimeoutRetryReason;
27+
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.ReadTimeoutRetryReason;
28+
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.RetryReasonCategory;
29+
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.ServerErrorRetryReason;
30+
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.UnknownHostRetryReason;
31+
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.UnknownIOExceptionRetryReason;
32+
import org.apache.hadoop.fs.azurebfs.services.retryReasonCategories.UnknownSocketExceptionRetryReason;
33+
34+
35+
/**
36+
* This utility class exposes methods to convert a server response-error to a
37+
* category of error.
38+
*/
39+
final class RetryReason {
40+
41+
/**
42+
* Linked-list of the implementations of RetryReasonCategory. The objects in the
43+
* list are arranged by the rank of their significance.
44+
* <ul>
45+
* <li>ServerError (statusCode==5XX), ClientError (statusCode==4XX) are
46+
* independent of other retryReason categories.</li>
47+
* <li>Since {@link java.net.SocketException} is subclass of
48+
* {@link java.io.IOException},
49+
* hence, {@link UnknownIOExceptionRetryReason} is placed before
50+
* {@link UnknownSocketExceptionRetryReason}</li>
51+
* <li>Since, connectionTimeout, readTimeout, and connectionReset are
52+
* {@link java.net.SocketTimeoutException} exceptions with different messages,
53+
* hence, {@link ConnectionTimeoutRetryReason}, {@link ReadTimeoutRetryReason},
54+
* {@link ConnectionResetRetryReason} are above {@link UnknownIOExceptionRetryReason}.
55+
* There is no order between the three reasons as they are differentiated
56+
* by exception-message.</li>
57+
* <li>Since, {@link java.net.UnknownHostException} is subclass of
58+
* {@link java.io.IOException}, {@link UnknownHostRetryReason} is placed
59+
* over {@link UnknownIOExceptionRetryReason}</li>
60+
* </ul>
61+
*/
62+
private static List<RetryReasonCategory> rankedReasonCategories
63+
= new LinkedList<RetryReasonCategory>() {{
64+
add(new ServerErrorRetryReason());
65+
add(new ClientErrorRetryReason());
66+
add(new UnknownIOExceptionRetryReason());
67+
add(new UnknownSocketExceptionRetryReason());
68+
add(new ConnectionTimeoutRetryReason());
69+
add(new ReadTimeoutRetryReason());
70+
add(new UnknownHostRetryReason());
71+
add(new ConnectionResetRetryReason());
72+
}};
73+
74+
private RetryReason() {
75+
76+
}
77+
78+
/**
79+
* Method to get correct abbreviation for a given set of exception, statusCode,
80+
* storageStatusCode.
81+
*
82+
* @param ex exception caught during server communication.
83+
* @param statusCode statusCode in the server response.
84+
* @param storageErrorMessage storageErrorMessage in the server response.
85+
*
86+
* @return abbreviation for the the given set of exception, statusCode, storageStatusCode.
87+
*/
88+
static String getAbbreviation(Exception ex,
89+
Integer statusCode,
90+
String storageErrorMessage) {
91+
String result = null;
92+
for (RetryReasonCategory retryReasonCategory : rankedReasonCategories) {
93+
final String abbreviation
94+
= retryReasonCategory.captureAndGetAbbreviation(ex,
95+
statusCode, storageErrorMessage);
96+
if (abbreviation != null) {
97+
result = abbreviation;
98+
}
99+
}
100+
return result;
101+
}
102+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
* <p>
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
* <p>
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hadoop.fs.azurebfs.services;
20+
21+
public final class RetryReasonConstants {
22+
23+
private RetryReasonConstants() {
24+
25+
}
26+
public static final String CONNECTION_TIMEOUT_JDK_MESSAGE = "connect timed out";
27+
public static final String READ_TIMEOUT_JDK_MESSAGE = "Read timed out";
28+
public static final String CONNECTION_RESET_MESSAGE = "Connection reset";
29+
public static final String OPERATION_BREACH_MESSAGE = "Operations per second is over the account limit.";
30+
public static final String CONNECTION_RESET_ABBREVIATION = "CR";
31+
public static final String CONNECTION_TIMEOUT_ABBREVIATION = "CT";
32+
public static final String READ_TIMEOUT_ABBREVIATION = "RT";
33+
public static final String INGRESS_LIMIT_BREACH_ABBREVIATION = "ING";
34+
public static final String EGRESS_LIMIT_BREACH_ABBREVIATION = "EGR";
35+
public static final String OPERATION_LIMIT_BREACH_ABBREVIATION = "OPR";
36+
public static final String UNKNOWN_HOST_EXCEPTION_ABBREVIATION = "UH";
37+
public static final String IO_EXCEPTION_ABBREVIATION = "IOE";
38+
public static final String SOCKET_EXCEPTION_ABBREVIATION = "SE";
39+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
* <p>
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
* <p>
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hadoop.fs.azurebfs.services.retryReasonCategories;
20+
21+
import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.HTTP_STATUS_CATEGORY_QUOTIENT;
22+
23+
/**
24+
* Category that can capture server-response errors for 4XX status-code.
25+
*/
26+
public class ClientErrorRetryReason extends RetryReasonCategory {
27+
28+
@Override
29+
Boolean canCapture(final Exception ex,
30+
final Integer statusCode,
31+
final String serverErrorMessage) {
32+
if (statusCode == null || statusCode / HTTP_STATUS_CATEGORY_QUOTIENT != 4) {
33+
return false;
34+
}
35+
return true;
36+
}
37+
38+
@Override
39+
String getAbbreviation(final Integer statusCode,
40+
final String serverErrorMessage) {
41+
return statusCode + "";
42+
}
43+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
* <p>
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
* <p>
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hadoop.fs.azurebfs.services.retryReasonCategories;
20+
21+
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_RESET_ABBREVIATION;
22+
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_RESET_MESSAGE;
23+
24+
/**
25+
* Category that can capture server-response errors for connection-reset exception.
26+
*/
27+
public class ConnectionResetRetryReason extends
28+
RetryReasonCategory {
29+
30+
@Override
31+
Boolean canCapture(final Exception ex,
32+
final Integer statusCode,
33+
final String serverErrorMessage) {
34+
return checkExceptionMessage(ex, CONNECTION_RESET_MESSAGE);
35+
}
36+
37+
@Override
38+
String getAbbreviation(final Integer statusCode,
39+
final String serverErrorMessage) {
40+
return CONNECTION_RESET_ABBREVIATION;
41+
}
42+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
* <p>
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
* <p>
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hadoop.fs.azurebfs.services.retryReasonCategories;
20+
21+
22+
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_TIMEOUT_ABBREVIATION;
23+
import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_TIMEOUT_JDK_MESSAGE;
24+
25+
/**
26+
* Category that can capture server-response errors for connection-timeout.
27+
*/
28+
public class ConnectionTimeoutRetryReason extends
29+
RetryReasonCategory {
30+
31+
@Override
32+
String getAbbreviation(final Integer statusCode,
33+
final String serverErrorMessage) {
34+
return CONNECTION_TIMEOUT_ABBREVIATION;
35+
}
36+
37+
@Override
38+
Boolean canCapture(final Exception ex,
39+
final Integer statusCode,
40+
final String serverErrorMessage) {
41+
return checkExceptionMessage(ex, CONNECTION_TIMEOUT_JDK_MESSAGE);
42+
}
43+
}

0 commit comments

Comments
 (0)