Skip to content

Commit e0563fe

Browse files
HADOOP-18908. Improve S3A region handling. (#6187)
S3A region logic improved for better inference and to be compatible with previous releases 1. If you are using an AWS S3 AccessPoint, its region is determined from the ARN itself. 2. If fs.s3a.endpoint.region is set and non-empty, it is used. 3. If fs.s3a.endpoint is an s3.*.amazonaws.com url, the region is determined by by parsing the URL Note: vpce endpoints are not handled by this. 4. If fs.s3a.endpoint.region==null, and none could be determined from the endpoint, use us-east-2 as default. 5. If fs.s3a.endpoint.region=="" then it is handed off to The default AWS SDK resolution process. Consult the AWS SDK documentation for the details on its resolution process, knowing that it is complicated and may use environment variables, entries in ~/.aws/config, IAM instance information within EC2 deployments and possibly even JSON resources on the classpath. Put differently: it is somewhat brittle across deployments. Contributed by Ahmar Suhail
1 parent e5eb404 commit e0563fe

File tree

11 files changed

+269
-179
lines changed

11 files changed

+269
-179
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/statistics/StoreStatisticNames.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -407,10 +407,6 @@ public final class StoreStatisticNames {
407407
public static final String MULTIPART_UPLOAD_LIST
408408
= "multipart_upload_list";
409409

410-
/** Probe for store region: {@value}. */
411-
public static final String STORE_REGION_PROBE
412-
= "store_region_probe";
413-
414410
private StoreStatisticNames() {
415411
}
416412

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,12 @@ private Constants() {
11791179
*/
11801180
public static final String AWS_S3_CENTRAL_REGION = "us-east-1";
11811181

1182+
/**
1183+
* The default S3 region when using cross region client.
1184+
* Value {@value}.
1185+
*/
1186+
public static final String AWS_S3_DEFAULT_REGION = "us-east-2";
1187+
11821188
/**
11831189
* Require that all S3 access is made through Access Points.
11841190
*/

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/DefaultS3ClientFactory.java

Lines changed: 105 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.slf4j.Logger;
2727
import org.slf4j.LoggerFactory;
2828

29+
import software.amazon.awssdk.awscore.util.AwsHostNameUtils;
2930
import software.amazon.awssdk.core.client.config.ClientOverrideConfiguration;
3031
import software.amazon.awssdk.core.client.config.SdkAdvancedClientOption;
3132
import software.amazon.awssdk.core.interceptor.ExecutionInterceptor;
@@ -48,6 +49,9 @@
4849
import org.apache.hadoop.fs.s3a.statistics.impl.AwsStatisticsCollector;
4950
import org.apache.hadoop.fs.store.LogExactlyOnce;
5051

52+
import static org.apache.hadoop.fs.s3a.Constants.AWS_REGION;
53+
import static org.apache.hadoop.fs.s3a.Constants.AWS_S3_DEFAULT_REGION;
54+
import static org.apache.hadoop.fs.s3a.Constants.CENTRAL_ENDPOINT;
5155
import static org.apache.hadoop.fs.s3a.impl.AWSHeaders.REQUESTER_PAYS_HEADER;
5256
import static org.apache.hadoop.fs.s3a.Constants.DEFAULT_SECURE_CONNECTIONS;
5357
import static org.apache.hadoop.fs.s3a.Constants.SECURE_CONNECTIONS;
@@ -66,12 +70,27 @@ public class DefaultS3ClientFactory extends Configured
6670

6771
private static final String REQUESTER_PAYS_HEADER_VALUE = "requester";
6872

73+
private static final String S3_SERVICE_NAME = "s3";
74+
6975
/**
7076
* Subclasses refer to this.
7177
*/
7278
protected static final Logger LOG =
7379
LoggerFactory.getLogger(DefaultS3ClientFactory.class);
7480

81+
/**
82+
* A one-off warning of default region chains in use.
83+
*/
84+
private static final LogExactlyOnce WARN_OF_DEFAULT_REGION_CHAIN =
85+
new LogExactlyOnce(LOG);
86+
87+
/**
88+
* Warning message printed when the SDK Region chain is in use.
89+
*/
90+
private static final String SDK_REGION_CHAIN_IN_USE =
91+
"S3A filesystem client is using"
92+
+ " the SDK region resolution chain.";
93+
7594

7695
/** Exactly once log to inform about ignoring the AWS-SDK Warnings for CSE. */
7796
private static final LogExactlyOnce IGNORE_CSE_WARN = new LogExactlyOnce(LOG);
@@ -138,15 +157,7 @@ private <BuilderT extends S3BaseClientBuilder<BuilderT, ClientT>, ClientT> Build
138157
BuilderT builder, S3ClientCreationParameters parameters, Configuration conf, String bucket)
139158
throws IOException {
140159

141-
Region region = parameters.getRegion();
142-
LOG.debug("Using region {}", region);
143-
144-
URI endpoint = getS3Endpoint(parameters.getEndpoint(), conf);
145-
146-
if (endpoint != null) {
147-
builder.endpointOverride(endpoint);
148-
LOG.debug("Using endpoint {}", endpoint);
149-
}
160+
configureEndpointAndRegion(builder, parameters, conf);
150161

151162
S3Configuration serviceConfiguration = S3Configuration.builder()
152163
.pathStyleAccessEnabled(parameters.isPathStyleAccess())
@@ -155,7 +166,6 @@ private <BuilderT extends S3BaseClientBuilder<BuilderT, ClientT>, ClientT> Build
155166
return builder
156167
.overrideConfiguration(createClientOverrideConfiguration(parameters, conf))
157168
.credentialsProvider(parameters.getCredentialSet())
158-
.region(region)
159169
.serviceConfiguration(serviceConfiguration);
160170
}
161171

@@ -201,6 +211,72 @@ protected ClientOverrideConfiguration createClientOverrideConfiguration(
201211
return clientOverrideConfigBuilder.build();
202212
}
203213

214+
/**
215+
* This method configures the endpoint and region for a S3 client.
216+
* The order of configuration is:
217+
*
218+
* <ol>
219+
* <li>If region is configured via fs.s3a.endpoint.region, use it.</li>
220+
* <li>If endpoint is configured via via fs.s3a.endpoint, set it.
221+
* If no region is configured, try to parse region from endpoint. </li>
222+
* <li> If no region is configured, and it could not be parsed from the endpoint,
223+
* set the default region as US_EAST_2 and enable cross region access. </li>
224+
* <li> If configured region is empty, fallback to SDK resolution chain. </li>
225+
* </ol>
226+
*
227+
* @param builder S3 client builder.
228+
* @param parameters parameter object
229+
* @param conf conf configuration object
230+
* @param <BuilderT> S3 client builder type
231+
* @param <ClientT> S3 client type
232+
*/
233+
private <BuilderT extends S3BaseClientBuilder<BuilderT, ClientT>, ClientT> void configureEndpointAndRegion(
234+
BuilderT builder, S3ClientCreationParameters parameters, Configuration conf) {
235+
URI endpoint = getS3Endpoint(parameters.getEndpoint(), conf);
236+
237+
String configuredRegion = parameters.getRegion();
238+
Region region = null;
239+
String origin = "";
240+
241+
// If the region was configured, set it.
242+
if (configuredRegion != null && !configuredRegion.isEmpty()) {
243+
origin = AWS_REGION;
244+
region = Region.of(configuredRegion);
245+
}
246+
247+
if (endpoint != null) {
248+
builder.endpointOverride(endpoint);
249+
// No region was configured, try to determine it from the endpoint.
250+
if (region == null) {
251+
region = getS3RegionFromEndpoint(parameters.getEndpoint());
252+
if (region != null) {
253+
origin = "endpoint";
254+
}
255+
}
256+
LOG.debug("Setting endpoint to {}", endpoint);
257+
}
258+
259+
if (region != null) {
260+
builder.region(region);
261+
} else if (configuredRegion == null) {
262+
// no region is configured, and none could be determined from the endpoint.
263+
// Use US_EAST_2 as default.
264+
region = Region.of(AWS_S3_DEFAULT_REGION);
265+
builder.crossRegionAccessEnabled(true);
266+
builder.region(region);
267+
origin = "cross region access fallback";
268+
} else if (configuredRegion.isEmpty()) {
269+
// region configuration was set to empty string.
270+
// allow this if people really want it; it is OK to rely on this
271+
// when deployed in EC2.
272+
WARN_OF_DEFAULT_REGION_CHAIN.warn(SDK_REGION_CHAIN_IN_USE);
273+
LOG.debug(SDK_REGION_CHAIN_IN_USE);
274+
origin = "SDK region chain";
275+
}
276+
277+
LOG.debug("Setting region to {} from {}", region, origin);
278+
}
279+
204280
/**
205281
* Given a endpoint string, create the endpoint URI.
206282
*
@@ -229,4 +305,23 @@ private static URI getS3Endpoint(String endpoint, final Configuration conf) {
229305
throw new IllegalArgumentException(e);
230306
}
231307
}
308+
309+
/**
310+
* Parses the endpoint to get the region.
311+
* If endpoint is the central one, use US_EAST_1.
312+
*
313+
* @param endpoint the configure endpoint.
314+
* @return the S3 region, null if unable to resolve from endpoint.
315+
*/
316+
private static Region getS3RegionFromEndpoint(String endpoint) {
317+
318+
if(!endpoint.endsWith(CENTRAL_ENDPOINT)) {
319+
LOG.debug("Endpoint {} is not the default; parsing", endpoint);
320+
return AwsHostNameUtils.parseSigningRegion(endpoint, S3_SERVICE_NAME).orElse(null);
321+
}
322+
323+
// endpoint is for US_EAST_1;
324+
return Region.US_EAST_1;
325+
}
326+
232327
}

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java

Lines changed: 1 addition & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import java.util.Collections;
3535
import java.util.Date;
3636
import java.util.EnumSet;
37-
import java.util.HashMap;
3837
import java.util.Iterator;
3938
import java.util.List;
4039
import java.util.Locale;
@@ -54,7 +53,6 @@
5453

5554
import software.amazon.awssdk.core.ResponseInputStream;
5655
import software.amazon.awssdk.core.exception.SdkException;
57-
import software.amazon.awssdk.regions.Region;
5856
import software.amazon.awssdk.services.s3.S3AsyncClient;
5957
import software.amazon.awssdk.services.s3.S3Client;
6058
import software.amazon.awssdk.services.s3.model.CompleteMultipartUploadRequest;
@@ -83,7 +81,6 @@
8381
import software.amazon.awssdk.services.s3.model.PutObjectRequest;
8482
import software.amazon.awssdk.services.s3.model.PutObjectResponse;
8583
import software.amazon.awssdk.services.s3.model.S3Error;
86-
import software.amazon.awssdk.services.s3.model.S3Exception;
8784
import software.amazon.awssdk.services.s3.model.S3Object;
8885
import software.amazon.awssdk.services.s3.model.SelectObjectContentRequest;
8986
import software.amazon.awssdk.services.s3.model.SelectObjectContentResponseHandler;
@@ -98,7 +95,6 @@
9895
import software.amazon.awssdk.transfer.s3.model.FileUpload;
9996
import software.amazon.awssdk.transfer.s3.model.UploadFileRequest;
10097

101-
import org.apache.commons.lang3.StringUtils;
10298
import org.apache.hadoop.fs.impl.prefetch.ExecutorServiceFuturePool;
10399
import org.slf4j.Logger;
104100
import org.slf4j.LoggerFactory;
@@ -246,7 +242,6 @@
246242
import static org.apache.hadoop.fs.s3a.impl.InternalConstants.CSE_PADDING_LENGTH;
247243
import static org.apache.hadoop.fs.s3a.impl.InternalConstants.DEFAULT_UPLOAD_PART_COUNT_LIMIT;
248244
import static org.apache.hadoop.fs.s3a.impl.InternalConstants.DELETE_CONSIDERED_IDEMPOTENT;
249-
import static org.apache.hadoop.fs.s3a.impl.InternalConstants.SC_301_MOVED_PERMANENTLY;
250245
import static org.apache.hadoop.fs.s3a.impl.InternalConstants.SC_403_FORBIDDEN;
251246
import static org.apache.hadoop.fs.s3a.impl.InternalConstants.SC_404_NOT_FOUND;
252247
import static org.apache.hadoop.fs.s3a.impl.InternalConstants.UPLOAD_PART_COUNT_LIMIT;
@@ -332,8 +327,6 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
332327
private int executorCapacity;
333328
private long multiPartThreshold;
334329
public static final Logger LOG = LoggerFactory.getLogger(S3AFileSystem.class);
335-
/** Exactly once log to warn about setting the region in config to avoid probe. */
336-
private static final LogExactlyOnce SET_REGION_WARNING = new LogExactlyOnce(LOG);
337330

338331
/** Log to warn of storage class configuration problems. */
339332
private static final LogExactlyOnce STORAGE_CLASS_WARNING = new LogExactlyOnce(LOG);
@@ -461,8 +454,6 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
461454
*/
462455
private String scheme = FS_S3A;
463456

464-
private final static Map<String, Region> BUCKET_REGIONS = new HashMap<>();
465-
466457
/** Add any deprecated keys. */
467458
@SuppressWarnings("deprecation")
468459
private static void addDeprecatedKeys() {
@@ -870,9 +861,6 @@ protected void verifyBucketExists() throws UnknownStoreException, IOException {
870861
STORE_EXISTS_PROBE, bucket, null, () ->
871862
invoker.retry("doesBucketExist", bucket, true, () -> {
872863
try {
873-
if (BUCKET_REGIONS.containsKey(bucket)) {
874-
return true;
875-
}
876864
s3Client.headBucket(HeadBucketRequest.builder().bucket(bucket).build());
877865
return true;
878866
} catch (AwsServiceException ex) {
@@ -982,8 +970,6 @@ private void bindAWSClient(URI name, boolean dtEnabled) throws IOException {
982970
? conf.getTrimmed(AWS_REGION)
983971
: accessPoint.getRegion();
984972

985-
Region region = getS3Region(configuredRegion);
986-
987973
S3ClientFactory.S3ClientCreationParameters parameters =
988974
new S3ClientFactory.S3ClientCreationParameters()
989975
.withCredentialSet(credentials)
@@ -998,7 +984,7 @@ private void bindAWSClient(URI name, boolean dtEnabled) throws IOException {
998984
.withMultipartCopyEnabled(isMultipartCopyEnabled)
999985
.withMultipartThreshold(multiPartThreshold)
1000986
.withTransferManagerExecutor(unboundedThreadPool)
1001-
.withRegion(region);
987+
.withRegion(configuredRegion);
1002988

1003989
S3ClientFactory clientFactory = ReflectionUtils.newInstance(s3ClientFactoryClass, conf);
1004990
s3Client = clientFactory.createS3Client(getUri(), parameters);
@@ -1019,75 +1005,6 @@ private void createS3AsyncClient(S3ClientFactory clientFactory,
10191005
s3AsyncClient = clientFactory.createS3AsyncClient(getUri(), parameters);
10201006
}
10211007

1022-
/**
1023-
* Get the bucket region.
1024-
*
1025-
* @param region AWS S3 Region set in the config. This property may not be set, in which case
1026-
* ask S3 for the region.
1027-
* @return region of the bucket.
1028-
*/
1029-
private Region getS3Region(String region) throws IOException {
1030-
1031-
if (!StringUtils.isBlank(region)) {
1032-
return Region.of(region);
1033-
}
1034-
1035-
Region cachedRegion = BUCKET_REGIONS.get(bucket);
1036-
1037-
if (cachedRegion != null) {
1038-
LOG.debug("Got region {} for bucket {} from cache", cachedRegion, bucket);
1039-
return cachedRegion;
1040-
}
1041-
1042-
Region s3Region = trackDurationAndSpan(STORE_REGION_PROBE, bucket, null,
1043-
() -> invoker.retry("getS3Region", bucket, true, () -> {
1044-
try {
1045-
1046-
SET_REGION_WARNING.warn(
1047-
"Getting region for bucket {} from S3, this will slow down FS initialisation. "
1048-
+ "To avoid this, set the region using property {}", bucket,
1049-
FS_S3A_BUCKET_PREFIX + bucket + ".endpoint.region");
1050-
1051-
// build a s3 client with region eu-west-1 that can be used to get the region of the
1052-
// bucket. Using eu-west-1, as headBucket() doesn't work with us-east-1. This is because
1053-
// us-east-1 uses the endpoint s3.amazonaws.com, which resolves bucket.s3.amazonaws.com
1054-
// to the actual region the bucket is in. As the request is signed with us-east-1 and
1055-
// not the bucket's region, it fails.
1056-
S3Client getRegionS3Client =
1057-
S3Client.builder().region(Region.EU_WEST_1).credentialsProvider(credentials)
1058-
.build();
1059-
1060-
HeadBucketResponse headBucketResponse =
1061-
getRegionS3Client.headBucket(HeadBucketRequest.builder().bucket(bucket).build());
1062-
1063-
Region bucketRegion = Region.of(
1064-
headBucketResponse.sdkHttpResponse().headers().get(BUCKET_REGION_HEADER).get(0));
1065-
BUCKET_REGIONS.put(bucket, bucketRegion);
1066-
1067-
return bucketRegion;
1068-
} catch (S3Exception exception) {
1069-
if (exception.statusCode() == SC_301_MOVED_PERMANENTLY) {
1070-
Region bucketRegion = Region.of(
1071-
exception.awsErrorDetails().sdkHttpResponse().headers().get(BUCKET_REGION_HEADER)
1072-
.get(0));
1073-
BUCKET_REGIONS.put(bucket, bucketRegion);
1074-
1075-
return bucketRegion;
1076-
}
1077-
1078-
if (exception.statusCode() == SC_404_NOT_FOUND) {
1079-
throw new UnknownStoreException("s3a://" + bucket + "/",
1080-
" Bucket does not exist: " + exception,
1081-
exception);
1082-
}
1083-
1084-
throw exception;
1085-
}
1086-
}));
1087-
1088-
return s3Region;
1089-
}
1090-
10911008
/**
10921009
* Initialize and launch the audit manager and service.
10931010
* As this takes the FS IOStatistics store, it must be invoked

0 commit comments

Comments
 (0)