apache · steveloughran · Sep 29, 2021 · Oct 4, 2021 · Feb 4, 2022
diff --git a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml
@@ -1590,6 +1590,14 @@
     implementations can still be used</description>
 </property>
 
+<property>
+  <name>fs.s3a.accesspoint.required</name>
+  <value>false</value>
+  <description>Require that all S3 access is made through Access Points and not through
+  buckets directly. If enabled, use per-bucket overrides to allow bucket access to a specific set
+  of buckets.</description>
+</property>
+
 <property>
   <name>fs.s3a.block.size</name>
   <value>32M</value>

diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/ArnResource.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/ArnResource.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a;
+
+import javax.annotation.Nonnull;
+
+import com.amazonaws.arn.Arn;
+
+/**
+ * Represents an Arn Resource, this can be an accesspoint or bucket.
+ */
+public final class ArnResource {
+  private final static String ACCESSPOINT_ENDPOINT_FORMAT = "s3-accesspoint.%s.amazonaws.com";
+
+  /**
+   * Resource name.
+   */
+  private final String name;
+
+  /**
+   * Resource owner account id.
+   */
+  private final String ownerAccountId;
+
+  /**
+   * Resource region.
+   */
+  private final String region;
+
+  /**
+   * Full Arn for the resource.
+   */
+  private final String fullArn;
+
+  /**
+   * Partition for the resource. Allowed partitions: aws, aws-cn, aws-us-gov
+   */
+  private final String partition;
+
+  /**
+   * Because of the different ways an endpoint can be constructed depending on partition we're
+   * relying on the AWS SDK to produce the endpoint. In this case we need a region key of the form
+   * {@code String.format("accesspoint-%s", awsRegion)}
+   */
+  private final String accessPointRegionKey;
+
+  private ArnResource(String name, String owner, String region, String partition, String fullArn) {
+    this.name = name;
+    this.ownerAccountId = owner;
+    this.region = region;
+    this.partition = partition;
+    this.fullArn = fullArn;
+    this.accessPointRegionKey = String.format("accesspoint-%s", region);
+  }
+
+  /**
+   * Resource name.
+   * @return resource name.
+   */
+  public String getName() {
+    return name;
+  }
+
+  /**
+   * Return owner's account id.
+   * @return owner account id
+   */
+  public String getOwnerAccountId() {
+    return ownerAccountId;
+  }
+
+  /**
+   * Resource region.
+   * @return resource region.
+   */
+  public String getRegion() {
+    return region;
+  }
+
+  /**
+   * Full arn for resource.
+   * @return arn for resource.
+   */
+  public String getFullArn() {
+    return fullArn;
+  }
+
+  /**
+   * Formatted endpoint for the resource.
+   * @return resource endpoint.
+   */
+  public String getEndpoint() {
+    return String.format(ACCESSPOINT_ENDPOINT_FORMAT, region);
+  }
+
+  /**
+   * Parses the passed `arn` string into a full ArnResource.
+   * @param arn - string representing an Arn resource.
+   * @return new ArnResource instance.
+   * @throws IllegalArgumentException - if the Arn is malformed or any of the region, accountId and
+   * resource name properties are empty.
+   */
+  @Nonnull
+  public static ArnResource accessPointFromArn(String arn) throws IllegalArgumentException {
+    Arn parsed = Arn.fromString(arn);
+
+    if (parsed.getRegion().isEmpty() || parsed.getAccountId().isEmpty() ||
+        parsed.getResourceAsString().isEmpty()) {
+      throw new IllegalArgumentException(
+          String.format("Access Point Arn %s has an invalid format or missing properties", arn));
+    }
+
+    String resourceName = parsed.getResource().getResource();
+    return new ArnResource(resourceName, parsed.getAccountId(), parsed.getRegion(),
+        parsed.getPartition(), arn);
+  }
+}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java
@@ -1056,4 +1056,8 @@ private Constants() {
    */
   public static final String AWS_S3_CENTRAL_REGION = "us-east-1";
 
+  /**
+   * Require that all S3 access is made through Access Points.
+   */
+  public static final String AWS_S3_ACCESSPOINT_REQUIRED = "fs.s3a.accesspoint.required";
 }
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -204,9 +204,13 @@
 import static org.apache.hadoop.fs.s3a.impl.CallableSupplier.waitForCompletionIgnoringExceptions;
 import static org.apache.hadoop.fs.s3a.impl.ErrorTranslation.isObjectNotFound;
 import static org.apache.hadoop.fs.s3a.impl.ErrorTranslation.isUnknownBucket;
+import static org.apache.hadoop.fs.s3a.impl.InternalConstants.AP_INACCESSIBLE;
+import static org.apache.hadoop.fs.s3a.impl.InternalConstants.AP_REQUIRED_EXCEPTION;
+import static org.apache.hadoop.fs.s3a.impl.InternalConstants.ARN_BUCKET_OPTION;
 import static org.apache.hadoop.fs.s3a.impl.InternalConstants.CSE_PADDING_LENGTH;
 import static org.apache.hadoop.fs.s3a.impl.InternalConstants.DEFAULT_UPLOAD_PART_COUNT_LIMIT;
 import static org.apache.hadoop.fs.s3a.impl.InternalConstants.DELETE_CONSIDERED_IDEMPOTENT;
+import static org.apache.hadoop.fs.s3a.impl.InternalConstants.SC_403;
 import static org.apache.hadoop.fs.s3a.impl.InternalConstants.SC_404;
 import static org.apache.hadoop.fs.s3a.impl.InternalConstants.UPLOAD_PART_COUNT_LIMIT;
 import static org.apache.hadoop.fs.s3a.impl.NetworkBinding.fixBucketRegion;
@@ -258,6 +262,12 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
       Invoker.LOG_EVENT);
 
   private final Retried onRetry = this::operationRetried;
+
+  /**
+   * Represents bucket name for all S3 operations. If per bucket override for
+   * {@link InternalConstants#ARN_BUCKET_OPTION} property  is set, then the bucket is updated to
+   * point to the configured Arn.
+   */
   private String bucket;
   private int maxKeys;
   private Listing listing;
@@ -345,6 +355,11 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities,
    */
   private boolean isCSEEnabled;
 
+  /**
+   * Bucket AccessPoint.
+   */
+  private ArnResource accessPoint;
+
   /** Add any deprecated keys. */
   @SuppressWarnings("deprecation")
   private static void addDeprecatedKeys() {
@@ -386,10 +401,20 @@ public void initialize(URI name, Configuration originalConf)
       LOG.debug("Initializing S3AFileSystem for {}", bucket);
       // clone the configuration into one with propagated bucket options
       Configuration conf = propagateBucketOptions(originalConf, bucket);
-
       // HADOOP-17894. remove references to s3a stores in JCEKS credentials.
       conf = ProviderUtils.excludeIncompatibleCredentialProviders(
           conf, S3AFileSystem.class);
+      String arn = String.format(ARN_BUCKET_OPTION, bucket);
+      String configuredArn = conf.getTrimmed(arn, "");
+      if (!configuredArn.isEmpty()) {
+        accessPoint = ArnResource.accessPointFromArn(configuredArn);
+        LOG.info("Using AccessPoint ARN \"{}\" for bucket {}", configuredArn, bucket);
+        bucket = accessPoint.getFullArn();
+      } else if (conf.getBoolean(AWS_S3_ACCESSPOINT_REQUIRED, false)) {
+        LOG.warn("Access Point usage is required because \"{}\" is enabled," +
+            " but not configured for the bucket: {}", AWS_S3_ACCESSPOINT_REQUIRED, bucket);
+        throw new PathIOException(bucket, AP_REQUIRED_EXCEPTION);
+      }
 
       // fix up the classloader of the configuration to be whatever
       // classloader loaded this filesystem.
@@ -449,6 +474,11 @@ public void initialize(URI name, Configuration originalConf)
             "version 2", listVersion);
       }
       useListV1 = (listVersion == 1);
+      if (accessPoint != null && useListV1) {
+        LOG.warn("V1 list configured in fs.s3a.list.version. This is not supported in by" +
+            " access points. Upgrading to V2");
+        useListV1 = false;
+      }
 
       signerManager = new SignerManager(bucket, this, conf, owner);
       signerManager.initCustomSigners();
@@ -690,11 +720,25 @@ protected void verifyBucketExists()
    */
   @Retries.RetryTranslated
   protected void verifyBucketExistsV2()
-          throws UnknownStoreException, IOException {
+      throws UnknownStoreException, IOException {
     if (!invoker.retry("doesBucketExistV2", bucket, true,
         trackDurationOfOperation(getDurationTrackerFactory(),
             STORE_EXISTS_PROBE.getSymbol(),
-            () -> s3.doesBucketExistV2(bucket)))) {
+            () -> {
+              // Bug in SDK always returns `true` for AccessPoint ARNs with `doesBucketExistV2()`
+              // expanding implementation to use ARNs and buckets correctly
+              try {
+                s3.getBucketAcl(bucket);
+              } catch (AmazonServiceException ex) {
+                int statusCode = ex.getStatusCode();
+                if (statusCode == SC_404 ||
+                    (statusCode == SC_403 && ex.getMessage().contains(AP_INACCESSIBLE))) {
+                  return false;
+                }
+              }
+
+              return true;
+            }))) {
       throw new UnknownStoreException("s3a://" + bucket + "/", " Bucket does "
           + "not exist");
     }
@@ -782,10 +826,14 @@ private void bindAWSClient(URI name, boolean dtEnabled) throws IOException {
         S3_CLIENT_FACTORY_IMPL, DEFAULT_S3_CLIENT_FACTORY_IMPL,
         S3ClientFactory.class);
 
+    String endpoint = accessPoint == null
+        ? conf.getTrimmed(ENDPOINT, DEFAULT_ENDPOINT)
+        : accessPoint.getEndpoint();
+
     S3ClientFactory.S3ClientCreationParameters parameters = null;
     parameters = new S3ClientFactory.S3ClientCreationParameters()
         .withCredentialSet(credentials)
-        .withEndpoint(conf.getTrimmed(ENDPOINT, DEFAULT_ENDPOINT))
+        .withEndpoint(endpoint)
         .withMetrics(statisticsContext.newStatisticsFromAwsSdk())
         .withPathStyleAccess(conf.getBoolean(PATH_STYLE_ACCESS, false))
         .withUserAgentSuffix(uaSuffix)
@@ -1114,7 +1162,10 @@ public String getBucketLocation(String bucketName) throws IOException {
     final String region = trackDurationAndSpan(
         STORE_EXISTS_PROBE, bucketName, null, () ->
             invoker.retry("getBucketLocation()", bucketName, true, () ->
-                s3.getBucketLocation(bucketName)));
+                // If accessPoint then region is known from Arn
+                accessPoint != null
+                    ? accessPoint.getRegion()
+                    : s3.getBucketLocation(bucketName)));
     return fixBucketRegion(region);
   }
 
@@ -4106,6 +4157,10 @@ public String toString() {
           .append("}");
     }
     sb.append(", ClientSideEncryption=").append(isCSEEnabled);
+
+    if (accessPoint != null) {
+      sb.append(", arnForBucket=").append(accessPoint.getFullArn());
+    }
     sb.append('}');
     return sb.toString();
   }

diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java
@@ -461,6 +461,7 @@ public DurationTracker trackDuration(final String key, final long count) {
   /**
    * Create an IOStatistics store which updates FS metrics
    * as well as IOStatistics.
+   * @return instance of the store.
    */
   public IOStatisticsStore createMetricsUpdatingStore() {
     return new MetricsUpdatingIOStatisticsStore();

diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/InternalConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/InternalConstants.java
@@ -95,6 +95,9 @@ private InternalConstants() {
               Arrays.asList(Constants.INPUT_FADVISE,
                   Constants.READAHEAD_RANGE)));
 
+  /** 403 error code. */
+  public static final int SC_403 = 403;
+
   /** 404 error code. */
   public static final int SC_404 = 404;
 
@@ -134,4 +137,20 @@ private InternalConstants() {
    */
   public static final int CSE_PADDING_LENGTH = 16;
 
+  /**
+   * Error message to indicate Access Points are required to be used for S3 access.
+   */
+  public static final String AP_REQUIRED_EXCEPTION = "Access Points usage is required" +
+      " but not configured for the bucket.";
+
+  /**
+   * Error message to indicate Access Points are not accessible or don't exist.
+   */
+  public static final String AP_INACCESSIBLE = "Could not access through this access point";
+
+  /**
+   * AccessPoint ARN for the bucket. When set as a bucket override the requests for that bucket
+   * will go through the AccessPoint.
+   */
+  public static final String ARN_BUCKET_OPTION = "fs.s3a.bucket.%s.accesspoint.arn";
 }
diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md
@@ -1563,6 +1563,62 @@ Why explicitly declare a bucket bound to the central endpoint? It ensures
 that if the default endpoint is changed to a new region, data store in
 US-east is still reachable.
 
+## <a name="accesspoints"></a>Configuring S3 AccessPoints usage with S3A
+S3a now supports [S3 Access Point](https://aws.amazon.com/s3/features/access-points/) usage which
+improves VPC integration with S3 and simplifies your data's permission model because different
+policies can be applied now on the Access Point level. For more information about why to use and
+how to create them make sure to read the official documentation.
+
+Accessing data through an access point, is done by using its ARN, as opposed to just the bucket name.
+You can set the Access Point ARN property using the following per bucket configuration property:
+```xml
+<property>
+    <name>fs.s3a.sample-bucket.accesspoint.arn</name>
+    <value> {ACCESSPOINT_ARN_HERE} </value>
+    <description>Configure S3a traffic to use this AccessPoint</description>
+</property>
+```
+
+This configures access to the `sample-bucket` bucket for S3A, to go through the
+new Access Point ARN. So, for example `s3a://sample-bucket/key` will now use your
+configured ARN when getting data from S3 instead of your bucket.
+
+You can also use an Access Point name as a path URI such as `s3a://finance-team-access/key`, by
+configuring the `.accesspoint.arn` property as a per-bucket override:
+```xml
+<property>
+    <name>fs.s3a.finance-team-access.accesspoint.arn</name>
+    <value> {ACCESSPOINT_ARN_HERE} </value>
+    <description>Configure S3a traffic to use this AccessPoint</description>
+</property>
+```
+
+The `fs.s3a.accesspoint.required` property can also require all access to S3 to go through Access
+Points. This has the advantage of increasing security inside a VPN / VPC as you only allow access
+to known sources of data defined through Access Points. In case there is a need to access a bucket
+directly (without Access Points) then you can use per bucket overrides to disable this setting on a
+bucket by bucket basis i.e. `fs.s3a.{YOUR-BUCKET}.accesspoint.required`.
+
+```xml
+<!-- Require access point only access -->
+<property>
+    <name>fs.s3a.accesspoint.required</name>
+    <value>true</value>
+</property>
+<!-- Disable it on a per-bucket basis if needed -->
+<property>
+    <name>fs.s3a.example-bucket.accesspoint.required</name>
+    <value>false</value>
+</property>
+```
+
+Before using Access Points make sure you're not impacted by the following:
+- `ListObjectsV1` is not supported, this is also deprecated on AWS S3 for performance reasons;
+- The endpoint for S3 requests will automatically change from `s3.amazonaws.com` to use
+`s3-accesspoint.REGION.amazonaws.{com | com.cn}` depending on the Access Point ARN. While
+considering endpoints, if you have any custom signers that use the host endpoint property make
+sure to update them if needed;
+
 ## <a name="upload"></a>How S3A writes data to S3
 
 The original S3A client implemented file writes by