From 91e20bc730c98eec4af4f2647fe8dcc38519e939 Mon Sep 17 00:00:00 2001 From: Marc Cenac Date: Wed, 9 Oct 2024 23:04:47 -0500 Subject: [PATCH] Azure: Accept wasb[s] paths in ADLSLocation --- .../iceberg/azure/adlsv2/ADLSLocation.java | 42 ++++++++++------- .../azure/adlsv2/ADLSLocationTest.java | 47 ++++++++++++++----- 2 files changed, 60 insertions(+), 29 deletions(-) diff --git a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java index e73093512b82..4a49bcea50c4 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java +++ b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSLocation.java @@ -18,6 +18,8 @@ */ package org.apache.iceberg.azure.adlsv2; +import java.net.URI; +import java.net.URISyntaxException; import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -25,17 +27,25 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** - * This class represents a fully qualified location in Azure expressed as a URI. + * This class represents a fully qualified location in Azure Data Lake Storage, expressed as a URI. * *

Locations follow the conventions used by Hadoop's Azure support, i.e. * - *

{@code abfs[s]://[@]/}
+ *
{@code abfs[s]://[@].dfs.core.windows.net/}
* - *

See Hadoop Azure - * Support + * or + * + *

{@code wasb[s]://@.blob.core.windows.net/}
+ * + * For compatibility, paths using the wasb scheme are also accepted but will be processed via the + * Azure Data Lake Storage Gen2 APIs and not the Blob Storage APIs. + * + *

See Hadoop + * Azure Support */ class ADLSLocation { - private static final Pattern URI_PATTERN = Pattern.compile("^abfss?://([^/?#]+)(.*)?$"); + private static final Pattern URI_PATTERN = Pattern.compile("^(abfss?|wasbs?)://([^/?#]+)(.*)?$"); private final String storageAccount; private final String container; @@ -53,19 +63,17 @@ class ADLSLocation { ValidationException.check(matcher.matches(), "Invalid ADLS URI: %s", location); - String authority = matcher.group(1); - String[] parts = authority.split("@", -1); - if (parts.length > 1) { - this.container = parts[0]; - this.storageAccount = parts[1]; - } else { - this.container = null; - this.storageAccount = authority; + try { + URI uri = new URI(location); + this.container = uri.getUserInfo(); + // storage account name is the first part of the host + int accountSplit = uri.getHost().indexOf('.'); + String storageAccountName = uri.getHost().substring(0, accountSplit); + this.storageAccount = String.format("%s.dfs.core.windows.net", storageAccountName); + this.path = uri.getPath().length() > 1 ? uri.getPath().substring(1) : ""; + } catch (URISyntaxException e) { + throw new ValidationException("Invalid URI: %s", location); } - - String uriPath = matcher.group(2); - uriPath = uriPath == null ? "" : uriPath.startsWith("/") ? uriPath.substring(1) : uriPath; - this.path = uriPath.split("\\?", -1)[0].split("#", -1)[0]; } /** Returns Azure storage account. */ diff --git a/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java b/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java index 867b54b4c7e3..34dd8e792e6d 100644 --- a/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java +++ b/azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSLocationTest.java @@ -38,6 +38,17 @@ public void testLocationParsing(String scheme) { assertThat(location.path()).isEqualTo("path/to/file"); } + @ParameterizedTest + @ValueSource(strings = {"wasb", "wasbs"}) + public void testWasbLocationParsing(String scheme) { + String p1 = scheme + "://container@account.blob.core.windows.net/path/to/file"; + ADLSLocation location = new ADLSLocation(p1); + + assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); + assertThat(location.container().get()).isEqualTo("container"); + assertThat(location.path()).isEqualTo("path/to/file"); + } + @Test public void testEncodedString() { String p1 = "abfs://container@account.dfs.core.windows.net/path%20to%20file"; @@ -62,30 +73,42 @@ public void testInvalidScheme() { .hasMessage("Invalid ADLS URI: s3://bucket/path/to/file"); } - @Test - public void testNoContainer() { - String p1 = "abfs://account.dfs.core.windows.net/path/to/file"; - ADLSLocation location = new ADLSLocation(p1); + @ParameterizedTest + @ValueSource( + strings = { + "abfs://account.dfs.core.windows.net/path/to/file", + "wasb://account.blob.core.windows.net/path/to/file" + }) + public void testNoContainer(String path) { + ADLSLocation location = new ADLSLocation(path); assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); assertThat(location.container().isPresent()).isFalse(); assertThat(location.path()).isEqualTo("path/to/file"); } - @Test - public void testNoPath() { - String p1 = "abfs://container@account.dfs.core.windows.net"; - ADLSLocation location = new ADLSLocation(p1); + @ParameterizedTest + @ValueSource( + strings = { + "abfs://container@account.dfs.core.windows.net", + "wasb://container@account.blob.core.windows.net", + }) + public void testNoPath(String path) { + ADLSLocation location = new ADLSLocation(path); assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); assertThat(location.container().get()).isEqualTo("container"); assertThat(location.path()).isEqualTo(""); } - @Test - public void testQueryAndFragment() { - String p1 = "abfs://container@account.dfs.core.windows.net/path/to/file?query=foo#123"; - ADLSLocation location = new ADLSLocation(p1); + @ParameterizedTest + @ValueSource( + strings = { + "abfs://container@account.dfs.core.windows.net/path/to/file?query=foo#123", + "wasb://container@account.blob.core.windows.net/path/to/file?query=foo#123" + }) + public void testQueryAndFragment(String path) { + ADLSLocation location = new ADLSLocation(path); assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net"); assertThat(location.container().get()).isEqualTo("container");