Skip to content

Commit

Permalink
Azure: Accept wasb[s] paths in ADLSLocation
Browse files Browse the repository at this point in the history
  • Loading branch information
mrcnc committed Oct 10, 2024
1 parent 208ab20 commit 91e20bc
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,34 @@
*/
package org.apache.iceberg.azure.adlsv2;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;

/**
* This class represents a fully qualified location in Azure expressed as a URI.
* This class represents a fully qualified location in Azure Data Lake Storage, expressed as a URI.
*
* <p>Locations follow the conventions used by Hadoop's Azure support, i.e.
*
* <pre>{@code abfs[s]://[<container>@]<storage account host>/<file path>}</pre>
* <pre>{@code abfs[s]://[<container>@]<storageAccount>.dfs.core.windows.net/<path>}</pre>
*
* <p>See <a href="https://hadoop.apache.org/docs/stable/hadoop-azure/abfs.html">Hadoop Azure
* Support</a>
* or
*
* <pre>{@code wasb[s]://<container>@<storageAccount>.blob.core.windows.net/<path>}</pre>
*
* For compatibility, paths using the wasb scheme are also accepted but will be processed via the
* Azure Data Lake Storage Gen2 APIs and not the Blob Storage APIs.
*
* <p>See <a
* href="https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri#uri-syntax">Hadoop
* Azure Support</a>
*/
class ADLSLocation {
private static final Pattern URI_PATTERN = Pattern.compile("^abfss?://([^/?#]+)(.*)?$");
private static final Pattern URI_PATTERN = Pattern.compile("^(abfss?|wasbs?)://([^/?#]+)(.*)?$");

private final String storageAccount;
private final String container;
Expand All @@ -53,19 +63,17 @@ class ADLSLocation {

ValidationException.check(matcher.matches(), "Invalid ADLS URI: %s", location);

String authority = matcher.group(1);
String[] parts = authority.split("@", -1);
if (parts.length > 1) {
this.container = parts[0];
this.storageAccount = parts[1];
} else {
this.container = null;
this.storageAccount = authority;
try {
URI uri = new URI(location);
this.container = uri.getUserInfo();
// storage account name is the first part of the host
int accountSplit = uri.getHost().indexOf('.');
String storageAccountName = uri.getHost().substring(0, accountSplit);
this.storageAccount = String.format("%s.dfs.core.windows.net", storageAccountName);
this.path = uri.getPath().length() > 1 ? uri.getPath().substring(1) : "";
} catch (URISyntaxException e) {
throw new ValidationException("Invalid URI: %s", location);
}

String uriPath = matcher.group(2);
uriPath = uriPath == null ? "" : uriPath.startsWith("/") ? uriPath.substring(1) : uriPath;
this.path = uriPath.split("\\?", -1)[0].split("#", -1)[0];
}

/** Returns Azure storage account. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,17 @@ public void testLocationParsing(String scheme) {
assertThat(location.path()).isEqualTo("path/to/file");
}

@ParameterizedTest
@ValueSource(strings = {"wasb", "wasbs"})
public void testWasbLocationParsing(String scheme) {
String p1 = scheme + "://container@account.blob.core.windows.net/path/to/file";
ADLSLocation location = new ADLSLocation(p1);

assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net");
assertThat(location.container().get()).isEqualTo("container");
assertThat(location.path()).isEqualTo("path/to/file");
}

@Test
public void testEncodedString() {
String p1 = "abfs://container@account.dfs.core.windows.net/path%20to%20file";
Expand All @@ -62,30 +73,42 @@ public void testInvalidScheme() {
.hasMessage("Invalid ADLS URI: s3://bucket/path/to/file");
}

@Test
public void testNoContainer() {
String p1 = "abfs://account.dfs.core.windows.net/path/to/file";
ADLSLocation location = new ADLSLocation(p1);
@ParameterizedTest
@ValueSource(
strings = {
"abfs://account.dfs.core.windows.net/path/to/file",
"wasb://account.blob.core.windows.net/path/to/file"
})
public void testNoContainer(String path) {
ADLSLocation location = new ADLSLocation(path);

assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net");
assertThat(location.container().isPresent()).isFalse();
assertThat(location.path()).isEqualTo("path/to/file");
}

@Test
public void testNoPath() {
String p1 = "abfs://container@account.dfs.core.windows.net";
ADLSLocation location = new ADLSLocation(p1);
@ParameterizedTest
@ValueSource(
strings = {
"abfs://container@account.dfs.core.windows.net",
"wasb://container@account.blob.core.windows.net",
})
public void testNoPath(String path) {
ADLSLocation location = new ADLSLocation(path);

assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net");
assertThat(location.container().get()).isEqualTo("container");
assertThat(location.path()).isEqualTo("");
}

@Test
public void testQueryAndFragment() {
String p1 = "abfs://container@account.dfs.core.windows.net/path/to/file?query=foo#123";
ADLSLocation location = new ADLSLocation(p1);
@ParameterizedTest
@ValueSource(
strings = {
"abfs://container@account.dfs.core.windows.net/path/to/file?query=foo#123",
"wasb://container@account.blob.core.windows.net/path/to/file?query=foo#123"
})
public void testQueryAndFragment(String path) {
ADLSLocation location = new ADLSLocation(path);

assertThat(location.storageAccount()).isEqualTo("account.dfs.core.windows.net");
assertThat(location.container().get()).isEqualTo("container");
Expand Down

0 comments on commit 91e20bc

Please sign in to comment.