Skip to content

Fix incorrect S3 uri parsing when key is not specified on path style #700

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions src/IO/S3/URI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
/// Case when bucket name and key represented in the path of S3 URL.
/// E.g. (https://s3.region.amazonaws.com/bucket-name/key)
/// https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html#path-style-access
static const RE2 path_style_pattern("^/([^/]*)/(.*)");
static const RE2 path_style_pattern("^/([^/]*)(?:/?(.*))");

if (allow_archive_path_syntax)
std::tie(uri_str, archive_pattern) = getURIAndArchivePattern(uri_);
Expand Down Expand Up @@ -124,7 +124,6 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
{
endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri;
}
validateBucket(bucket, uri);

if (!uri.getPath().empty())
{
Expand All @@ -142,7 +141,6 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
{
is_virtual_hosted_style = false;
endpoint = uri.getScheme() + "://" + uri.getAuthority();
validateBucket(bucket, uri);
}
else
{
Expand All @@ -155,6 +153,9 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
if (!uri.getPath().empty())
key = uri.getPath().substr(1);
}

validateBucket(bucket, uri);
validateKey(key, uri);
}

void URI::addRegionToURI(const std::string &region)
Expand All @@ -175,6 +176,37 @@ void URI::validateBucket(const String & bucket, const Poco::URI & uri)
!uri.empty() ? " (" + uri.toString() + ")" : "");
}

void URI::validateKey(const String & key, const Poco::URI & uri)
{
auto onError = [&]()
{
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"Invalid S3 key: {}{}",
quoteString(key),
!uri.empty() ? " (" + uri.toString() + ")" : "");
};


// this shouldn't happen ever because the regex should not catch this
if (key.size() == 1 && key[0] == '/')
{
onError();
}

// the current regex impl allows something like "bucket-name/////".
// bucket: bucket-name
// key: ////
// throw exception in case such thing is found
for (size_t i = 1; i < key.size(); i++)
{
if (key[i - 1] == '/' && key[i] == '/')
{
onError();
}
}
}

std::pair<std::string, std::optional<std::string>> URI::getURIAndArchivePattern(const std::string & source)
{
size_t pos = source.find("::");
Expand Down
1 change: 1 addition & 0 deletions src/IO/S3/URI.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ struct URI
void addRegionToURI(const std::string & region);

static void validateBucket(const std::string & bucket, const Poco::URI & uri);
static void validateKey(const std::string & key, const Poco::URI & uri);

private:
std::pair<std::string, std::optional<std::string>> getURIAndArchivePattern(const std::string & source);
Expand Down
41 changes: 41 additions & 0 deletions src/IO/S3/tests/gtest_s3_uri.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include <gtest/gtest.h>

#include <IO/S3/URI.h>
#include "config.h"


#if USE_AWS_S3

TEST(IOTestS3URI, PathStyleNoKey)
{
using namespace DB;

auto uri_with_no_key_and_no_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name");

ASSERT_EQ(uri_with_no_key_and_no_slash.bucket, "bucket-name");
ASSERT_EQ(uri_with_no_key_and_no_slash.key, "");

auto uri_with_no_key_and_with_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name/");

ASSERT_EQ(uri_with_no_key_and_with_slash.bucket, "bucket-name");
ASSERT_EQ(uri_with_no_key_and_with_slash.key, "");

ASSERT_ANY_THROW(S3::URI("https://s3.region.amazonaws.com/bucket-name//"));
}

TEST(IOTestS3URI, PathStyleWithKey)
{
using namespace DB;

auto uri_with_no_key_and_no_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name/key");

ASSERT_EQ(uri_with_no_key_and_no_slash.bucket, "bucket-name");
ASSERT_EQ(uri_with_no_key_and_no_slash.key, "key");

auto uri_with_no_key_and_with_slash = S3::URI("https://s3.region.amazonaws.com/bucket-name/key/key/key/key");

ASSERT_EQ(uri_with_no_key_and_with_slash.bucket, "bucket-name");
ASSERT_EQ(uri_with_no_key_and_with_slash.key, "key/key/key/key");
}

#endif
Loading