diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java index c842f0e922d5..237284ca44fe 100644 --- a/core/src/main/java/org/apache/iceberg/LocationProviders.java +++ b/core/src/main/java/org/apache/iceberg/LocationProviders.java @@ -109,6 +109,10 @@ static class ObjectStoreLocationProvider implements LocationProvider { private static final HashFunction HASH_FUNC = Hashing.murmur3_32_fixed(); // the starting index of the lower 20-bits of a 32-bit binary string private static final int HASH_BINARY_STRING_START_INDEX = 12; + // Entropy generated will be divided into dirs with this lengths + private static final int ENTROPY_DIR_LENGTH = 4; + // Will create DEPTH many dirs from the entropy + private static final int ENTROPY_DIR_DEPTH = 3; private final String storageLocation; private final String context; private final boolean includePartitionPaths; @@ -160,7 +164,13 @@ public String newDataLocation(String filename) { if (context != null) { return String.format("%s/%s/%s/%s", storageLocation, hash, context, filename); } else { - return String.format("%s/%s/%s", storageLocation, hash, filename); + // if partition paths are included, add last part of entropy as dir before partition names + if (includePartitionPaths) { + return String.format("%s/%s/%s", storageLocation, hash, filename); + } else { + // if partition paths are not included, append last part of entropy with `-` to file name + return String.format("%s/%s-%s", storageLocation, hash, filename); + } } } @@ -187,7 +197,39 @@ private String computeHash(String fileName) { // {@link Integer#toBinaryString} excludes leading zeros, which we want to preserve. // force the first bit to be set to get around that. String hashAsBinaryString = Integer.toBinaryString(hashCode.asInt() | Integer.MIN_VALUE); - return hashAsBinaryString.substring(HASH_BINARY_STRING_START_INDEX); + return dirsFromHash(hashAsBinaryString.substring(HASH_BINARY_STRING_START_INDEX)); + } + + /** + * Divides hash into directories for optimized orphan removal operation using ENTROPY_DIR_DEPTH + * and ENTROPY_DIR_LENGTH + * + * @param hash 10011001100110011001 + * @return 1001/1001/1001/10011001 with depth 3 and length 4 + */ + private String dirsFromHash(String hash) { + if (hash.length() < ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH) { + throw new IllegalArgumentException( + String.format( + "Generated hash length needs to be at least %d chars long.", + ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH)); + } + StringBuilder hashWithDirs = new StringBuilder(); + + for (int i = 0; i < ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH; i += ENTROPY_DIR_LENGTH) { + if (i > 0) { + hashWithDirs.append("/"); + } + hashWithDirs.append(hash, i, Math.min(i + ENTROPY_DIR_LENGTH, hash.length())); + } + + if (hash.length() > ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH) { + hashWithDirs + .append("/") + .append(hash, ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH, hash.length()); + } + + return hashWithDirs.toString(); } } } diff --git a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java index e10ee8e231d7..7edba51c3d85 100644 --- a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java +++ b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java @@ -278,12 +278,12 @@ public void testObjectStorageWithinTableLocation() { String fileLocation = table.locationProvider().newDataLocation("test.parquet"); String relativeLocation = fileLocation.replaceFirst(table.location(), ""); List parts = Splitter.on("/").splitToList(relativeLocation); - - assertThat(parts).hasSize(4); + assertThat(parts).hasSize(7); assertThat(parts).first().asString().isEmpty(); assertThat(parts).element(1).asString().isEqualTo("data"); - assertThat(parts).element(2).asString().isNotEmpty(); - assertThat(parts).element(3).asString().isEqualTo("test.parquet"); + // entropy dirs in the middle + assertThat(parts).elements(2, 3, 4, 5).asString().isNotEmpty(); + assertThat(parts).element(6).asString().isEqualTo("test.parquet"); } @TestTemplate @@ -318,20 +318,20 @@ public void testExcludePartitionInPath() { String fileLocation = table.locationProvider().newDataLocation(table.spec(), partitionData, "test.parquet"); - // no partition values included in the path - assertThat(fileLocation).endsWith("/data/01101010001111101000/test.parquet"); + // no partition values included in the path and last part of entropy is seperated with "-" + assertThat(fileLocation).endsWith("/data/0110/1010/0011/11101000-test.parquet"); } @TestTemplate public void testHashInjection() { table.updateProperties().set(TableProperties.OBJECT_STORE_ENABLED, "true").commit(); assertThat(table.locationProvider().newDataLocation("a")) - .endsWith("/data/01010110100110110010/a"); + .endsWith("/data/0101/0110/1001/10110010/a"); assertThat(table.locationProvider().newDataLocation("b")) - .endsWith("/data/11100111111000000011/b"); + .endsWith("/data/1110/0111/1110/00000011/b"); assertThat(table.locationProvider().newDataLocation("c")) - .endsWith("/data/00101101011001011111/c"); + .endsWith("/data/0010/1101/0110/01011111/c"); assertThat(table.locationProvider().newDataLocation("d")) - .endsWith("/data/10010001010001110011/d"); + .endsWith("/data/1001/0001/0100/01110011/d"); } }