Skip to content

Commit

Permalink
Divide LocationProvider entropy into directories for optimized orphan…
Browse files Browse the repository at this point in the history
… removal
  • Loading branch information
ookumuso committed Oct 11, 2024
1 parent 39b1a29 commit 977ee08
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 12 deletions.
46 changes: 44 additions & 2 deletions core/src/main/java/org/apache/iceberg/LocationProviders.java
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ static class ObjectStoreLocationProvider implements LocationProvider {
private static final HashFunction HASH_FUNC = Hashing.murmur3_32_fixed();
// the starting index of the lower 20-bits of a 32-bit binary string
private static final int HASH_BINARY_STRING_START_INDEX = 12;
// Entropy generated will be divided into dirs with this lengths
private static final int ENTROPY_DIR_LENGTH = 4;
// Will create DEPTH many dirs from the entropy
private static final int ENTROPY_DIR_DEPTH = 3;
private final String storageLocation;
private final String context;
private final boolean includePartitionPaths;
Expand Down Expand Up @@ -160,7 +164,13 @@ public String newDataLocation(String filename) {
if (context != null) {
return String.format("%s/%s/%s/%s", storageLocation, hash, context, filename);
} else {
return String.format("%s/%s/%s", storageLocation, hash, filename);
// if partition paths are included, add last part of entropy as dir before partition names
if (includePartitionPaths) {
return String.format("%s/%s/%s", storageLocation, hash, filename);
} else {
// if partition paths are not included, append last part of entropy with `-` to file name
return String.format("%s/%s-%s", storageLocation, hash, filename);
}
}
}

Expand All @@ -187,7 +197,39 @@ private String computeHash(String fileName) {
// {@link Integer#toBinaryString} excludes leading zeros, which we want to preserve.
// force the first bit to be set to get around that.
String hashAsBinaryString = Integer.toBinaryString(hashCode.asInt() | Integer.MIN_VALUE);
return hashAsBinaryString.substring(HASH_BINARY_STRING_START_INDEX);
return dirsFromHash(hashAsBinaryString.substring(HASH_BINARY_STRING_START_INDEX));
}

/**
* Divides hash into directories for optimized orphan removal operation using ENTROPY_DIR_DEPTH
* and ENTROPY_DIR_LENGTH
*
* @param hash 10011001100110011001
* @return 1001/1001/1001/10011001 with depth 3 and length 4
*/
private String dirsFromHash(String hash) {
if (hash.length() < ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH) {
throw new IllegalArgumentException(
String.format(
"Generated hash length needs to be at least %d chars long.",
ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH));
}
StringBuilder hashWithDirs = new StringBuilder();

for (int i = 0; i < ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH; i += ENTROPY_DIR_LENGTH) {
if (i > 0) {
hashWithDirs.append("/");
}
hashWithDirs.append(hash, i, Math.min(i + ENTROPY_DIR_LENGTH, hash.length()));
}

if (hash.length() > ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH) {
hashWithDirs
.append("/")
.append(hash, ENTROPY_DIR_DEPTH * ENTROPY_DIR_LENGTH, hash.length());
}

return hashWithDirs.toString();
}
}
}
20 changes: 10 additions & 10 deletions core/src/test/java/org/apache/iceberg/TestLocationProvider.java
Original file line number Diff line number Diff line change
Expand Up @@ -278,12 +278,12 @@ public void testObjectStorageWithinTableLocation() {
String fileLocation = table.locationProvider().newDataLocation("test.parquet");
String relativeLocation = fileLocation.replaceFirst(table.location(), "");
List<String> parts = Splitter.on("/").splitToList(relativeLocation);

assertThat(parts).hasSize(4);
assertThat(parts).hasSize(7);
assertThat(parts).first().asString().isEmpty();
assertThat(parts).element(1).asString().isEqualTo("data");
assertThat(parts).element(2).asString().isNotEmpty();
assertThat(parts).element(3).asString().isEqualTo("test.parquet");
// entropy dirs in the middle
assertThat(parts).elements(2, 3, 4, 5).asString().isNotEmpty();
assertThat(parts).element(6).asString().isEqualTo("test.parquet");
}

@TestTemplate
Expand Down Expand Up @@ -318,20 +318,20 @@ public void testExcludePartitionInPath() {
String fileLocation =
table.locationProvider().newDataLocation(table.spec(), partitionData, "test.parquet");

// no partition values included in the path
assertThat(fileLocation).endsWith("/data/01101010001111101000/test.parquet");
// no partition values included in the path and last part of entropy is seperated with "-"
assertThat(fileLocation).endsWith("/data/0110/1010/0011/11101000-test.parquet");
}

@TestTemplate
public void testHashInjection() {
table.updateProperties().set(TableProperties.OBJECT_STORE_ENABLED, "true").commit();
assertThat(table.locationProvider().newDataLocation("a"))
.endsWith("/data/01010110100110110010/a");
.endsWith("/data/0101/0110/1001/10110010/a");
assertThat(table.locationProvider().newDataLocation("b"))
.endsWith("/data/11100111111000000011/b");
.endsWith("/data/1110/0111/1110/00000011/b");
assertThat(table.locationProvider().newDataLocation("c"))
.endsWith("/data/00101101011001011111/c");
.endsWith("/data/0010/1101/0110/01011111/c");
assertThat(table.locationProvider().newDataLocation("d"))
.endsWith("/data/10010001010001110011/d");
.endsWith("/data/1001/0001/0100/01110011/d");
}
}

0 comments on commit 977ee08

Please sign in to comment.