Skip to content

Commit 38a8b82

Browse files
Don't Upload Redundant Shard Files (elastic#51729)
Segment(s) info blobs are already stored with their full content in the "hash" field in the shard snapshot metadata as long as they are smaller than 1MB. We can make use of this fact and never upload them physically to the repo. This saves a non-trivial number of uploads and downloads when restoring and might also lower the latency of searchable snapshots since they can save phyiscally loading this information as well.
1 parent 864e9d8 commit 38a8b82

File tree

4 files changed

+102
-46
lines changed

4 files changed

+102
-46
lines changed

server/src/main/java/org/elasticsearch/index/store/StoreFileMetaData.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,13 @@
1919

2020
package org.elasticsearch.index.store;
2121

22+
import org.apache.lucene.codecs.CodecUtil;
2223
import org.apache.lucene.util.BytesRef;
2324
import org.apache.lucene.util.Version;
2425
import org.elasticsearch.common.io.stream.StreamInput;
2526
import org.elasticsearch.common.io.stream.StreamOutput;
2627
import org.elasticsearch.common.io.stream.Writeable;
28+
import org.elasticsearch.common.lucene.store.ByteArrayIndexInput;
2729

2830
import java.io.IOException;
2931
import java.text.ParseException;
@@ -100,6 +102,29 @@ public String checksum() {
100102
return this.checksum;
101103
}
102104

105+
/**
106+
* Checks if the bytes returned by {@link #hash()} are the contents of the file that this instances refers to.
107+
*
108+
* @return {@code true} iff {@link #hash()} will return the actual file contents
109+
*/
110+
public boolean hashEqualsContents() {
111+
if (hash.length == length) {
112+
try {
113+
final boolean checksumsMatch = Store.digestToString(CodecUtil.retrieveChecksum(
114+
new ByteArrayIndexInput("store_file", hash.bytes, hash.offset, hash.length))).equals(checksum);
115+
assert checksumsMatch : "Checksums did not match for [" + this + "] which has a hash of [" + hash + "]";
116+
return checksumsMatch;
117+
} catch (Exception e) {
118+
// Hash didn't contain any bytes that Lucene could extract a checksum from so we can't verify against the checksum of the
119+
// original file. We should never see an exception here because lucene files are assumed to always contain the checksum
120+
// footer.
121+
assert false : new AssertionError("Saw exception for hash [" + hash + "] but expected it to be Lucene file", e);
122+
return false;
123+
}
124+
}
125+
return false;
126+
}
127+
103128
/**
104129
* Returns <code>true</code> iff the length and the checksums are the same. otherwise <code>false</code>
105130
*/

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 67 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.apache.lucene.store.IndexInput;
3131
import org.apache.lucene.store.IndexOutput;
3232
import org.apache.lucene.store.RateLimiter;
33+
import org.apache.lucene.util.BytesRef;
3334
import org.apache.lucene.util.SetOnce;
3435
import org.elasticsearch.ExceptionsHelper;
3536
import org.elasticsearch.Version;
@@ -182,7 +183,15 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
182183

183184
private static final String SNAPSHOT_INDEX_CODEC = "snapshots";
184185

185-
private static final String DATA_BLOB_PREFIX = "__";
186+
private static final String UPLOADED_DATA_BLOB_PREFIX = "__";
187+
188+
/**
189+
* Prefix used for the identifiers of data blobs that were not actually written to the repository physically because their contents are
190+
* already stored in the metadata referencing them, i.e. in {@link BlobStoreIndexShardSnapshot} and
191+
* {@link BlobStoreIndexShardSnapshots}. This is the case for files for which {@link StoreFileMetaData#hashEqualsContents()} is
192+
* {@code true}.
193+
*/
194+
private static final String VIRTUAL_DATA_BLOB_PREFIX = "v__";
186195

187196
/**
188197
* When set to true metadata files are stored in compressed format. This setting doesn’t affect index
@@ -1529,6 +1538,9 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
15291538
}
15301539
}
15311540

1541+
// We can skip writing blobs where the metadata hash is equal to the blob's contents because we store the hash/contents
1542+
// directly in the shard level metadata in this case
1543+
final boolean needsWrite = md.hashEqualsContents() == false;
15321544
indexTotalFileCount += md.length();
15331545
indexTotalNumberOfFiles++;
15341546

@@ -1537,9 +1549,14 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
15371549
indexIncrementalSize += md.length();
15381550
// create a new FileInfo
15391551
BlobStoreIndexShardSnapshot.FileInfo snapshotFileInfo =
1540-
new BlobStoreIndexShardSnapshot.FileInfo(DATA_BLOB_PREFIX + UUIDs.randomBase64UUID(), md, chunkSize());
1552+
new BlobStoreIndexShardSnapshot.FileInfo(
1553+
(needsWrite ? UPLOADED_DATA_BLOB_PREFIX : VIRTUAL_DATA_BLOB_PREFIX) + UUIDs.randomBase64UUID(),
1554+
md, chunkSize());
15411555
indexCommitPointFiles.add(snapshotFileInfo);
1542-
filesToSnapshot.add(snapshotFileInfo);
1556+
if (needsWrite) {
1557+
filesToSnapshot.add(snapshotFileInfo);
1558+
}
1559+
assert needsWrite || assertFileContentsMatchHash(snapshotFileInfo, store);
15431560
} else {
15441561
indexCommitPointFiles.add(existingFileInfo);
15451562
}
@@ -1548,8 +1565,6 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
15481565
snapshotStatus.moveToStarted(startTime, indexIncrementalFileCount,
15491566
indexTotalNumberOfFiles, indexIncrementalSize, indexTotalFileCount);
15501567

1551-
assert indexIncrementalFileCount == filesToSnapshot.size();
1552-
15531568
final StepListener<Collection<Void>> allFilesUploadedListener = new StepListener<>();
15541569
allFilesUploadedListener.whenComplete(v -> {
15551570
final IndexShardSnapshotStatus.Copy lastSnapshotStatus =
@@ -1638,6 +1653,17 @@ public void snapshotShard(Store store, MapperService mapperService, SnapshotId s
16381653
}
16391654
}
16401655

1656+
private static boolean assertFileContentsMatchHash(BlobStoreIndexShardSnapshot.FileInfo fileInfo, Store store) {
1657+
try (IndexInput indexInput = store.openVerifyingInput(fileInfo.physicalName(), IOContext.READONCE, fileInfo.metadata())) {
1658+
final byte[] tmp = new byte[Math.toIntExact(fileInfo.metadata().length())];
1659+
indexInput.readBytes(tmp, 0, tmp.length);
1660+
assert fileInfo.metadata().hash().bytesEquals(new BytesRef(tmp));
1661+
} catch (IOException e) {
1662+
throw new AssertionError(e);
1663+
}
1664+
return true;
1665+
}
1666+
16411667
@Override
16421668
public void restoreShard(Store store, SnapshotId snapshotId, IndexId indexId, ShardId snapshotShardId,
16431669
RecoveryState recoveryState, ActionListener<Void> listener) {
@@ -1681,38 +1707,42 @@ protected void restoreFiles(List<BlobStoreIndexShardSnapshot.FileInfo> filesToRe
16811707

16821708
private void restoreFile(BlobStoreIndexShardSnapshot.FileInfo fileInfo, Store store) throws IOException {
16831709
boolean success = false;
1684-
1685-
try (InputStream stream = maybeRateLimit(new SlicedInputStream(fileInfo.numberOfParts()) {
1686-
@Override
1687-
protected InputStream openSlice(long slice) throws IOException {
1688-
return container.readBlob(fileInfo.partName(slice));
1689-
}
1690-
},
1691-
restoreRateLimiter, restoreRateLimitingTimeInNanos)) {
1692-
try (IndexOutput indexOutput =
1693-
store.createVerifyingOutput(fileInfo.physicalName(), fileInfo.metadata(), IOContext.DEFAULT)) {
1694-
final byte[] buffer = new byte[BUFFER_SIZE];
1695-
int length;
1696-
while ((length = stream.read(buffer)) > 0) {
1697-
indexOutput.writeBytes(buffer, 0, length);
1698-
recoveryState.getIndex().addRecoveredBytesToFile(fileInfo.physicalName(), length);
1699-
}
1700-
Store.verify(indexOutput);
1701-
indexOutput.close();
1702-
store.directory().sync(Collections.singleton(fileInfo.physicalName()));
1703-
success = true;
1704-
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
1705-
try {
1706-
store.markStoreCorrupted(ex);
1707-
} catch (IOException e) {
1708-
logger.warn("store cannot be marked as corrupted", e);
1709-
}
1710-
throw ex;
1711-
} finally {
1712-
if (success == false) {
1713-
store.deleteQuiet(fileInfo.physicalName());
1710+
try (IndexOutput indexOutput =
1711+
store.createVerifyingOutput(fileInfo.physicalName(), fileInfo.metadata(), IOContext.DEFAULT)) {
1712+
if (fileInfo.name().startsWith(VIRTUAL_DATA_BLOB_PREFIX)) {
1713+
final BytesRef hash = fileInfo.metadata().hash();
1714+
indexOutput.writeBytes(hash.bytes, hash.offset, hash.length);
1715+
recoveryState.getIndex().addRecoveredBytesToFile(fileInfo.physicalName(), hash.length);
1716+
} else {
1717+
try (InputStream stream = maybeRateLimit(new SlicedInputStream(fileInfo.numberOfParts()) {
1718+
@Override
1719+
protected InputStream openSlice(long slice) throws IOException {
1720+
return container.readBlob(fileInfo.partName(slice));
1721+
}
1722+
}, restoreRateLimiter, restoreRateLimitingTimeInNanos)) {
1723+
final byte[] buffer = new byte[BUFFER_SIZE];
1724+
int length;
1725+
while ((length = stream.read(buffer)) > 0) {
1726+
indexOutput.writeBytes(buffer, 0, length);
1727+
recoveryState.getIndex().addRecoveredBytesToFile(fileInfo.physicalName(), length);
1728+
}
17141729
}
17151730
}
1731+
Store.verify(indexOutput);
1732+
indexOutput.close();
1733+
store.directory().sync(Collections.singleton(fileInfo.physicalName()));
1734+
success = true;
1735+
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
1736+
try {
1737+
store.markStoreCorrupted(ex);
1738+
} catch (IOException e) {
1739+
logger.warn("store cannot be marked as corrupted", e);
1740+
}
1741+
throw ex;
1742+
} finally {
1743+
if (success == false) {
1744+
store.deleteQuiet(fileInfo.physicalName());
1745+
}
17161746
}
17171747
}
17181748
}.restore(snapshotFiles, store, l);
@@ -1843,7 +1873,7 @@ private static List<String> unusedBlobs(Set<String> blobs, Set<String> surviving
18431873
|| (blob.startsWith(SNAPSHOT_PREFIX) && blob.endsWith(".dat")
18441874
&& survivingSnapshotUUIDs.contains(
18451875
blob.substring(SNAPSHOT_PREFIX.length(), blob.length() - ".dat".length())) == false)
1846-
|| (blob.startsWith(DATA_BLOB_PREFIX) && updatedSnapshots.findNameFile(canonicalName(blob)) == null)
1876+
|| (blob.startsWith(UPLOADED_DATA_BLOB_PREFIX) && updatedSnapshots.findNameFile(canonicalName(blob)) == null)
18471877
|| FsBlobContainer.isTempBlobName(blob)).collect(Collectors.toList());
18481878
}
18491879

@@ -1897,7 +1927,7 @@ private Tuple<BlobStoreIndexShardSnapshots, Long> buildBlobStoreIndexShardSnapsh
18971927
final BlobStoreIndexShardSnapshots shardSnapshots = indexShardSnapshotsFormat.read(shardContainer, Long.toString(latest));
18981928
return new Tuple<>(shardSnapshots, latest);
18991929
} else if (blobs.stream().anyMatch(b -> b.startsWith(SNAPSHOT_PREFIX) || b.startsWith(INDEX_FILE_PREFIX)
1900-
|| b.startsWith(DATA_BLOB_PREFIX))) {
1930+
|| b.startsWith(UPLOADED_DATA_BLOB_PREFIX))) {
19011931
throw new IllegalStateException(
19021932
"Could not find a readable index-N file in a non-empty shard snapshot directory [" + shardContainer.path() + "]");
19031933
}

server/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreIT.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,11 +1132,11 @@ public void testSnapshotTotalAndIncrementalSizes() throws IOException {
11321132

11331133
SnapshotStats stats = snapshots.get(0).getStats();
11341134

1135-
assertThat(stats.getTotalFileCount(), is(snapshot0FileCount));
1136-
assertThat(stats.getTotalSize(), is(snapshot0FileSize));
1135+
assertThat(stats.getTotalFileCount(), greaterThanOrEqualTo(snapshot0FileCount));
1136+
assertThat(stats.getTotalSize(), greaterThanOrEqualTo(snapshot0FileSize));
11371137

1138-
assertThat(stats.getIncrementalFileCount(), equalTo(snapshot0FileCount));
1139-
assertThat(stats.getIncrementalSize(), equalTo(snapshot0FileSize));
1138+
assertThat(stats.getIncrementalFileCount(), equalTo(stats.getTotalFileCount()));
1139+
assertThat(stats.getIncrementalSize(), equalTo(stats.getTotalSize()));
11401140

11411141
assertThat(stats.getIncrementalFileCount(), equalTo(stats.getProcessedFileCount()));
11421142
assertThat(stats.getIncrementalSize(), equalTo(stats.getProcessedSize()));
@@ -1175,17 +1175,17 @@ public void testSnapshotTotalAndIncrementalSizes() throws IOException {
11751175
ArrayList<Path> snapshotFilesDiff = new ArrayList<>(snapshot1Files);
11761176
snapshotFilesDiff.removeAll(snapshot0Files);
11771177

1178-
assertThat(anotherStats.getIncrementalFileCount(), equalTo(snapshotFilesDiff.size()));
1179-
assertThat(anotherStats.getIncrementalSize(), equalTo(calculateTotalFilesSize(snapshotFilesDiff)));
1178+
assertThat(anotherStats.getIncrementalFileCount(), greaterThanOrEqualTo(snapshotFilesDiff.size()));
1179+
assertThat(anotherStats.getIncrementalSize(), greaterThanOrEqualTo(calculateTotalFilesSize(snapshotFilesDiff)));
11801180

11811181
assertThat(anotherStats.getIncrementalFileCount(), equalTo(anotherStats.getProcessedFileCount()));
11821182
assertThat(anotherStats.getIncrementalSize(), equalTo(anotherStats.getProcessedSize()));
11831183

11841184
assertThat(stats.getTotalSize(), lessThan(anotherStats.getTotalSize()));
11851185
assertThat(stats.getTotalFileCount(), lessThan(anotherStats.getTotalFileCount()));
11861186

1187-
assertThat(anotherStats.getTotalFileCount(), is(snapshot1FileCount));
1188-
assertThat(anotherStats.getTotalSize(), is(snapshot1FileSize));
1187+
assertThat(anotherStats.getTotalFileCount(), greaterThanOrEqualTo(snapshot1FileCount));
1188+
assertThat(anotherStats.getTotalSize(), greaterThanOrEqualTo(snapshot1FileSize));
11891189
}
11901190

11911191
public void testDataNodeRestartWithBusyMasterDuringSnapshot() throws Exception {

server/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreIT.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1036,7 +1036,8 @@ public void testUnrestorableFilesDuringRestore() throws Exception {
10361036
final String indexName = "unrestorable-files";
10371037
final int maxRetries = randomIntBetween(1, 10);
10381038

1039-
Settings createIndexSettings = Settings.builder().put(SETTING_ALLOCATION_MAX_RETRY.getKey(), maxRetries).build();
1039+
Settings createIndexSettings = Settings.builder().put(SETTING_ALLOCATION_MAX_RETRY.getKey(), maxRetries)
1040+
.put(IndexMetaData.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 1).build();
10401041

10411042
Settings repositorySettings = Settings.builder()
10421043
.put("random", randomAlphaOfLength(10))

0 commit comments

Comments
 (0)