Skip to content

Commit 4b951ef

Browse files
[CAS] Add OnDiskCAS
Add OnDiskCAS abstraction, that implements ObjectStore and ActionCache interface using OnDiskGraphDB and OnDiskKeyValueDB. Reviewers: Pull Request: #114103
1 parent f2b20d3 commit 4b951ef

20 files changed

+1967
-29
lines changed

llvm/include/llvm/CAS/ActionCache.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ class ActionCache {
7575
CanBeDistributed);
7676
}
7777

78+
/// Validate the ActionCache contents.
79+
virtual Error validate() const = 0;
80+
7881
virtual ~ActionCache() = default;
7982

8083
protected:
@@ -97,6 +100,13 @@ class ActionCache {
97100
/// Create an action cache in memory.
98101
std::unique_ptr<ActionCache> createInMemoryActionCache();
99102

103+
/// Get a reasonable default on-disk path for a persistent ActionCache for the
104+
/// current user.
105+
std::string getDefaultOnDiskActionCachePath();
106+
107+
/// Create an action cache on disk.
108+
Expected<std::unique_ptr<ActionCache>> createOnDiskActionCache(StringRef Path);
109+
100110
} // end namespace llvm::cas
101111

102112
#endif // LLVM_CAS_ACTIONCACHE_H
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
10+
#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
11+
12+
#include "llvm/Support/Error.h"
13+
14+
namespace llvm::cas {
15+
16+
class ActionCache;
17+
class ObjectStore;
18+
19+
/// Create on-disk \c ObjectStore and \c ActionCache instances based on
20+
/// \c ondisk::UnifiedOnDiskCache, with built-in hashing.
21+
Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
22+
createOnDiskUnifiedCASDatabases(StringRef Path);
23+
24+
/// Represents the result of validating the contents using
25+
/// \c validateOnDiskUnifiedCASDatabasesIfNeeded.
26+
///
27+
/// Note: invalid results are handled as an \c Error.
28+
enum class ValidationResult {
29+
/// The data is already valid.
30+
Valid,
31+
/// The data was invalid, but was recovered.
32+
Recovered,
33+
/// Validation was skipped, as it was not needed.
34+
Skipped,
35+
};
36+
37+
/// Validate the data in \p Path, if needed to ensure correctness.
38+
///
39+
/// \param Path directory for the on-disk database.
40+
/// \param CheckHash Whether to validate hashes match the data.
41+
/// \param AllowRecovery Whether to automatically recover from invalid data by
42+
/// marking the files for garbage collection.
43+
/// \param ForceValidation Whether to force validation to occur even if it
44+
/// should not be necessary.
45+
/// \param LLVMCasBinary If provided, validation is performed out-of-process
46+
/// using the given \c llvm-cas executable which protects against crashes
47+
/// during validation. Otherwise validation is performed in-process.
48+
///
49+
/// \returns \c Valid if the data is already valid, \c Recovered if data
50+
/// was invalid but has been cleared, \c Skipped if validation is not needed,
51+
/// or an \c Error if validation cannot be performed or if the data is left
52+
/// in an invalid state because \p AllowRecovery is false.
53+
Expected<ValidationResult> validateOnDiskUnifiedCASDatabasesIfNeeded(
54+
StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation,
55+
std::optional<StringRef> LLVMCasBinary);
56+
57+
} // namespace llvm::cas
58+
59+
#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H

llvm/include/llvm/CAS/ObjectStore.h

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,10 @@ class ObjectStore {
111111
virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0;
112112

113113
/// Validate the underlying object referred by CASID.
114-
virtual Error validate(const CASID &ID) = 0;
114+
virtual Error validateObject(const CASID &ID) = 0;
115+
116+
/// Validate the entire ObjectStore.
117+
virtual Error validate(bool CheckHash) const = 0;
115118

116119
protected:
117120
/// Load the object referenced by \p Ref.
@@ -215,9 +218,39 @@ class ObjectStore {
215218
return Data.size();
216219
}
217220

221+
/// Set the size for limiting growth of on-disk storage. This has an effect
222+
/// for when the instance is closed.
223+
///
224+
/// Implementations may be not have this implemented.
225+
virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) {
226+
return Error::success();
227+
}
228+
229+
/// \returns the storage size of the on-disk CAS data.
230+
///
231+
/// Implementations that don't have an implementation for this should return
232+
/// \p std::nullopt.
233+
virtual Expected<std::optional<uint64_t>> getStorageSize() const {
234+
return std::nullopt;
235+
}
236+
237+
/// Prune local storage to reduce its size according to the desired size
238+
/// limit. Pruning can happen concurrently with other operations.
239+
///
240+
/// Implementations may be not have this implemented.
241+
virtual Error pruneStorageData() { return Error::success(); }
242+
218243
/// Validate the whole node tree.
219244
Error validateTree(ObjectRef Ref);
220245

246+
/// Import object from another CAS. This will import the full tree from the
247+
/// other CAS.
248+
Expected<ObjectRef> importObject(ObjectStore &Upstream, ObjectRef Other);
249+
250+
/// Print the ObjectStore internals for debugging purpose.
251+
virtual void print(raw_ostream &) const {}
252+
void dump() const;
253+
221254
/// Get CASContext
222255
const CASContext &getContext() const { return Context; }
223256

@@ -292,6 +325,20 @@ class ObjectProxy {
292325

293326
std::unique_ptr<ObjectStore> createInMemoryCAS();
294327

328+
/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled.
329+
bool isOnDiskCASEnabled();
330+
331+
/// Gets or creates a persistent on-disk path at \p Path.
332+
Expected<std::unique_ptr<ObjectStore>> createOnDiskCAS(const Twine &Path);
333+
334+
/// Set \p Path to a reasonable default on-disk path for a persistent CAS for
335+
/// the current user.
336+
Error getDefaultOnDiskCASPath(SmallVectorImpl<char> &Path);
337+
338+
/// Get a reasonable default on-disk path for a persistent CAS for the current
339+
/// user.
340+
llvm::Expected<std::string> getDefaultOnDiskCASPath();
341+
295342
} // namespace cas
296343
} // namespace llvm
297344

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H
10+
#define LLVM_CAS_UNIFIEDONDISKCACHE_H
11+
12+
#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
13+
#include "llvm/CAS/OnDiskGraphDB.h"
14+
#include <atomic>
15+
16+
namespace llvm::cas::ondisk {
17+
18+
class OnDiskKeyValueDB;
19+
20+
/// A unified CAS nodes and key-value database, using on-disk storage for both.
21+
/// It manages storage growth and provides APIs for garbage collection.
22+
///
23+
/// High-level properties:
24+
/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the
25+
/// storage size in that directory will keep growing unrestricted. For data to
26+
/// become eligible for garbage-collection there should be no open instances
27+
/// of \p UnifiedOnDiskCache for that directory, by any process.
28+
/// * Garbage-collection needs to be triggered explicitly by the client. It can
29+
/// be triggered on a directory concurrently, at any time and by any process,
30+
/// without affecting any active readers/writers, in the same process or other
31+
/// processes.
32+
///
33+
/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open
34+
/// for a limited period of time, e.g. for the duration of a build operation.
35+
/// For long-living processes that need periodic access to a
36+
/// \p UnifiedOnDiskCache, the client should device a scheme where access is
37+
/// performed within some defined period. For example, if a service is designed
38+
/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it
39+
/// could keep the instance alive while new requests are coming in but close it
40+
/// after a time period in which there are no new requests.
41+
class UnifiedOnDiskCache {
42+
public:
43+
/// The \p OnDiskGraphDB instance for the open directory.
44+
OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; }
45+
46+
/// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key.
47+
///
48+
/// \param Key the hash bytes for the key.
49+
/// \param Value the \p ObjectID value.
50+
///
51+
/// \returns the \p ObjectID associated with the \p Key. It may be different
52+
/// than \p Value if another value was already associated with this key.
53+
Expected<ObjectID> KVPut(ArrayRef<uint8_t> Key, ObjectID Value);
54+
55+
/// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key.
56+
/// An \p ObjectID as a key is equivalent to its digest bytes.
57+
///
58+
/// \param Key the \p ObjectID for the key.
59+
/// \param Value the \p ObjectID value.
60+
///
61+
/// \returns the \p ObjectID associated with the \p Key. It may be different
62+
/// than \p Value if another value was already associated with this key.
63+
Expected<ObjectID> KVPut(ObjectID Key, ObjectID Value);
64+
65+
/// \returns the \p ObjectID, of the \p OnDiskGraphDB instance, associated
66+
/// with the \p Key, or \p std::nullopt if the key does not exist.
67+
Expected<std::optional<ObjectID>> KVGet(ArrayRef<uint8_t> Key);
68+
69+
/// Open a \p UnifiedOnDiskCache instance for a directory.
70+
///
71+
/// \param Path directory for the on-disk database. The directory will be
72+
/// created if it doesn't exist.
73+
/// \param SizeLimit Optional size for limiting growth. This has an effect for
74+
/// when the instance is closed.
75+
/// \param HashName Identifier name for the hashing algorithm that is going to
76+
/// be used.
77+
/// \param HashByteSize Size for the object digest hash bytes.
78+
/// \param FaultInPolicy Controls how nodes are copied to primary store. This
79+
/// is recorded at creation time and subsequent opens need to pass the same
80+
/// policy otherwise the \p open will fail.
81+
static Expected<std::unique_ptr<UnifiedOnDiskCache>>
82+
open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName,
83+
unsigned HashByteSize,
84+
OnDiskGraphDB::FaultInPolicy FaultInPolicy =
85+
OnDiskGraphDB::FaultInPolicy::FullTree);
86+
87+
/// Validate the data in \p Path, if needed to ensure correctness.
88+
///
89+
/// Note: if invalid data is detected and \p AllowRecovery is true, then
90+
/// recovery requires exclusive access to the CAS and it is an error to
91+
/// attempt recovery if there is concurrent use of the CAS.
92+
///
93+
/// \param Path directory for the on-disk database.
94+
/// \param HashName Identifier name for the hashing algorithm that is going to
95+
/// be used.
96+
/// \param HashByteSize Size for the object digest hash bytes.
97+
/// \param CheckHash Whether to validate hashes match the data.
98+
/// \param AllowRecovery Whether to automatically recover from invalid data by
99+
/// marking the files for garbage collection.
100+
/// \param ForceValidation Whether to force validation to occur even if it
101+
/// should not be necessary.
102+
/// \param LLVMCasBinary If provided, validation is performed out-of-process
103+
/// using the given \c llvm-cas executable which protects against crashes
104+
/// during validation. Otherwise validation is performed in-process.
105+
///
106+
/// \returns \c Valid if the data is already valid, \c Recovered if data
107+
/// was invalid but has been cleared, \c Skipped if validation is not needed,
108+
/// or an \c Error if validation cannot be performed or if the data is left
109+
/// in an invalid state because \p AllowRecovery is false.
110+
static Expected<ValidationResult>
111+
validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize,
112+
bool CheckHash, bool AllowRecovery, bool ForceValidation,
113+
std::optional<StringRef> LLVMCasBinary);
114+
115+
/// This is called implicitly at destruction time, so it is not required for a
116+
/// client to call this. After calling \p close the only method that is valid
117+
/// to call is \p needsGarbageCollection.
118+
///
119+
/// \param CheckSizeLimit if true it will check whether the primary store has
120+
/// exceeded its intended size limit. If false the check is skipped even if a
121+
/// \p SizeLimit was passed to the \p open call.
122+
Error close(bool CheckSizeLimit = true);
123+
124+
/// Set the size for limiting growth. This has an effect for when the instance
125+
/// is closed.
126+
void setSizeLimit(std::optional<uint64_t> SizeLimit);
127+
128+
/// \returns the storage size of the cache data.
129+
uint64_t getStorageSize() const;
130+
131+
/// \returns whether the primary store has exceeded the intended size limit.
132+
/// This can return false even if the overall size of the opened directory is
133+
/// over the \p SizeLimit passed to \p open. To know whether garbage
134+
/// collection needs to be triggered or not, call \p needsGarbaseCollection.
135+
bool hasExceededSizeLimit() const;
136+
137+
/// \returns whether there are unused data that can be deleted using a
138+
/// \p collectGarbage call.
139+
bool needsGarbageCollection() const { return NeedsGarbageCollection; }
140+
141+
/// Remove any unused data from the directory at \p Path. If there are no such
142+
/// data the operation is a no-op.
143+
///
144+
/// This can be called concurrently, regardless of whether there is an open
145+
/// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers
146+
/// in the same process or other processes.
147+
///
148+
/// It is recommended that garbage-collection is triggered concurrently in the
149+
/// background, so that it has minimal effect on the workload of the process.
150+
static Error collectGarbage(StringRef Path);
151+
152+
/// Remove unused data from the current UnifiedOnDiskCache.
153+
Error collectGarbage();
154+
155+
/// Validate the key value databases.
156+
Error validateActionCache();
157+
158+
/// Get the upstream OnDiskGraphDB if exists.
159+
///
160+
/// \returns upstream database or nullptr if upstream database doesn't exist.
161+
OnDiskGraphDB *getUpstreamGraphDB() const { return UpstreamGraphDB; }
162+
163+
~UnifiedOnDiskCache();
164+
165+
private:
166+
UnifiedOnDiskCache();
167+
168+
Expected<std::optional<ObjectID>>
169+
faultInFromUpstreamKV(ArrayRef<uint8_t> Key);
170+
171+
/// \returns the storage size of the primary directory.
172+
uint64_t getPrimaryStorageSize() const;
173+
174+
std::string RootPath;
175+
std::atomic<uint64_t> SizeLimit;
176+
177+
int LockFD = -1;
178+
179+
std::atomic<bool> NeedsGarbageCollection;
180+
std::string PrimaryDBDir;
181+
182+
OnDiskGraphDB *UpstreamGraphDB = nullptr;
183+
std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
184+
185+
std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
186+
std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
187+
};
188+
189+
} // namespace llvm::cas::ondisk
190+
191+
#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H

0 commit comments

Comments
 (0)