|
| 1 | +//===----------------------------------------------------------------------===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | + |
| 9 | +#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H |
| 10 | +#define LLVM_CAS_UNIFIEDONDISKCACHE_H |
| 11 | + |
| 12 | +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" |
| 13 | +#include "llvm/CAS/OnDiskGraphDB.h" |
| 14 | +#include <atomic> |
| 15 | + |
| 16 | +namespace llvm::cas::ondisk { |
| 17 | + |
| 18 | +class OnDiskKeyValueDB; |
| 19 | + |
| 20 | +/// A unified CAS nodes and key-value database, using on-disk storage for both. |
| 21 | +/// It manages storage growth and provides APIs for garbage collection. |
| 22 | +/// |
| 23 | +/// High-level properties: |
| 24 | +/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the |
| 25 | +/// storage size in that directory will keep growing unrestricted. For data to |
| 26 | +/// become eligible for garbage-collection there should be no open instances |
| 27 | +/// of \p UnifiedOnDiskCache for that directory, by any process. |
| 28 | +/// * Garbage-collection needs to be triggered explicitly by the client. It can |
| 29 | +/// be triggered on a directory concurrently, at any time and by any process, |
| 30 | +/// without affecting any active readers/writers, in the same process or other |
| 31 | +/// processes. |
| 32 | +/// |
| 33 | +/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open |
| 34 | +/// for a limited period of time, e.g. for the duration of a build operation. |
| 35 | +/// For long-living processes that need periodic access to a |
| 36 | +/// \p UnifiedOnDiskCache, the client should device a scheme where access is |
| 37 | +/// performed within some defined period. For example, if a service is designed |
| 38 | +/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it |
| 39 | +/// could keep the instance alive while new requests are coming in but close it |
| 40 | +/// after a time period in which there are no new requests. |
| 41 | +class UnifiedOnDiskCache { |
| 42 | +public: |
| 43 | + /// The \p OnDiskGraphDB instance for the open directory. |
| 44 | + OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; } |
| 45 | + |
| 46 | + /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key. |
| 47 | + /// |
| 48 | + /// \param Key the hash bytes for the key. |
| 49 | + /// \param Value the \p ObjectID value. |
| 50 | + /// |
| 51 | + /// \returns the \p ObjectID associated with the \p Key. It may be different |
| 52 | + /// than \p Value if another value was already associated with this key. |
| 53 | + Expected<ObjectID> KVPut(ArrayRef<uint8_t> Key, ObjectID Value); |
| 54 | + |
| 55 | + /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key. |
| 56 | + /// An \p ObjectID as a key is equivalent to its digest bytes. |
| 57 | + /// |
| 58 | + /// \param Key the \p ObjectID for the key. |
| 59 | + /// \param Value the \p ObjectID value. |
| 60 | + /// |
| 61 | + /// \returns the \p ObjectID associated with the \p Key. It may be different |
| 62 | + /// than \p Value if another value was already associated with this key. |
| 63 | + Expected<ObjectID> KVPut(ObjectID Key, ObjectID Value); |
| 64 | + |
| 65 | + /// \returns the \p ObjectID, of the \p OnDiskGraphDB instance, associated |
| 66 | + /// with the \p Key, or \p std::nullopt if the key does not exist. |
| 67 | + Expected<std::optional<ObjectID>> KVGet(ArrayRef<uint8_t> Key); |
| 68 | + |
| 69 | + /// Open a \p UnifiedOnDiskCache instance for a directory. |
| 70 | + /// |
| 71 | + /// \param Path directory for the on-disk database. The directory will be |
| 72 | + /// created if it doesn't exist. |
| 73 | + /// \param SizeLimit Optional size for limiting growth. This has an effect for |
| 74 | + /// when the instance is closed. |
| 75 | + /// \param HashName Identifier name for the hashing algorithm that is going to |
| 76 | + /// be used. |
| 77 | + /// \param HashByteSize Size for the object digest hash bytes. |
| 78 | + /// \param FaultInPolicy Controls how nodes are copied to primary store. This |
| 79 | + /// is recorded at creation time and subsequent opens need to pass the same |
| 80 | + /// policy otherwise the \p open will fail. |
| 81 | + static Expected<std::unique_ptr<UnifiedOnDiskCache>> |
| 82 | + open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName, |
| 83 | + unsigned HashByteSize, |
| 84 | + OnDiskGraphDB::FaultInPolicy FaultInPolicy = |
| 85 | + OnDiskGraphDB::FaultInPolicy::FullTree); |
| 86 | + |
| 87 | + /// Validate the data in \p Path, if needed to ensure correctness. |
| 88 | + /// |
| 89 | + /// Note: if invalid data is detected and \p AllowRecovery is true, then |
| 90 | + /// recovery requires exclusive access to the CAS and it is an error to |
| 91 | + /// attempt recovery if there is concurrent use of the CAS. |
| 92 | + /// |
| 93 | + /// \param Path directory for the on-disk database. |
| 94 | + /// \param HashName Identifier name for the hashing algorithm that is going to |
| 95 | + /// be used. |
| 96 | + /// \param HashByteSize Size for the object digest hash bytes. |
| 97 | + /// \param CheckHash Whether to validate hashes match the data. |
| 98 | + /// \param AllowRecovery Whether to automatically recover from invalid data by |
| 99 | + /// marking the files for garbage collection. |
| 100 | + /// \param ForceValidation Whether to force validation to occur even if it |
| 101 | + /// should not be necessary. |
| 102 | + /// \param LLVMCasBinary If provided, validation is performed out-of-process |
| 103 | + /// using the given \c llvm-cas executable which protects against crashes |
| 104 | + /// during validation. Otherwise validation is performed in-process. |
| 105 | + /// |
| 106 | + /// \returns \c Valid if the data is already valid, \c Recovered if data |
| 107 | + /// was invalid but has been cleared, \c Skipped if validation is not needed, |
| 108 | + /// or an \c Error if validation cannot be performed or if the data is left |
| 109 | + /// in an invalid state because \p AllowRecovery is false. |
| 110 | + static Expected<ValidationResult> |
| 111 | + validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize, |
| 112 | + bool CheckHash, bool AllowRecovery, bool ForceValidation, |
| 113 | + std::optional<StringRef> LLVMCasBinary); |
| 114 | + |
| 115 | + /// This is called implicitly at destruction time, so it is not required for a |
| 116 | + /// client to call this. After calling \p close the only method that is valid |
| 117 | + /// to call is \p needsGarbageCollection. |
| 118 | + /// |
| 119 | + /// \param CheckSizeLimit if true it will check whether the primary store has |
| 120 | + /// exceeded its intended size limit. If false the check is skipped even if a |
| 121 | + /// \p SizeLimit was passed to the \p open call. |
| 122 | + Error close(bool CheckSizeLimit = true); |
| 123 | + |
| 124 | + /// Set the size for limiting growth. This has an effect for when the instance |
| 125 | + /// is closed. |
| 126 | + void setSizeLimit(std::optional<uint64_t> SizeLimit); |
| 127 | + |
| 128 | + /// \returns the storage size of the cache data. |
| 129 | + uint64_t getStorageSize() const; |
| 130 | + |
| 131 | + /// \returns whether the primary store has exceeded the intended size limit. |
| 132 | + /// This can return false even if the overall size of the opened directory is |
| 133 | + /// over the \p SizeLimit passed to \p open. To know whether garbage |
| 134 | + /// collection needs to be triggered or not, call \p needsGarbaseCollection. |
| 135 | + bool hasExceededSizeLimit() const; |
| 136 | + |
| 137 | + /// \returns whether there are unused data that can be deleted using a |
| 138 | + /// \p collectGarbage call. |
| 139 | + bool needsGarbageCollection() const { return NeedsGarbageCollection; } |
| 140 | + |
| 141 | + /// Remove any unused data from the directory at \p Path. If there are no such |
| 142 | + /// data the operation is a no-op. |
| 143 | + /// |
| 144 | + /// This can be called concurrently, regardless of whether there is an open |
| 145 | + /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers |
| 146 | + /// in the same process or other processes. |
| 147 | + /// |
| 148 | + /// It is recommended that garbage-collection is triggered concurrently in the |
| 149 | + /// background, so that it has minimal effect on the workload of the process. |
| 150 | + static Error collectGarbage(StringRef Path); |
| 151 | + |
| 152 | + /// Remove unused data from the current UnifiedOnDiskCache. |
| 153 | + Error collectGarbage(); |
| 154 | + |
| 155 | + /// Validate the key value databases. |
| 156 | + Error validateActionCache(); |
| 157 | + |
| 158 | + /// Get the upstream OnDiskGraphDB if exists. |
| 159 | + /// |
| 160 | + /// \returns upstream database or nullptr if upstream database doesn't exist. |
| 161 | + OnDiskGraphDB *getUpstreamGraphDB() const { return UpstreamGraphDB; } |
| 162 | + |
| 163 | + ~UnifiedOnDiskCache(); |
| 164 | + |
| 165 | +private: |
| 166 | + UnifiedOnDiskCache(); |
| 167 | + |
| 168 | + Expected<std::optional<ObjectID>> |
| 169 | + faultInFromUpstreamKV(ArrayRef<uint8_t> Key); |
| 170 | + |
| 171 | + /// \returns the storage size of the primary directory. |
| 172 | + uint64_t getPrimaryStorageSize() const; |
| 173 | + |
| 174 | + std::string RootPath; |
| 175 | + std::atomic<uint64_t> SizeLimit; |
| 176 | + |
| 177 | + int LockFD = -1; |
| 178 | + |
| 179 | + std::atomic<bool> NeedsGarbageCollection; |
| 180 | + std::string PrimaryDBDir; |
| 181 | + |
| 182 | + OnDiskGraphDB *UpstreamGraphDB = nullptr; |
| 183 | + std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; |
| 184 | + |
| 185 | + std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; |
| 186 | + std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; |
| 187 | +}; |
| 188 | + |
| 189 | +} // namespace llvm::cas::ondisk |
| 190 | + |
| 191 | +#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H |
0 commit comments