diff --git a/db/db_impl.cc b/db/db_impl.cc index 2667945b..b1dc6f19 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1673,7 +1673,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { uint64_t new_log_number = versions_->NewFileNumber(); WritableFile* lfile = NULL; gPerfCounters->Inc(ePerfWriteNewMem); - s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); + s = env_->NewWriteOnlyFile(LogFileName(dbname_, new_log_number), &lfile); if (!s.ok()) { // Avoid chewing through file number space in a tight loop. versions_->ReuseFileNumber(new_log_number); @@ -1820,7 +1820,7 @@ Status DB::Open(const Options& options, const std::string& dbname, if (s.ok()) { uint64_t new_log_number = impl->versions_->NewFileNumber(); WritableFile* lfile; - s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), + s = options.env->NewWriteOnlyFile(LogFileName(dbname, new_log_number), &lfile); if (s.ok()) { edit.SetLogNumber(new_log_number); diff --git a/db/table_cache.cc b/db/table_cache.cc index 8aeafbc1..1c63b414 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -88,6 +88,7 @@ Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, int level } else { + // (later, call SetForCompaction here too for files already in cache) gPerfCounters->Inc(ePerfTableCached); } // else return s; @@ -103,7 +104,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, } Cache::Handle* handle = NULL; - Status s = FindTable(file_number, file_size, level, &handle); + Status s = FindTable(file_number, file_size, level, &handle, options.IsCompaction()); if (!s.ok()) { return NewErrorIterator(s); } diff --git a/include/leveldb/env.h b/include/leveldb/env.h index ba8374e5..95f59108 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -71,6 +71,7 @@ class Env { virtual Status NewWritableFile(const std::string& fname, WritableFile** result) = 0; + // Riak specific: // Derived from NewWritableFile. One change: if the file exists, // move to the end of the file and continue writing. // new file. On success, stores a pointer to the open file in @@ -81,6 +82,16 @@ class Env { virtual Status NewAppendableFile(const std::string& fname, WritableFile** result) = 0; + // Riak specific: + // Allows for virtualized version of NewWritableFile that enables write + // and close operations to execute on background threads + // (where platform supported). + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewWriteOnlyFile(const std::string& fname, + WritableFile** result) + {return(NewWritableFile(fname, result));}; + // Returns true iff the named file exists. virtual bool FileExists(const std::string& fname) = 0; @@ -233,6 +244,11 @@ class WritableFile { virtual Status Flush() = 0; virtual Status Sync() = 0; + // Riak specific: + // Provide hint where key/value data ends and metadata starts + // in an .sst table file. + virtual void SetMetadataOffset(uint64_t) {}; + private: // No copying allowed WritableFile(const WritableFile&); @@ -318,6 +334,9 @@ class EnvWrapper : public Env { Status NewAppendableFile(const std::string& f, WritableFile** r) { return target_->NewAppendableFile(f, r); } + Status NewWriteOnlyFile(const std::string& f, WritableFile** r) { + return target_->NewWriteOnlyFile(f, r); + } bool FileExists(const std::string& f) { return target_->FileExists(f); } Status GetChildren(const std::string& dir, std::vector* r) { return target_->GetChildren(dir, r); diff --git a/include/leveldb/perf_count.h b/include/leveldb/perf_count.h index c05a76ec..3d37eb31 100644 --- a/include/leveldb/perf_count.h +++ b/include/leveldb/perf_count.h @@ -189,6 +189,7 @@ enum PerformanceCountersEnum ePerfThrottleBacklog1=64,//!< backlog at time of posting (level1+) ePerfThrottleCompacts1=65,//!< number of level 1+ compactions + ePerfBGWriteError=66, //!< error in write/close, see syslog // must follow last index name to represent size of array // (ASSUMES previous enum is highest value) diff --git a/table/table_builder.cc b/table/table_builder.cc index e84c6ab9..a5c3a529 100644 --- a/table/table_builder.cc +++ b/table/table_builder.cc @@ -228,6 +228,9 @@ Status TableBuilder::Finish() { BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle, sst_stats_block_handle; + // pass hint to Linux fadvise management + r->file->SetMetadataOffset(r->offset); + // Write filter block if (ok() && r->filter_block != NULL) { WriteRawBlock(r->filter_block->Finish(), kNoCompression, diff --git a/util/cache.cc b/util/cache.cc index 2c1459ed..0e5f5d4c 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -116,7 +116,7 @@ class HandleTable { LRUHandle* h = list_[i]; while (h != NULL) { LRUHandle* next = h->next_hash; - Slice key = h->key(); + /*Slice key =*/ h->key(); // eliminate unused var warning, but allow for side-effects uint32_t hash = h->hash; LRUHandle** ptr = &new_list[hash & (new_length - 1)]; h->next_hash = *ptr; diff --git a/util/env_posix.cc b/util/env_posix.cc index d4803d08..59c9d063 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -60,9 +61,13 @@ struct BGCloseInfo size_t offset_; size_t length_; size_t unused_; + uint64_t metadata_; - BGCloseInfo(int fd, void * base, size_t offset, size_t length, size_t unused) - : fd_(fd), base_(base), offset_(offset), length_(length), unused_(unused) {}; + BGCloseInfo(int fd, void * base, size_t offset, size_t length, + size_t unused, uint64_t metadata) + : fd_(fd), base_(base), offset_(offset), length_(length), + unused_(unused), metadata_(metadata) + {}; }; class PosixSequentialFile: public SequentialFile { @@ -108,7 +113,15 @@ class PosixRandomAccessFile: public RandomAccessFile { public: PosixRandomAccessFile(const std::string& fname, int fd) - : filename_(fname), fd_(fd), is_compaction_(false), file_size_(0) { } + : filename_(fname), fd_(fd), is_compaction_(false), file_size_(0) + { +#if defined(HAVE_FADVISE) + // Currently hurts performance instead of helps. Likely + // requires better interaction with tables already in cache + // that start compaction. See comment in table_cache.cc. + // posix_fadvise(fd_, 0, file_size_, POSIX_FADV_RANDOM); +#endif + } virtual ~PosixRandomAccessFile() { if (is_compaction_) @@ -137,6 +150,9 @@ class PosixRandomAccessFile: public RandomAccessFile { { is_compaction_=true; file_size_=file_size; +#if defined(HAVE_FADVISE) + posix_fadvise(fd_, 0, file_size_, POSIX_FADV_SEQUENTIAL); +#endif }; @@ -163,7 +179,7 @@ class PosixMmapReadableFile: public RandomAccessFile { } virtual ~PosixMmapReadableFile() { - BGCloseInfo * ptr=new BGCloseInfo(fd_, mmapped_region_, 0, length_, 0); + BGCloseInfo * ptr=new BGCloseInfo(fd_, mmapped_region_, 0, length_, 0, 0); Env::Default()->Schedule(&BGFileCloser, ptr, 4); }; @@ -195,9 +211,9 @@ class PosixMmapFile : public WritableFile { char* dst_; // Where to write next (in range [base_,limit_]) char* last_sync_; // Where have we synced up to uint64_t file_offset_; // Offset of base_ in file - - // Have we done an munmap of unsynced data? - bool pending_sync_; + uint64_t metadata_offset_; // Offset where sst metadata starts, or zero + bool pending_sync_; // Have we done an munmap of unsynced data? + bool is_write_only_; // can this file process in background // Roundup x to a multiple of y static size_t Roundup(size_t x, size_t y) { @@ -218,19 +234,32 @@ class PosixMmapFile : public WritableFile { pending_sync_ = true; } - BGCloseInfo * ptr=new BGCloseInfo(fd_, base_, file_offset_, limit_-base_, limit_-dst_); - if (and_close) + BGCloseInfo * ptr=new BGCloseInfo(fd_, base_, file_offset_, limit_-base_, + limit_-dst_, metadata_offset_); + + // write only files can perform operations async, but not + // files that might re-open and read again soon + if (!is_write_only_) { - // do this in foreground unfortunately, bug where file not - // closed fast enough for reopen - BGFileCloser2(ptr); - fd_=-1; + if (and_close) + BGFileCloser2(ptr); + else + BGFileUnmapper2(ptr); } // if + + // called from user thread, move these operations to background + // queue else { - Env::Default()->Schedule(&BGFileUnmapper2, ptr, 4); + if (and_close) + Env::Default()->Schedule(&BGFileCloser2, ptr, 4); + else + Env::Default()->Schedule(&BGFileUnmapper2, ptr, 4); } // else + if (and_close) + fd_=-1; + file_offset_ += limit_ - base_; base_ = NULL; limit_ = NULL; @@ -277,7 +306,8 @@ class PosixMmapFile : public WritableFile { public: PosixMmapFile(const std::string& fname, int fd, - size_t page_size, size_t file_offset=0L) + size_t page_size, size_t file_offset=0L, + bool is_write_only=false) : filename_(fname), fd_(fd), page_size_(page_size), @@ -287,13 +317,14 @@ class PosixMmapFile : public WritableFile { dst_(NULL), last_sync_(NULL), file_offset_(file_offset), - pending_sync_(false) { + metadata_offset_(0), + pending_sync_(false), + is_write_only_(is_write_only) { assert((page_size & (page_size - 1)) == 0); gPerfCounters->Inc(ePerfRWFileOpen); } - ~PosixMmapFile() { if (fd_ >= 0) { PosixMmapFile::Close(); @@ -364,6 +395,11 @@ class PosixMmapFile : public WritableFile { return s; } + + virtual void SetMetadataOffset(uint64_t Metadata) + { + metadata_offset_=Metadata; + } // SetMetadataOffset }; @@ -480,7 +516,7 @@ class PosixEnv : public Env { *result = NULL; s = IOError(fname, errno); } else { - *result = new PosixMmapFile(fname, fd, page_size_); + *result = new PosixMmapFile(fname, fd, page_size_, 0, false); } return s; } @@ -509,6 +545,18 @@ class PosixEnv : public Env { return s; } + virtual Status NewWriteOnlyFile(const std::string& fname, + WritableFile** result) { + Status s; + const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { + *result = NULL; + s = IOError(fname, errno); + } else { + *result = new PosixMmapFile(fname, fd, page_size_, 0, true); + } + return s; + } virtual bool FileExists(const std::string& fname) { @@ -1005,22 +1053,46 @@ void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { void BGFileCloser(void * arg) { BGCloseInfo * file_ptr; + bool err_flag; + int ret_val; + err_flag=false; file_ptr=(BGCloseInfo *)arg; - munmap(file_ptr->base_, file_ptr->length_); + ret_val=munmap(file_ptr->base_, file_ptr->length_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser munmap failed [%d, %m]", errno); + err_flag=true; + } // if #if defined(HAVE_FADVISE) - posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser posix_fadvise DONTNEED failed [%d, %m]", errno); + err_flag=true; + } // if + #endif if (0 != file_ptr->unused_) - ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); + { + ret_val=ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser ftruncate failed [%d, %m]", errno); + err_flag=true; + } // if + } close(file_ptr->fd_); delete file_ptr; gPerfCounters->Inc(ePerfROFileClose); + if (err_flag) + gPerfCounters->Inc(ePerfBGWriteError); + return; } // BGFileCloser @@ -1029,23 +1101,68 @@ void BGFileCloser(void * arg) void BGFileCloser2(void * arg) { BGCloseInfo * file_ptr; + bool err_flag; + int ret_val; + err_flag=false; file_ptr=(BGCloseInfo *)arg; - munmap(file_ptr->base_, file_ptr->length_); + ret_val=munmap(file_ptr->base_, file_ptr->length_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 munmap failed [%d, %m]", errno); + err_flag=true; + } // if + #if defined(HAVE_FADVISE) - posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + // release newly written data from Linux page cache if possible + if (0==file_ptr->metadata_ + || (file_ptr->offset_ + file_ptr->length_ < file_ptr->metadata_)) + { + // must fdatasync for DONTNEED to work + ret_val=fdatasync(file_ptr->fd_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 fdatasync failed [%d, %m]", errno); + err_flag=true; + } // if + + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 posix_fadvise DONTNEED failed [%d, %m]", errno); + err_flag=true; + } // if + } // if + else + { + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 posix_fadvise WILLNEED failed [%d, %m]", errno); + err_flag=true; + } // if + } // else #endif if (0 != file_ptr->unused_) - ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); - + { + ret_val=ftruncate(file_ptr->fd_, file_ptr->offset_ + file_ptr->length_ - file_ptr->unused_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileCloser2 ftruncate failed [%d, %m]", errno); + err_flag=true; + } // if + } close(file_ptr->fd_); delete file_ptr; gPerfCounters->Inc(ePerfRWFileClose); + if (err_flag) + gPerfCounters->Inc(ePerfBGWriteError); + return; } // BGFileCloser2 @@ -1076,18 +1193,55 @@ void BGFileUnmapper(void * arg) void BGFileUnmapper2(void * arg) { BGCloseInfo * file_ptr; + bool err_flag; + int ret_val; + err_flag=false; file_ptr=(BGCloseInfo *)arg; - munmap(file_ptr->base_, file_ptr->length_); + ret_val=munmap(file_ptr->base_, file_ptr->length_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 munmap failed [%d, %m]", errno); + err_flag=true; + } // if #if defined(HAVE_FADVISE) - posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + if (0==file_ptr->metadata_ + || (file_ptr->offset_ + file_ptr->length_ < file_ptr->metadata_)) + { + // must fdatasync for DONTNEED to work + ret_val=fdatasync(file_ptr->fd_); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 fdatasync failed [%d, %m]", errno); + err_flag=true; + } // if + + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_DONTNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 posix_fadvise DONTNEED failed [%d, %m]", errno); + err_flag=true; + } // if + } // if + else + { + ret_val=posix_fadvise(file_ptr->fd_, file_ptr->offset_, file_ptr->length_, POSIX_FADV_WILLNEED); + if (0!=ret_val) + { + syslog(LOG_ERR,"BGFileUnmapper2 posix_fadvise WILLNEED failed [%d, %m]", errno); + err_flag=true; + } // if + } // else #endif delete file_ptr; gPerfCounters->Inc(ePerfRWFileUnmap); + if (err_flag) + gPerfCounters->Inc(ePerfBGWriteError); + return; } // BGFileUnmapper2 diff --git a/util/perf_count.cc b/util/perf_count.cc index 91e29924..7aee5d99 100644 --- a/util/perf_count.cc +++ b/util/perf_count.cc @@ -514,7 +514,8 @@ PerformanceCounters * gPerfCounters(&LocalStartupCounters); "ThrottleMicros1", "ThrottleKeys1", "ThrottleBacklog1", - "ThrottleCompacts1" + "ThrottleCompacts1", + "BGWriteError" };