From 037ca46b53dbf81335756c906eafaa1c50ae52eb Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 3 Jul 2017 12:13:04 +0800 Subject: [PATCH 1/6] mds: don't issue caps for frozen inode When mds goes into active state, it re-issue caps for all inodes. It's possible that some inodes are in frozen state. (mds starts to process slave request in clientreplay state, slave rename may freeze inode) Signed-off-by: "Yan, Zheng" --- src/mds/MDCache.cc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 1c6936823bcb0..84f8002723243 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6080,6 +6080,20 @@ void MDCache::rejoin_send_acks() rejoin_imported_caps.clear(); } +class C_MDC_ReIssueCaps : public MDCacheContext { + CInode *in; +public: + C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) : + MDCacheContext(mdc), in(i) + { + in->get(CInode::PIN_PTRWAITER); + } + void finish(int r) override { + if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS)) + mdcache->mds->locker->issue_caps(in); + in->put(CInode::PIN_PTRWAITER); + } +}; void MDCache::reissue_all_caps() { @@ -6090,6 +6104,11 @@ void MDCache::reissue_all_caps() ++p) { CInode *in = p->second; if (in->is_head() && in->is_any_caps()) { + // called by MDSRank::active_start(). There shouldn't be any frozen subtree. + if (in->is_frozen_inode()) { + in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in)); + continue; + } if (!mds->locker->eval(in, CEPH_CAP_LOCKS)) mds->locker->issue_caps(in); } From a0b8fa6944ccc80b580d29406df8a727574e8b1a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Jul 2017 21:59:59 +0800 Subject: [PATCH 2/6] mds: force client to flush data when waiting on LOCK_SNAP_SYNC lock Client does not send CAP_FLUSHSNAP message until it flush all snapshotted data. To make client flush snapshotted data and send CAP_FLUSHSNAP message quickly, we should revoke Fb cap. Signed-off-by: "Yan, Zheng" --- src/mds/Locker.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 6cb9a73511130..e8a28231a13ea 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -1306,10 +1306,13 @@ bool Locker::rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon) // okay, we actually need to kick the head's lock to get ourselves synced up. CInode *head = mdcache->get_inode(in->ino()); assert(head); - SimpleLock *hlock = head->get_lock(lock->get_type()); + SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE); + if (hlock->get_state() == LOCK_SYNC) + hlock = head->get_lock(lock->get_type()); + if (hlock->get_state() != LOCK_SYNC) { dout(10) << "rdlock_start trying head inode " << *head << dendl; - if (!rdlock_start(head->get_lock(lock->get_type()), mut, true)) // ** as_anon, no rdlock on EXCL ** + if (!rdlock_start(hlock, mut, true)) // ** as_anon, no rdlock on EXCL ** return false; // oh, check our lock again then } From 4b95fbe327814049ff63cb1e1dc4ed0e7cbd3874 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 5 Jul 2017 17:22:46 +0800 Subject: [PATCH 3/6] mds: properly do null snapflush Client always sends snapflushs in snapid order. If mds receives a snapflush, but it's not the oldest one in client_need_snapflush, it means client didn't make change for the older snaps. Signed-off-by: "Yan, Zheng" --- src/mds/Locker.cc | 8 +++++--- src/mds/Locker.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index e8a28231a13ea..b7de5ab59bad7 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2509,11 +2509,11 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq) -void Locker::_do_null_snapflush(CInode *head_in, client_t client) +void Locker::_do_null_snapflush(CInode *head_in, client_t client, snapid_t last) { dout(10) << "_do_null_snapflush client." << client << " on " << *head_in << dendl; - compact_map >::iterator p = head_in->client_need_snapflush.begin(); - while (p != head_in->client_need_snapflush.end()) { + for (auto p = head_in->client_need_snapflush.begin(); + p != head_in->client_need_snapflush.end() && p->first < last; ) { snapid_t snapid = p->first; set& clients = p->second; ++p; // be careful, q loop below depends on this @@ -2739,6 +2739,8 @@ void Locker::handle_client_caps(MClientCaps *m) // this cap now follows a later snap (i.e. the one initiating this flush, or later) if (in == head_in) cap->client_follows = snap < CEPH_NOSNAP ? snap : realm->get_newest_seq(); + else if (head_in->client_need_snapflush.begin()->first < snap) + _do_null_snapflush(head_in, client, snap); _do_snap_update(in, snap, m->get_dirty(), follows, client, m, ack); diff --git a/src/mds/Locker.h b/src/mds/Locker.h index 32b7d63016e4c..b27447246a7cd 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -192,7 +192,7 @@ class Locker { void handle_client_caps(class MClientCaps *m); void _update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t *pi); void _do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, MClientCaps *m, MClientCaps *ack); - void _do_null_snapflush(CInode *head_in, client_t client); + void _do_null_snapflush(CInode *head_in, client_t client, snapid_t last=CEPH_NOSNAP); bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, MClientCaps *m, MClientCaps *ack=0, bool *need_flush=NULL); void handle_client_cap_release(class MClientCapRelease *m); From b0f911e83f0f03916061ebea4a16c93cab216c79 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 6 Jul 2017 16:54:08 +0800 Subject: [PATCH 4/6] ceph: kick purge queue when mds active Otherwise purge queue does not start to work until stray manager adds new item. Signed-off-by: "Yan, Zheng" --- src/mds/PurgeQueue.cc | 15 +++++++++++++++ src/mds/PurgeQueue.h | 1 + src/mds/StrayManager.cc | 1 + 3 files changed, 17 insertions(+) diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc index 430d3eee80d8f..4da872c47b45a 100644 --- a/src/mds/PurgeQueue.cc +++ b/src/mds/PurgeQueue.cc @@ -116,6 +116,21 @@ void PurgeQueue::init() timer.init(); } +void PurgeQueue::activate() +{ + Mutex::Locker l(lock); + if (journaler.get_read_pos() == journaler.get_write_pos()) + return; + + if (in_flight.empty()) { + dout(4) << "start work (by drain)" << dendl; + finisher.queue(new FunctionContext([this](int r) { + Mutex::Locker l(lock); + _consume(); + })); + } +} + void PurgeQueue::shutdown() { Mutex::Locker l(lock); diff --git a/src/mds/PurgeQueue.h b/src/mds/PurgeQueue.h index b9699ddf3e3d4..aed66c94ebcdd 100644 --- a/src/mds/PurgeQueue.h +++ b/src/mds/PurgeQueue.h @@ -133,6 +133,7 @@ class PurgeQueue public: void init(); + void activate(); void shutdown(); void create_logger(); diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc index b2102ddc2d09b..042aa0eb2fc28 100644 --- a/src/mds/StrayManager.cc +++ b/src/mds/StrayManager.cc @@ -545,6 +545,7 @@ void StrayManager::activate() { dout(10) << __func__ << dendl; started = true; + purge_queue.activate(); } bool StrayManager::eval_stray(CDentry *dn, bool delay) From 9b374207758ca70ac040cee92f6670fa6023fc84 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 6 Jul 2017 17:46:12 +0800 Subject: [PATCH 5/6] mds: fix "wait for stray manager to start" Stray manager may not get started immediate after mds becomes active. If mds is already active, contexts added to waiting_for_active never get executed. Signed-off-by: "Yan, Zheng" --- src/mds/StrayManager.cc | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc index 042aa0eb2fc28..caa717024d165 100644 --- a/src/mds/StrayManager.cc +++ b/src/mds/StrayManager.cc @@ -314,6 +314,8 @@ class C_OpenSnapParents : public StrayManagerContext { void StrayManager::_enqueue(CDentry *dn, bool trunc) { + assert(started); + CInode *in = dn->get_linkage()->get_inode(); if (in->snaprealm && !in->snaprealm->have_past_parents_open() && @@ -322,22 +324,6 @@ void StrayManager::_enqueue(CDentry *dn, bool trunc) return; } - if (!started) { - // If the MDS is not yet active, defer executing this purge - // in order to avoid the mdlog writes we do on purge completion. - mds->wait_for_active( - new MDSInternalContextWrapper(mds, - new FunctionContext([this, dn, trunc](int r){ - // It is safe to hold on to this CDentry* pointer - // because the dentry is pinned with PIN_PURGING - _enqueue(dn, trunc); - }) - ) - ); - - return; - } - if (trunc) { truncate(dn); } else { @@ -348,6 +334,9 @@ void StrayManager::_enqueue(CDentry *dn, bool trunc) void StrayManager::advance_delayed() { + if (!started) + return; + for (elist::iterator p = delayed_eval_stray.begin(); !p.end(); ) { CDentry *dn = *p; ++p; @@ -435,6 +424,9 @@ bool StrayManager::_eval_stray(CDentry *dn, bool delay) return false; } + if (!started) + delay = true; + if (dn->item_stray.is_on_list()) { if (delay) return false; From 2b98f4701e9a12e50f8d017c93e5101eb02f7992 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 10 Jul 2017 19:20:14 +0800 Subject: [PATCH 6/6] mds: wait auth pinned objects when deactivating mds It is possible that mdsdir and stray directroies are auth pinned. (they have unstable lock or they are being stored) Fixes: http://tracker.ceph.com/issues/20537 Signed-off-by: "Yan, Zheng" --- src/mds/MDCache.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 84f8002723243..3e079e716350a 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -7627,6 +7627,11 @@ bool MDCache::shutdown_pass() assert(!migrator->is_exporting()); assert(!migrator->is_importing()); + if ((myin && myin->is_auth_pinned()) || + (mydir && mydir->is_auth_pinned())) { + dout(7) << "still have auth pinned objects" << dendl; + return false; + } // flush what we can from the log mds->mdlog->trim(0);