From 037ca46b53dbf81335756c906eafaa1c50ae52eb Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 3 Jul 2017 12:13:04 +0800
Subject: [PATCH 1/6] mds: don't issue caps for frozen inode

When mds goes into active state, it re-issue caps for all inodes.
It's possible that some inodes are in frozen state. (mds starts to
process slave request in clientreplay state, slave rename may freeze
inode)

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 src/mds/MDCache.cc | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 1c6936823bcb0..84f8002723243 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -6080,6 +6080,20 @@ void MDCache::rejoin_send_acks()
   rejoin_imported_caps.clear();
 }
 
+class C_MDC_ReIssueCaps : public MDCacheContext {
+  CInode *in;
+public:
+  C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
+    MDCacheContext(mdc), in(i)
+  {
+    in->get(CInode::PIN_PTRWAITER);
+  }
+  void finish(int r) override {
+    if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
+      mdcache->mds->locker->issue_caps(in);
+    in->put(CInode::PIN_PTRWAITER);
+  }
+};
 
 void MDCache::reissue_all_caps()
 {
@@ -6090,6 +6104,11 @@ void MDCache::reissue_all_caps()
        ++p) {
     CInode *in = p->second;
     if (in->is_head() && in->is_any_caps()) {
+      // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
+      if (in->is_frozen_inode()) {
+	in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
+	continue;
+      }
       if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
 	mds->locker->issue_caps(in);
     }

From a0b8fa6944ccc80b580d29406df8a727574e8b1a Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 4 Jul 2017 21:59:59 +0800
Subject: [PATCH 2/6] mds: force client to flush data when waiting on
 LOCK_SNAP_SYNC lock

Client does not send CAP_FLUSHSNAP message until it flush all
snapshotted data. To make client flush snapshotted data and
send CAP_FLUSHSNAP message quickly, we should revoke Fb cap.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 src/mds/Locker.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 6cb9a73511130..e8a28231a13ea 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -1306,10 +1306,13 @@ bool Locker::rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon)
       // okay, we actually need to kick the head's lock to get ourselves synced up.
       CInode *head = mdcache->get_inode(in->ino());
       assert(head);
-      SimpleLock *hlock = head->get_lock(lock->get_type());
+      SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE);
+      if (hlock->get_state() == LOCK_SYNC)
+	hlock = head->get_lock(lock->get_type());
+
       if (hlock->get_state() != LOCK_SYNC) {
 	dout(10) << "rdlock_start trying head inode " << *head << dendl;
-	if (!rdlock_start(head->get_lock(lock->get_type()), mut, true)) // ** as_anon, no rdlock on EXCL **
+	if (!rdlock_start(hlock, mut, true)) // ** as_anon, no rdlock on EXCL **
 	  return false;
 	// oh, check our lock again then
       }

From 4b95fbe327814049ff63cb1e1dc4ed0e7cbd3874 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 5 Jul 2017 17:22:46 +0800
Subject: [PATCH 3/6] mds: properly do null snapflush

Client always sends snapflushs in snapid order. If mds receives
a snapflush, but it's not the oldest one in client_need_snapflush,
it means client didn't make change for the older snaps.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 src/mds/Locker.cc | 8 +++++---
 src/mds/Locker.h  | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index e8a28231a13ea..b7de5ab59bad7 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2509,11 +2509,11 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
 
 
 
-void Locker::_do_null_snapflush(CInode *head_in, client_t client)
+void Locker::_do_null_snapflush(CInode *head_in, client_t client, snapid_t last)
 {
   dout(10) << "_do_null_snapflush client." << client << " on " << *head_in << dendl;
-  compact_map<snapid_t, set<client_t> >::iterator p = head_in->client_need_snapflush.begin();
-  while (p != head_in->client_need_snapflush.end()) {
+  for (auto p = head_in->client_need_snapflush.begin();
+       p != head_in->client_need_snapflush.end() && p->first < last; ) {
     snapid_t snapid = p->first;
     set<client_t>& clients = p->second;
     ++p;  // be careful, q loop below depends on this
@@ -2739,6 +2739,8 @@ void Locker::handle_client_caps(MClientCaps *m)
       // this cap now follows a later snap (i.e. the one initiating this flush, or later)
       if (in == head_in)
 	cap->client_follows = snap < CEPH_NOSNAP ? snap : realm->get_newest_seq();
+      else if (head_in->client_need_snapflush.begin()->first < snap)
+	_do_null_snapflush(head_in, client, snap);
    
       _do_snap_update(in, snap, m->get_dirty(), follows, client, m, ack);
 
diff --git a/src/mds/Locker.h b/src/mds/Locker.h
index 32b7d63016e4c..b27447246a7cd 100644
--- a/src/mds/Locker.h
+++ b/src/mds/Locker.h
@@ -192,7 +192,7 @@ class Locker {
   void handle_client_caps(class MClientCaps *m);
   void _update_cap_fields(CInode *in, int dirty, MClientCaps *m, inode_t *pi);
   void _do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, MClientCaps *m, MClientCaps *ack);
-  void _do_null_snapflush(CInode *head_in, client_t client);
+  void _do_null_snapflush(CInode *head_in, client_t client, snapid_t last=CEPH_NOSNAP);
   bool _do_cap_update(CInode *in, Capability *cap, int dirty, snapid_t follows, MClientCaps *m,
 		      MClientCaps *ack=0, bool *need_flush=NULL);
   void handle_client_cap_release(class MClientCapRelease *m);

From b0f911e83f0f03916061ebea4a16c93cab216c79 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 6 Jul 2017 16:54:08 +0800
Subject: [PATCH 4/6] ceph: kick purge queue when mds active

Otherwise purge queue does not start to work until stray manager
adds new item.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 src/mds/PurgeQueue.cc   | 15 +++++++++++++++
 src/mds/PurgeQueue.h    |  1 +
 src/mds/StrayManager.cc |  1 +
 3 files changed, 17 insertions(+)

diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc
index 430d3eee80d8f..4da872c47b45a 100644
--- a/src/mds/PurgeQueue.cc
+++ b/src/mds/PurgeQueue.cc
@@ -116,6 +116,21 @@ void PurgeQueue::init()
   timer.init();
 }
 
+void PurgeQueue::activate()
+{
+  Mutex::Locker l(lock);
+  if (journaler.get_read_pos() == journaler.get_write_pos())
+    return;
+
+  if (in_flight.empty()) {
+    dout(4) << "start work (by drain)" << dendl;
+    finisher.queue(new FunctionContext([this](int r) {
+	  Mutex::Locker l(lock);
+	  _consume();
+	  }));
+  }
+}
+
 void PurgeQueue::shutdown()
 {
   Mutex::Locker l(lock);
diff --git a/src/mds/PurgeQueue.h b/src/mds/PurgeQueue.h
index b9699ddf3e3d4..aed66c94ebcdd 100644
--- a/src/mds/PurgeQueue.h
+++ b/src/mds/PurgeQueue.h
@@ -133,6 +133,7 @@ class PurgeQueue
 
 public:
   void init();
+  void activate();
   void shutdown();
 
   void create_logger();
diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc
index b2102ddc2d09b..042aa0eb2fc28 100644
--- a/src/mds/StrayManager.cc
+++ b/src/mds/StrayManager.cc
@@ -545,6 +545,7 @@ void StrayManager::activate()
 {
   dout(10) << __func__ << dendl;
   started = true;
+  purge_queue.activate();
 }
 
 bool StrayManager::eval_stray(CDentry *dn, bool delay)

From 9b374207758ca70ac040cee92f6670fa6023fc84 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 6 Jul 2017 17:46:12 +0800
Subject: [PATCH 5/6] mds: fix "wait for stray manager to start"

Stray manager may not get started immediate after mds becomes active.
If mds is already active, contexts added to waiting_for_active never
get executed.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 src/mds/StrayManager.cc | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/mds/StrayManager.cc b/src/mds/StrayManager.cc
index 042aa0eb2fc28..caa717024d165 100644
--- a/src/mds/StrayManager.cc
+++ b/src/mds/StrayManager.cc
@@ -314,6 +314,8 @@ class C_OpenSnapParents : public StrayManagerContext {
 
 void StrayManager::_enqueue(CDentry *dn, bool trunc)
 {
+  assert(started);
+
   CInode *in = dn->get_linkage()->get_inode();
   if (in->snaprealm &&
       !in->snaprealm->have_past_parents_open() &&
@@ -322,22 +324,6 @@ void StrayManager::_enqueue(CDentry *dn, bool trunc)
     return;
   }
 
-  if (!started) {
-    // If the MDS is not yet active, defer executing this purge
-    // in order to avoid the mdlog writes we do on purge completion.
-    mds->wait_for_active(
-        new MDSInternalContextWrapper(mds,
-          new FunctionContext([this, dn, trunc](int r){
-            // It is safe to hold on to this CDentry* pointer
-            // because the dentry is pinned with PIN_PURGING
-           _enqueue(dn, trunc); 
-            })
-        )
-      );
-
-    return;
-  }
-
   if (trunc) {
     truncate(dn);
   } else {
@@ -348,6 +334,9 @@ void StrayManager::_enqueue(CDentry *dn, bool trunc)
 
 void StrayManager::advance_delayed()
 {
+  if (!started)
+    return;
+
   for (elist<CDentry*>::iterator p = delayed_eval_stray.begin(); !p.end(); ) {
     CDentry *dn = *p;
     ++p;
@@ -435,6 +424,9 @@ bool StrayManager::_eval_stray(CDentry *dn, bool delay)
     return false;
   }
 
+  if (!started)
+    delay = true;
+
   if (dn->item_stray.is_on_list()) {
     if (delay)
       return false;

From 2b98f4701e9a12e50f8d017c93e5101eb02f7992 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 10 Jul 2017 19:20:14 +0800
Subject: [PATCH 6/6] mds: wait auth pinned objects when deactivating mds

It is possible that mdsdir and stray directroies are auth pinned.
(they have unstable lock or they are being stored)

Fixes: http://tracker.ceph.com/issues/20537
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 src/mds/MDCache.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 84f8002723243..3e079e716350a 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -7627,6 +7627,11 @@ bool MDCache::shutdown_pass()
   assert(!migrator->is_exporting());
   assert(!migrator->is_importing());
 
+  if ((myin && myin->is_auth_pinned()) ||
+      (mydir && mydir->is_auth_pinned())) {
+    dout(7) << "still have auth pinned objects" << dendl;
+    return false;
+  }
 
   // flush what we can from the log
   mds->mdlog->trim(0);