Skip to content

Commit

Permalink
mon/MgrMonitor: blacklist previous instance
Browse files Browse the repository at this point in the history
This wasn't realized as necessary early on in the ceph-mgr development
because the mgr didn't interact with RADOS. However, now it is becoming
common for plugins to store data there. It's important that the previous
instance can no longer interact with RADOS while the new mgr takes over.

In particular, this means that the mgr's client sessions with the MDS
are automatically evicted once the MDS receives the new OSDMap. This
avoids a pesky "unresponsive client" warning in the cluster logs.

Fixes: https://tracker.ceph.com/issues/42939
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
  • Loading branch information
batrick committed Dec 4, 2019
1 parent cc6422a commit f2986a4
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 23 deletions.
8 changes: 8 additions & 0 deletions src/common/options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2089,6 +2089,14 @@ std::vector<Option> get_global_options() {
"daemons remain in the OSD map")
.set_flag(Option::FLAG_RUNTIME),

Option("mon_mgr_blacklist_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(1_day)
.set_min(1_hr)
.add_service("mon")
.set_description("Duration in seconds that blacklist entries for mgr "
"daemons remain in the OSD map")
.set_flag(Option::FLAG_RUNTIME),

Option("mon_osd_crush_smoke_test", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.add_service("mon")
Expand Down
41 changes: 24 additions & 17 deletions src/mgr/Mgr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -248,22 +248,6 @@ void Mgr::init()
register_async_signal_handler_oneshot(SIGINT, handle_mgr_signal);
register_async_signal_handler_oneshot(SIGTERM, handle_mgr_signal);

// Start communicating with daemons to learn statistics etc
int r = server.init(monc->get_global_id(), client_messenger->get_myaddrs());
if (r < 0) {
derr << "Initialize server fail: " << cpp_strerror(r) << dendl;
// This is typically due to a bind() failure, so let's let
// systemd restart us.
exit(1);
}
dout(4) << "Initialized server at " << server.get_myaddrs() << dendl;

// Preload all daemon metadata (will subsequently keep this
// up to date by watching maps, so do the initial load before
// we subscribe to any maps)
dout(4) << "Loading daemon metadata..." << dendl;
load_all_metadata();

// subscribe to all the maps
monc->sub_want("log-info", 0, 0);
monc->sub_want("mgrdigest", 0, 0);
Expand All @@ -281,9 +265,32 @@ void Mgr::init()

// Start Objecter and wait for OSD map
lock.unlock(); // Drop lock because OSDMap dispatch calls into my ms_dispatch
objecter->wait_for_osd_map();
epoch_t e;
cluster_state.with_mgrmap([&e](const MgrMap& m) {
e = m.last_failure_osd_epoch;
});
/* wait for any blacklists to be applied to previous mgr instance */
dout(4) << "Waiting for new OSDMap (e=" << e
<< ") that may blacklist prior active." << dendl;
objecter->wait_for_osd_map(e);
lock.lock();

// Start communicating with daemons to learn statistics etc
int r = server.init(monc->get_global_id(), client_messenger->get_myaddrs());
if (r < 0) {
derr << "Initialize server fail: " << cpp_strerror(r) << dendl;
// This is typically due to a bind() failure, so let's let
// systemd restart us.
exit(1);
}
dout(4) << "Initialized server at " << server.get_myaddrs() << dendl;

// Preload all daemon metadata (will subsequently keep this
// up to date by watching maps, so do the initial load before
// we subscribe to any maps)
dout(4) << "Loading daemon metadata..." << dendl;
load_all_metadata();

// Populate PGs in ClusterState
cluster_state.with_osdmap_and_pgmap([this](const OSDMap &osd_map,
const PGMap& pg_map) {
Expand Down
9 changes: 8 additions & 1 deletion src/mon/MgrMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ class MgrMap
};

epoch_t epoch = 0;
epoch_t last_failure_osd_epoch = 0;

/// global_id of the ceph-mgr instance selected as a leader
uint64_t active_gid = 0;
Expand Down Expand Up @@ -255,6 +256,7 @@ class MgrMap
std::map<std::string, std::string> services;

epoch_t get_epoch() const { return epoch; }
epoch_t get_last_failure_osd_epoch() const { return last_failure_osd_epoch; }
entity_addrvec_t get_active_addrs() const { return active_addrs; }
uint64_t get_active_gid() const { return active_gid; }
bool get_available() const { return available; }
Expand Down Expand Up @@ -379,7 +381,7 @@ class MgrMap
ENCODE_FINISH(bl);
return;
}
ENCODE_START(9, 6, bl);
ENCODE_START(10, 6, bl);
encode(epoch, bl);
encode(active_addrs, bl, features);
encode(active_gid, bl);
Expand All @@ -392,6 +394,7 @@ class MgrMap
encode(active_change, bl);
encode(always_on_modules, bl);
encode(active_mgr_features, bl);
encode(last_failure_osd_epoch, bl);
ENCODE_FINISH(bl);
return;
}
Expand Down Expand Up @@ -440,6 +443,9 @@ class MgrMap
if (struct_v >= 9) {
decode(active_mgr_features, p);
}
if (struct_v >= 10) {
decode(last_failure_osd_epoch, p);
}
DECODE_FINISH(p);
}

Expand Down Expand Up @@ -491,6 +497,7 @@ class MgrMap
}
f->close_section();
}
f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
f->close_section();
}

Expand Down
29 changes: 28 additions & 1 deletion src/mon/MgrMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,12 @@ bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
dout(4) << "Active daemon restart (mgr." << m->get_name() << ")" << dendl;
mon->clog->info() << "Active manager daemon " << m->get_name()
<< " restarted";
if (!mon->osdmon()->is_writeable()) {
dout(1) << __func__ << ": waiting for osdmon writeable to"
" blacklist old instance." << dendl;
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return false;
}
drop_active();
}

Expand Down Expand Up @@ -744,7 +750,8 @@ void MgrMonitor::tick()
}

if (pending_map.active_gid != 0
&& last_beacon.at(pending_map.active_gid) < cutoff) {
&& last_beacon.at(pending_map.active_gid) < cutoff
&& mon->osdmon()->is_writeable()) {
const std::string old_active_name = pending_map.active_name;
drop_active();
propose = true;
Expand Down Expand Up @@ -814,10 +821,21 @@ bool MgrMonitor::promote_standby()

void MgrMonitor::drop_active()
{
ceph_assert(mon->osdmon()->is_writeable());

if (last_beacon.count(pending_map.active_gid) > 0) {
last_beacon.erase(pending_map.active_gid);
}

ceph_assert(pending_map.active_gid > 0);
auto until = ceph_clock_now();
until += g_conf().get_val<double>("mon_mgr_blacklist_interval");
dout(5) << "blacklisting previous mgr." << pending_map.active_name << "."
<< pending_map.active_gid << " ("
<< pending_map.active_addrs << ")" << dendl;
auto blacklist_epoch = mon->osdmon()->blacklist(pending_map.active_addrs, until);
request_proposal(mon->osdmon());

pending_metadata_rm.insert(pending_map.active_name);
pending_metadata.erase(pending_map.active_name);
pending_map.active_name = "";
Expand All @@ -827,6 +845,7 @@ void MgrMonitor::drop_active()
pending_map.available = false;
pending_map.active_addrs = entity_addrvec_t();
pending_map.services.clear();
pending_map.last_failure_osd_epoch = blacklist_epoch;

// So that when new active mgr subscribes to mgrdigest, it will
// get an immediate response instead of waiting for next timer
Expand Down Expand Up @@ -1023,6 +1042,10 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
if (!err.empty()) {
// Does not parse as a gid, treat it as a name
if (pending_map.active_name == who) {
if (!mon->osdmon()->is_writeable()) {
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return false;
}
drop_active();
changed = true;
} else {
Expand All @@ -1042,6 +1065,10 @@ bool MgrMonitor::prepare_command(MonOpRequestRef op)
}
} else {
if (pending_map.active_gid == gid) {
if (!mon->osdmon()->is_writeable()) {
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return false;
}
drop_active();
changed = true;
} else if (pending_map.standbys.count(gid) > 0) {
Expand Down
6 changes: 3 additions & 3 deletions src/osdc/Objecter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1911,10 +1911,10 @@ void Objecter::close_session(OSDSession *s)
logger->set(l_osdc_osd_sessions, osd_sessions.size());
}

void Objecter::wait_for_osd_map()
void Objecter::wait_for_osd_map(epoch_t e)
{
unique_lock l(rwlock);
if (osdmap->get_epoch()) {
if (osdmap->get_epoch() >= e) {
l.unlock();
return;
}
Expand All @@ -1925,7 +1925,7 @@ void Objecter::wait_for_osd_map()
bool done;
std::unique_lock mlock{lock};
C_SafeCond *context = new C_SafeCond(lock, cond, &done, NULL);
waiting_for_map[0].push_back(pair<Context*, int>(context, 0));
waiting_for_map[e].push_back(pair<Context*, int>(context, 0));
l.unlock();
cond.wait(mlock, [&done] { return done; });
}
Expand Down
2 changes: 1 addition & 1 deletion src/osdc/Objecter.h
Original file line number Diff line number Diff line change
Expand Up @@ -2185,7 +2185,7 @@ class Objecter : public md_config_obs_t, public Dispatcher {
void handle_osd_backoff(class MOSDBackoff *m);
void handle_watch_notify(class MWatchNotify *m);
void handle_osd_map(class MOSDMap *m);
void wait_for_osd_map();
void wait_for_osd_map(epoch_t e=0);

/**
* Get std::list of entities blacklisted since this was last called,
Expand Down

0 comments on commit f2986a4

Please sign in to comment.