Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/groups/mqb/mqbc/mqbc_clusterstatemanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1144,7 +1144,7 @@ void ClusterStateManager::onWatchDogDispatched()
}

BALL_LOG_WARN << d_clusterData_p->identity().description()
<< ": Watch dog triggered because node startup healing "
<< ": Watchdog triggered because node startup healing "
<< "sequence was not completed in the configured time of "
<< d_watchDogTimeoutInterval.totalSeconds() << " seconds.";

Expand Down
24 changes: 12 additions & 12 deletions src/groups/mqb/mqbc/mqbc_clusterstatemanager.t.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2425,19 +2425,19 @@ static void test23_selectLeaderFromFollower()

static void test24_watchDogLeader()
// ------------------------------------------------------------------------
// WATCH DOG LEADER
// WATCHDOG LEADER
//
// Concerns:
// Verify that the watchdog triggers upon timeout when the leader is
// healing.
//
// Testing:
// Watch dog for healing leader
// Watchdog for healing leader
// ------------------------------------------------------------------------
// ------------------------------------------------------------------------
{
bmqtst::TestHelper::printTestName("CLUSTER STATE MANAGER - "
"WATCH DOG LEADER");
"WATCHDOG LEADER");

Tester tester(true); // isLeader
BSLS_ASSERT_OPT(tester.d_clusterStateManager_mp->healthState() ==
Expand All @@ -2456,11 +2456,11 @@ static void test24_watchDogLeader()
tester.verifyFollowerLSNRequestsSent();
tester.clearChannels();

// 1.b.) Trigger watch dog via timeout
// 1.b.) Trigger watchdog via timeout
tester.d_cluster_mp->advanceTime(k_WATCHDOG_TIMEOUT_DURATION);
tester.d_cluster_mp->waitForScheduler();

// Verify that the watch dog triggers re-transition to Leader Healing
// Verify that the watchdog triggers re-transition to Leader Healing
// Stage 1, where we send follower LSN requests again.
BMQTST_ASSERT_EQ(tester.d_clusterStateManager_mp->healthState(),
mqbc::ClusterStateTableState::e_LDR_HEALING_STG1);
Expand Down Expand Up @@ -2494,11 +2494,11 @@ static void test24_watchDogLeader()
BSLS_ASSERT_OPT(tester.d_clusterStateManager_mp->healthState() ==
mqbc::ClusterStateTableState::e_LDR_HEALING_STG2);

// 2.b.) Trigger watch dog via timeout
// 2.b.) Trigger watchdog via timeout
tester.d_cluster_mp->advanceTime(k_WATCHDOG_TIMEOUT_DURATION);
tester.d_cluster_mp->waitForScheduler();

// Verify that the watch dog triggers re-transition to Leader Healing
// Verify that the watchdog triggers re-transition to Leader Healing
// Stage 1, where we send follower LSN requests again.
BMQTST_ASSERT_EQ(tester.d_clusterStateManager_mp->healthState(),
mqbc::ClusterStateTableState::e_LDR_HEALING_STG1);
Expand All @@ -2520,30 +2520,30 @@ static void test24_watchDogLeader()
BSLS_ASSERT_OPT(tester.d_clusterStateManager_mp->healthState() ==
mqbc::ClusterStateTableState::e_LDR_HEALED);

// 3.b.) Attempt to trigger watch dog via timeout, but should fail
// 3.b.) Attempt to trigger watchdog via timeout, but should fail
tester.d_cluster_mp->advanceTime(k_WATCHDOG_TIMEOUT_DURATION);
tester.d_cluster_mp->waitForScheduler();

// Verify that watch dog did not trigger
// Verify that watchdog did not trigger
BMQTST_ASSERT_EQ(tester.d_clusterStateManager_mp->healthState(),
mqbc::ClusterStateTableState::e_LDR_HEALED);
}

static void test25_watchDogFollower()
// ------------------------------------------------------------------------
// WATCH DOG FOLLOWER
// WATCHDOG FOLLOWER
//
// Concerns:
// Verify that the watchdog triggers upon timeout when the follower is
// healing.
//
// Testing:
// Watch dog for healing follower
// Watchdog for healing follower
// ------------------------------------------------------------------------
// ------------------------------------------------------------------------
{
bmqtst::TestHelper::printTestName("CLUSTER STATE MANAGER - "
"WATCH DOG FOLLOWER");
"WATCHDOG FOLLOWER");

Tester tester(false); // isLeader

Expand Down
14 changes: 3 additions & 11 deletions src/groups/mqb/mqbc/mqbc_storagemanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ void StorageManager::onWatchDogDispatched(int partitionId)
BMQTSK_ALARMLOG_ALARM("RECOVERY")
<< d_clusterData_p->identity().description() << " Partition ["
<< partitionId
<< "]: " << "Watch dog triggered because partition startup healing "
<< "]: " << "Watchdog triggered because partition startup healing "
<< "sequence was not completed in the configured time of "
<< d_watchDogTimeoutInterval.totalSeconds() << " seconds."
<< BMQTSK_ALARMLOG_END;
Expand Down Expand Up @@ -1154,14 +1154,8 @@ void StorageManager::do_startWatchDog(const PartitionFSMArgsSp& args)

const int partitionId = eventDataVec[0].partitionId();

if (static_cast<const bdlmt::EventSchedulerEventHandle::Event*>(
d_watchDogEventHandles[partitionId]) != 0) {
BALL_LOG_WARN << d_clusterData_p->identity().description()
<< " Partition [" << partitionId << "]: "
<< "Not starting watchdog since it has already been "
<< "started.";
return; // RETURN
}
// Clear any existing watchdog before starting the timer anew.
d_watchDogEventHandles[partitionId].release();

d_clusterData_p->scheduler().scheduleEvent(
&d_watchDogEventHandles[partitionId],
Expand Down Expand Up @@ -1191,8 +1185,6 @@ void StorageManager::do_stopWatchDog(const PartitionFSMArgsSp& args)
<< " Partition [" << partitionId << "]: "
<< "Failed to cancel WatchDog, rc: " << rc;
}

d_watchDogEventHandles[partitionId].release();
}

void StorageManager::do_openRecoveryFileSet(const PartitionFSMArgsSp& args)
Expand Down
8 changes: 4 additions & 4 deletions src/groups/mqb/mqbc/mqbc_storagemanager.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,14 +195,14 @@ class StorageManager BSLS_KEYWORD_FINAL
/// Whether this StorageMgr has started.
bsls::AtomicBool d_isStarted;

/// List of event handles for the watch dog, indexed by partitionId.
/// List of event handles for the watchdog, indexed by partitionId.
///
/// THREAD: Except during the ctor, the i-th index of this data member
/// **must** be accessed in the associated Queue dispatcher thread
/// for the i-th partitionId.
EventHandles d_watchDogEventHandles;

/// Timeout interval for the watch dog.
/// Timeout interval for the watchdog.
const bsls::TimeInterval d_watchDogTimeoutInterval;

/// Flag to denote if a low disk space warning was issued. This flag is
Expand Down Expand Up @@ -403,13 +403,13 @@ class StorageManager BSLS_KEYWORD_FINAL
void recoveredQueuesCb(int partitionId,
const QueueKeyInfoMap& queueKeyInfoMap);

/// Process the watch dog trigger event for the specified `partitionId`,
/// Process the watchdog trigger event for the specified `partitionId`,
/// indicating unhealthiness in the Partition FSM.
///
/// THREAD: Executed by the scheduler thread.
void onWatchDog(int partitionId);

/// Process the watch dog trigger event for the specified `partitionId`,
/// Process the watchdog trigger event for the specified `partitionId`,
/// indicating unhealthiness in the Partition FSM.
///
/// THREAD: This method is invoked in the associated cluster's
Expand Down
Loading