@@ -43,14 +43,44 @@ namespace NSentinel {
4343
4444// / TPDiskStatusComputer
4545
46- TPDiskStatusComputer::TPDiskStatusComputer (const ui32& defaultStateLimit, const TLimitsMap& stateLimits)
46+ TPDiskStatusComputer::TPDiskStatusComputer (const ui32& defaultStateLimit, const ui32& goodStateLimit, const TLimitsMap& stateLimits)
4747 : DefaultStateLimit(defaultStateLimit)
48+ , GoodStateLimit(goodStateLimit)
4849 , StateLimits(stateLimits)
4950 , StateCounter(0 )
5051{
5152}
5253
53- void TPDiskStatusComputer::AddState (EPDiskState state) {
54+ bool IsGoodState (EPDiskState state) {
55+ switch (state) {
56+ case NKikimrBlobStorage::TPDiskState::Unknown:
57+ return false ;
58+ case NKikimrBlobStorage::TPDiskState::Initial:
59+ case NKikimrBlobStorage::TPDiskState::InitialFormatRead:
60+ case NKikimrBlobStorage::TPDiskState::InitialSysLogRead:
61+ case NKikimrBlobStorage::TPDiskState::InitialCommonLogRead:
62+ case NKikimrBlobStorage::TPDiskState::Normal:
63+ return true ;
64+ case NKikimrBlobStorage::TPDiskState::InitialFormatReadError:
65+ case NKikimrBlobStorage::TPDiskState::InitialSysLogReadError:
66+ case NKikimrBlobStorage::TPDiskState::InitialSysLogParseError:
67+ case NKikimrBlobStorage::TPDiskState::InitialCommonLogReadError:
68+ case NKikimrBlobStorage::TPDiskState::InitialCommonLogParseError:
69+ case NKikimrBlobStorage::TPDiskState::CommonLoggerInitError:
70+ case NKikimrBlobStorage::TPDiskState::OpenFileError:
71+ case NKikimrBlobStorage::TPDiskState::ChunkQuotaError:
72+ case NKikimrBlobStorage::TPDiskState::DeviceIoError:
73+ case NKikimrBlobStorage::TPDiskState::Reserved14:
74+ case NKikimrBlobStorage::TPDiskState::Reserved15:
75+ case NKikimrBlobStorage::TPDiskState::Reserved16:
76+ case NKikimrBlobStorage::TPDiskState::Missing:
77+ case NKikimrBlobStorage::TPDiskState::Timeout:
78+ case NKikimrBlobStorage::TPDiskState::NodeDisconnected:
79+ return false ;
80+ }
81+ }
82+
83+ void TPDiskStatusComputer::AddState (EPDiskState state, bool isNodeLocked) {
5484 if (StateCounter && state == State) {
5585 if (StateCounter != Max<ui64>()) {
5686 ++StateCounter;
@@ -59,6 +89,12 @@ void TPDiskStatusComputer::AddState(EPDiskState state) {
5989 PrevState = std::exchange (State, state);
6090 StateCounter = 1 ;
6191 }
92+
93+ if (!isNodeLocked && !IsGoodState (state)) {
94+ // If node is not locked (i.e. it is not in maintenance mode),
95+ // then we should remember that we had a bad state recently
96+ HadBadStateRecently = true ;
97+ }
6298}
6399
64100EPDiskStatus TPDiskStatusComputer::Compute (EPDiskStatus current, TString& reason) const {
@@ -81,12 +117,18 @@ EPDiskStatus TPDiskStatusComputer::Compute(EPDiskStatus current, TString& reason
81117 << " State# " << State
82118 << " StateCounter# " << StateCounter
83119 << " current# " << current;
84- switch (PrevState) {
85- case NKikimrBlobStorage::TPDiskState::Unknown:
86- return current;
87- default :
88- return EPDiskStatus::INACTIVE;
120+
121+ if (PrevState == NKikimrBlobStorage::TPDiskState::Unknown) {
122+ return current;
89123 }
124+
125+ if (IsGoodState (PrevState) && State == NKikimrBlobStorage::TPDiskState::Normal) {
126+ if (!HadBadStateRecently && (StateCounter >= GoodStateLimit)) {
127+ return EPDiskStatus::ACTIVE;
128+ }
129+ }
130+
131+ return EPDiskStatus::INACTIVE;
90132 }
91133
92134 reason = TStringBuilder ()
@@ -99,6 +141,7 @@ EPDiskStatus TPDiskStatusComputer::Compute(EPDiskStatus current, TString& reason
99141
100142 switch (State) {
101143 case NKikimrBlobStorage::TPDiskState::Normal:
144+ HadBadStateRecently = false ;
102145 return EPDiskStatus::ACTIVE;
103146 default :
104147 return EPDiskStatus::FAULTY;
@@ -135,15 +178,15 @@ void TPDiskStatusComputer::ResetForcedStatus() {
135178
136179// / TPDiskStatus
137180
138- TPDiskStatus::TPDiskStatus (EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits)
139- : TPDiskStatusComputer(defaultStateLimit, stateLimits)
181+ TPDiskStatus::TPDiskStatus (EPDiskStatus initialStatus, const ui32& defaultStateLimit, const ui32& goodStateLimit, const TLimitsMap& stateLimits)
182+ : TPDiskStatusComputer(defaultStateLimit, goodStateLimit, stateLimits)
140183 , Current(initialStatus)
141184 , ChangingAllowed(true )
142185{
143186}
144187
145- void TPDiskStatus::AddState (EPDiskState state) {
146- TPDiskStatusComputer::AddState (state);
188+ void TPDiskStatus::AddState (EPDiskState state, bool isNodeLocked ) {
189+ TPDiskStatusComputer::AddState (state, isNodeLocked );
147190}
148191
149192bool TPDiskStatus::IsChanged () const {
@@ -198,15 +241,15 @@ void TPDiskStatus::DisallowChanging() {
198241
199242// / TPDiskInfo
200243
201- TPDiskInfo::TPDiskInfo (EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits)
202- : TPDiskStatus(initialStatus, defaultStateLimit, stateLimits)
244+ TPDiskInfo::TPDiskInfo (EPDiskStatus initialStatus, const ui32& defaultStateLimit, const ui32& goodStateLimit, const TLimitsMap& stateLimits)
245+ : TPDiskStatus(initialStatus, defaultStateLimit, goodStateLimit, stateLimits)
203246 , ActualStatus(initialStatus)
204247{
205248 Touch ();
206249}
207250
208- void TPDiskInfo::AddState (EPDiskState state) {
209- TPDiskStatus::AddState (state);
251+ void TPDiskInfo::AddState (EPDiskState state, bool isNodeLocked ) {
252+ TPDiskStatus::AddState (state, isNodeLocked );
210253 Touch ();
211254}
212255
@@ -476,7 +519,7 @@ class TConfigUpdater: public TUpdaterBase<TEvSentinel::TEvConfigUpdated, TConfig
476519 continue ;
477520 }
478521
479- pdisks.emplace (id, new TPDiskInfo (pdisk.GetDriveStatus (), Config.DefaultStateLimit , Config.StateLimits ));
522+ pdisks.emplace (id, new TPDiskInfo (pdisk.GetDriveStatus (), Config.DefaultStateLimit , Config.GoodStateLimit , Config. StateLimits ));
480523 }
481524
482525 SentinelState->ConfigUpdaterState .GotBSCResponse = true ;
@@ -570,16 +613,31 @@ class TStateUpdater: public TUpdaterBase<TEvSentinel::TEvStateUpdated, TStateUpd
570613 Reply ();
571614 }
572615
616+ bool IsNodeLocked (ui32 nodeId) const {
617+ const auto & clusterInfo = CmsState->ClusterInfo ;
618+
619+ if (clusterInfo && clusterInfo->HasNode (nodeId)) {
620+ const auto & node = clusterInfo->Node (nodeId);
621+ TErrorInfo unused;
622+ if (node.IsLocked (unused, TDuration::Zero (), TInstant::Zero (), TDuration::Zero ())) {
623+ return true ;
624+ }
625+ }
626+
627+ return false ;
628+ }
629+
573630 void MarkNodePDisks (ui32 nodeId, EPDiskState state, bool skipTouched = false ) {
631+ bool isNodeLocked = IsNodeLocked (nodeId);
574632 auto it = SentinelState->PDisks .lower_bound (TPDiskID (nodeId, 0 ));
575633 while (it != SentinelState->PDisks .end () && it->first .NodeId == nodeId) {
576634 if (skipTouched && it->second ->IsTouched ()) {
577635 ++it;
578636 continue ;
579637 }
580-
638+
581639 Y_ABORT_UNLESS (!it->second ->IsTouched ());
582- it->second ->AddState (state);
640+ it->second ->AddState (state, isNodeLocked );
583641 ++it;
584642 }
585643 }
@@ -613,6 +671,7 @@ class TStateUpdater: public TUpdaterBase<TEvSentinel::TEvStateUpdated, TStateUpd
613671 << " : nodeId# " << nodeId);
614672 MarkNodePDisks (nodeId, NKikimrBlobStorage::TPDiskState::Missing);
615673 } else {
674+ const bool isNodeLocked = IsNodeLocked (nodeId);
616675 for (const auto & info : record.GetPDiskStateInfo ()) {
617676 auto it = SentinelState->PDisks .find (TPDiskID (nodeId, info.GetPDiskId ()));
618677 if (it == SentinelState->PDisks .end ()) {
@@ -625,7 +684,7 @@ class TStateUpdater: public TUpdaterBase<TEvSentinel::TEvStateUpdated, TStateUpd
625684 << " , original# " << (ui32)info.GetState ()
626685 << " , safeState# " << safeState);
627686
628- it->second ->AddState (safeState);
687+ it->second ->AddState (safeState, isNodeLocked );
629688 }
630689
631690 MarkNodePDisks (nodeId, NKikimrBlobStorage::TPDiskState::Missing, true );
0 commit comments