Skip to content

Commit dd8f422

Browse files
serbel324blinkov
authored andcommitted
Restart VDisk instead of resetting internal queues when queue is stuck (#14815)
1 parent 3ba0b36 commit dd8f422

File tree

2 files changed

+26
-21
lines changed

2 files changed

+26
-21
lines changed

ydb/core/blobstorage/vdisk/common/vdisk_mongroups.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -761,5 +761,18 @@ public:
761761
COUNTER_DEF(DroppingStuckInternalQueue);
762762
};
763763

764+
///////////////////////////////////////////////////////////////////////////////////
765+
// TTimerGroup
766+
///////////////////////////////////////////////////////////////////////////////////
767+
class TTimerGroup : public TBase {
768+
public:
769+
GROUP_CONSTRUCTOR(TTimerGroup)
770+
{
771+
COUNTER_INIT(SkeletonFrontUptimeSeconds, false);
772+
}
773+
774+
COUNTER_DEF(SkeletonFrontUptimeSeconds);
775+
};
776+
764777
} // NMonGroup
765778
} // NKikimr

ydb/core/blobstorage/vdisk/skeleton/blobstorage_skeletonfront.cpp

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -439,20 +439,6 @@ namespace NKikimr {
439439
return InFlightCount > 0 && TActivationContext::Monotonic() - LastUpdate > StuckQueueThreshold;
440440
}
441441

442-
void ResetQueue() {
443-
InFlightCount = 0;
444-
InFlightCost = 0;
445-
InFlightBytes = 0;
446-
447-
*SkeletonFrontInFlightCount = 0;
448-
*SkeletonFrontInFlightCost = 0;
449-
*SkeletonFrontInFlightBytes = 0;
450-
*SkeletonFrontCostProcessed = 0;
451-
452-
Msgs.clear();
453-
UpdateState();
454-
}
455-
456442
TString GenerateHtmlState() const {
457443
// NOTE: warning policy:
458444
// 1. For InFlightCount and InFlightCost we output them in yellow, if
@@ -701,13 +687,15 @@ namespace NKikimr {
701687
NMonGroup::TSyncerGroup SyncerMonGroup;
702688
NMonGroup::TVDiskStateGroup VDiskMonGroup;
703689
NMonGroup::TCostGroup CostGroup;
704-
NMonGroup::TMalfunctionGroup MalfunctionGroup;
690+
NMonGroup::TTimerGroup TimerGroup;
705691
TVDiskIncarnationGuid VDiskIncarnationGuid;
706692
bool HasUnreadableBlobs = false;
707693
TInstant LastSanitizeTime = TInstant::Zero();
708694
TInstant LastSanitizeWithErrorTime = TInstant::Zero();
709695
ui64 NextUniqueMessageId = 1;
710696

697+
TMonotonic StartTimestamp = TMonotonic::Zero();
698+
711699
static constexpr TDuration StuckQueueCheckPeriod = TDuration::Seconds(60);
712700

713701
ui64 AllocateMessageId() {
@@ -812,6 +800,8 @@ namespace NKikimr {
812800
ActiveActors.Insert(SkeletonId, __FILE__, __LINE__, ctx, NKikimrServices::BLOBSTORAGE);
813801

814802
SetupMonitoring(ctx);
803+
StartTimestamp = TActivationContext::Monotonic();
804+
TimerGroup.SkeletonFrontUptimeSeconds() = 0;
815805
Become(&TThis::StateLocalRecoveryInProgress);
816806
}
817807

@@ -2077,18 +2067,20 @@ namespace NKikimr {
20772067
}
20782068

20792069
void HandleWakeup(const TActorContext& ctx) {
2070+
TMonotonic now = TActivationContext::Monotonic();
2071+
TimerGroup.SkeletonFrontUptimeSeconds() = (now - StartTimestamp).Seconds();
20802072
for (TIntQueueClass* queue : { IntQueueAsyncGets.get(), IntQueueFastGets.get(),
20812073
IntQueueDiscover.get(), IntQueueLowGets.get(), IntQueueLogPuts.get(),
20822074
IntQueueHugePutsForeground.get(), IntQueueHugePutsBackground.get() }) {
20832075
if (queue->IsStuck()) {
2084-
queue->DropWithError(ctx, *this);
2085-
queue->ResetQueue();
2086-
DisconnectClients(ctx);
20872076
LOG_CRIT_S(ctx, NKikimrServices::BS_SKELETON, VCtx->VDiskLogPrefix
2088-
<< "Stuck internal queue detected, dropping queues, "
2077+
<< "Stuck internal queue detected, restarting VDisk, "
20892078
<< " Queue.Name# " << queue->Name
20902079
<< " Marker# BSVSF08");
2091-
++MalfunctionGroup.DroppingStuckInternalQueue();
2080+
TActorId wardenId = MakeBlobStorageNodeWardenID(SelfId().NodeId());
2081+
ctx.Send(wardenId, new TEvBlobStorage::TEvAskRestartVDisk(
2082+
Config->BaseInfo.PDiskId, SelfVDiskId));
2083+
return;
20922084
}
20932085
}
20942086
Schedule(StuckQueueCheckPeriod, new TEvents::TEvWakeup);
@@ -2266,7 +2258,7 @@ namespace NKikimr {
22662258
, SyncerMonGroup(VDiskCounters, "subsystem", "syncer")
22672259
, VDiskMonGroup(VDiskCounters, "subsystem", "state")
22682260
, CostGroup(VDiskCounters, "subsystem", "cost")
2269-
, MalfunctionGroup(VDiskCounters, "subsystem", "malfunction")
2261+
, TimerGroup(VDiskCounters, "subsystem", "timer")
22702262
{
22712263
ReplMonGroup.ReplUnreplicatedVDisks() = 1;
22722264
VDiskMonGroup.VDiskState(NKikimrWhiteboard::EVDiskState::Initial);

0 commit comments

Comments
 (0)