Skip to content

Commit 3b95e6f

Browse files
committed
More detailed metrics for DeepScrubbing and print reports in a single line (#21888)
1 parent 5c2ef6f commit 3b95e6f

File tree

5 files changed

+141
-48
lines changed

5 files changed

+141
-48
lines changed

ydb/core/blobstorage/ut_blobstorage/lib/env.h

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,32 @@ struct TEnvironmentSetup {
958958
}
959959
}
960960
return ctr;
961-
};
961+
}
962+
963+
template <class TCallback>
964+
ui64 AggregateVDiskCountersWithCallback(TString storagePool, ui32 nodesCount, ui32 groupSize, ui32 groupId,
965+
const std::vector<ui32>& pdiskLayout, TCallback callback) {
966+
ui64 ctr = 0;
967+
968+
for (ui32 nodeId = 1; nodeId <= nodesCount; ++nodeId) {
969+
auto* appData = Runtime->GetNode(nodeId)->AppData.get();
970+
for (ui32 i = 0; i < groupSize; ++i) {
971+
TStringStream ss;
972+
ss << LeftPad(i, 2, '0');
973+
TString orderNumber = ss.Str();
974+
ss.Clear();
975+
ss << LeftPad(pdiskLayout[i], 9, '0');
976+
TString pdisk = ss.Str();
977+
ctr += callback(GetServiceCounters(appData->Counters, "vdisks")->
978+
GetSubgroup("storagePool", storagePool)->
979+
GetSubgroup("group", std::to_string(groupId))->
980+
GetSubgroup("orderNumber", orderNumber)->
981+
GetSubgroup("pdisk", pdisk)->
982+
GetSubgroup("media", "rot"));
983+
}
984+
}
985+
return ctr;
986+
}
962987

963988
void SetIcbControl(ui32 nodeId, TString controlName, ui64 value) {
964989
if (nodeId == 0) {

ydb/core/blobstorage/ut_blobstorage/scrub.cpp

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,25 @@ Y_UNIT_TEST_SUITE(DeepScrubbing) {
433433
, PartCorruptionMask(partCorruptionMask)
434434
{}
435435

436+
struct TAggregateScrubMetrics {
437+
TAggregateScrubMetrics(TString counterName, bool isHuge, TErasureType::EErasureSpecies erasure)
438+
: CounterName(counterName)
439+
, IsHuge(isHuge)
440+
, Erasure(erasure)
441+
{}
442+
443+
ui64 operator()(TIntrusivePtr<NMonitoring::TDynamicCounters> counters) const {
444+
return counters->GetSubgroup("subsystem", "deepScrubbing")
445+
->GetSubgroup("blobSize", IsHuge ? "huge" : "small")
446+
->GetSubgroup("erasure", TErasureType::ErasureSpeciesName(Erasure))
447+
->GetCounter(CounterName, true)->Val();
448+
}
449+
450+
TString CounterName;
451+
bool IsHuge;
452+
TErasureType::EErasureSpecies Erasure;
453+
};
454+
436455
void RunTest() {
437456
Initialize();
438457
AllocateEdgeActor(true);
@@ -506,19 +525,16 @@ Y_UNIT_TEST_SUITE(DeepScrubbing) {
506525
}
507526
}
508527

528+
bool isHuge = (BlobSize == EBlobSize::Val_HugeBlob);
529+
509530
std::vector<ui32> pdiskLayout = MakePDiskLayout(BaseConfig, groupInfo->GetTopology(), GroupId);
510531

511532
ui64 blobsScrubbed =
512-
Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
513-
GroupId, pdiskLayout, "deepScrubbing", "SmallBlobsChecked", false) +
514-
Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
515-
GroupId, pdiskLayout, "deepScrubbing", "HugeBlobsChecked", false);
516-
533+
Env->AggregateVDiskCountersWithCallback(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
534+
GroupId, pdiskLayout, TAggregateScrubMetrics("BlobsChecked", isHuge, Erasure.GetErasure()));
517535
ui64 dataIssues =
518-
Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
519-
GroupId, pdiskLayout, "deepScrubbing", "DataIssuesSmallBlobs", false) +
520-
Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
521-
GroupId, pdiskLayout, "deepScrubbing", "DataIssuesHugeBlobs", false);
536+
Env->AggregateVDiskCountersWithCallback(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
537+
GroupId, pdiskLayout, TAggregateScrubMetrics("DataIssues", isHuge, Erasure.GetErasure()));
522538

523539
UNIT_ASSERT_VALUES_UNEQUAL_C(blobsScrubbed, 0, makePrefix());
524540
UNIT_ASSERT_VALUES_UNEQUAL_C(dataIssues, 0, makePrefix()

ydb/core/blobstorage/vdisk/common/vdisk_mongroups.h

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -676,33 +676,60 @@ public:
676676
COUNTER_DEF(BlobsFixed);
677677
};
678678

679+
///////////////////////////////////////////////////////////////////////////////////
680+
// TDeepScrubbingGroup
681+
///////////////////////////////////////////////////////////////////////////////////
679682
class TDeepScrubbingGroup : public TBase {
680683
public:
681684
GROUP_CONSTRUCTOR(TDeepScrubbingGroup)
682685
{
683-
COUNTER_INIT(SmallBlobsChecked, true);
684-
COUNTER_INIT(HugeBlobsChecked, true);
685-
COUNTER_INIT(CheckIntegritySuccesses, false);
686-
COUNTER_INIT(CheckIntegrityErrors, false);
687-
688-
COUNTER_INIT(PlacementIssuesSmallBlobs, false);
689-
COUNTER_INIT(DataIssuesSmallBlobs, false);
690-
691-
COUNTER_INIT(PlacementIssuesHugeBlobs, false);
692-
COUNTER_INIT(DataIssuesHugeBlobs, false);
686+
COUNTER_INIT(BlobsChecked, true);
687+
COUNTER_INIT(CheckIntegritySuccesses, true);
688+
COUNTER_INIT(CheckIntegrityErrors, true);
689+
COUNTER_INIT(UnknownDataStatus, true);
690+
COUNTER_INIT(UnknownPlacementStatus, true);
691+
COUNTER_INIT(DataIssues, true);
692+
COUNTER_INIT(PlacementIssues, true);
693693
}
694694

695-
COUNTER_DEF(SmallBlobsChecked);
696-
COUNTER_DEF(HugeBlobsChecked);
697-
695+
COUNTER_DEF(BlobsChecked);
698696
COUNTER_DEF(CheckIntegritySuccesses);
699697
COUNTER_DEF(CheckIntegrityErrors);
698+
COUNTER_DEF(UnknownDataStatus);
699+
COUNTER_DEF(UnknownPlacementStatus);
700+
COUNTER_DEF(DataIssues);
701+
COUNTER_DEF(PlacementIssues);
702+
};
700703

701-
COUNTER_DEF(PlacementIssuesSmallBlobs);
702-
COUNTER_DEF(DataIssuesSmallBlobs);
704+
class TDeepScrubbingSubgroups {
705+
public:
706+
TDeepScrubbingSubgroups(TIntrusivePtr<NMonitoring::TDynamicCounters> counters) {
707+
for (bool isHuge : {true, false}) {
708+
for (TErasureType::EErasureSpecies erasure :
709+
{TErasureType::ErasureNone, TErasureType::Erasure4Plus2Block,
710+
TErasureType::ErasureMirror3of4, TErasureType::ErasureMirror3dc}) {
711+
::NMonitoring::TDynamicCounterPtr subgroup = counters
712+
->GetSubgroup("blobSize", isHuge ? "huge" : "small")
713+
->GetSubgroup("erasure", TErasureType::ErasureSpeciesName(erasure));
714+
Subgroups.insert({GetKey(isHuge, erasure), TDeepScrubbingGroup(subgroup)});
715+
}
716+
}
717+
}
718+
719+
TDeepScrubbingGroup* GetCounters(bool isHuge, TErasureType::EErasureSpecies erasure) {
720+
auto it = Subgroups.find(GetKey(isHuge, erasure));
721+
if (it == Subgroups.end()) {
722+
return nullptr;
723+
}
724+
return &it->second;
725+
}
703726

704-
COUNTER_DEF(PlacementIssuesHugeBlobs);
705-
COUNTER_DEF(DataIssuesHugeBlobs);
727+
private:
728+
std::unordered_map<ui64, TDeepScrubbingGroup> Subgroups;
729+
730+
ui64 GetKey(bool isHuge, TErasureType::EErasureSpecies erasure) {
731+
return ((ui64)isHuge << 32) + (ui64)erasure;
732+
}
706733
};
707734

708735
} // NMonGroup

ydb/core/blobstorage/vdisk/scrub/scrub_actor.cpp

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ namespace NKikimr {
1414
, LogPrefix(VCtx->VDiskLogPrefix)
1515
, Counters(VCtx->VDiskCounters->GetSubgroup("subsystem", "scrub"))
1616
, MonGroup(Counters)
17-
, DeepScrubbingGroup(VCtx->VDiskCounters->GetSubgroup("subsystem", "deepScrubbing"))
17+
, DeepScrubbingSubgroups(VCtx->VDiskCounters->GetSubgroup("subsystem", "deepScrubbing"))
1818
, Arena(&TScrubCoroImpl::AllocateRopeArenaChunk)
1919
, ScrubEntrypoint(std::move(scrubEntrypoint))
2020
, ScrubEntrypointLsn(scrubEntrypointLsn)
@@ -239,38 +239,63 @@ namespace NKikimr {
239239

240240
void TScrubCoroImpl::CheckIntegrity(const TLogoBlobID& blobId, bool isHuge) {
241241
SendToBSProxy(SelfActorId, Info->GroupID, new TEvBlobStorage::TEvCheckIntegrity(blobId, TInstant::Max(),
242-
NKikimrBlobStorage::EGetHandleClass::LowRead));
242+
NKikimrBlobStorage::EGetHandleClass::LowRead, true));
243243
auto res = WaitForPDiskEvent<TEvBlobStorage::TEvCheckIntegrityResult>();
244244

245-
if (isHuge) {
246-
++DeepScrubbingGroup.HugeBlobsChecked();
247-
} else {
248-
++DeepScrubbingGroup.SmallBlobsChecked();
245+
TErasureType::EErasureSpecies erasure = Info->Type.GetErasure();
246+
247+
NMonGroup::TDeepScrubbingGroup* counters = DeepScrubbingSubgroups.GetCounters(isHuge, erasure);
248+
if (counters) {
249+
++counters->BlobsChecked();
249250
}
250251

251252
if (res->Get()->Status != NKikimrProto::OK) {
252253
STLOGX(GetActorContext(), PRI_WARN, BS_VDISK_SCRUB, VDS97, VDISKP(LogPrefix, "TEvCheckIntegrity request failed"),
253254
(BlobId, blobId), (ErrorReason, res->Get()->ErrorReason));
254-
++DeepScrubbingGroup.CheckIntegrityErrors();
255+
if (counters) {
256+
++counters->CheckIntegrityErrors();
257+
}
255258
} else {
256-
++DeepScrubbingGroup.CheckIntegritySuccesses();
257-
if (res->Get()->PlacementStatus != TEvBlobStorage::TEvCheckIntegrityResult::PS_OK) {
259+
if (counters) {
260+
++counters->CheckIntegritySuccesses();
261+
}
262+
263+
switch (res->Get()->PlacementStatus) {
264+
case TEvBlobStorage::TEvCheckIntegrityResult::PS_UNKNOWN:
265+
case TEvBlobStorage::TEvCheckIntegrityResult::PS_REPLICATION_IN_PROGRESS:
266+
if (counters) {
267+
++counters->UnknownPlacementStatus();
268+
}
269+
break;
270+
case TEvBlobStorage::TEvCheckIntegrityResult::PS_BLOB_IS_LOST:
271+
case TEvBlobStorage::TEvCheckIntegrityResult::PS_BLOB_IS_RECOVERABLE:
258272
STLOGX(GetActorContext(), PRI_CRIT, BS_VDISK_SCRUB, VDS98, VDISKP(LogPrefix, "TEvCheckIntegrity discovered placement issue"),
259-
(BlobId, blobId), (CheckIntegrityResult, res->Get()->ToString()));
260-
if (isHuge) {
261-
++DeepScrubbingGroup.PlacementIssuesHugeBlobs();
262-
} else {
263-
++DeepScrubbingGroup.PlacementIssuesSmallBlobs();
273+
(BlobId, blobId), (Erasure, TErasureType::ErasureSpeciesName(erasure)), (CheckIntegrityResult, res->Get()->ToString()));
274+
if (counters) {
275+
++counters->PlacementIssues();
264276
}
277+
break;
278+
case TEvBlobStorage::TEvCheckIntegrityResult::PS_OK:
279+
default:
280+
break; // nothing to do
265281
}
266-
if (res->Get()->DataStatus != TEvBlobStorage::TEvCheckIntegrityResult::DS_OK) {
282+
283+
switch (res->Get()->DataStatus) {
284+
case TEvBlobStorage::TEvCheckIntegrityResult::DS_UNKNOWN:
285+
if (counters) {
286+
++counters->UnknownDataStatus();
287+
}
288+
break;
289+
case TEvBlobStorage::TEvCheckIntegrityResult::DS_ERROR:
267290
STLOGX(GetActorContext(), PRI_CRIT, BS_VDISK_SCRUB, VDS99, VDISKP(LogPrefix, "TEvCheckIntegrity discovered data issue"),
268-
(BlobId, blobId), (CheckIntegrityResult, res->Get()->ToString()));
269-
if (isHuge) {
270-
++DeepScrubbingGroup.DataIssuesHugeBlobs();
271-
} else {
272-
++DeepScrubbingGroup.DataIssuesSmallBlobs();
291+
(BlobId, blobId), (Erasure, TErasureType::ErasureSpeciesName(erasure)), (CheckIntegrityResult, res->Get()->ToString()));
292+
if (counters) {
293+
++counters->DataIssues();
273294
}
295+
break;
296+
case TEvBlobStorage::TEvCheckIntegrityResult::DS_OK:
297+
default:
298+
break; // nothing to do
274299
}
275300
}
276301
}

ydb/core/blobstorage/vdisk/scrub/scrub_actor_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ namespace NKikimr {
2525

2626
::NMonitoring::TDynamicCounterPtr Counters;
2727
NMonGroup::TScrubGroup MonGroup;
28-
NMonGroup::TDeepScrubbingGroup DeepScrubbingGroup;
28+
NMonGroup::TDeepScrubbingSubgroups DeepScrubbingSubgroups;
2929

3030
TRopeArena Arena;
3131

0 commit comments

Comments
 (0)