Skip to content

Commit 36a3f78

Browse files
authored
Improve DeepScrubbing output (#21946)
2 parents ea806b6 + 3b95e6f commit 36a3f78

File tree

9 files changed

+200
-68
lines changed

9 files changed

+200
-68
lines changed

ydb/core/base/blobstorage.h

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1322,17 +1322,20 @@ struct TEvBlobStorage {
13221322
TLogoBlobID Id;
13231323
TInstant Deadline;
13241324
NKikimrBlobStorage::EGetHandleClass GetHandleClass;
1325+
bool SingleLine; // Print DataInfo in single line
13251326

13261327
ui32 RestartCounter = 0;
13271328
std::shared_ptr<TExecutionRelay> ExecutionRelay;
13281329

13291330
TEvCheckIntegrity(
13301331
const TLogoBlobID& id,
13311332
TInstant deadline,
1332-
NKikimrBlobStorage::EGetHandleClass getHandleClass)
1333+
NKikimrBlobStorage::EGetHandleClass getHandleClass,
1334+
bool singleLine = false)
13331335
: Id(id)
13341336
, Deadline(deadline)
13351337
, GetHandleClass(getHandleClass)
1338+
, SingleLine(singleLine)
13361339
{}
13371340

13381341
TString Print(bool /*isFull*/) const {
@@ -1373,13 +1376,44 @@ struct TEvBlobStorage {
13731376
PS_BLOB_IS_RECOVERABLE = 4, // blob parts are definitely placed incorrectly or there are missing parts but blob may be recovered
13741377
PS_BLOB_IS_LOST = 5, // blob is lost/unrecoverable
13751378
};
1376-
EPlacementStatus PlacementStatus = PS_OK;
1379+
1380+
static TString PlacementStatusToString(EPlacementStatus status) {
1381+
switch (status) {
1382+
case PS_OK:
1383+
return "PS_OK";
1384+
case PS_REPLICATION_IN_PROGRESS:
1385+
return "PS_REPLICATION_IN_PROGRESS";
1386+
case PS_UNKNOWN:
1387+
return "PS_UNKNOWN";
1388+
case PS_BLOB_IS_RECOVERABLE:
1389+
return "PS_BLOB_IS_RECOVERABLE";
1390+
case PS_BLOB_IS_LOST:
1391+
return "PS_BLOB_IS_LOST";
1392+
default:
1393+
return "BAD_PLACEMENT_STATUS";
1394+
}
1395+
}
13771396

13781397
enum EDataStatus {
13791398
DS_OK = 1, // all data parts contain valid data
13801399
DS_UNKNOWN = 2, // status is unknown because of missing disks or network problems
13811400
DS_ERROR = 3, // some parts definitely contain invalid data
13821401
};
1402+
1403+
static TString DataStatusToString(EDataStatus status) {
1404+
switch (status) {
1405+
case DS_OK:
1406+
return "DS_OK";
1407+
case DS_UNKNOWN:
1408+
return "DS_UNKNOWN";
1409+
case DS_ERROR:
1410+
return "DS_ERROR";
1411+
default:
1412+
return "BAD_DATA_STATUS";
1413+
}
1414+
}
1415+
1416+
EPlacementStatus PlacementStatus = PS_OK;
13831417
EDataStatus DataStatus = DS_OK;
13841418
TString DataInfo; // textual info about checks in blob data
13851419

@@ -1395,8 +1429,8 @@ struct TEvBlobStorage {
13951429
<< " Id# " << Id
13961430
<< " Status# " << NKikimrProto::EReplyStatus_Name(Status)
13971431
<< " ErrorReason# " << ErrorReason
1398-
<< " PlacementStatus# " << (int)PlacementStatus
1399-
<< " DataStatus# " << (int)DataStatus
1432+
<< " PlacementStatus# " << PlacementStatusToString(PlacementStatus)
1433+
<< " DataStatus# " << DataStatusToString(DataStatus)
14001434
<< " DataInfo# " << DataInfo
14011435
<< " }";
14021436
return str.Str();

ydb/core/blobstorage/dsproxy/dsproxy_check_integrity_get.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class TBlobStorageGroupCheckIntegrityRequest
1414
const TLogoBlobID Id;
1515
const TInstant Deadline;
1616
const NKikimrBlobStorage::EGetHandleClass GetHandleClass;
17+
const bool SingleLine;
1718

1819
TGroupQuorumTracker QuorumTracker;
1920

@@ -162,8 +163,10 @@ class TBlobStorageGroupCheckIntegrityRequest
162163
}
163164
}
164165

166+
char separator = SingleLine ? ' ' : '\n';
167+
165168
const auto& dataChecker = Info->GetTopology().GetDataIntegrityChecker();
166-
auto partsState = dataChecker.GetDataState(Id, PartsData);
169+
auto partsState = dataChecker.GetDataState(Id, PartsData, separator);
167170

168171
if (partsState.IsOk) {
169172
PendingResult->DataStatus = (PendingResult->PlacementStatus == TEvCheckIntegrityResult::PS_UNKNOWN) ?
@@ -173,10 +176,10 @@ class TBlobStorageGroupCheckIntegrityRequest
173176
}
174177

175178
TStringStream str;
176-
str << "Disks:" << Endl;
179+
str << "Disks:" << separator;
177180
for (ui32 diskIdx = 0; diskIdx < Info->Type.BlobSubgroupSize(); ++diskIdx) {
178181
auto vDiskIdShort = Info->GetTopology().GetVDiskInSubgroup(diskIdx, Id.Hash());
179-
str << diskIdx << ": " << Info->CreateVDiskID(vDiskIdShort) << Endl;
182+
str << diskIdx << ": " << Info->CreateVDiskID(vDiskIdShort) << separator;
180183
}
181184

182185
PendingResult->DataInfo = str.Str();
@@ -199,6 +202,7 @@ class TBlobStorageGroupCheckIntegrityRequest
199202
, Id(params.Common.Event->Id)
200203
, Deadline(params.Common.Event->Deadline)
201204
, GetHandleClass(params.Common.Event->GetHandleClass)
205+
, SingleLine(params.Common.Event->SingleLine)
202206
, QuorumTracker(Info.Get())
203207
{}
204208

ydb/core/blobstorage/groupinfo/blobstorage_groupinfo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ class TBlobStorageGroupInfo : public TThrRefBase {
172172
TString DataInfo;
173173
};
174174

175-
virtual TPartsState GetDataState(const TLogoBlobID& id, const TPartsData& partsData) const = 0;
175+
virtual TPartsState GetDataState(const TLogoBlobID& id, const TPartsData& partsData, char separator) const = 0;
176176
};
177177

178178
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

ydb/core/blobstorage/groupinfo/blobstorage_groupinfo_data_check.h

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@ class TDataIntegrityCheckerTrivial : public TDataIntegrityCheckerBase {
1616
public:
1717
using TDataIntegrityCheckerBase::TDataIntegrityCheckerBase;
1818

19-
TPartsState GetDataState(const TLogoBlobID& id, const TPartsData& partsData) const override {
19+
TPartsState GetDataState(const TLogoBlobID& id, const TPartsData& partsData, char separator) const override {
2020
Y_UNUSED(id);
2121
Y_UNUSED(partsData);
22+
Y_UNUSED(separator);
2223
return {};
2324
}
2425
};
@@ -27,7 +28,7 @@ class TDataIntegrityCheckerBlock42 : public TDataIntegrityCheckerBase {
2728
public:
2829
using TDataIntegrityCheckerBase::TDataIntegrityCheckerBase;
2930

30-
TPartsState GetDataState(const TLogoBlobID& id, const TPartsData& partsData) const override {
31+
TPartsState GetDataState(const TLogoBlobID& id, const TPartsData& partsData, char separator) const override {
3132
Y_ABORT_UNLESS(partsData.Parts.size() == 6);
3233

3334
TPartsState partsState;
@@ -58,7 +59,7 @@ class TDataIntegrityCheckerBlock42 : public TDataIntegrityCheckerBase {
5859

5960
// checking layout
6061
TStringStream layoutReport;
61-
layoutReport << "Layout info:" << Endl;
62+
layoutReport << "Layout info:" << separator;
6263

6364
TStringStream str;
6465
bool hasUnequalParts = false;
@@ -80,19 +81,19 @@ class TDataIntegrityCheckerBlock42 : public TDataIntegrityCheckerBase {
8081
str << "]";
8182
++ver;
8283
}
83-
str << Endl;
84+
str << separator;
8485
}
8586

8687
layoutReport << str.Str();
8788
if (hasUnequalParts) {
8889
partsState.IsOk = false;
89-
layoutReport << "ERROR: There are unequal parts" << Endl;
90+
layoutReport << "ERROR: There are unequal parts" << separator;
9091
}
9192
partsState.DataInfo = layoutReport.Str();
9293

9394
// checking erasure
9495
TStringStream erasureReport;
95-
erasureReport << "Erasure info:" << Endl;
96+
erasureReport << "Erasure info:" << separator;
9697

9798
std::vector<ui32> partIds;
9899
partIds.reserve(6);
@@ -144,7 +145,7 @@ class TDataIntegrityCheckerBlock42 : public TDataIntegrityCheckerBase {
144145
if (cmp) {
145146
erasureError = true;
146147
} else {
147-
str << "OK" << Endl;
148+
str << "OK" << separator;
148149
erasureReport << str.Str(); // report only succesful restore
149150
}
150151
}
@@ -199,7 +200,7 @@ class TDataIntegrityCheckerBlock42 : public TDataIntegrityCheckerBase {
199200

200201
if (erasureError) {
201202
partsState.IsOk = false;
202-
erasureReport << "ERROR: There are erasure restore fails" << Endl;
203+
erasureReport << "ERROR: There are erasure restore fails" << separator;
203204
}
204205

205206
partsState.DataInfo += erasureReport.Str();
@@ -214,7 +215,7 @@ class TDataIntegrityCheckerMirror : public TDataIntegrityCheckerBase {
214215
public:
215216
using TDataIntegrityCheckerBase::TDataIntegrityCheckerBase;
216217

217-
TPartsState GetDataState(const TLogoBlobID& id, const TPartsData& partsData) const override {
218+
TPartsState GetDataState(const TLogoBlobID& id, const TPartsData& partsData, char separator) const override {
218219
Y_UNUSED(id);
219220
Y_ABORT_UNLESS(partsData.Parts.size() == 3);
220221

@@ -244,7 +245,7 @@ class TDataIntegrityCheckerMirror : public TDataIntegrityCheckerBase {
244245
}
245246

246247
TStringStream layoutReport;
247-
layoutReport << "Layout info:" << Endl;
248+
layoutReport << "Layout info:" << separator;
248249

249250
TStringStream str;
250251
bool hasUnequalParts = (seenParts.size() > 1);
@@ -260,12 +261,12 @@ class TDataIntegrityCheckerMirror : public TDataIntegrityCheckerBase {
260261
str << "]";
261262
++ver;
262263
}
263-
str << Endl;
264+
str << separator;
264265
layoutReport << str.Str();
265266

266267
if (hasUnequalParts) {
267268
partsState.IsOk = false;
268-
layoutReport << "ERROR: There are unequal parts" << Endl;
269+
layoutReport << "ERROR: There are unequal parts" << separator;
269270
}
270271
partsState.DataInfo = layoutReport.Str();
271272

ydb/core/blobstorage/ut_blobstorage/lib/env.h

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,32 @@ struct TEnvironmentSetup {
958958
}
959959
}
960960
return ctr;
961-
};
961+
}
962+
963+
template <class TCallback>
964+
ui64 AggregateVDiskCountersWithCallback(TString storagePool, ui32 nodesCount, ui32 groupSize, ui32 groupId,
965+
const std::vector<ui32>& pdiskLayout, TCallback callback) {
966+
ui64 ctr = 0;
967+
968+
for (ui32 nodeId = 1; nodeId <= nodesCount; ++nodeId) {
969+
auto* appData = Runtime->GetNode(nodeId)->AppData.get();
970+
for (ui32 i = 0; i < groupSize; ++i) {
971+
TStringStream ss;
972+
ss << LeftPad(i, 2, '0');
973+
TString orderNumber = ss.Str();
974+
ss.Clear();
975+
ss << LeftPad(pdiskLayout[i], 9, '0');
976+
TString pdisk = ss.Str();
977+
ctr += callback(GetServiceCounters(appData->Counters, "vdisks")->
978+
GetSubgroup("storagePool", storagePool)->
979+
GetSubgroup("group", std::to_string(groupId))->
980+
GetSubgroup("orderNumber", orderNumber)->
981+
GetSubgroup("pdisk", pdisk)->
982+
GetSubgroup("media", "rot"));
983+
}
984+
}
985+
return ctr;
986+
}
962987

963988
void SetIcbControl(ui32 nodeId, TString controlName, ui64 value) {
964989
if (nodeId == 0) {

ydb/core/blobstorage/ut_blobstorage/scrub.cpp

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,25 @@ Y_UNIT_TEST_SUITE(DeepScrubbing) {
433433
, PartCorruptionMask(partCorruptionMask)
434434
{}
435435

436+
struct TAggregateScrubMetrics {
437+
TAggregateScrubMetrics(TString counterName, bool isHuge, TErasureType::EErasureSpecies erasure)
438+
: CounterName(counterName)
439+
, IsHuge(isHuge)
440+
, Erasure(erasure)
441+
{}
442+
443+
ui64 operator()(TIntrusivePtr<NMonitoring::TDynamicCounters> counters) const {
444+
return counters->GetSubgroup("subsystem", "deepScrubbing")
445+
->GetSubgroup("blobSize", IsHuge ? "huge" : "small")
446+
->GetSubgroup("erasure", TErasureType::ErasureSpeciesName(Erasure))
447+
->GetCounter(CounterName, true)->Val();
448+
}
449+
450+
TString CounterName;
451+
bool IsHuge;
452+
TErasureType::EErasureSpecies Erasure;
453+
};
454+
436455
void RunTest() {
437456
Initialize();
438457
AllocateEdgeActor(true);
@@ -506,19 +525,16 @@ Y_UNIT_TEST_SUITE(DeepScrubbing) {
506525
}
507526
}
508527

528+
bool isHuge = (BlobSize == EBlobSize::Val_HugeBlob);
529+
509530
std::vector<ui32> pdiskLayout = MakePDiskLayout(BaseConfig, groupInfo->GetTopology(), GroupId);
510531

511532
ui64 blobsScrubbed =
512-
Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
513-
GroupId, pdiskLayout, "deepScrubbing", "SmallBlobsChecked", false) +
514-
Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
515-
GroupId, pdiskLayout, "deepScrubbing", "HugeBlobsChecked", false);
516-
533+
Env->AggregateVDiskCountersWithCallback(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
534+
GroupId, pdiskLayout, TAggregateScrubMetrics("BlobsChecked", isHuge, Erasure.GetErasure()));
517535
ui64 dataIssues =
518-
Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
519-
GroupId, pdiskLayout, "deepScrubbing", "DataIssuesSmallBlobs", false) +
520-
Env->AggregateVDiskCounters(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
521-
GroupId, pdiskLayout, "deepScrubbing", "DataIssuesHugeBlobs", false);
536+
Env->AggregateVDiskCountersWithCallback(Env->StoragePoolName, NodeCount, Erasure.BlobSubgroupSize(),
537+
GroupId, pdiskLayout, TAggregateScrubMetrics("DataIssues", isHuge, Erasure.GetErasure()));
522538

523539
UNIT_ASSERT_VALUES_UNEQUAL_C(blobsScrubbed, 0, makePrefix());
524540
UNIT_ASSERT_VALUES_UNEQUAL_C(dataIssues, 0, makePrefix()

ydb/core/blobstorage/vdisk/common/vdisk_mongroups.h

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -676,33 +676,60 @@ public:
676676
COUNTER_DEF(BlobsFixed);
677677
};
678678

679+
///////////////////////////////////////////////////////////////////////////////////
680+
// TDeepScrubbingGroup
681+
///////////////////////////////////////////////////////////////////////////////////
679682
class TDeepScrubbingGroup : public TBase {
680683
public:
681684
GROUP_CONSTRUCTOR(TDeepScrubbingGroup)
682685
{
683-
COUNTER_INIT(SmallBlobsChecked, true);
684-
COUNTER_INIT(HugeBlobsChecked, true);
685-
COUNTER_INIT(CheckIntegritySuccesses, false);
686-
COUNTER_INIT(CheckIntegrityErrors, false);
687-
688-
COUNTER_INIT(PlacementIssuesSmallBlobs, false);
689-
COUNTER_INIT(DataIssuesSmallBlobs, false);
690-
691-
COUNTER_INIT(PlacementIssuesHugeBlobs, false);
692-
COUNTER_INIT(DataIssuesHugeBlobs, false);
686+
COUNTER_INIT(BlobsChecked, true);
687+
COUNTER_INIT(CheckIntegritySuccesses, true);
688+
COUNTER_INIT(CheckIntegrityErrors, true);
689+
COUNTER_INIT(UnknownDataStatus, true);
690+
COUNTER_INIT(UnknownPlacementStatus, true);
691+
COUNTER_INIT(DataIssues, true);
692+
COUNTER_INIT(PlacementIssues, true);
693693
}
694694

695-
COUNTER_DEF(SmallBlobsChecked);
696-
COUNTER_DEF(HugeBlobsChecked);
697-
695+
COUNTER_DEF(BlobsChecked);
698696
COUNTER_DEF(CheckIntegritySuccesses);
699697
COUNTER_DEF(CheckIntegrityErrors);
698+
COUNTER_DEF(UnknownDataStatus);
699+
COUNTER_DEF(UnknownPlacementStatus);
700+
COUNTER_DEF(DataIssues);
701+
COUNTER_DEF(PlacementIssues);
702+
};
700703

701-
COUNTER_DEF(PlacementIssuesSmallBlobs);
702-
COUNTER_DEF(DataIssuesSmallBlobs);
704+
class TDeepScrubbingSubgroups {
705+
public:
706+
TDeepScrubbingSubgroups(TIntrusivePtr<NMonitoring::TDynamicCounters> counters) {
707+
for (bool isHuge : {true, false}) {
708+
for (TErasureType::EErasureSpecies erasure :
709+
{TErasureType::ErasureNone, TErasureType::Erasure4Plus2Block,
710+
TErasureType::ErasureMirror3of4, TErasureType::ErasureMirror3dc}) {
711+
::NMonitoring::TDynamicCounterPtr subgroup = counters
712+
->GetSubgroup("blobSize", isHuge ? "huge" : "small")
713+
->GetSubgroup("erasure", TErasureType::ErasureSpeciesName(erasure));
714+
Subgroups.insert({GetKey(isHuge, erasure), TDeepScrubbingGroup(subgroup)});
715+
}
716+
}
717+
}
718+
719+
TDeepScrubbingGroup* GetCounters(bool isHuge, TErasureType::EErasureSpecies erasure) {
720+
auto it = Subgroups.find(GetKey(isHuge, erasure));
721+
if (it == Subgroups.end()) {
722+
return nullptr;
723+
}
724+
return &it->second;
725+
}
703726

704-
COUNTER_DEF(PlacementIssuesHugeBlobs);
705-
COUNTER_DEF(DataIssuesHugeBlobs);
727+
private:
728+
std::unordered_map<ui64, TDeepScrubbingGroup> Subgroups;
729+
730+
ui64 GetKey(bool isHuge, TErasureType::EErasureSpecies erasure) {
731+
return ((ui64)isHuge << 32) + (ui64)erasure;
732+
}
706733
};
707734

708735
} // NMonGroup

0 commit comments

Comments
 (0)