@@ -783,6 +783,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
783783 }
784784
785785 void Bootstrap () {
786+ Cerr << " iiiiii Bootstrap " << SelfId () << Endl;
786787 FilterDatabase = Request->Database ;
787788 if (Request->Request .operation_params ().has_operation_timeout ()) {
788789 Timeout = GetDuration (Request->Request .operation_params ().operation_timeout ());
@@ -837,6 +838,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
837838 }
838839
839840 void Handle (TEvNodeWardenStorageConfig::TPtr ev) {
841+ Cerr << " aaaaa TEvNodeWardenStorageConfig" << Endl;
840842 NodeWardenStorageConfig->Set (std::move (ev));
841843 if (const NKikimrBlobStorage::TStorageConfig& config = *NodeWardenStorageConfig->Get ()->Config ; config.HasBlobStorageConfig ()) {
842844 if (const auto & bsConfig = config.GetBlobStorageConfig (); bsConfig.HasServiceSet ()) {
@@ -868,6 +870,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
868870 }
869871
870872 auto groupId = vDisk.GetVDiskID ().GetGroupID ();
873+ Cerr << " aaaaa TEvNodeWardenStorageConfig 2" << Endl;
871874 if (NeedWhiteboardInfoForGroup (groupId)) {
872875 BLOG_D (" Requesting whiteboard for group " << groupId);
873876 RequestStorageNode (vDisk.GetVDiskLocation ().GetNodeID ());
@@ -1115,6 +1118,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11151118 }
11161119
11171120 void RequestStorageNode (TNodeId nodeId) {
1121+ Cerr << " aaaaaaa RequestStorageNode " << nodeId << Endl;
11181122 if (StorageNodeIds.emplace (nodeId).second ) {
11191123 RequestGenericNode (nodeId);
11201124 if (NodeVDiskState.count (nodeId) == 0 ) {
@@ -1155,24 +1159,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11551159 }
11561160
11571161 void Handle (TEvPrivate::TEvRetryNodeWhiteboard::TPtr& ev) {
1162+ Cerr << " !!!!!!! Handle RetryNodeWhiteboard " << Endl;
11581163 auto eventId = ev->Get ()->EventId ;
11591164 auto nodeId = ev->Get ()->NodeId ;
11601165 switch (eventId) {
11611166 case TEvWhiteboard::EvSystemStateRequest:
1162- NodeSystemState.erase (nodeId);
1163- NodeSystemState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId);
1167+ Cerr << " !!!!!!! Handle RetryNodeWhiteboard EvSystemStateRequest " << Endl;
1168+ // if (!NodeSystemState[nodeId].IsDone()) {
1169+ NodeSystemState.erase (nodeId);
1170+ NodeSystemState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId);
1171+ // }
11641172 break ;
11651173 case TEvWhiteboard::EvVDiskStateRequest:
1166- NodeVDiskState.erase (nodeId);
1167- NodeVDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId);
1174+ if (!NodeVDiskState[nodeId].IsDone ()) {
1175+ NodeVDiskState.erase (nodeId);
1176+ NodeVDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvVDiskStateRequest>(nodeId);
1177+ }
11681178 break ;
11691179 case TEvWhiteboard::EvPDiskStateRequest:
1170- NodePDiskState.erase (nodeId);
1171- NodePDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId);
1180+ if (!NodePDiskState[nodeId].IsDone ()) {
1181+ NodePDiskState.erase (nodeId);
1182+ NodePDiskState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvPDiskStateRequest>(nodeId);
1183+ }
11721184 break ;
11731185 case TEvWhiteboard::EvBSGroupStateRequest:
1174- NodeBSGroupState.erase (nodeId);
1175- NodeBSGroupState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId);
1186+ if (!NodeBSGroupState[nodeId].IsDone ()) {
1187+ NodeBSGroupState.erase (nodeId);
1188+ NodeBSGroupState[nodeId] = RequestNodeWhiteboard<TEvWhiteboard::TEvBSGroupStateRequest>(nodeId);
1189+ }
11761190 break ;
11771191 default :
11781192 RequestDone (" unsupported event scheduled" );
@@ -1182,6 +1196,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11821196
11831197 template <typename TEvent>
11841198 bool RetryRequestNodeWhiteboard (TNodeId nodeId) {
1199+ Cerr << " !!!!!!! RetryRequestNodeWhiteboard " << nodeId << Endl;
11851200 if (NodeRetries[{nodeId, TEvent::EventType}]++ < MaxRetries) {
11861201 Schedule (RetryDelay, new TEvPrivate::TEvRetryNodeWhiteboard (nodeId, TEvent::EventType));
11871202 return true ;
@@ -1190,6 +1205,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
11901205 }
11911206
11921207 void Handle (TEvents::TEvUndelivered::TPtr& ev) {
1208+ Cerr << " iiiiiiiii Undelivered " << Endl;
11931209 ui32 nodeId = ev.Get ()->Cookie ;
11941210 TString error = " Undelivered" ;
11951211 if (ev->Get ()->SourceType == TEvWhiteboard::EvSystemStateRequest) {
@@ -1226,6 +1242,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
12261242 }
12271243
12281244 void Disconnected (TEvInterconnect::TEvNodeDisconnected::TPtr& ev) {
1245+ Cerr << " iiiiiiiii Disconnected " << Endl;
12291246 ui32 nodeId = ev->Get ()->NodeId ;
12301247 TString error = " NodeDisconnected" ;
12311248 if (NodeSystemState.count (nodeId) && NodeSystemState[nodeId].Error (error)) {
@@ -1310,8 +1327,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
13101327 }
13111328
13121329 void HandleTimeout (TEvents::TEvWakeup::TPtr& ev) {
1330+ Cerr << " aaaaa HandleTimeout" << Endl;
13131331 switch (ev->Get ()->Tag ) {
13141332 case TimeoutBSC:
1333+ Cerr << " aaaaa TimeoutBSC" << Endl;
13151334 Span.Event (" TimeoutBSC" );
13161335 if (!HaveAllBSControllerInfo ()) {
13171336 if (FilterDatabase.empty () || FilterDatabase == DomainPath) {
@@ -1632,14 +1651,32 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
16321651 RequestDone (" TEvListTenantsResponse" );
16331652 }
16341653
1654+ // void Handle(TEvWhiteboard::TEvSystemStateResponse::TPtr& ev) {
1655+ // TNodeId nodeId = ev.Get()->Cookie;
1656+ // Cerr << "iiiiiiii TEvSystemStateResponse: nodeId: " << nodeId << Endl;
1657+ // if (!NodeSystemState[nodeId].Done()) {
1658+ // auto& nodeSystemState(NodeSystemState[nodeId]);
1659+ // nodeSystemState.Set(std::move(ev));
1660+ // for (NKikimrWhiteboard::TSystemStateInfo& state : *nodeSystemState->Record.MutableSystemStateInfo()) {
1661+ // state.set_nodeid(nodeId);
1662+ // MergedNodeSystemState[nodeId] = &state;
1663+ // }
1664+ // }
1665+ // RequestDone("TEvSystemStateResponse");
1666+ // }
1667+
16351668 void Handle (TEvWhiteboard::TEvSystemStateResponse::TPtr& ev) {
16361669 TNodeId nodeId = ev.Get ()->Cookie ;
1670+ Cerr << " iiiiiiii TEvSystemStateResponse: nodeId: " << nodeId << Endl;
1671+
16371672 auto & nodeSystemState (NodeSystemState[nodeId]);
16381673 nodeSystemState.Set (std::move (ev));
16391674 for (NKikimrWhiteboard::TSystemStateInfo& state : *nodeSystemState->Record .MutableSystemStateInfo ()) {
1675+ Cerr << " iiiiiiii Fill " << Endl;
16401676 state.set_nodeid (nodeId);
16411677 MergedNodeSystemState[nodeId] = &state;
16421678 }
1679+
16431680 RequestDone (" TEvSystemStateResponse" );
16441681 }
16451682
@@ -1891,10 +1928,12 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
18911928 rrContext.ReportStatus (Ydb::Monitoring::StatusFlag::GREEN);
18921929 }
18931930
1931+ Cerr << " iiiiiiii FillComputeNodeStatus: nodeId: " << nodeId << Endl;
18941932 auto itNodeSystemState = MergedNodeSystemState.find (nodeId);
18951933 if (itNodeSystemState != MergedNodeSystemState.end ()) {
18961934 const NKikimrWhiteboard::TSystemStateInfo& nodeSystemState (*itNodeSystemState->second );
18971935
1936+ Cerr << " iiiiiiii poolstats: " << nodeSystemState.poolstats_size () << Endl;
18981937 for (const auto & poolStat : nodeSystemState.poolstats ()) {
18991938 TSelfCheckContext poolContext (&context, " COMPUTE_POOL" );
19001939 poolContext.Location .mutable_compute ()->mutable_pool ()->set_name (poolStat.name ());
@@ -1943,6 +1982,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
19431982 }
19441983 }
19451984 }
1985+ Cerr << " iiiiiiii nodeSystemState: 2 " << Endl;
19461986 } else {
19471987 // context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
19481988 // TStringBuilder() << "Compute node is not available",
@@ -2306,45 +2346,61 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
23062346
23072347 void Handle (TEvWhiteboard::TEvVDiskStateResponse::TPtr& ev) {
23082348 TNodeId nodeId = ev.Get ()->Cookie ;
2309- auto & nodeVDiskState (NodeVDiskState[nodeId]);
2310- nodeVDiskState.Set (std::move (ev));
2311- for (NKikimrWhiteboard::TVDiskStateInfo& state : *nodeVDiskState->Record .MutableVDiskStateInfo ()) {
2312- state.set_nodeid (nodeId);
2313- auto id = GetVDiskId (state.vdiskid ());
2314- MergedVDiskState[id] = &state;
2349+ if (!NodeVDiskState.count (nodeId)) {
2350+ auto & nodeVDiskState (NodeVDiskState[nodeId]);
2351+ nodeVDiskState.Set (std::move (ev));
2352+ for (NKikimrWhiteboard::TVDiskStateInfo& state : *nodeVDiskState->Record .MutableVDiskStateInfo ()) {
2353+ state.set_nodeid (nodeId);
2354+ auto id = GetVDiskId (state.vdiskid ());
2355+ MergedVDiskState[id] = &state;
2356+ }
2357+ }
2358+
2359+ TString error = " NodeDisconnected" ;
2360+ if (NodeSystemState.count (nodeId) && NodeSystemState[nodeId].Error (error)) {
2361+ if (!RetryRequestNodeWhiteboard<TEvWhiteboard::TEvSystemStateRequest>(nodeId)) {
2362+ Cerr << " iiiiiii Retry" << Endl;
2363+ RequestDone (" node disconnected with TEvSystemStateRequest" );
2364+ UnavailableComputeNodes.insert (nodeId);
2365+ }
23152366 }
2367+
23162368 RequestDone (" TEvVDiskStateResponse" );
23172369 }
23182370
23192371 void Handle (TEvWhiteboard::TEvPDiskStateResponse::TPtr& ev) {
23202372 TNodeId nodeId = ev.Get ()->Cookie ;
2321- auto & nodePDiskState (NodePDiskState[nodeId]);
2322- nodePDiskState.Set (std::move (ev));
2323- for (NKikimrWhiteboard::TPDiskStateInfo& state : *nodePDiskState->Record .MutablePDiskStateInfo ()) {
2324- state.set_nodeid (nodeId);
2325- auto id = GetPDiskId (state);
2326- MergedPDiskState[id] = &state;
2373+ if (!NodePDiskState.count (nodeId)) {
2374+ auto & nodePDiskState (NodePDiskState[nodeId]);
2375+ nodePDiskState.Set (std::move (ev));
2376+ for (NKikimrWhiteboard::TPDiskStateInfo& state : *nodePDiskState->Record .MutablePDiskStateInfo ()) {
2377+ state.set_nodeid (nodeId);
2378+ auto id = GetPDiskId (state);
2379+ MergedPDiskState[id] = &state;
2380+ }
23272381 }
23282382 RequestDone (" TEvPDiskStateResponse" );
23292383 }
23302384
23312385 void Handle (TEvWhiteboard::TEvBSGroupStateResponse::TPtr& ev) {
23322386 ui64 nodeId = ev.Get ()->Cookie ;
2333- auto & nodeBSGroupState (NodeBSGroupState[nodeId]);
2334- nodeBSGroupState.Set (std::move (ev));
2335- for (NKikimrWhiteboard::TBSGroupStateInfo& state : *nodeBSGroupState->Record .MutableBSGroupStateInfo ()) {
2336- state.set_nodeid (nodeId);
2337- TString storagePoolName = state.storagepoolname ();
2338- TGroupID groupId (state.groupid ());
2339- const NKikimrWhiteboard::TBSGroupStateInfo*& current (MergedBSGroupState[state.groupid ()]);
2340- if (current == nullptr || current->GetGroupGeneration () < state.GetGroupGeneration ()) {
2341- current = &state;
2342- }
2343- if (storagePoolName.empty () && groupId.ConfigurationType () != EGroupConfigurationType::Static) {
2344- continue ;
2387+ if (!NodeBSGroupState.count (nodeId)) {
2388+ auto & nodeBSGroupState (NodeBSGroupState[nodeId]);
2389+ nodeBSGroupState.Set (std::move (ev));
2390+ for (NKikimrWhiteboard::TBSGroupStateInfo& state : *nodeBSGroupState->Record .MutableBSGroupStateInfo ()) {
2391+ state.set_nodeid (nodeId);
2392+ TString storagePoolName = state.storagepoolname ();
2393+ TGroupID groupId (state.groupid ());
2394+ const NKikimrWhiteboard::TBSGroupStateInfo*& current (MergedBSGroupState[state.groupid ()]);
2395+ if (current == nullptr || current->GetGroupGeneration () < state.GetGroupGeneration ()) {
2396+ current = &state;
2397+ }
2398+ if (storagePoolName.empty () && groupId.ConfigurationType () != EGroupConfigurationType::Static) {
2399+ continue ;
2400+ }
2401+ StoragePoolStateByName[storagePoolName].Groups .emplace (state.groupid ());
2402+ StoragePoolStateByName[storagePoolName].Name = storagePoolName;
23452403 }
2346- StoragePoolStateByName[storagePoolName].Groups .emplace (state.groupid ());
2347- StoragePoolStateByName[storagePoolName].Name = storagePoolName;
23482404 }
23492405 RequestDone (" TEvBSGroupStateResponse" );
23502406 }
0 commit comments