44#include " viewer.h"
55#include " viewer_helper.h"
66#include " viewer_tabletinfo.h"
7+ #include < ydb/library/actors/interconnect/interconnect.h>
78#include < ydb/public/api/protos/ydb_bridge_common.pb.h>
89
910namespace NKikimr ::NViewer {
@@ -32,6 +33,10 @@ class TJsonCluster : public TViewerPipeClient {
3233 std::unordered_map<TNodeId, TRequestResponse<TEvViewer::TEvViewerResponse>> SystemViewerResponse;
3334 std::unordered_map<TNodeId, TRequestResponse<TEvViewer::TEvViewerResponse>> TabletViewerResponse;
3435
36+ std::optional<TRequestResponse<NHealthCheck::TEvSelfCheckResult>> SelfCheckResult; // from the local health check
37+ std::optional<TRequestResponse<NHealthCheck::TEvSelfCheckResultProto>> SelfCheckResultProto; // from the metadata cache service
38+ TViewerPipeClient::TRequestResponse<TEvStateStorage::TEvBoardInfo> MetadataCacheEndpointsLookup;
39+
3540 struct TNode {
3641 TEvInterconnect::TNodeInfo NodeInfo;
3742 NKikimrWhiteboard::TSystemStateInfo SystemState;
@@ -91,26 +96,27 @@ class TJsonCluster : public TViewerPipeClient {
9196 std::unordered_set<TTabletId> FilterTablets;
9297 bool OffloadMerge = true ;
9398 size_t OffloadMergeAttempts = 2 ;
99+ bool UseHealthCheck = true ;
100+ bool UseHealthCheckCache = false ; // doesn't work for domain ?
101+ TString DomainName;
94102 TTabletId RootHiveId = 0 ;
95- TJsonSettings JsonSettings;
96- ui32 Timeout;
97103 bool Tablets = false ;
98104
99105public:
100106 TJsonCluster (IViewer* viewer, NMon::TEvHttpInfo::TPtr& ev)
101107 : TViewerPipeClient(viewer, ev)
102108 {
103- const auto & params (Event->Get ()->Request .GetParams ());
104- JsonSettings.EnumAsNumbers = !FromStringWithDefault<bool >(params.Get (" enums" ), true );
105- JsonSettings.UI64AsString = !FromStringWithDefault<bool >(params.Get (" ui64" ), false );
106- InitConfig (params);
107- Tablets = FromStringWithDefault<bool >(params.Get (" tablets" ), false );
108- Timeout = FromStringWithDefault<ui32>(params.Get (" timeout" ), 10000 );
109- OffloadMerge = FromStringWithDefault<bool >(params.Get (" offload_merge" ), OffloadMerge);
110- OffloadMergeAttempts = FromStringWithDefault<bool >(params.Get (" offload_merge_attempts" ), OffloadMergeAttempts);
111109 }
112110
113111 void Bootstrap () override {
112+ Tablets = FromStringWithDefault<bool >(Params.Get (" tablets" ), false );
113+ OffloadMerge = FromStringWithDefault<bool >(Params.Get (" offload_merge" ), OffloadMerge);
114+ OffloadMergeAttempts = FromStringWithDefault<bool >(Params.Get (" offload_merge_attempts" ), OffloadMergeAttempts);
115+ UseHealthCheck = FromStringWithDefault<bool >(Params.Get (" use_health_check" ), UseHealthCheck);
116+ UseHealthCheckCache = FromStringWithDefault<bool >(Params.Get (" use_health_check_cache" ), UseHealthCheckCache);
117+ if (!UseHealthCheck) {
118+ UseHealthCheckCache = false ;
119+ }
114120 NodesInfoResponse = MakeRequest<TEvInterconnect::TEvNodesInfo>(GetNameserviceActorId (), new TEvInterconnect::TEvListNodes ());
115121 if (AppData ()->BridgeModeEnabled ) {
116122 NodeWardenStorageConfigResponse = MakeRequest<TEvNodeWardenStorageConfig>(MakeBlobStorageNodeWardenID (SelfId ().NodeId ()),
@@ -124,7 +130,8 @@ class TJsonCluster : public TViewerPipeClient {
124130 ListTenantsResponse = MakeRequestConsoleListTenants ();
125131 if (AppData ()->DomainsInfo && AppData ()->DomainsInfo ->Domain ) {
126132 TIntrusivePtr<TDomainsInfo> domains = AppData ()->DomainsInfo ;
127- ClusterInfo.SetDomain (TStringBuilder () << " /" << AppData ()->DomainsInfo ->Domain ->Name );
133+ DomainName = TStringBuilder () << " /" << AppData ()->DomainsInfo ->Domain ->Name ;
134+ ClusterInfo.SetDomain (DomainName);
128135 if (const auto & domain = domains->Domain ) {
129136 for (TTabletId id : domain->Coordinators ) {
130137 FilterTablets.insert (id);
@@ -146,8 +153,16 @@ class TJsonCluster : public TViewerPipeClient {
146153 FilterTablets.insert (MakeNodeBrokerID ());
147154 FilterTablets.insert (MakeTenantSlotBrokerID ());
148155 FilterTablets.insert (MakeConsoleID ());
156+
157+ if (UseHealthCheck) {
158+ if (UseHealthCheckCache && AppData ()->FeatureFlags .GetEnableDbMetadataCache ()) {
159+ MetadataCacheEndpointsLookup = MakeRequestStateStorageMetadataCacheEndpointsLookup (DomainName);
160+ } else {
161+ SendHealthCheckRequest ();
162+ }
163+ }
149164 }
150- Become (&TThis::StateWork, TDuration::MilliSeconds ( Timeout) , new TEvents::TEvWakeup ());
165+ Become (&TThis::StateWork, Timeout, new TEvents::TEvWakeup ());
151166 }
152167
153168private:
@@ -421,7 +436,7 @@ class TJsonCluster : public TViewerPipeClient {
421436 if (SystemViewerResponse.count (nodeId) == 0 ) {
422437 auto viewerRequest = std::make_unique<TEvViewer::TEvViewerRequest>();
423438 InitSystemWhiteboardRequest (viewerRequest->Record .MutableSystemRequest ());
424- viewerRequest->Record .SetTimeout (Timeout / 2 );
439+ viewerRequest->Record .SetTimeout (Timeout. MilliSeconds () / 2 );
425440 for (const TNode* node : batch.NodesToAskAbout ) {
426441 viewerRequest->Record .MutableLocation ()->AddNodeId (node->NodeId );
427442 }
@@ -432,7 +447,7 @@ class TJsonCluster : public TViewerPipeClient {
432447 if (Tablets && batch.HasStaticNodes && TabletViewerResponse.count (nodeId) == 0 ) {
433448 auto viewerRequest = std::make_unique<TEvViewer::TEvViewerRequest>();
434449 InitTabletWhiteboardRequest (viewerRequest->Record .MutableTabletRequest ());
435- viewerRequest->Record .SetTimeout (Timeout / 2 );
450+ viewerRequest->Record .SetTimeout (Timeout. MilliSeconds () / 2 );
436451 for (const TNode* node : batch.NodesToAskAbout ) {
437452 if (node->Static ) {
438453 viewerRequest->Record .MutableLocation ()->AddNodeId (node->NodeId );
@@ -639,6 +654,45 @@ class TJsonCluster : public TViewerPipeClient {
639654 RequestDone ();
640655 }
641656
657+ std::unique_ptr<NHealthCheck::TEvSelfCheckRequest> MakeSelfCheckRequest () {
658+ auto request = std::make_unique<NHealthCheck::TEvSelfCheckRequest>();
659+ request->Database = DomainName;
660+ return request;
661+ }
662+
663+ void SendHealthCheckRequest () {
664+ SelfCheckResult = MakeRequest<NHealthCheck::TEvSelfCheckResult>(NHealthCheck::MakeHealthCheckID (), MakeSelfCheckRequest ().release ());
665+ }
666+
667+ void Handle (NHealthCheck::TEvSelfCheckResult::TPtr& ev) {
668+ if (SelfCheckResult->Set (std::move (ev))) {
669+ RequestDone ();
670+ }
671+ }
672+
673+ void Handle (NHealthCheck::TEvSelfCheckResultProto::TPtr& ev) {
674+ if (SelfCheckResultProto->Set (std::move (ev))) {
675+ RequestDone ();
676+ }
677+ }
678+
679+ void Handle (TEvStateStorage::TEvBoardInfo::TPtr& ev) {
680+ if (MetadataCacheEndpointsLookup.Set (std::move (ev))) {
681+ if (MetadataCacheEndpointsLookup.IsOk ()) {
682+ auto activeNode = TDatabaseMetadataCache::PickActiveNode (MetadataCacheEndpointsLookup->InfoEntries );
683+ if (activeNode != 0 ) {
684+ TActorId cache = MakeDatabaseMetadataCacheId (activeNode);
685+ auto request = std::make_unique<NHealthCheck::TEvSelfCheckRequestProto>();
686+ SelfCheckResultProto = MakeRequest<NHealthCheck::TEvSelfCheckResultProto>(cache, request.release (), IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession, activeNode);
687+ }
688+ }
689+ if (!SelfCheckResultProto) {
690+ SendHealthCheckRequest ();
691+ }
692+ RequestDone ();
693+ }
694+ }
695+
642696 void Handle (TEvInterconnect::TEvNodesInfo::TPtr& ev) {
643697 if (NodesInfoResponse->Set (std::move (ev))) {
644698 ProcessResponses ();
@@ -903,6 +957,8 @@ class TJsonCluster : public TViewerPipeClient {
903957 hFunc (NSysView::TEvSysView::TEvGetVSlotsResponse, Handle);
904958 hFunc (NSysView::TEvSysView::TEvGetGroupsResponse, Handle);
905959 hFunc (TEvHive::TEvResponseHiveNodeStats, Handle);
960+ hFunc (NHealthCheck::TEvSelfCheckResult, Handle);
961+ hFunc (NHealthCheck::TEvSelfCheckResultProto, Handle);
906962 hFunc (TEvents::TEvUndelivered, Undelivered);
907963 hFunc (TEvInterconnect::TEvNodeDisconnected, Disconnected);
908964 hFunc (TEvTabletPipe::TEvClientConnected, Handle);
@@ -922,6 +978,21 @@ class TJsonCluster : public TViewerPipeClient {
922978 }
923979 }
924980
981+ static NKikimrWhiteboard::EFlag GetClusterStateFromSelfCheck (const Ydb::Monitoring::SelfCheckResult& result) {
982+ switch (result.self_check_result ()) {
983+ case Ydb::Monitoring::SelfCheck::GOOD:
984+ return NKikimrWhiteboard::EFlag::Green;
985+ case Ydb::Monitoring::SelfCheck::DEGRADED:
986+ return NKikimrWhiteboard::EFlag::Yellow;
987+ case Ydb::Monitoring::SelfCheck::MAINTENANCE_REQUIRED:
988+ return NKikimrWhiteboard::EFlag::Red;
989+ case Ydb::Monitoring::SelfCheck::EMERGENCY:
990+ return NKikimrWhiteboard::EFlag::Red;
991+ default :
992+ return NKikimrWhiteboard::EFlag::Grey;
993+ }
994+ }
995+
925996 void ReplyAndPassAway () override {
926997 if (StorageStatsResponse && StorageStatsResponse->IsOk ()) {
927998 for (NKikimrSysView::TStorageStatsEntry& entry : *StorageStatsResponse->Get ()->Record .MutableEntries ()) {
@@ -1018,20 +1089,26 @@ class TJsonCluster : public TViewerPipeClient {
10181089 if (CachedDataMaxAge) {
10191090 ClusterInfo.SetCachedDataMaxAge (CachedDataMaxAge.MilliSeconds ());
10201091 }
1021- NKikimrWhiteboard::EFlag worstState = NKikimrWhiteboard::EFlag::Grey;
1022- ui64 worstNodes = 0 ;
1023- for (NKikimrWhiteboard::EFlag flag = NKikimrWhiteboard::EFlag::Grey; flag <= NKikimrWhiteboard::EFlag::Red; flag = NKikimrWhiteboard::EFlag (flag + 1 )) {
1024- auto itNodes = ClusterInfo.GetMapNodeStates ().find (NKikimrWhiteboard::EFlag_Name (flag));
1025- if (itNodes == ClusterInfo.GetMapNodeStates ().end ()) {
1026- continue ;
1027- }
1028- auto & nodes = itNodes->second ;
1029- if (nodes > worstNodes / 100 ) { // only if it's more than 1% of all nodes
1030- worstState = flag;
1092+ NKikimrWhiteboard::EFlag clusterState = NKikimrWhiteboard::EFlag::Grey;
1093+ if (SelfCheckResult && SelfCheckResult->IsOk ()) {
1094+ clusterState = GetClusterStateFromSelfCheck (SelfCheckResult->Get ()->Result );
1095+ } else if (SelfCheckResultProto && SelfCheckResultProto->IsOk ()) {
1096+ clusterState = GetClusterStateFromSelfCheck (SelfCheckResultProto->Get ()->Record );
1097+ } else {
1098+ ui64 worstNodes = 0 ;
1099+ for (NKikimrWhiteboard::EFlag flag = NKikimrWhiteboard::EFlag::Grey; flag <= NKikimrWhiteboard::EFlag::Red; flag = NKikimrWhiteboard::EFlag (flag + 1 )) {
1100+ auto itNodes = ClusterInfo.GetMapNodeStates ().find (NKikimrWhiteboard::EFlag_Name (flag));
1101+ if (itNodes == ClusterInfo.GetMapNodeStates ().end ()) {
1102+ continue ;
1103+ }
1104+ auto & nodes = itNodes->second ;
1105+ if (nodes > worstNodes / 100 ) { // only if it's more than 1% of all nodes
1106+ clusterState = flag;
1107+ }
1108+ worstNodes += nodes;
10311109 }
1032- worstNodes += nodes;
10331110 }
1034- ClusterInfo.SetOverall (GetViewerFlag (worstState ));
1111+ ClusterInfo.SetOverall (GetViewerFlag (clusterState ));
10351112 TBase::ReplyAndPassAway (GetHTTPOKJSON (ClusterInfo));
10361113 }
10371114
0 commit comments