@@ -52,62 +52,6 @@ void CreateTableWithGlobalIndex(TTestEnv& env, const TString& databaseName, cons
5252 FillTable (env, databaseName, tableName, rowCount);
5353}
5454
55- void ValidateRowCount (TTestActorRuntime& runtime, ui32 nodeIndex, TPathId pathId, size_t expectedRowCount) {
56- auto statServiceId = NStat::MakeStatServiceID (runtime.GetNodeId (nodeIndex));
57- ui64 rowCount = 0 ;
58- while (rowCount == 0 ) {
59- NStat::TRequest req;
60- req.PathId = pathId;
61-
62- auto evGet = std::make_unique<TEvStatistics::TEvGetStatistics>();
63- evGet->StatType = NStat::EStatType::SIMPLE;
64- evGet->StatRequests .push_back (req);
65-
66- auto sender = runtime.AllocateEdgeActor (nodeIndex);
67- runtime.Send (statServiceId, sender, evGet.release (), nodeIndex, true );
68- auto evResult = runtime.GrabEdgeEventRethrow <TEvStatistics::TEvGetStatisticsResult>(sender);
69-
70- UNIT_ASSERT (evResult);
71- UNIT_ASSERT (evResult->Get ());
72- UNIT_ASSERT (evResult->Get ()->StatResponses .size () == 1 );
73-
74- auto rsp = evResult->Get ()->StatResponses [0 ];
75- auto stat = rsp.Simple ;
76-
77- rowCount = stat.RowCount ;
78-
79- if (rowCount != 0 ) {
80- UNIT_ASSERT (stat.RowCount == expectedRowCount);
81- break ;
82- }
83-
84- runtime.SimulateSleep (TDuration::Seconds (1 ));
85- }
86- }
87-
88- ui64 GetRowCount (TTestActorRuntime& runtime, ui32 nodeIndex, TPathId pathId) {
89- auto statServiceId = NStat::MakeStatServiceID (runtime.GetNodeId (nodeIndex));
90- NStat::TRequest req;
91- req.PathId = pathId;
92-
93- auto evGet = std::make_unique<TEvStatistics::TEvGetStatistics>();
94- evGet->StatType = NStat::EStatType::SIMPLE;
95- evGet->StatRequests .push_back (req);
96-
97- auto sender = runtime.AllocateEdgeActor (nodeIndex);
98- runtime.Send (statServiceId, sender, evGet.release (), nodeIndex, true );
99- auto evResult = runtime.GrabEdgeEventRethrow <TEvStatistics::TEvGetStatisticsResult>(sender);
100-
101- UNIT_ASSERT (evResult);
102- UNIT_ASSERT (evResult->Get ());
103- UNIT_ASSERT (evResult->Get ()->StatResponses .size () == 1 );
104-
105- auto rsp = evResult->Get ()->StatResponses [0 ];
106- auto stat = rsp.Simple ;
107-
108- return stat.RowCount ;
109- }
110-
11155} // namespace
11256
11357Y_UNIT_TEST_SUITE (BasicStatistics) {
@@ -375,6 +319,101 @@ Y_UNIT_TEST_SUITE(BasicStatistics) {
375319 UNIT_ASSERT_VALUES_EQUAL (sendCount, 2 ); // events from 2 serverless schemeshards
376320 UNIT_ASSERT_VALUES_EQUAL (propagateCount, 2 ); // SA -> node1 and node1 -> node2
377321 }
322+
323+ Y_UNIT_TEST (PersistenceWithStorageFailuresAndReboots) {
324+ TTestEnv env (1 , 2 );
325+ auto & runtime = *env.GetServer ().GetRuntime ();
326+
327+ const size_t rowCount1 = 5 ;
328+
329+ CreateDatabase (env, " Database" , 2 );
330+ CreateTable (env, " Database" , " Table" , rowCount1);
331+
332+ ui64 saTabletId = 0 ;
333+ auto pathId = ResolvePathId (runtime, " /Root/Database/Table" , nullptr , &saTabletId);
334+ ui64 ssTabletId = pathId.OwnerId ;
335+
336+ const ui32 nodeIdx = 1 ;
337+ const ui32 otherNodeIdx = 2 ;
338+
339+ // Block propagate events that go to node with otherNodeIdx. We will use this
340+ // node later as a clean slate.
341+ TBlockEvents<TEvStatistics::TEvPropagateStatistics> blockPropagate (runtime,
342+ [&](const TEvStatistics::TEvPropagateStatistics::TPtr& ev) {
343+ return ev->Recipient .NodeId () == runtime.GetNodeId (otherNodeIdx);
344+ });
345+
346+ // Wait until correct statistics gets reported
347+ ValidateRowCount (runtime, nodeIdx, pathId, rowCount1);
348+
349+ // Block persisting new updates from schemeshards on the aggregator.
350+ // This should result in old statistics being reported, even after new
351+ // updates arrive.
352+ TBlockEvents<TEvBlobStorage::TEvPut> blockPersistStats (runtime,
353+ [&](const TEvBlobStorage::TEvPut::TPtr& ev) {
354+ return ev->Get ()->Id .TabletID () == saTabletId;
355+ });
356+
357+ // Upsert some more data
358+ const size_t rowCount2 = 7 ;
359+ FillTable (env, " Database" , " Table" , rowCount2);
360+
361+ {
362+ // Wait for an update from SchemeShard with new row count.
363+
364+ bool statsUpdateSent = false ;
365+ auto sendObserver = runtime.AddObserver <TEvStatistics::TEvSchemeShardStats>([&](auto & ev){
366+ NKikimrStat::TSchemeShardStats statRecord;
367+ UNIT_ASSERT (statRecord.ParseFromString (ev->Get ()->Record .GetStats ()));
368+ for (const auto & entry : statRecord.GetEntries ()) {
369+ if (TPathId::FromProto (entry.GetPathId ()) == pathId
370+ && entry.GetAreStatsFull ()
371+ && entry.GetRowCount () == rowCount2) {
372+ statsUpdateSent = true ;
373+ }
374+ }
375+ });
376+ runtime.WaitFor (" TEvSchemeShardStats" , [&]{ return statsUpdateSent; });
377+
378+ bool propagateSent = false ;
379+ auto propagateObserver = runtime.AddObserver <TEvStatistics::TEvPropagateStatistics>([&](auto & ev){
380+ if (ev->Recipient .NodeId () == runtime.GetNodeId (nodeIdx)) {
381+ propagateSent = true ;
382+ }
383+ });
384+ runtime.WaitFor (" TEvPropagateStatistics" , [&]{ return propagateSent; });
385+ }
386+ UNIT_ASSERT_VALUES_EQUAL (GetRowCount (runtime, nodeIdx, pathId), rowCount1);
387+
388+ TActorId sender = runtime.AllocateEdgeActor ();
389+ RebootTablet (runtime, ssTabletId, sender);
390+
391+ // Simulate storage failure, StatisticsAggregator will reboot.
392+
393+ TBlockEvents<TEvStatistics::TEvSchemeShardStats> blockSSUpdates (runtime);
394+ UNIT_ASSERT_GT (blockPersistStats.size (), 0 );
395+ blockPersistStats.Stop ();
396+ for (auto & ev : blockPersistStats) {
397+ auto proxy = ev->Recipient ;
398+ ui32 groupId = GroupIDFromBlobStorageProxyID (proxy);
399+ auto res = ev->Get ()->MakeErrorResponse (
400+ NKikimrProto::ERROR, " Something went wrong" , TGroupId::FromValue (groupId));
401+ ui32 nodeIdx = ev->Sender .NodeId () - runtime.GetFirstNodeId ();
402+ runtime.Send (new IEventHandle (ev->Sender , proxy, res.release ()), nodeIdx, true );
403+ }
404+ TDispatchOptions rebootOptions;
405+ rebootOptions.FinalEvents .emplace_back (TEvTablet::EvBoot);
406+ runtime.DispatchEvents (rebootOptions);
407+
408+ // Check that after reboot the old value is still persisted by the Aggregator
409+ // and returned to the Service.
410+ blockPropagate.Stop ();
411+ UNIT_ASSERT_VALUES_EQUAL (GetRowCount (runtime, otherNodeIdx, pathId), rowCount1);
412+
413+ // After everything is healed, stats should get updated.
414+ blockSSUpdates.Stop ();
415+ WaitForRowCount (runtime, otherNodeIdx, pathId, rowCount2);
416+ }
378417}
379418
380419} // NSysView
0 commit comments