@@ -53,62 +53,6 @@ void CreateTableWithGlobalIndex(TTestEnv& env, const TString& databaseName, cons
5353 FillTable (env, databaseName, tableName, rowCount);
5454}
5555
56- void ValidateRowCount (TTestActorRuntime& runtime, ui32 nodeIndex, TPathId pathId, size_t expectedRowCount) {
57- auto statServiceId = NStat::MakeStatServiceID (runtime.GetNodeId (nodeIndex));
58- ui64 rowCount = 0 ;
59- while (rowCount == 0 ) {
60- NStat::TRequest req;
61- req.PathId = pathId;
62-
63- auto evGet = std::make_unique<TEvStatistics::TEvGetStatistics>();
64- evGet->StatType = NStat::EStatType::SIMPLE;
65- evGet->StatRequests .push_back (req);
66-
67- auto sender = runtime.AllocateEdgeActor (nodeIndex);
68- runtime.Send (statServiceId, sender, evGet.release (), nodeIndex, true );
69- auto evResult = runtime.GrabEdgeEventRethrow <TEvStatistics::TEvGetStatisticsResult>(sender);
70-
71- UNIT_ASSERT (evResult);
72- UNIT_ASSERT (evResult->Get ());
73- UNIT_ASSERT (evResult->Get ()->StatResponses .size () == 1 );
74-
75- auto rsp = evResult->Get ()->StatResponses [0 ];
76- auto stat = rsp.Simple ;
77-
78- rowCount = stat.RowCount ;
79-
80- if (rowCount != 0 ) {
81- UNIT_ASSERT (stat.RowCount == expectedRowCount);
82- break ;
83- }
84-
85- runtime.SimulateSleep (TDuration::Seconds (1 ));
86- }
87- }
88-
89- ui64 GetRowCount (TTestActorRuntime& runtime, ui32 nodeIndex, TPathId pathId) {
90- auto statServiceId = NStat::MakeStatServiceID (runtime.GetNodeId (nodeIndex));
91- NStat::TRequest req;
92- req.PathId = pathId;
93-
94- auto evGet = std::make_unique<TEvStatistics::TEvGetStatistics>();
95- evGet->StatType = NStat::EStatType::SIMPLE;
96- evGet->StatRequests .push_back (req);
97-
98- auto sender = runtime.AllocateEdgeActor (nodeIndex);
99- runtime.Send (statServiceId, sender, evGet.release (), nodeIndex, true );
100- auto evResult = runtime.GrabEdgeEventRethrow <TEvStatistics::TEvGetStatisticsResult>(sender);
101-
102- UNIT_ASSERT (evResult);
103- UNIT_ASSERT (evResult->Get ());
104- UNIT_ASSERT (evResult->Get ()->StatResponses .size () == 1 );
105-
106- auto rsp = evResult->Get ()->StatResponses [0 ];
107- auto stat = rsp.Simple ;
108-
109- return stat.RowCount ;
110- }
111-
11256} // namespace
11357
11458Y_UNIT_TEST_SUITE (BasicStatistics) {
@@ -291,6 +235,103 @@ Y_UNIT_TEST_SUITE(BasicStatistics) {
291235 auto pathId = ResolvePathId (runtime, " /Root/Serverless/Table/ValueIndex/indexImplTable" );
292236 ValidateRowCount (runtime, 1 , pathId, 5 );
293237 }
238+
239+ Y_UNIT_TEST (PersistenceWithStorageFailuresAndReboots) {
240+ TTestEnv env (1 , 2 );
241+ auto & runtime = *env.GetServer ().GetRuntime ();
242+
243+ const size_t rowCount1 = 5 ;
244+
245+ CreateDatabase (env, " Database" , 2 );
246+ CreateTable (env, " Database" , " Table" , rowCount1);
247+
248+ ui64 saTabletId = 0 ;
249+ auto pathId = ResolvePathId (runtime, " /Root/Database/Table" , nullptr , &saTabletId);
250+ ui64 ssTabletId = pathId.OwnerId ;
251+
252+ const ui32 nodeIdx = 1 ;
253+ const ui32 otherNodeIdx = 2 ;
254+
255+ // Block propagate events that go to node with otherNodeIdx. We will use this
256+ // node later as a clean slate.
257+ TBlockEvents<TEvStatistics::TEvPropagateStatistics> blockPropagate (runtime,
258+ [&](const TEvStatistics::TEvPropagateStatistics::TPtr& ev) {
259+ return ev->Recipient .NodeId () == runtime.GetNodeId (otherNodeIdx);
260+ });
261+
262+ // Wait until correct statistics gets reported
263+ ValidateRowCount (runtime, nodeIdx, pathId, rowCount1);
264+
265+ // Block persisting new updates from schemeshards on the aggregator.
266+ // This should result in old statistics being reported, even after new
267+ // updates arrive.
268+ TBlockEvents<TEvBlobStorage::TEvPut> blockPersistStats (runtime,
269+ [&](const TEvBlobStorage::TEvPut::TPtr& ev) {
270+ return ev->Get ()->Id .TabletID () == saTabletId;
271+ });
272+
273+ // Upsert some more data
274+ const size_t rowCount2 = 7 ;
275+ FillTable (env, " Database" , " Table" , rowCount2);
276+
277+ {
278+ // Wait for an update from SchemeShard with new row count.
279+
280+ bool statsUpdateSent = false ;
281+ auto sendObserver = runtime.AddObserver <TEvStatistics::TEvSchemeShardStats>([&](auto & ev){
282+ NKikimrStat::TSchemeShardStats statRecord;
283+ UNIT_ASSERT (statRecord.ParseFromString (ev->Get ()->Record .GetStats ()));
284+ for (const auto & entry : statRecord.GetEntries ()) {
285+ if (TPathId::FromProto (entry.GetPathId ()) == pathId
286+ && entry.GetAreStatsFull ()
287+ && entry.GetRowCount () == rowCount2) {
288+ statsUpdateSent = true ;
289+ }
290+ }
291+ });
292+ runtime.WaitFor (" TEvSchemeShardStats" , [&]{ return statsUpdateSent; });
293+
294+ bool propagateSent = false ;
295+ auto propagateObserver = runtime.AddObserver <TEvStatistics::TEvPropagateStatistics>([&](auto & ev){
296+ if (ev->Recipient .NodeId () == runtime.GetNodeId (nodeIdx)) {
297+ propagateSent = true ;
298+ }
299+ });
300+ runtime.WaitFor (" TEvPropagateStatistics" , [&]{ return propagateSent; });
301+ }
302+ UNIT_ASSERT_VALUES_EQUAL (GetRowCount (runtime, nodeIdx, pathId), rowCount1);
303+
304+ TActorId sender = runtime.AllocateEdgeActor ();
305+ RebootTablet (runtime, ssTabletId, sender);
306+
307+ // Simulate storage failure, StatisticsAggregator will reboot.
308+
309+ TBlockEvents<TEvStatistics::TEvSchemeShardStats> blockSSUpdates (runtime);
310+ UNIT_ASSERT_GT (blockPersistStats.size (), 0 );
311+ blockPersistStats.Stop ();
312+ for (auto & ev : blockPersistStats) {
313+ auto proxy = ev->Recipient ;
314+ ui32 groupId = GroupIDFromBlobStorageProxyID (proxy);
315+ auto res = ev->Get ()->MakeErrorResponse (
316+ NKikimrProto::ERROR, " Something went wrong" , TGroupId::FromValue (groupId));
317+ ui32 nodeIdx = ev->Sender .NodeId () - runtime.GetFirstNodeId ();
318+ runtime.Send (new IEventHandle (ev->Sender , proxy, res.release ()), nodeIdx, true );
319+ }
320+ TDispatchOptions rebootOptions;
321+ rebootOptions.FinalEvents .emplace_back (TEvTablet::EvBoot);
322+ runtime.DispatchEvents (rebootOptions);
323+
324+ // Check that after reboot the old value is still persisted by the Aggregator
325+ // and returned to the Service.
326+ blockPropagate.Stop ();
327+ UNIT_ASSERT_VALUES_EQUAL (GetRowCount (runtime, otherNodeIdx, pathId), rowCount1);
328+
329+ // After everything is healed, stats should get updated.
330+ blockSSUpdates.Stop ();
331+ // Wait takes a long time because of long send intervals in schemeshard, raise the limit.
332+ runtime.SetScheduledLimit (200000 );
333+ WaitForRowCount (runtime, otherNodeIdx, pathId, rowCount2);
334+ }
294335}
295336
296337} // NSysView
0 commit comments