@@ -2283,6 +2283,120 @@ Y_UNIT_TEST_SUITE(DataShardVolatile) {
2283
2283
UNIT_ASSERT_VALUES_EQUAL (volatileTxs, 2u );
2284
2284
}
2285
2285
2286
+ // Regression test for KIKIMR-21156
2287
+ Y_UNIT_TEST (VolatileCommitOnBlobStorageFailure) {
2288
+ TPortManager pm;
2289
+ TServerSettings serverSettings (pm.GetPort (2134 ));
2290
+ serverSettings.SetDomainName (" Root" )
2291
+ .SetUseRealThreads (false )
2292
+ .SetDomainPlanResolution (1000 )
2293
+ .SetEnableDataShardVolatileTransactions (true );
2294
+
2295
+ Tests::TServer::TPtr server = new TServer (serverSettings);
2296
+ auto &runtime = *server->GetRuntime ();
2297
+ auto sender = runtime.AllocateEdgeActor ();
2298
+
2299
+ runtime.SetLogPriority (NKikimrServices::TX_DATASHARD, NLog::PRI_TRACE);
2300
+ runtime.SetLogPriority (NKikimrServices::TX_PROXY, NLog::PRI_DEBUG);
2301
+ runtime.SetLogPriority (NKikimrServices::KQP_EXECUTER, NLog::PRI_TRACE);
2302
+ runtime.SetLogPriority (NKikimrServices::KQP_SESSION, NLog::PRI_TRACE);
2303
+
2304
+ InitRoot (server, sender);
2305
+
2306
+ TDisableDataShardLogBatching disableDataShardLogBatching;
2307
+ CreateShardedTable (server, sender, " /Root" , " table-1" , 1 );
2308
+ CreateShardedTable (server, sender, " /Root" , " table-2" , 1 );
2309
+
2310
+ // Make sure read flags are persisted by performing a snapshot read
2311
+ UNIT_ASSERT_VALUES_EQUAL (
2312
+ KqpSimpleExec (runtime, R"(
2313
+ SELECT key, value FROM `/Root/table-1`
2314
+ UNION ALL
2315
+ SELECT key, value FROM `/Root/table-2`
2316
+ ORDER BY key
2317
+ )" ),
2318
+ " " );
2319
+
2320
+ // Insert initial values
2321
+ ExecSQL (server, sender, Q_ (" UPSERT INTO `/Root/table-1` (key, value) VALUES (1, 10);" ));
2322
+ ExecSQL (server, sender, Q_ (" UPSERT INTO `/Root/table-2` (key, value) VALUES (2, 20);" ));
2323
+
2324
+ // Start blocking commits for table-1
2325
+ const auto shards1 = GetTableShards (server, sender, " /Root/table-1" );
2326
+ UNIT_ASSERT_VALUES_EQUAL (shards1.size (), 1u );
2327
+ std::deque<TEvBlobStorage::TEvPut::TPtr> blockedPuts;
2328
+ auto blockCommits = runtime.AddObserver <TEvBlobStorage::TEvPut>([&](TEvBlobStorage::TEvPut::TPtr& ev) {
2329
+ auto * msg = ev->Get ();
2330
+ // Drop all put requests for table-1
2331
+ if (msg->Id .TabletID () == shards1.at (0 )) {
2332
+ Cerr << " ... blocking put " << msg->Id << Endl;
2333
+ blockedPuts.push_back (std::move (ev));
2334
+ }
2335
+ });
2336
+
2337
+ // Start an upsert to table-1, this will block further readonly localdb tx completions
2338
+ Cerr << " ... starting an upsert to table-1" << Endl;
2339
+ auto firstUpsertFuture = KqpSimpleSend (runtime, R"(
2340
+ UPSERT INTO `/Root/table-1` (key, value) VALUES (3, 30);
2341
+ )" );
2342
+
2343
+ // Wait until puts are blocked
2344
+ WaitFor (runtime, [&]{ return blockedPuts.size () > 0 ; }, " blocked puts" );
2345
+ auto firstUpsertPuts = std::move (blockedPuts);
2346
+ UNIT_ASSERT (blockedPuts.empty ());
2347
+
2348
+ // Read from table-2 and write to table-1 based on the result
2349
+ // This will result in a two-shard volatile tx writing to table-1
2350
+ Cerr << " ... starting distributed tx between table-1 and table-2" << Endl;
2351
+ auto volatileFuture = KqpSimpleSend (runtime, R"(
2352
+ UPSERT INTO `/Root/table-1`
2353
+ SELECT key + 2u AS key, value + 2u AS value
2354
+ FROM `/Root/table-2`;
2355
+ )" );
2356
+
2357
+ // Wait until it also tries to commit
2358
+ WaitFor (runtime, [&]{ return blockedPuts.size () > 0 ; }, " blocked puts" );
2359
+
2360
+ // Now unblock the first upsert puts
2361
+ blockCommits.Remove ();
2362
+ for (auto & ev : firstUpsertPuts) {
2363
+ runtime.Send (ev.Release (), 0 , true );
2364
+ }
2365
+ firstUpsertPuts.clear ();
2366
+
2367
+ // And wait for it to finish successfully
2368
+ Cerr << " ... waiting for first upsert result" << Endl;
2369
+ UNIT_ASSERT_VALUES_EQUAL (
2370
+ FormatResult (AwaitResponse (runtime, std::move (firstUpsertFuture))),
2371
+ " <empty>" );
2372
+
2373
+ // Reply to everything previously blocked with an error, the shard will restart
2374
+ for (auto & ev : blockedPuts) {
2375
+ auto proxy = ev->Recipient ;
2376
+ ui32 groupId = GroupIDFromBlobStorageProxyID (proxy);
2377
+ auto res = ev->Get ()->MakeErrorResponse (NKikimrProto::ERROR, " Something went wrong" , groupId);
2378
+ runtime.Send (new IEventHandle (ev->Sender , proxy, res.release ()), 0 , true );
2379
+ }
2380
+
2381
+ // Wait for the volatile tx result
2382
+ Cerr << " ... waiting for volatile tx result" << Endl;
2383
+ auto result = FormatResult (AwaitResponse (runtime, std::move (volatileFuture)));
2384
+ if (result == " <empty>" ) {
2385
+ // A success result is not ok now, but in the future we might migrate state
2386
+ // Check that the supposedly committed row actually exists
2387
+ UNIT_ASSERT_VALUES_EQUAL (
2388
+ KqpSimpleExec (runtime, R"(
2389
+ SELECT key, value FROM `/Root/table-1` ORDER BY key;
2390
+ )" ),
2391
+ " { items { uint32_value: 1 } items { uint32_value: 10 } }, "
2392
+ " { items { uint32_value: 3 } items { uint32_value: 30 } }, "
2393
+ " { items { uint32_value: 4 } items { uint32_value: 22 } }" );
2394
+ } else {
2395
+ // Otherwise the result must be undetermined
2396
+ UNIT_ASSERT_VALUES_EQUAL (result, " ERROR: UNDETERMINED" );
2397
+ }
2398
+ }
2399
+
2286
2400
} // Y_UNIT_TEST_SUITE(DataShardVolatile)
2287
2401
2288
2402
} // namespace NKikimr
0 commit comments