Skip to content

Commit

Permalink
[BACKPORT 2.14][#20510] Docdb: Handle backfill responses getting inte…
Browse files Browse the repository at this point in the history
…rleaved across different backfill operations

Summary:
Original commit: 0fe1bba / D31571
If the master is slow, or if network delays cause a backfill response to be processed when a different backfill operation is running on the same table, we may run into

```
[m-1] [libprotobuf FATAL /opt/yb-build/thirdparty/yugabyte-db-thirdparty-v20231120041920-e307bff3a7-macos-arm64/installed/uninstrumented/include/google/protobuf/map.h:1064] CHECK failed: it != end(): key not found: ebd5a1637b4746f9bf4d6ab7218ffeee
```

It may affect both ysql and ycql backfills. However, it is less likely to be seen on ysql because the create index call in ysql is synchronous.

This change looks to check if the reponse being handled and the current backfill job are for the same indexes, if not it bails out gracefully.
Jira: DB-9516

Test Plan: ./yb_build.sh --cxx-test integration-tests_cassandra_cpp_driver-test --gtest_filter CppCassandraDriverTest.ConcurrentBackfillIndexFailures

Reviewers: jason, hsunder

Reviewed By: jason

Subscribers: ybase, bogdan

Tags: #jenkins-ready

Differential Revision: https://phorge.dev.yugabyte.com/D31601
  • Loading branch information
amitanandaiyer committed Jan 24, 2024
1 parent 3f49fa4 commit b0b6e14
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 0 deletions.
46 changes: 46 additions & 0 deletions src/yb/integration-tests/cassandra_cpp_driver-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2058,6 +2058,52 @@ TEST_F_EX(CppCassandraDriverTest, TestBackfillBatchingDisabled, CppCassandraDriv
testBackfillBatching(false);
}

TEST_F_EX(CppCassandraDriverTest, ConcurrentBackfillIndexFailures, CppCassandraDriverTest) {
ASSERT_OK(session_.ExecuteQuery("use test;"));
ASSERT_OK(
session_.ExecuteQuery("create table test.test_table (k int, v text, PRIMARY KEY (k)) "
"with transactions = {'enabled' : true} AND tablets = 10;"));

LOG(INFO) << "Creating two indexes that will NOT backfill together";
ASSERT_OK(cluster_->SetFlagOnMasters("allow_batching_non_deferred_indexes", "false"));
ASSERT_OK(cluster_->SetFlagOnMasters("TEST_block_do_backfill", "true"));

ASSERT_OK(
cluster_->SetFlagOnTServers("TEST_index_backfill_fail_after_random_wait_upto_ms", "1000"));
ASSERT_OK(session_.ExecuteQuery("CREATE INDEX test_table_index_by_v0 ON test_table (v)"));
ASSERT_OK(session_.ExecuteQuery("CREATE INDEX test_table_index_by_v1 ON test_table (v)"));

constexpr auto kNamespace = "test";
const YBTableName table_name(YQL_DATABASE_CQL, kNamespace, "test_table");
const YBTableName index_table_name0(YQL_DATABASE_CQL, kNamespace, "test_table_index_by_v0");
const YBTableName index_table_name1(YQL_DATABASE_CQL, kNamespace, "test_table_index_by_v1");

for (const auto& idx : {index_table_name0, index_table_name1}) {
auto perm = ASSERT_RESULT(client_->WaitUntilIndexPermissionsAtLeast(
table_name, idx, IndexPermissions::INDEX_PERM_DO_BACKFILL));
ASSERT_EQ(perm, IndexPermissions::INDEX_PERM_DO_BACKFILL);
}

// Allow backfill to get past GetSafeTime
ASSERT_OK(WaitForBackfillSafeTimeOn(cluster_.get(), table_name));

ASSERT_OK(cluster_->SetFlagOnMasters("TEST_block_do_backfill", "false"));

// Wait for the backfill to actually run to completion/failure.
for (const auto& idx : {index_table_name0, index_table_name1}) {
auto res = client_->WaitUntilIndexPermissionsAtLeast(
table_name, idx, IndexPermissions::INDEX_PERM_NOT_USED);
EXPECT_NOT_OK(res);
EXPECT_TRUE(res.status().IsNotFound());
}

// Wait for responses from "slower tablets" to also be received, as
// this is what caused the master to crash prior to #20510.
SleepFor(MonoDelta::FromSeconds(10));

ASSERT_TRUE(cluster_->master()->IsProcessAlive());
}

TEST_F_EX(
CppCassandraDriverTest,
DeleteIndexWhileBackfilling,
Expand Down
6 changes: 6 additions & 0 deletions src/yb/master/backfill_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,12 @@ Status BackfillTable::MarkIndexesAsDesired(
}
auto* backfill_state_pb = indexed_table_pb.mutable_backfill_jobs(0)->mutable_backfill_state();
for (const auto& idx_id : index_ids_set) {
auto iter = backfill_state_pb->find(idx_id);
if (iter == backfill_state_pb->end()) {
LOG(INFO) << "Index " << idx_id << " is not being backfilled. Current backfill_job: "
<< indexed_table_pb.backfill_jobs(0).ShortDebugString();
return STATUS_FORMAT(InvalidArgument, "Index $0 is not being backfilled", idx_id);
}
backfill_state_pb->at(idx_id) = state;
VLOG(2) << "Marking index " << idx_id << " as " << BackfillJobPB_State_Name(state);
}
Expand Down
14 changes: 14 additions & 0 deletions src/yb/tserver/tablet_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ DEFINE_int32(index_backfill_additional_delay_before_backfilling_ms, 0,
TAG_FLAG(index_backfill_additional_delay_before_backfilling_ms, evolving);
TAG_FLAG(index_backfill_additional_delay_before_backfilling_ms, runtime);

DEFINE_test_flag(int32, index_backfill_fail_after_random_wait_upto_ms, 0,
"If set to > 0 BackfillIndex calls will be failed after randomly waiting.");

DEFINE_int32(index_backfill_wait_for_old_txns_ms, 0,
"Index backfill needs to wait for transactions that started before the "
"WRITE_AND_DELETE phase to commit or abort before choosing a time for "
Expand Down Expand Up @@ -652,6 +655,17 @@ void TabletServiceAdminImpl::BackfillIndex(
return;
}

auto max_sleep_ms = GetAtomicFlag(&FLAGS_TEST_index_backfill_fail_after_random_wait_upto_ms);
if (max_sleep_ms > 0) {
auto rand_wait = RandomUniformInt(0, max_sleep_ms);
LOG(INFO) << "Randomly sleeping for " << rand_wait << " ms before failing";
SleepFor(1ms * rand_wait);
SetupErrorAndRespond(
resp->mutable_error(), STATUS_FORMAT(InvalidArgument, "Failing as requested for testing."),
TabletServerErrorPB::OPERATION_NOT_SUPPORTED, &context);
return;
}

const uint32_t our_schema_version = tablet.peer->tablet_metadata()->schema_version();
const uint32_t their_schema_version = req->schema_version();
bool all_at_backfill = true;
Expand Down

0 comments on commit b0b6e14

Please sign in to comment.