Skip to content

Commit

Permalink
[#18124] YSQL: make backends catalog version check upgrade-aware
Browse files Browse the repository at this point in the history
Summary:
Commit a1729c3 titled

    [#7376] ysql: fix state waits for online CREATE INDEX

fixes CREATE INDEX state waits by adding WaitForBackendsCatalogVersion
RPCs that fan out to tservers.  It was not made rolling upgrade safe
because DDLs are normally not supported during rolling upgrade.
However, it is nice to still make a best effort to support the case, so
handle it by ignoring the version check when the following two cases
happen:

- tservers are on old version: then they don't recognize the RPC.
- YSQL upgrade did not run: if the src->dst src version is too old, it
  may not have the column "catalog_version" in pg_stat_activity.

One minor difference is that, previously, it had a hard-coded 1s sleep.
With the ignore logic, it may delay for less than that, but this likely
won't be the case in a real-world setting where RPC latencies are
decently high.

Also add a missing space in an existing log message.
Jira: DB-7155

Test Plan:
1. Do release build: ./yb_build.sh release.
1. Get an old release:

       curl "https://downloads.yugabyte.com/releases/2.16.4.0/yugabyte-2.16.4.0-b32-linux-x86_64.tar.gz" \
         | tar xzv -C /tmp

1. Run the following script passing in that dir:

       ./script.sh /tmp/yugabyte-2.16.4.0

   where

       #!/usr/bin/env bash
       # script.sh
       set -euxo pipefail

       if [ $# -ne 1 ]; then
         echo "Missing argument path to old release" >&2
         exit 1
       fi

       OLD_RELEASE_PATH=$1

       "$OLD_RELEASE_PATH"/bin/yb-ctl destroy
       "$OLD_RELEASE_PATH"/bin/yb-ctl create --rf 3

       "$OLD_RELEASE_PATH"/bin/ysqlsh -c 'CREATE TABLE t (i int)'

       YB_ADMIN_CMD_PREFIX=(
         build/latest/bin/yb-admin
         "--master_addresses=127.0.0.1,127.0.0.2,127.0.0.3"
       )

       function test_case() {
         master_idx=$1
         num_tservers_not_migrated=$2
         num_tservers_old_version=$3

         # Use fresh master for each test case in order to get a fresh set of logs.
         master_uuid=$("${YB_ADMIN_CMD_PREFIX[@]}" list_all_masters \
                         | sed "$((master_idx + 1))q;d" \
                         | awk '{print$1}')
         "${YB_ADMIN_CMD_PREFIX[@]}" master_leader_stepdown "$master_uuid"
         "${YB_ADMIN_CMD_PREFIX[@]}" list_all_masters \
           | sed "$((master_idx + 1))q;d" \
           | grep -q LEADER

         sleep 5

         bin/ysqlsh -c 'CREATE INDEX ON t (i)'
         # 2 backends catalog version requests per CREATE INDEX, hence x2.
         log_file=~/yugabyte-data/node-"$master_idx"/disk-1/yb-data/master/logs/yb-master.INFO
         test "$(grep -c "but ignoring for backwards compatibility" "$log_file")" \
              -eq $((num_tservers_not_migrated * 2))
         test "$(grep -c "is on an older version" "$log_file")" \
              -eq $((num_tservers_old_version * 2))
       }

       bin/yb-ctl restart_node 1 --master
       bin/yb-ctl restart_node 2 --master
       bin/yb-ctl restart_node 3 --master

       # Test case: not all tservers upgraded.
       # - 1 tserver: up-to-date version, but YSQL upgrade did not happen.
       # - 2 tservers: old version.
       bin/yb-ctl restart_node 1
       test_case 1 1 2

       # Test case: all tservers upgraded, but ysql upgrade not run.
       bin/yb-ctl restart_node 2
       bin/yb-ctl restart_node 3
       test_case 2 3 0

       # Test case: everything upgraded.
       "${YB_ADMIN_CMD_PREFIX[@]}" upgrade_ysql
       test_case 3 0 0

Close: #18124

Reviewers: skedia

Reviewed By: skedia

Subscribers: ybase, bogdan

Differential Revision: https://phorge.dev.yugabyte.com/D26727
  • Loading branch information
jaki committed Jul 13, 2023
1 parent 825ce0e commit 811fa92
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 17 deletions.
12 changes: 9 additions & 3 deletions src/yb/master/async_rpc_tasks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -329,15 +329,21 @@ void RetryingTSRpcTask::DoRpcCallback() {
LOG_WITH_PREFIX(WARNING) << "TS " << target_ts_desc_->permanent_uuid() << ": "
<< type_name() << " RPC failed for tablet "
<< tablet_id() << ": " << rpc_.status().ToString();
if (!target_ts_desc_->IsLive()) {
if (type() == MonitoredTaskType::kBackendsCatalogVersionTs && rpc_.status().IsRemoteError() &&
rpc_.status().message().ToBuffer().find("invalid method name:") != std::string::npos) {
LOG_WITH_PREFIX(WARNING)
<< "TS " << target_ts_desc_->permanent_uuid() << " is on an older version that doesn't"
<< " support backends catalog version RPC. Ignoring.";
TransitionToCompleteState();
} else if (!target_ts_desc_->IsLive()) {
switch (type()) {
case MonitoredTaskType::kBackendsCatalogVersionTs:
// A similar check is done in BackendsCatalogVersionTS::HandleResponse. This check is hit
// when this RPC failed and tserver is dead. That check is hit when this RPC succeeded
// and tserver is dead.
LOG_WITH_PREFIX(WARNING)
<< "TS " << target_ts_desc_->permanent_uuid() << "is DEAD. Assume backends on that TS"
<< " will be resolved to sufficient catalog version";
<< "TS " << target_ts_desc_->permanent_uuid() << " is DEAD. Assume backends on that"
<< " TS will be resolved to sufficient catalog version";
TransitionToCompleteState();
break;
case MonitoredTaskType::kDeleteReplica:
Expand Down
52 changes: 38 additions & 14 deletions src/yb/master/ysql_backends_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,17 @@ void BackendsCatalogVersionTS::HandleResponse(int attempt) {
// load.
should_retry = false;
}
if (status.message().ToBuffer().find("column \"catalog_version\" does not exist") !=
std::string::npos) {
// This likely means ysql upgrade hasn't happened yet. Succeed for backwards
// compatibility.
TransitionToCompleteState();
LOG_WITH_PREFIX(INFO)
<< "wait for backends catalog version failed: " << status
<< ", but ignoring for backwards compatibility";
found_behind_ = true;
return;
}
}
break;
case TabletServerErrorPB::OPERATION_NOT_SUPPORTED:
Expand Down Expand Up @@ -802,22 +813,30 @@ void BackendsCatalogVersionTS::UnregisterAsyncTaskCallback() {
return;
}

// Identify dead tserver.
bool was_dead = found_dead_;
// There are multiple cases where we consider a tserver to be resolved:
// - Num lagging backends is zero: directly resolved
// - Tserver was found dead in HandleResponse: indirectly resolved assuming issue #13369.
// - Tserver was found dead in DoRpcCallback: (same).
// - Tserver was found behind in HandleResponse: indirectly resolved for compatibility during
// upgrade: it is possible backends are actually behind.
// - Tserver was found behind in DoRpcCallback: (same).
bool indirectly_resolved = found_dead_ || found_behind_;
if (!rpc_.status().ok()) {
// Only way for rpc status to be not okay is from ts not being live in
// RetryingTSRpcTask::DoRpcCallback, resulting in TransitionToCompleteState.
// Only way for rpc status to be not okay is from TransitionToCompleteState in
// RetryingTSRpcTask::DoRpcCallback. That can happen in any of the following ways:
// - ts died.
// - ts is on an older version that doesn't support the RPC.
DCHECK_EQ(state(), MonitoredTaskState::kComplete);
DCHECK(!resp_.has_error()) << LogPrefix() << resp_.error().ShortDebugString();
was_dead = true;
indirectly_resolved = true;
}

// Find failures.
Status status;
if (!was_dead) {
// Only live tservers can be considered to have failures since dead tservers are considered
// resolved. This check comes first because dead tservers could still get resp error like
// catalog version too old.
if (!indirectly_resolved) {
// Only live tservers can be considered to have failures since dead or behind tservers are
// considered resolved. Dead tservers could still get resp error like catalog version too old,
// but we don't want to throw error when they are already considered resolved.
// TODO(#13369): ensure dead tservers abort/block ops until they successfully heartbeat.

if (resp_.has_error()) {
Expand All @@ -839,11 +858,16 @@ void BackendsCatalogVersionTS::UnregisterAsyncTaskCallback() {
}

if (auto job = job_.lock()) {
if (was_dead) {
// Dead tservers are assumed to be resolved to latest.
// TODO(#13369): ensure dead tservers abort/block ops until they successfully heartbeat.
LOG_WITH_PREFIX(INFO)
<< "tserver died, so assuming its backends are at latest catalog version";
if (indirectly_resolved) {
// There are three cases of indirectly resolved tservers, outlined in a comment above.
if (found_dead_ || !target_ts_desc_->IsLive()) {
// The two tserver-found-dead cases.
LOG_WITH_PREFIX(INFO)
<< "tserver died, so assuming its backends are at latest catalog version";
} else {
// The two tserver-found-behind cases.
LOG_WITH_PREFIX(INFO) << "tserver behind, so skipping backends catalog version check";
}
job->Update(permanent_uuid(), 0);
} else if (status.ok()) {
DCHECK(resp_.has_num_lagging_backends());
Expand Down
4 changes: 4 additions & 0 deletions src/yb/master/ysql_backends_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,10 @@ class BackendsCatalogVersionTS : public RetryingTSRpcTask {
// master-to-tserver RPC deadline.
const int prev_num_lagging_backends_;

// Whether the tserver is considered behind. This is checked and set true on HandleResponse when
// the response error complains about mismatched schema most likely due to not having run
// upgrade_ysql.
bool found_behind_ = false;
// Whether the tserver is considered dead (expired). This is checked and set true on
// HandleResponse. It may be the case that this is not set and tserver is found dead through a
// different way (e.g. rpc failure).
Expand Down

0 comments on commit 811fa92

Please sign in to comment.