diff --git a/src/yb/master/async_rpc_tasks.cc b/src/yb/master/async_rpc_tasks.cc index 5d61f88b009c..723e97b7c456 100644 --- a/src/yb/master/async_rpc_tasks.cc +++ b/src/yb/master/async_rpc_tasks.cc @@ -329,15 +329,21 @@ void RetryingTSRpcTask::DoRpcCallback() { LOG_WITH_PREFIX(WARNING) << "TS " << target_ts_desc_->permanent_uuid() << ": " << type_name() << " RPC failed for tablet " << tablet_id() << ": " << rpc_.status().ToString(); - if (!target_ts_desc_->IsLive()) { + if (type() == MonitoredTaskType::kBackendsCatalogVersionTs && rpc_.status().IsRemoteError() && + rpc_.status().message().ToBuffer().find("invalid method name:") != std::string::npos) { + LOG_WITH_PREFIX(WARNING) + << "TS " << target_ts_desc_->permanent_uuid() << " is on an older version that doesn't" + << " support backends catalog version RPC. Ignoring."; + TransitionToCompleteState(); + } else if (!target_ts_desc_->IsLive()) { switch (type()) { case MonitoredTaskType::kBackendsCatalogVersionTs: // A similar check is done in BackendsCatalogVersionTS::HandleResponse. This check is hit // when this RPC failed and tserver is dead. That check is hit when this RPC succeeded // and tserver is dead. LOG_WITH_PREFIX(WARNING) - << "TS " << target_ts_desc_->permanent_uuid() << "is DEAD. Assume backends on that TS" - << " will be resolved to sufficient catalog version"; + << "TS " << target_ts_desc_->permanent_uuid() << " is DEAD. Assume backends on that" + << " TS will be resolved to sufficient catalog version"; TransitionToCompleteState(); break; case MonitoredTaskType::kDeleteReplica: diff --git a/src/yb/master/ysql_backends_manager.cc b/src/yb/master/ysql_backends_manager.cc index 8b130f562631..69ce095d9d75 100644 --- a/src/yb/master/ysql_backends_manager.cc +++ b/src/yb/master/ysql_backends_manager.cc @@ -765,6 +765,17 @@ void BackendsCatalogVersionTS::HandleResponse(int attempt) { // load. should_retry = false; } + if (status.message().ToBuffer().find("column \"catalog_version\" does not exist") != + std::string::npos) { + // This likely means ysql upgrade hasn't happened yet. Succeed for backwards + // compatibility. + TransitionToCompleteState(); + LOG_WITH_PREFIX(INFO) + << "wait for backends catalog version failed: " << status + << ", but ignoring for backwards compatibility"; + found_behind_ = true; + return; + } } break; case TabletServerErrorPB::OPERATION_NOT_SUPPORTED: @@ -802,22 +813,30 @@ void BackendsCatalogVersionTS::UnregisterAsyncTaskCallback() { return; } - // Identify dead tserver. - bool was_dead = found_dead_; + // There are multiple cases where we consider a tserver to be resolved: + // - Num lagging backends is zero: directly resolved + // - Tserver was found dead in HandleResponse: indirectly resolved assuming issue #13369. + // - Tserver was found dead in DoRpcCallback: (same). + // - Tserver was found behind in HandleResponse: indirectly resolved for compatibility during + // upgrade: it is possible backends are actually behind. + // - Tserver was found behind in DoRpcCallback: (same). + bool indirectly_resolved = found_dead_ || found_behind_; if (!rpc_.status().ok()) { - // Only way for rpc status to be not okay is from ts not being live in - // RetryingTSRpcTask::DoRpcCallback, resulting in TransitionToCompleteState. + // Only way for rpc status to be not okay is from TransitionToCompleteState in + // RetryingTSRpcTask::DoRpcCallback. That can happen in any of the following ways: + // - ts died. + // - ts is on an older version that doesn't support the RPC. DCHECK_EQ(state(), MonitoredTaskState::kComplete); DCHECK(!resp_.has_error()) << LogPrefix() << resp_.error().ShortDebugString(); - was_dead = true; + indirectly_resolved = true; } // Find failures. Status status; - if (!was_dead) { - // Only live tservers can be considered to have failures since dead tservers are considered - // resolved. This check comes first because dead tservers could still get resp error like - // catalog version too old. + if (!indirectly_resolved) { + // Only live tservers can be considered to have failures since dead or behind tservers are + // considered resolved. Dead tservers could still get resp error like catalog version too old, + // but we don't want to throw error when they are already considered resolved. // TODO(#13369): ensure dead tservers abort/block ops until they successfully heartbeat. if (resp_.has_error()) { @@ -839,11 +858,16 @@ void BackendsCatalogVersionTS::UnregisterAsyncTaskCallback() { } if (auto job = job_.lock()) { - if (was_dead) { - // Dead tservers are assumed to be resolved to latest. - // TODO(#13369): ensure dead tservers abort/block ops until they successfully heartbeat. - LOG_WITH_PREFIX(INFO) - << "tserver died, so assuming its backends are at latest catalog version"; + if (indirectly_resolved) { + // There are three cases of indirectly resolved tservers, outlined in a comment above. + if (found_dead_ || !target_ts_desc_->IsLive()) { + // The two tserver-found-dead cases. + LOG_WITH_PREFIX(INFO) + << "tserver died, so assuming its backends are at latest catalog version"; + } else { + // The two tserver-found-behind cases. + LOG_WITH_PREFIX(INFO) << "tserver behind, so skipping backends catalog version check"; + } job->Update(permanent_uuid(), 0); } else if (status.ok()) { DCHECK(resp_.has_num_lagging_backends()); diff --git a/src/yb/master/ysql_backends_manager.h b/src/yb/master/ysql_backends_manager.h index 38b7f09b3cb6..23851a0e21bb 100644 --- a/src/yb/master/ysql_backends_manager.h +++ b/src/yb/master/ysql_backends_manager.h @@ -397,6 +397,10 @@ class BackendsCatalogVersionTS : public RetryingTSRpcTask { // master-to-tserver RPC deadline. const int prev_num_lagging_backends_; + // Whether the tserver is considered behind. This is checked and set true on HandleResponse when + // the response error complains about mismatched schema most likely due to not having run + // upgrade_ysql. + bool found_behind_ = false; // Whether the tserver is considered dead (expired). This is checked and set true on // HandleResponse. It may be the case that this is not set and tserver is found dead through a // different way (e.g. rpc failure).