Skip to content

24-4: increase logging around tablet unlocks (#14741) #15669

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion ydb/core/base/hive.h
Original file line number Diff line number Diff line change
Expand Up @@ -705,15 +705,25 @@ namespace NKikimr {
Record.SetStatus(status);
Record.SetStatusMessage(statusMessage);
}

TString ToString() const override {
TStringStream str;
str << ToStringHeader() << "{Status: " << NKikimrProto::EReplyStatus_Name(Record.GetStatus()).data();
str << " TabletID: " << Record.GetTabletID();
str << " Message: " << Record.GetStatusMessage();
str << "}";
return str.Str();
}
};

struct TEvLockTabletExecutionLost : public TEventPB<TEvLockTabletExecutionLost,
NKikimrHive::TEvLockTabletExecutionLost, EvLockTabletExecutionLost>
{
TEvLockTabletExecutionLost() = default;

explicit TEvLockTabletExecutionLost(ui64 tabletId) {
explicit TEvLockTabletExecutionLost(ui64 tabletId, NKikimrHive::ELockLostReason reason) {
Record.SetTabletID(tabletId);
Record.SetReason(reason);
}
};

Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/hive.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ inline void Out<NKikimr::NHive::TCompleteNotifications>(IOutputStream& o, const
if (it != n.Notifications.begin()) {
o << ',';
}
o << Hex(it->first->Type) << " " << it->first.Get()->Recipient;
o << Hex(it->first->Type) << " " << it->first.Get()->Recipient << " " << it->first->ToString();
}
}
}
Expand Down
9 changes: 6 additions & 3 deletions ydb/core/mind/hive/hive_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "hive.h"
#include "tablet_info.h"
#include "node_info.h"
#include "leader_tablet_info.h"

namespace NKikimr {
namespace NHive {
Expand Down Expand Up @@ -69,12 +70,14 @@ struct TEvPrivate {
struct TEvUnlockTabletReconnectTimeout : TEventLocal<TEvUnlockTabletReconnectTimeout, EvUnlockTabletReconnectTimeout> {
ui64 TabletId;
ui64 SeqNo;
NKikimrHive::ELockLostReason Reason;

TEvUnlockTabletReconnectTimeout() = default;

explicit TEvUnlockTabletReconnectTimeout(ui64 tabletId, ui64 seqNo)
: TabletId(tabletId)
, SeqNo(seqNo)
explicit TEvUnlockTabletReconnectTimeout(const TLeaderTabletInfo& tablet, NKikimrHive::ELockLostReason reason)
: TabletId(tablet.Id)
, SeqNo(tablet.PendingUnlockSeqNo)
, Reason(reason)
{}
};

Expand Down
4 changes: 2 additions & 2 deletions ydb/core/mind/hive/hive_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
ITransaction* CreateReassignGroupsOnDecommit(ui32 groupId, std::unique_ptr<IEventHandle> reply);
ITransaction* CreateLockTabletExecution(const NKikimrHive::TEvLockTabletExecution& rec, const TActorId& sender, const ui64 cookie);
ITransaction* CreateUnlockTabletExecution(const NKikimrHive::TEvUnlockTabletExecution& rec, const TActorId& sender, const ui64 cookie);
ITransaction* CreateUnlockTabletExecution(ui64 tabletId, ui64 seqNo);
ITransaction* CreateUnlockTabletExecution(ui64 tabletId, ui64 seqNo, NKikimrHive::ELockLostReason reason);
ITransaction* CreateRequestTabletSequence(TEvHive::TEvRequestTabletIdSequence::TPtr event);
ITransaction* CreateResponseTabletSequence(TEvHive::TEvResponseTabletIdSequence::TPtr event);
ITransaction* CreateDisconnectNode(THolder<TEvInterconnect::TEvNodeDisconnected> event);
Expand Down Expand Up @@ -977,7 +977,7 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
void DeleteTabletWithoutStorage(TLeaderTabletInfo* tablet);
void DeleteTabletWithoutStorage(TLeaderTabletInfo* tablet, TSideEffects& sideEffects);
TInstant GetAllowedBootingTime();
void ScheduleUnlockTabletExecution(TNodeInfo& node);
void ScheduleUnlockTabletExecution(TNodeInfo& node, NKikimrHive::ELockLostReason reason);
TString DebugDomainsActiveNodes() const;
TResourceNormalizedValues GetStDevResourceValues() const;
bool IsTabletMoveExpedient(const TTabletInfo& tablet, const TNodeInfo& node) const;
Expand Down
17 changes: 9 additions & 8 deletions ydb/core/mind/hive/hive_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5791,12 +5791,13 @@ Y_UNIT_TEST_SUITE(THiveTest) {
UNIT_ASSERT_VALUES_EQUAL(result->Record.GetStatus(), expectedStatus);
}

void VerifyLockTabletExecutionLost(TTestActorRuntime& runtime, ui64 tabletId, const TActorId& owner) {
void VerifyLockTabletExecutionLost(TTestActorRuntime& runtime, ui64 tabletId, const TActorId& owner, NKikimrHive::ELockLostReason reason) {
TAutoPtr<IEventHandle> handle;
auto result = runtime.GrabEdgeEventRethrow<TEvHive::TEvLockTabletExecutionLost>(handle);
UNIT_ASSERT(result);
UNIT_ASSERT_VALUES_EQUAL(handle->GetRecipientRewrite(), owner);
UNIT_ASSERT_VALUES_EQUAL(result->Record.GetTabletID(), tabletId);
UNIT_ASSERT_VALUES_EQUAL(static_cast<i32>(result->Record.GetReason()), static_cast<i32>(reason));
}

Y_UNIT_TEST(TestLockTabletExecution) {
Expand Down Expand Up @@ -5870,7 +5871,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
WaitForTabletIsUp(runtime, tabletId, 0);

// Hive should try to notify owner on unlocking
VerifyLockTabletExecutionLost(runtime, tabletId, owner);
VerifyLockTabletExecutionLost(runtime, tabletId, owner, NKikimrHive::LOCK_LOST_REASON_NODE_DISCONNECTED);
}

Y_UNIT_TEST(TestLockTabletExecutionRebootTimeout) {
Expand Down Expand Up @@ -5899,7 +5900,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
WaitForTabletIsUp(runtime, tabletId, 0);

// Hive should try to notify owner on unlocking
VerifyLockTabletExecutionLost(runtime, tabletId, owner);
VerifyLockTabletExecutionLost(runtime, tabletId, owner, NKikimrHive::LOCK_LOST_REASON_HIVE_RESTART);
}

Y_UNIT_TEST(TestLockTabletExecutionDelete) {
Expand Down Expand Up @@ -5935,7 +5936,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
}

// Hive should try to notify owner on unlocking
VerifyLockTabletExecutionLost(runtime, tabletId, owner);
VerifyLockTabletExecutionLost(runtime, tabletId, owner, NKikimrHive::LOCK_LOST_REASON_TABLET_DELETED);
}

Y_UNIT_TEST(TestLockTabletExecutionDeleteReboot) {
Expand Down Expand Up @@ -5976,7 +5977,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
RebootTablet(runtime, hiveTablet, runtime.AllocateEdgeActor(0));

// Hive should try to notify owner on unlocking
VerifyLockTabletExecutionLost(runtime, tabletId, owner);
VerifyLockTabletExecutionLost(runtime, tabletId, owner, NKikimrHive::LOCK_LOST_REASON_TABLET_DELETED);
}

void MakeSureTabletStaysDown(TTestActorRuntime& runtime, ui64 tabletId, const TDuration& timeout) {
Expand Down Expand Up @@ -6086,7 +6087,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
runtime.Send(new IEventHandle(proxy, disconnecter, new TEvInterconnect::TEvDisconnect()), 0);

// wait for the lost lock notification
VerifyLockTabletExecutionLost(runtime, tabletId, owner);
VerifyLockTabletExecutionLost(runtime, tabletId, owner, NKikimrHive::LOCK_LOST_REASON_NODE_DISCONNECTED);

// lock reconnect should fail
SendLockTabletExecution(runtime, hiveTablet, tabletId, 1, NKikimrProto::ERROR, owner, 500, true);
Expand Down Expand Up @@ -6155,7 +6156,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
WaitForTabletIsUp(runtime, tabletId, 0);

// Hive should try to notify owner on unlocking
VerifyLockTabletExecutionLost(runtime, tabletId, owner);
VerifyLockTabletExecutionLost(runtime, tabletId, owner, NKikimrHive::LOCK_LOST_REASON_UNLOCKED);
}

Y_UNIT_TEST(TestLockTabletExecutionStealLock) {
Expand All @@ -6181,7 +6182,7 @@ Y_UNIT_TEST_SUITE(THiveTest) {
SendLockTabletExecution(runtime, hiveTablet, tabletId, 1, NKikimrProto::OK, owner2);

// Hive should notify the old owner on unlocking
VerifyLockTabletExecutionLost(runtime, tabletId, owner);
VerifyLockTabletExecutionLost(runtime, tabletId, owner, NKikimrHive::LOCK_LOST_REASON_NEW_LOCK);
}

Y_UNIT_TEST(TestExternalBoot) {
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/tx__delete_tablet_result.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class TTxDeleteTabletResult : public TTransactionBase<THive> {
TActorId unlockedFromActor = tablet->ClearLockedToActor();
if (unlockedFromActor) {
// Notify lock owner that lock has been lost
SideEffects.Send(unlockedFromActor, new TEvHive::TEvLockTabletExecutionLost(TabletId));
SideEffects.Send(unlockedFromActor, new TEvHive::TEvLockTabletExecutionLost(TabletId, NKikimrHive::LOCK_LOST_REASON_TABLET_DELETED));
}
Self->PendingCreateTablets.erase({tablet->Owner.first, tablet->Owner.second});
Self->DeleteTablet(tablet->Id);
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/tx__disconnect_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class TTxDisconnectNode : public TTransactionBase<THive> {
BLOG_D("THive::TTxDisconnectNode()::Execute");
TNodeInfo* node = Self->FindNode(Event->NodeId);
if (node != nullptr) {
Self->ScheduleUnlockTabletExecution(*node);
Self->ScheduleUnlockTabletExecution(*node, NKikimrHive::LOCK_LOST_REASON_NODE_DISCONNECTED);
if (node->BecomeDisconnecting()) {
THolder<TEvPrivate::TEvProcessDisconnectNode> event = MakeHolder<TEvPrivate::TEvProcessDisconnectNode>();
event->NodeId = node->Id;
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/tx__load_everything.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,7 @@ class TTxLoadEverything : public TTransactionBase<THive> {
ctx.Send(Self->SelfId(), new TEvPrivate::TEvBootTablets());

for (auto it = Self->Nodes.begin(); it != Self->Nodes.end(); ++it) {
Self->ScheduleUnlockTabletExecution(it->second);
Self->ScheduleUnlockTabletExecution(it->second, NKikimrHive::LOCK_LOST_REASON_HIVE_RESTART);
}
}
};
Expand Down
13 changes: 11 additions & 2 deletions ydb/core/mind/hive/tx__lock_tablet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class TTxLockTabletExecution : public TTransactionBase<THive> {

TSideEffects SideEffects;
TActorId PreviousOwner;
bool Success = true;

public:
TTxLockTabletExecution(const NKikimrHive::TEvLockTabletExecution& rec, const TActorId& sender, const ui64 cookie, THive* hive)
Expand Down Expand Up @@ -43,6 +44,7 @@ class TTxLockTabletExecution : public TTransactionBase<THive> {
NKikimrProto::ERROR,
TStringBuilder() << "Trying to lock tablet " << TabletId << " to an invalid owner actor"
), 0, Cookie);
Success = false;
return true;
}

Expand All @@ -53,6 +55,7 @@ class TTxLockTabletExecution : public TTransactionBase<THive> {
NKikimrProto::ERROR,
TStringBuilder() << "Trying to lock tablet " << TabletId << ", which doesn't exist"
), 0, Cookie);
Success = false;
return true;
}

Expand All @@ -62,6 +65,7 @@ class TTxLockTabletExecution : public TTransactionBase<THive> {
NKikimrProto::ERROR,
TStringBuilder() << "Trying to lock tablet " << TabletId << " to " << OwnerActor << ", which is on a different node"
), 0, Cookie);
Success = false;
return true;
}

Expand All @@ -71,6 +75,7 @@ class TTxLockTabletExecution : public TTransactionBase<THive> {
NKikimrProto::ERROR,
TStringBuilder() << "Trying to restore lock to tablet " << TabletId << ", which has expired"
), 0, Cookie);
Success = false;
return true;
}

Expand All @@ -86,7 +91,7 @@ class TTxLockTabletExecution : public TTransactionBase<THive> {
ui32 flags = 0;
if (PreviousOwner && PreviousOwner != OwnerActor) {
// Notify previous owner that its lock ownership has been lost
SideEffects.Send(PreviousOwner, new TEvHive::TEvLockTabletExecutionLost(TabletId));
SideEffects.Send(PreviousOwner, new TEvHive::TEvLockTabletExecutionLost(TabletId, NKikimrHive::LOCK_LOST_REASON_NEW_LOCK));
}

if (tablet->IsLockedToActor()) {
Expand All @@ -106,7 +111,11 @@ class TTxLockTabletExecution : public TTransactionBase<THive> {
}

void Complete(const TActorContext& ctx) override {
BLOG_D("THive::TTxLockTabletExecution::Complete TabletId: " << TabletId << " SideEffects: " << SideEffects);
if (Success) {
BLOG_D("THive::TTxLockTabletExecution::Complete TabletId: " << TabletId << " SideEffects: " << SideEffects);
} else {
BLOG_NOTICE("THive::TTxLockTabletExecution::Complete TabletId: " << TabletId << " SideEffects: " << SideEffects);
}
SideEffects.Complete(ctx);
}

Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/tx__release_tablets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class TTxReleaseTablets : public TTransactionBase<THive> {
SideEffects.Complete(ctx);
for (const auto& unlockedFromActor : UnlockedFromActor) {
// Notify lock owner that lock has been lost
ctx.Send(unlockedFromActor.second, new TEvHive::TEvLockTabletExecutionLost(unlockedFromActor.first));
ctx.Send(unlockedFromActor.second, new TEvHive::TEvLockTabletExecutionLost(unlockedFromActor.first, NKikimrHive::LOCK_LOST_REASON_TABLET_RELEASED));
}
ctx.Send(Request->Sender, Response.Release());
if (NeedToProcessPendingOperations) {
Expand Down
24 changes: 14 additions & 10 deletions ydb/core/mind/hive/tx__unlock_tablet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class TTxUnlockTabletExecution : public TTransactionBase<THive> {
const ui64 TabletId;
const TActorId OwnerActor;
const ui64 SeqNo;
const NKikimrHive::ELockLostReason Reason = NKikimrHive::LOCK_LOST_REASON_UNKNOWN;

const TActorId Sender;
const ui64 Cookie;
Expand All @@ -21,6 +22,7 @@ class TTxUnlockTabletExecution : public TTransactionBase<THive> {
, TabletId(rec.GetTabletID())
, OwnerActor(GetOwnerActor(rec, sender))
, SeqNo(0)
, Reason(NKikimrHive::LOCK_LOST_REASON_UNLOCKED)
, Sender(sender)
, Cookie(cookie)
{
Expand All @@ -29,15 +31,16 @@ class TTxUnlockTabletExecution : public TTransactionBase<THive> {

TTxType GetTxType() const override { return NHive::TXTYPE_UNLOCK_TABLET_EXECUTION; }

TTxUnlockTabletExecution(ui64 tabletId, ui64 seqNo, THive* hive)
TTxUnlockTabletExecution(ui64 tabletId, ui64 seqNo, NKikimrHive::ELockLostReason reason, THive* hive)
: TBase(hive)
, TabletId(tabletId)
, SeqNo(seqNo)
, Reason(reason)
, Cookie(0)
{}

bool Execute(TTransactionContext& txc, const TActorContext&) override {
BLOG_D("THive::TTxUnlockTabletExecution::Execute TabletId: " << TabletId);
BLOG_NOTICE("THive::TTxUnlockTabletExecution::Execute TabletId: " << TabletId);
SideEffects.Reset(Self->SelfId());
TLeaderTabletInfo* tablet = Self->FindTabletEvenInDeleting(TabletId);
if (tablet == nullptr) {
Expand Down Expand Up @@ -70,7 +73,7 @@ class TTxUnlockTabletExecution : public TTransactionBase<THive> {

if (PreviousOwner) {
// Notify previous owner that its lock ownership has been lost
SideEffects.Send(PreviousOwner, new TEvHive::TEvLockTabletExecutionLost(TabletId));
SideEffects.Send(PreviousOwner, new TEvHive::TEvLockTabletExecutionLost(TabletId, Reason));
}

if (!tablet->IsLockedToActor()) {
Expand All @@ -82,7 +85,7 @@ class TTxUnlockTabletExecution : public TTransactionBase<THive> {
}

void Complete(const TActorContext& ctx) override {
BLOG_D("THive::TTxUnlockTabletExecution::Complete TabletId: " << TabletId << " SideEffects: " << SideEffects);
BLOG_NOTICE("THive::TTxUnlockTabletExecution::Complete TabletId: " << TabletId << " SideEffects: " << SideEffects);
SideEffects.Complete(ctx);
}

Expand All @@ -100,19 +103,20 @@ ITransaction* THive::CreateUnlockTabletExecution(const NKikimrHive::TEvUnlockTab
return new TTxUnlockTabletExecution(rec, sender, cookie, this);
}

ITransaction* THive::CreateUnlockTabletExecution(ui64 tabletId, ui64 seqNo) {
return new TTxUnlockTabletExecution(tabletId, seqNo, this);
ITransaction* THive::CreateUnlockTabletExecution(ui64 tabletId, ui64 seqNo, NKikimrHive::ELockLostReason reason) {
return new TTxUnlockTabletExecution(tabletId, seqNo, reason, this);
}

void THive::ScheduleUnlockTabletExecution(TNodeInfo& node) {
void THive::ScheduleUnlockTabletExecution(TNodeInfo& node, NKikimrHive::ELockLostReason reason) {
// Unlock tablets that have been locked by this node
BLOG_NOTICE("ScheduleUnlockTabletExecution(" << node.Id << ", " << NKikimrHive::ELockLostReason_Name(reason) << ")");
for (TLeaderTabletInfo* tablet : node.LockedTablets) {
Y_ABORT_UNLESS(FindTabletEvenInDeleting(tablet->Id) == tablet);
Y_ABORT_UNLESS(tablet->LockedToActor.NodeId() == node.Id);
if (tablet->PendingUnlockSeqNo == 0) {
tablet->PendingUnlockSeqNo = NextTabletUnlockSeqNo++;
Y_ABORT_UNLESS(tablet->PendingUnlockSeqNo != 0);
auto event = new TEvPrivate::TEvUnlockTabletReconnectTimeout(tablet->Id, tablet->PendingUnlockSeqNo);
auto event = new TEvPrivate::TEvUnlockTabletReconnectTimeout(*tablet, reason);
if (tablet->LockedReconnectTimeout) {
Schedule(tablet->LockedReconnectTimeout, event);
} else {
Expand All @@ -125,7 +129,7 @@ void THive::ScheduleUnlockTabletExecution(TNodeInfo& node) {
void THive::Handle(TEvPrivate::TEvUnlockTabletReconnectTimeout::TPtr& ev) {
TTabletId tabletId = ev->Get()->TabletId;
ui64 seqNo = ev->Get()->SeqNo;
BLOG_D("THive::Handle::TEvUnlockTabletReconnectTimeout TabletId=" << tabletId);
BLOG_NOTICE("THive::Handle::TEvUnlockTabletReconnectTimeout TabletId=" << tabletId << " Reason=" << NKikimrHive::ELockLostReason_Name(ev->Get()->Reason));
TLeaderTabletInfo* tablet = FindTabletEvenInDeleting(tabletId);
if (tablet != nullptr && tablet->IsLockedToActor() && tablet->PendingUnlockSeqNo == seqNo) {
// We use sequence numbers to make sure unlock happens only if some
Expand All @@ -141,7 +145,7 @@ void THive::Handle(TEvPrivate::TEvUnlockTabletReconnectTimeout::TPtr& ev) {
// - reconnect timeout (execute, failure)
// tablet is not unlocked, because logically lock/reconnect
// transaction was scheduled before the timeout really happened.
Execute(CreateUnlockTabletExecution(tabletId, seqNo));
Execute(CreateUnlockTabletExecution(tabletId, seqNo, ev->Get()->Reason));
}
}

Expand Down
11 changes: 11 additions & 0 deletions ydb/core/protos/hive.proto
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,16 @@ enum EDrainDownPolicy {
DRAIN_POLICY_KEEP_DOWN = 2;
}

enum ELockLostReason {
LOCK_LOST_REASON_UNKNOWN = 0;
LOCK_LOST_REASON_NODE_DISCONNECTED = 1;
LOCK_LOST_REASON_HIVE_RESTART = 2;
LOCK_LOST_REASON_TABLET_DELETED = 3;
LOCK_LOST_REASON_NEW_LOCK = 4;
LOCK_LOST_REASON_TABLET_RELEASED = 5;
LOCK_LOST_REASON_UNLOCKED = 6;
}

message TChannelInfo {
message THistorySlot {
optional uint32 FromGeneration = 1;
Expand Down Expand Up @@ -405,6 +415,7 @@ message TEvLockTabletExecution {

message TEvLockTabletExecutionLost {
optional fixed64 TabletID = 1;
optional ELockLostReason Reason = 2;
}

message TEvLockTabletExecutionResult {
Expand Down
Loading