Skip to content

Commit 35dd6e3

Browse files
committed
Re-run workers (ydb-platform#5163)
1 parent df0ce0f commit 35dd6e3

File tree

9 files changed

+130
-32
lines changed

9 files changed

+130
-32
lines changed

ydb/core/tx/replication/controller/controller.cpp

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ void TController::Handle(TEvService::TEvWorkerStatus::TPtr& ev, const TActorCont
333333
return;
334334
}
335335

336-
const auto& session = Sessions[nodeId];
336+
auto& session = Sessions[nodeId];
337337
const auto& record = ev->Get()->Record;
338338
const auto id = TWorkerId::Parse(record.GetWorker());
339339

@@ -344,7 +344,16 @@ void TController::Handle(TEvService::TEvWorkerStatus::TPtr& ev, const TActorCont
344344
}
345345
break;
346346
case NKikimrReplication::TEvWorkerStatus::STOPPED:
347-
MaybeRemoveWorker(id, ctx);
347+
if (!MaybeRemoveWorker(id, ctx)) {
348+
session.DetachWorker(id);
349+
if (IsValidWorker(id)) {
350+
auto* worker = GetOrCreateWorker(id);
351+
worker->ClearSession();
352+
if (worker->HasCommand()) {
353+
BootQueue.insert(id);
354+
}
355+
}
356+
}
348357
break;
349358
default:
350359
CLOG_W(ctx, "Unknown worker status"
@@ -589,10 +598,13 @@ void TController::RemoveWorker(const TWorkerId& id, const TActorContext& ctx) {
589598
target->Progress(ctx);
590599
}
591600

592-
void TController::MaybeRemoveWorker(const TWorkerId& id, const TActorContext& ctx) {
593-
if (RemoveQueue.contains(id)) {
594-
RemoveWorker(id, ctx);
601+
bool TController::MaybeRemoveWorker(const TWorkerId& id, const TActorContext& ctx) {
602+
if (!RemoveQueue.contains(id)) {
603+
return false;
595604
}
605+
606+
RemoveWorker(id, ctx);
607+
return true;
596608
}
597609

598610
void TController::Handle(TEvInterconnect::TEvNodeDisconnected::TPtr& ev, const TActorContext& ctx) {

ydb/core/tx/replication/controller/controller_impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ class TController
100100
void BootWorker(ui32 nodeId, const TWorkerId& id, const NKikimrReplication::TRunWorkerCommand& cmd);
101101
void StopWorker(ui32 nodeId, const TWorkerId& id);
102102
void RemoveWorker(const TWorkerId& id, const TActorContext& ctx);
103-
void MaybeRemoveWorker(const TWorkerId& id, const TActorContext& ctx);
103+
bool MaybeRemoveWorker(const TWorkerId& id, const TActorContext& ctx);
104104

105105
// local transactions
106106
class TTxInitSchema;

ydb/core/tx/replication/controller/tx_resolve_secret_result.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,6 @@ class TController::TTxResolveSecretResult: public TTxBase {
2929
return true;
3030
}
3131

32-
if (Replication->GetState() != TReplication::EState::Ready) {
33-
CLOG_W(ctx, "Replication state mismatch"
34-
<< ": rid# " << rid
35-
<< ", state# " << Replication->GetState());
36-
return true;
37-
}
38-
3932
if (Ev->Get()->IsSuccess()) {
4033
CLOG_N(ctx, "Secret resolved"
4134
<< ": rid# " << rid);

ydb/core/tx/replication/service/service.cpp

Lines changed: 79 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,21 +45,51 @@ class TSessionInfo {
4545
return Workers.contains(id);
4646
}
4747

48-
void RegisterWorker(IActorOps* ops, const TWorkerId& id, IActor* actor) {
48+
bool HasWorker(const TActorId& id) const {
49+
return ActorIdToWorkerId.contains(id);
50+
}
51+
52+
TActorId GetWorkerActorId(const TWorkerId& id) const {
53+
auto it = Workers.find(id);
54+
Y_ABORT_UNLESS(it != Workers.end());
55+
return it->second;
56+
}
57+
58+
TActorId RegisterWorker(IActorOps* ops, const TWorkerId& id, IActor* actor) {
4959
auto res = Workers.emplace(id, ops->Register(actor));
5060
Y_ABORT_UNLESS(res.second);
5161

52-
ops->Send(ActorId, new TEvService::TEvWorkerStatus(id, NKikimrReplication::TEvWorkerStatus::RUNNING));
62+
const auto actorId = res.first->second;
63+
ActorIdToWorkerId.emplace(actorId, id);
64+
65+
SendWorkerStatus(ops, id, NKikimrReplication::TEvWorkerStatus::RUNNING);
66+
return actorId;
5367
}
5468

5569
void StopWorker(IActorOps* ops, const TWorkerId& id) {
5670
auto it = Workers.find(id);
5771
Y_ABORT_UNLESS(it != Workers.end());
5872

5973
ops->Send(it->second, new TEvents::TEvPoison());
74+
SendWorkerStatus(ops, id, NKikimrReplication::TEvWorkerStatus::STOPPED);
75+
76+
ActorIdToWorkerId.erase(it->second);
6077
Workers.erase(it);
78+
}
79+
80+
void StopWorker(IActorOps* ops, const TActorId& id) {
81+
auto it = ActorIdToWorkerId.find(id);
82+
Y_ABORT_UNLESS(it != ActorIdToWorkerId.end());
83+
84+
// actor already stopped
85+
SendWorkerStatus(ops, it->second, NKikimrReplication::TEvWorkerStatus::STOPPED);
86+
87+
Workers.erase(it->second);
88+
ActorIdToWorkerId.erase(it);
89+
}
6190

62-
ops->Send(ActorId, new TEvService::TEvWorkerStatus(id, NKikimrReplication::TEvWorkerStatus::STOPPED));
91+
void SendWorkerStatus(IActorOps* ops, const TWorkerId& id, NKikimrReplication::TEvWorkerStatus::EStatus status) {
92+
ops->Send(ActorId, new TEvService::TEvWorkerStatus(id, status));
6393
}
6494

6595
void SendStatus(IActorOps* ops) const {
@@ -83,6 +113,7 @@ class TSessionInfo {
83113
TActorId ActorId;
84114
ui64 Generation;
85115
THashMap<TWorkerId, TActorId> Workers;
116+
THashMap<TActorId, TWorkerId> ActorIdToWorkerId;
86117

87118
}; // TSessionInfo
88119

@@ -243,7 +274,7 @@ class TReplicationService: public TActorBootstrapped<TReplicationService> {
243274
}
244275

245276
if (session.HasWorker(id)) {
246-
return;
277+
return session.SendWorkerStatus(this, id, NKikimrReplication::TEvWorkerStatus::RUNNING);
247278
}
248279

249280
LOG_I("Run worker"
@@ -253,7 +284,9 @@ class TReplicationService: public TActorBootstrapped<TReplicationService> {
253284
// TODO: validate settings
254285
const auto& readerSettings = cmd.GetRemoteTopicReader();
255286
const auto& writerSettings = cmd.GetLocalTableWriter();
256-
session.RegisterWorker(this, id, CreateWorker(ReaderFn(readerSettings), WriterFn(writerSettings)));
287+
const auto actorId = session.RegisterWorker(this, id,
288+
CreateWorker(SelfId(), ReaderFn(readerSettings), WriterFn(writerSettings)));
289+
WorkerActorIdToSession[actorId] = controller.GetTabletId();
257290
}
258291

259292
void Handle(TEvService::TEvStopWorker::TPtr& ev) {
@@ -282,11 +315,46 @@ class TReplicationService: public TActorBootstrapped<TReplicationService> {
282315
return;
283316
}
284317

285-
if (session.HasWorker(id)) {
286-
LOG_I("Stop worker"
287-
<< ": worker# " << id);
288-
session.StopWorker(this, id);
318+
if (!session.HasWorker(id)) {
319+
return session.SendWorkerStatus(this, id, NKikimrReplication::TEvWorkerStatus::STOPPED);
289320
}
321+
322+
LOG_I("Stop worker"
323+
<< ": worker# " << id);
324+
WorkerActorIdToSession.erase(session.GetWorkerActorId(id));
325+
session.StopWorker(this, id);
326+
}
327+
328+
void Handle(TEvWorker::TEvGone::TPtr& ev) {
329+
LOG_T("Handle " << ev->Get()->ToString());
330+
331+
auto wit = WorkerActorIdToSession.find(ev->Sender);
332+
if (wit == WorkerActorIdToSession.end()) {
333+
LOG_W("Unknown worker has gone"
334+
<< ": worker# " << ev->Sender);
335+
return;
336+
}
337+
338+
auto it = Sessions.find(wit->second);
339+
if (it == Sessions.end()) {
340+
LOG_E("Cannot find session"
341+
<< ": worker# " << ev->Sender
342+
<< ", session# " << wit->second);
343+
return;
344+
}
345+
346+
auto& session = it->second;
347+
if (!session.HasWorker(ev->Sender)) {
348+
LOG_E("Cannot find worker"
349+
<< ": worker# " << ev->Sender
350+
<< ", session# " << wit->second);
351+
return;
352+
}
353+
354+
LOG_I("Worker has gone"
355+
<< ": worker# " << ev->Sender);
356+
WorkerActorIdToSession.erase(ev->Sender);
357+
session.StopWorker(this, ev->Sender);
290358
}
291359

292360
void PassAway() override {
@@ -320,6 +388,7 @@ class TReplicationService: public TActorBootstrapped<TReplicationService> {
320388
hFunc(TEvService::TEvHandshake, Handle);
321389
hFunc(TEvService::TEvRunWorker, Handle);
322390
hFunc(TEvService::TEvStopWorker, Handle);
391+
hFunc(TEvWorker::TEvGone, Handle);
323392
sFunc(TEvents::TEvPoison, PassAway);
324393
}
325394
}
@@ -329,6 +398,7 @@ class TReplicationService: public TActorBootstrapped<TReplicationService> {
329398
TActorId BoardPublisher;
330399
THashMap<ui64, TSessionInfo> Sessions;
331400
THashMap<TCredentialsKey, TActorId> YdbProxies;
401+
THashMap<TActorId, ui64> WorkerActorIdToSession;
332402

333403
}; // TReplicationService
334404

ydb/core/tx/replication/service/table_writer.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,9 @@ class TTablePartitionWriter: public TActorBootstrapped<TTablePartitionWriter> {
139139
}
140140

141141
void Leave(bool hardError = false) {
142+
LOG_I("Leave"
143+
<< ": hard error# " << hardError);
144+
142145
Send(Parent, new NChangeExchange::TEvChangeExchangePrivate::TEvGone(TabletId, hardError));
143146
PassAway();
144147
}
@@ -495,6 +498,9 @@ class TLocalTableWriter
495498
}
496499

497500
void Leave(TEvWorker::TEvGone::EStatus status) {
501+
LOG_I("Leave"
502+
<< ": status# " << status);
503+
498504
Send(Worker, new TEvWorker::TEvGone(status));
499505
PassAway();
500506
}

ydb/core/tx/replication/service/topic_reader.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,9 @@ class TRemoteTopicReader: public TActor<TRemoteTopicReader> {
7676
}
7777

7878
void Leave(TEvWorker::TEvGone::EStatus status) {
79-
LOG_I("Leave");
79+
LOG_I("Leave"
80+
<< ": status# " << status);
81+
8082
Send(Worker, new TEvWorker::TEvGone(status));
8183
PassAway();
8284
}

ydb/core/tx/replication/service/worker.cpp

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -191,12 +191,15 @@ class TWorker: public TActorBootstrapped<TWorker> {
191191
}
192192
[[fallthrough]];
193193
default:
194-
return Leave();
194+
return Leave(status);
195195
}
196196
}
197197

198-
void Leave() {
199-
// TODO: signal to parent
198+
void Leave(TEvWorker::TEvGone::EStatus status) {
199+
LOG_I("Leave"
200+
<< ": status# " << status);
201+
202+
Send(Parent, new TEvWorker::TEvGone(status));
200203
PassAway();
201204
}
202205

@@ -213,8 +216,12 @@ class TWorker: public TActorBootstrapped<TWorker> {
213216
return NKikimrServices::TActivity::REPLICATION_WORKER;
214217
}
215218

216-
explicit TWorker(std::function<IActor*(void)>&& createReaderFn, std::function<IActor*(void)>&& createWriterFn)
217-
: Reader(std::move(createReaderFn))
219+
explicit TWorker(
220+
const TActorId& parent,
221+
std::function<IActor*(void)>&& createReaderFn,
222+
std::function<IActor*(void)>&& createWriterFn)
223+
: Parent(parent)
224+
, Reader(std::move(createReaderFn))
218225
, Writer(std::move(createWriterFn))
219226
{
220227
}
@@ -239,14 +246,19 @@ class TWorker: public TActorBootstrapped<TWorker> {
239246

240247
private:
241248
static constexpr ui32 MaxAttempts = 3;
249+
const TActorId Parent;
242250
mutable TMaybe<TString> LogPrefix;
243251
TActorInfo Reader;
244252
TActorInfo Writer;
245253
THolder<TEvWorker::TEvData> InFlightData;
246254
};
247255

248-
IActor* CreateWorker(std::function<IActor*(void)>&& createReaderFn, std::function<IActor*(void)>&& createWriterFn) {
249-
return new TWorker(std::move(createReaderFn), std::move(createWriterFn));
256+
IActor* CreateWorker(
257+
const TActorId& parent,
258+
std::function<IActor*(void)>&& createReaderFn,
259+
std::function<IActor*(void)>&& createWriterFn)
260+
{
261+
return new TWorker(parent, std::move(createReaderFn), std::move(createWriterFn));
250262
}
251263

252264
}

ydb/core/tx/replication/service/worker.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,10 @@ struct TEvWorker {
5757
};
5858
};
5959

60-
IActor* CreateWorker(std::function<IActor*(void)>&& createReaderFn, std::function<IActor*(void)>&& createWriterFn);
60+
IActor* CreateWorker(
61+
const TActorId& parent,
62+
std::function<IActor*(void)>&& createReaderFn,
63+
std::function<IActor*(void)>&& createWriterFn);
6164

6265
}
6366

ydb/core/tx/replication/service/worker_ut.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ Y_UNIT_TEST_SUITE(Worker) {
5656
return CreateLocalTableWriter(tablePathId);
5757
};
5858

59-
auto worker = env.GetRuntime().Register(CreateWorker(std::move(createReaderFn), std::move(createWriterFn)));
59+
auto worker = env.GetRuntime().Register(CreateWorker(env.GetSender(), std::move(createReaderFn), std::move(createWriterFn)));
6060
Y_UNUSED(worker);
6161

6262
UNIT_ASSERT(WriteTopic(env, "/Root/topic", R"({"key":[1], "update":{"value":"10"}})"));

0 commit comments

Comments
 (0)