forked from apple/foundationdb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCoordinatedState.actor.cpp
363 lines (322 loc) · 14.1 KB
/
CoordinatedState.actor.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
/*
* CoordinatedState.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/ClusterConnectionMemoryRecord.h"
#include "fdbserver/CoordinatedState.h"
#include "fdbserver/CoordinationInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h"
#include "fdbserver/LeaderElection.h"
#include "flow/actorcompiler.h" // has to be last include
ACTOR Future<GenerationRegReadReply> waitAndSendRead(RequestStream<GenerationRegReadRequest> to,
GenerationRegReadRequest req) {
if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
state GenerationRegReadReply reply = wait(retryBrokenPromise(to, req));
if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
return reply;
}
ACTOR Future<UniqueGeneration> waitAndSendWrite(RequestStream<GenerationRegWriteRequest> to,
GenerationRegWriteRequest req) {
if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
state UniqueGeneration reply = wait(retryBrokenPromise(to, req));
if (SERVER_KNOBS->BUGGIFY_ALL_COORDINATION || BUGGIFY)
wait(delay(SERVER_KNOBS->BUGGIFIED_EVENTUAL_CONSISTENCY * deterministicRandom()->random01()));
return reply;
}
ACTOR Future<GenerationRegReadReply> emptyToNever(Future<GenerationRegReadReply> f) {
state GenerationRegReadReply r = wait(f);
if (r.gen.generation == 0)
wait(Future<Void>(Never()));
return r;
}
ACTOR Future<GenerationRegReadReply> nonemptyToNever(Future<GenerationRegReadReply> f) {
state GenerationRegReadReply r = wait(f);
if (r.gen.generation != 0)
wait(Future<Void>(Never()));
return r;
}
struct CoordinatedStateImpl {
ServerCoordinators coordinators;
int stage;
UniqueGeneration gen;
uint64_t conflictGen;
bool doomed;
ActorCollection ac; // Errors are not reported
bool initial;
CoordinatedStateImpl(ServerCoordinators const& c)
: coordinators(c), stage(0), conflictGen(0), doomed(false), ac(false), initial(false) {}
uint64_t getConflict() { return conflictGen; }
bool isDoomed(GenerationRegReadReply const& rep) {
return rep.gen > gen // setExclusive is doomed, because there was a write at least started at a higher
// generation, which means a read completed at that higher generation
// || rep.rgen > gen // setExclusive isn't absolutely doomed, but it may/probably will fail
;
}
ACTOR static Future<Value> read(CoordinatedStateImpl* self) {
ASSERT(self->stage == 0);
{
self->stage = 1;
GenerationRegReadReply rep = wait(self->replicatedRead(
self, GenerationRegReadRequest(self->coordinators.clusterKey, UniqueGeneration())));
self->conflictGen = std::max(self->conflictGen, std::max(rep.gen.generation, rep.rgen.generation)) + 1;
self->gen = UniqueGeneration(self->conflictGen, deterministicRandom()->randomUniqueID());
}
{
self->stage = 2;
GenerationRegReadReply rep =
wait(self->replicatedRead(self, GenerationRegReadRequest(self->coordinators.clusterKey, self->gen)));
self->stage = 3;
self->conflictGen = std::max(self->conflictGen, std::max(rep.gen.generation, rep.rgen.generation));
if (self->isDoomed(rep))
self->doomed = true;
self->initial = rep.gen.generation == 0;
self->stage = 4;
return rep.value.present() ? rep.value.get() : Value();
}
}
ACTOR static Future<Void> onConflict(CoordinatedStateImpl* self) {
ASSERT(self->stage == 4);
if (self->doomed)
return Void();
loop {
wait(delay(SERVER_KNOBS->COORDINATED_STATE_ONCONFLICT_POLL_INTERVAL));
GenerationRegReadReply rep = wait(self->replicatedRead(
self, GenerationRegReadRequest(self->coordinators.clusterKey, UniqueGeneration())));
if (self->stage > 4)
break;
self->conflictGen = std::max(self->conflictGen, std::max(rep.gen.generation, rep.rgen.generation));
if (self->isDoomed(rep))
return Void();
}
wait(Future<Void>(Never()));
return Void();
}
ACTOR static Future<Void> setExclusive(CoordinatedStateImpl* self, Value v) {
ASSERT(self->stage == 4);
self->stage = 5;
UniqueGeneration wgen = wait(self->replicatedWrite(
self, GenerationRegWriteRequest(KeyValueRef(self->coordinators.clusterKey, v), self->gen)));
self->stage = 6;
TraceEvent("CoordinatedStateSet")
.detail("Gen", self->gen.generation)
.detail("Wgen", wgen.generation)
.detail("Genu", self->gen.uid)
.detail("Wgenu", wgen.uid)
.detail("Cgen", self->conflictGen);
if (wgen == self->gen)
return Void();
else {
self->conflictGen = std::max(self->conflictGen, wgen.generation);
throw coordinated_state_conflict();
}
}
ACTOR static Future<GenerationRegReadReply> replicatedRead(CoordinatedStateImpl* self,
GenerationRegReadRequest req) {
state std::vector<GenerationRegInterface>& replicas = self->coordinators.stateServers;
state std::vector<Future<GenerationRegReadReply>> rep_empty_reply;
state std::vector<Future<GenerationRegReadReply>> rep_reply;
for (int i = 0; i < replicas.size(); i++) {
Future<GenerationRegReadReply> reply =
waitAndSendRead(replicas[i].read, GenerationRegReadRequest(req.key, req.gen));
rep_empty_reply.push_back(nonemptyToNever(reply));
rep_reply.push_back(emptyToNever(reply));
self->ac.add(success(reply));
}
state Future<Void> majorityEmpty =
quorum(rep_empty_reply,
(replicas.size() + 1) / 2); // enough empty to ensure we cannot achieve a majority non-empty
wait(quorum(rep_reply, replicas.size() / 2 + 1) || majorityEmpty);
if (majorityEmpty.isReady()) {
int best = -1;
for (int i = 0; i < rep_empty_reply.size(); i++)
if (rep_empty_reply[i].isReady() && !rep_empty_reply[i].isError()) {
if (best < 0 || rep_empty_reply[i].get().rgen > rep_empty_reply[best].get().rgen)
best = i;
}
ASSERT(best >= 0);
auto result = rep_empty_reply[best].get();
return result;
} else {
int best = -1;
for (int i = 0; i < rep_reply.size(); i++)
if (rep_reply[i].isReady() && !rep_reply[i].isError()) {
if (best < 0 || rep_reply[i].get().gen > rep_reply[best].get().gen ||
(rep_reply[i].get().gen == rep_reply[best].get().gen &&
rep_reply[i].get().rgen > rep_reply[best].get().rgen))
best = i;
}
ASSERT(best >= 0);
auto result = rep_reply[best].get();
return result;
}
}
ACTOR static Future<UniqueGeneration> replicatedWrite(CoordinatedStateImpl* self, GenerationRegWriteRequest req) {
state std::vector<GenerationRegInterface>& replicas = self->coordinators.stateServers;
state std::vector<Future<UniqueGeneration>> wrep_reply;
for (int i = 0; i < replicas.size(); i++) {
Future<UniqueGeneration> reply =
waitAndSendWrite(replicas[i].write, GenerationRegWriteRequest(req.kv, req.gen));
wrep_reply.push_back(reply);
self->ac.add(success(reply));
}
wait(quorum(wrep_reply, self->initial ? replicas.size() : replicas.size() / 2 + 1));
UniqueGeneration maxGen;
for (int i = 0; i < wrep_reply.size(); i++)
if (wrep_reply[i].isReady())
maxGen = std::max(maxGen, wrep_reply[i].get());
return maxGen;
}
};
CoordinatedState::CoordinatedState(ServerCoordinators const& coord)
: impl(std::make_unique<CoordinatedStateImpl>(coord)) {}
CoordinatedState::~CoordinatedState() = default;
Future<Value> CoordinatedState::read() {
return CoordinatedStateImpl::read(impl.get());
}
Future<Void> CoordinatedState::onConflict() {
return CoordinatedStateImpl::onConflict(impl.get());
}
Future<Void> CoordinatedState::setExclusive(Value v) {
return CoordinatedStateImpl::setExclusive(impl.get(), v);
}
uint64_t CoordinatedState::getConflict() {
return impl->getConflict();
}
struct MovableValue {
enum MoveState { MaybeTo = 1, Active = 2, MovingFrom = 3 };
Value value;
int32_t mode;
Optional<Value> other; // a cluster connection string
MovableValue() : mode(Active) {}
MovableValue(Value const& v, int mode, Optional<Value> other = Optional<Value>())
: value(v), mode(mode), other(other) {}
// To change this serialization, ProtocolVersion::MovableCoordinatedStateV2 must be updated, and downgrades need to
// be considered
template <class Ar>
void serialize(Ar& ar) {
ASSERT(ar.protocolVersion().hasMovableCoordinatedState());
serializer(ar, value, mode, other);
}
};
struct MovableCoordinatedStateImpl {
ServerCoordinators coordinators;
CoordinatedState cs;
Optional<Value> lastValue, // The value passed to setExclusive()
lastCSValue; // The value passed to cs.setExclusive()
MovableCoordinatedStateImpl(ServerCoordinators const& c) : coordinators(c), cs(c) {}
ACTOR static Future<Value> read(MovableCoordinatedStateImpl* self) {
state MovableValue moveState;
Value rawValue = wait(self->cs.read());
if (rawValue.size()) {
BinaryReader r(rawValue, IncludeVersion());
if (!r.protocolVersion().hasMovableCoordinatedState()) {
// Old coordinated state, not a MovableValue
moveState.value = rawValue;
} else
r >> moveState;
}
// SOMEDAY: If moveState.mode == MovingFrom, read (without locking) old state and assert that it corresponds
// with our state and is ReallyTo(coordinators)
if (moveState.mode == MovableValue::MaybeTo) {
TEST(true); // Maybe moveto state
ASSERT(moveState.other.present());
wait(self->moveTo(
self, &self->cs, ClusterConnectionString(moveState.other.get().toString()), moveState.value));
}
return moveState.value;
}
Future<Void> onConflict() { return cs.onConflict(); }
Future<Void> setExclusive(Value v) {
lastValue = v;
lastCSValue = BinaryWriter::toValue(MovableValue(v, MovableValue::Active),
IncludeVersion(ProtocolVersion::withMovableCoordinatedStateV2()));
return cs.setExclusive(lastCSValue.get());
}
ACTOR static Future<Void> move(MovableCoordinatedStateImpl* self, ClusterConnectionString nc) {
// Call only after setExclusive returns. Attempts to move the coordinated state
// permanently to the new ServerCoordinators, which must be uninitialized. Returns when the process has
// reached the point where a leader elected by the new coordinators should be doing the rest of the work
// (and therefore the caller should die).
state CoordinatedState cs(self->coordinators);
state CoordinatedState nccs(ServerCoordinators(makeReference<ClusterConnectionMemoryRecord>(nc)));
state Future<Void> creationTimeout = delay(30);
ASSERT(self->lastValue.present() && self->lastCSValue.present());
TraceEvent("StartMove").detail("ConnectionString", nc.toString());
choose {
when(wait(creationTimeout)) { throw new_coordinators_timed_out(); }
when(Value ncInitialValue = wait(nccs.read())) {
ASSERT(!ncInitialValue.size()); // The new coordinators must be uninitialized!
}
}
TraceEvent("FinishedRead").detail("ConnectionString", nc.toString());
choose {
when(wait(creationTimeout)) { throw new_coordinators_timed_out(); }
when(wait(nccs.setExclusive(
BinaryWriter::toValue(MovableValue(self->lastValue.get(),
MovableValue::MovingFrom,
self->coordinators.ccr->getConnectionString().toString()),
IncludeVersion(ProtocolVersion::withMovableCoordinatedStateV2()))))) {}
}
if (BUGGIFY)
wait(delay(5));
Value oldQuorumState = wait(cs.read());
if (oldQuorumState != self->lastCSValue.get()) {
TEST(true); // Quorum change aborted by concurrent write to old coordination state
TraceEvent("QuorumChangeAbortedByConcurrency").log();
throw coordinated_state_conflict();
}
wait(self->moveTo(self, &cs, nc, self->lastValue.get()));
throw coordinators_changed();
}
ACTOR static Future<Void> moveTo(MovableCoordinatedStateImpl* self,
CoordinatedState* coordinatedState,
ClusterConnectionString nc,
Value value) {
wait(coordinatedState->setExclusive(
BinaryWriter::toValue(MovableValue(value, MovableValue::MaybeTo, nc.toString()),
IncludeVersion(ProtocolVersion::withMovableCoordinatedStateV2()))));
if (BUGGIFY)
wait(delay(5));
// SOMEDAY: If we are worried about someone magically getting the new cluster ID and interfering, do a second
// cs.setExclusive( encode( ReallyTo, ... ) )
TraceEvent("ChangingQuorum").detail("ConnectionString", nc.toString());
wait(changeLeaderCoordinators(self->coordinators, StringRef(nc.toString())));
TraceEvent("ChangedQuorum").detail("ConnectionString", nc.toString());
throw coordinators_changed();
}
};
MovableCoordinatedState& MovableCoordinatedState::operator=(MovableCoordinatedState&&) = default;
MovableCoordinatedState::MovableCoordinatedState(class ServerCoordinators const& coord)
: impl(std::make_unique<MovableCoordinatedStateImpl>(coord)) {}
MovableCoordinatedState::~MovableCoordinatedState() = default;
Future<Value> MovableCoordinatedState::read() {
return MovableCoordinatedStateImpl::read(impl.get());
}
Future<Void> MovableCoordinatedState::onConflict() {
return impl->onConflict();
}
Future<Void> MovableCoordinatedState::setExclusive(Value v) {
return impl->setExclusive(v);
}
Future<Void> MovableCoordinatedState::move(ClusterConnectionString const& nc) {
return MovableCoordinatedStateImpl::move(impl.get(), nc);
}