Skip to content

Commit 2d5118f

Browse files
MichaelCuevasfacebook-github-bot
authored andcommitted
TakeoverClient: add simulated errors for testing
Summary: # This diff Adds a new config that will force the TakeoverClient to crash prior to receiving takeover data from the existing server. # Context Crashes during graceful restart are plaguing macOS. Although these happen in the background, these greatly affect users because they cause Eden daemons to exit uncleanly. Because of the unclean exit, mount points get left in the mount table and cause all sorts of issues. The biggest issues are: 1) Any attempts to perform IO into the mount point fail 2) Any subsequent attempts to remount the mount point fail. These attempts fail because of #1, and Eden's startup/mount codepaths haven't been taught how to deal with this. The goal of this stack is to: 1) Introduce a mechanism to easily repro graceful restart crashes (both crashes that occur in the TakeoverClient and TakeoverServer) 2) Use the new mechanism to troubleshoot issue #2 above 3) Use the learnings from #2 to make EdenFS automatically recover from graceful restart crashes (unmount stale mounts and successfully remount any mounts that were mounted prior to the crash). Reviewed By: jdelliot Differential Revision: D72490700 fbshipit-source-id: f86b7d1317ce8b5accbca3c6102d83e277b213ce
1 parent 05d8a85 commit 2d5118f

File tree

6 files changed

+57
-9
lines changed

6 files changed

+57
-9
lines changed

eden/fs/config/EdenConfig.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,18 @@ class EdenConfig : private ConfigSettingManager {
231231
12,
232232
this};
233233

234+
/**
235+
* Whether graceful takeover client should purposefully return an exception
236+
* during the takeover process. Intended to be used only in integration tests.
237+
*
238+
* NOTE: if you want the takeover server to throw an exception, you can use
239+
* FaultInjector with the key/value pair: ("takeover", "error during send")
240+
*/
241+
ConfigSetting<bool> clientThrowsDuringTakeover{
242+
"core:client-throws-during-takeover",
243+
false,
244+
this};
245+
234246
// [config]
235247

236248
/**

eden/fs/service/EdenServer.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1143,6 +1143,12 @@ Future<Unit> EdenServer::prepare(std::shared_ptr<StartupLogger> logger) {
11431143
[this] { runningState_.wlock()->state = RunState::RUNNING; });
11441144
}
11451145

1146+
namespace {
1147+
bool shouldThrowDuringTakeover(const EdenConfig& config) {
1148+
return config.clientThrowsDuringTakeover.getValue();
1149+
}
1150+
} // namespace
1151+
11461152
Future<Unit> EdenServer::prepareImpl(std::shared_ptr<StartupLogger> logger) {
11471153
bool doingTakeover = false;
11481154
if (!edenDir_.acquireLock()) {
@@ -1177,7 +1183,9 @@ Future<Unit> EdenServer::prepareImpl(std::shared_ptr<StartupLogger> logger) {
11771183
logger->log(
11781184
"Requesting existing edenfs process to gracefully "
11791185
"transfer its mount points...");
1180-
takeoverData = takeoverMounts(takeoverPath);
1186+
takeoverData = takeoverMounts(
1187+
takeoverPath,
1188+
shouldThrowDuringTakeover(*serverState_->getEdenConfig()));
11811189
logger->log(
11821190
"Received takeover information for ",
11831191
takeoverData.mountPoints.size(),

eden/fs/takeover/TakeoverClient.cpp

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <folly/io/async/EventBase.h>
1414
#include <folly/logging/xlog.h>
1515
#include <thrift/lib/cpp2/protocol/Serializer.h>
16+
#include "eden/common/utils/FaultInjector.h"
1617
#include "eden/common/utils/FutureUnixSocket.h"
1718
#include "eden/fs/takeover/TakeoverData.h"
1819
#include "eden/fs/takeover/gen-cpp2/takeover_types.h"
@@ -33,6 +34,7 @@ namespace facebook::eden {
3334

3435
TakeoverData takeoverMounts(
3536
AbsolutePathPiece socketPath,
37+
bool shouldThrowDuringTakeover,
3638
bool shouldPing,
3739
const std::set<int32_t>& supportedVersions,
3840
const uint64_t supportedTakeoverCapabilities) {
@@ -61,17 +63,32 @@ TakeoverData takeoverMounts(
6163
auto timeout = std::chrono::seconds(FLAGS_takeoverReceiveTimeout);
6264
return socket.receive(timeout);
6365
})
64-
.thenValue([&socket, shouldPing](UnixSocket::Message&& msg) {
66+
.thenValue([&socket, shouldPing, shouldThrowDuringTakeover](
67+
UnixSocket::Message&& msg) mutable {
6568
if (TakeoverData::isPing(&msg.data)) {
6669
if (shouldPing) {
6770
// Just send an empty message back here, the server knows it sent a
6871
// ping so it does not need to parse the message.
6972
UnixSocket::Message ping;
70-
return socket.send(std::move(ping)).thenValue([&socket](auto&&) {
71-
// Wait for the takeover data response
72-
auto timeout = std::chrono::seconds(FLAGS_takeoverReceiveTimeout);
73-
return socket.receive(timeout);
74-
});
73+
return socket.send(std::move(ping))
74+
.thenValue([&socket,
75+
shouldThrowDuringTakeover](auto&&) mutable {
76+
// Possibly simulate a takeover error during data transfer
77+
// for testing purposes. While we would prefer to use
78+
// fault injection here, it's not possible to inject an
79+
// error into the TakeoverClient because the thrift server
80+
// is not yet running.
81+
if (shouldThrowDuringTakeover) {
82+
// throw std::runtime_error("simulated takeover error");
83+
return folly::makeFuture<UnixSocket::Message>(
84+
folly::exception_wrapper(
85+
std::runtime_error("simulated takeover error")));
86+
}
87+
// Wait for the takeover data response
88+
auto timeout =
89+
std::chrono::seconds(FLAGS_takeoverReceiveTimeout);
90+
return socket.receive(timeout);
91+
});
7592
} else {
7693
// This should only be hit during integration tests.
7794
return folly::makeFuture<UnixSocket::Message>(

eden/fs/takeover/TakeoverClient.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ TakeoverData takeoverMounts(
2121
AbsolutePathPiece socketPath,
2222
// the following parameters are present for testing purposes and should not
2323
// normally be used in the production build.
24+
bool shouldThrowDuringTakeover = false,
2425
bool shouldPing = true,
2526
const std::set<int32_t>& supportedTakeoverVersions =
2627
kSupportedTakeoverVersions,

eden/fs/takeover/test/TakeoverTest.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ Future<TakeoverData> takeoverViaEventBase(
7979
promise.setWith([&] {
8080
return takeoverMounts(
8181
path,
82+
/*shouldThrowDuringTakeover=*/false,
8283
/*shouldPing=*/true,
8384
supportedVersions,
8485
supportedCapabilities);

eden/integration/helpers/TakeoverTool.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ DEFINE_bool(
2929
true,
3030
"This is used by integration tests to avoid sending a ping");
3131

32+
DEFINE_bool(
33+
shouldThrowDuringTakeover,
34+
false,
35+
"This can be used by tests to initiate an error in the TakeoverClient");
36+
3237
using namespace facebook::eden::path_literals;
3338

3439
/*
@@ -55,11 +60,15 @@ int main(int argc, char* argv[]) {
5560

5661
facebook::eden::TakeoverData data;
5762
if (FLAGS_takeoverVersion == 0) {
58-
data = facebook::eden::takeoverMounts(takeoverSocketPath, FLAGS_shouldPing);
63+
data = facebook::eden::takeoverMounts(
64+
takeoverSocketPath, FLAGS_shouldThrowDuringTakeover, FLAGS_shouldPing);
5965
} else {
6066
auto takeoverVersion = std::set<int32_t>{FLAGS_takeoverVersion};
6167
data = facebook::eden::takeoverMounts(
62-
takeoverSocketPath, FLAGS_shouldPing, takeoverVersion);
68+
takeoverSocketPath,
69+
FLAGS_shouldThrowDuringTakeover,
70+
FLAGS_shouldPing,
71+
takeoverVersion);
6372
}
6473
for (const auto& mount : data.mountPoints) {
6574
const folly::File* mountFD = nullptr;

0 commit comments

Comments
 (0)