Skip to content

Commit 432d674

Browse files
Zhengchao Liufacebook-github-bot
authored andcommitted
sample FS trace events for logging
Summary: The current fb303 counters only report aggregated latency while we want to track Eden performance under different version, os, channel, and configs. So I am setting up a new logging mechanism for this purpose. This diff introduces the class `FsEventLogger` for sampling and logging. There are 3 configs introduced by this diff. The configs are reloaded every 30 minutes. 1. `telemetry:request-sampling-config-allowlist` A list of config keys that we want to attach to scuba events. 2. `telemetry:request-samples-per-minute` Max number of events logged to scuba per minute per mount. 3. `telemetry:request-sampling-group-denominators` * Each type of operation has a "sampling group" (defaulted to 0, which is dropping all). * We use this sampling group as index to look up its denominator in this config. * The denominator is then used for sampling. e.g. `1/x` of the events are send to scuba, if we haven't reached the cap specified by #2. Example workflow: 1. receive tracing event 2. look up denominator of the sampling group of the operation type 3. sample based on the denominator 4. check that we have not exceeded the logging cap per min 5. create sample and send to scribe Reviewed By: xavierd Differential Revision: D30288054 fbshipit-source-id: 8f2b95c11c718550a8162f4d1259a25628f499ff
1 parent 025929c commit 432d674

File tree

10 files changed

+281
-17
lines changed

10 files changed

+281
-17
lines changed

eden/fs/config/EdenConfig.h

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,60 @@ class EdenConfig : private ConfigSettingManager {
529529
std::nullopt,
530530
this};
531531

532+
/**
533+
* Controls sample denominator for each request sampling group.
534+
* We assign request types into sampling groups based on their usage and
535+
* set a sample denominator for each sampling group so that we have the
536+
* flexibility of up/down-sampling different requests but also avoid having to
537+
* set a sampling rate for each of the dozens of request types. For example,
538+
* `mkdir` and `rmdir` can be assigned to a sampling group that have a high
539+
* sampling rate while `getattr` and `getxattr` to another sampling group with
540+
* low sampling rate as they happen very frequently.
541+
*
542+
* Sampling rates are calculated from sampling denominators. A denominator of
543+
* 0 indicates dropping all requests in the group. Group 0's value is ignored
544+
* as it's always considered as having denominator of 0. A positive
545+
* denominator means that the requests in the group are sampled at 1/x (so
546+
* denominator of 1 drops no events).
547+
*
548+
* We use sampling group as indexes into this vector to look
549+
* up their denominators. Thus, the size of this vector should match the
550+
* number of sampling groups defined by the enum `SamplingGroup`. If the
551+
* vector has fewer elements than the number of sampling groups, look-ups will
552+
* fail for the higher sampling groups and we will consider them having
553+
* denominator of 0. For example, if the vector has size of 3, all requests of
554+
* sampling group 4 will be dropped.
555+
* Keeping this vector in ascending order is recommended but not required.
556+
* e.g. {0, 10, 100, 1000, 10000}
557+
*/
558+
ConfigSetting<std::vector<uint32_t>> requestSamplingGroupDenominators{
559+
"telemetry:request-sampling-group-denominators",
560+
std::vector<uint32_t>{0, 0, 0, 0, 0},
561+
this};
562+
563+
/**
564+
* Controls the max number of requests per minute per mount that can be sent
565+
* for logging.
566+
* A request is first sampled based on its sampling group denominators. Then
567+
* if we have not reached this cap, the request is sent for logging.
568+
*/
569+
ConfigSetting<uint32_t> requestSamplesPerMinute{
570+
"telemetry:request-samples-per-minute",
571+
0,
572+
this};
573+
574+
/**
575+
* Controls which configs we want to send with the request logging.
576+
* The elements are full config keys, e.g. "hg:import-batch-size".
577+
* Elements not valid or not present in the config map are silently ignored.
578+
* This is only intended for facilitating A/B testing and should be empty if
579+
* there is no active experiment.
580+
*/
581+
ConfigSetting<std::vector<std::string>> requestSamplingConfigAllowlist{
582+
"telemetry:request-sampling-config-allowlist",
583+
std::vector<std::string>{},
584+
this};
585+
532586
// [experimental]
533587

534588
/**

eden/fs/fuse/FuseChannel.cpp

Lines changed: 51 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,12 @@
1616
#include <folly/logging/xlog.h>
1717
#include <folly/system/ThreadName.h>
1818
#include <signal.h>
19+
#include <chrono>
1920
#include <type_traits>
2021
#include "eden/fs/fuse/DirList.h"
2122
#include "eden/fs/fuse/FuseDispatcher.h"
2223
#include "eden/fs/fuse/FuseRequestContext.h"
24+
#include "eden/fs/telemetry/FsEventLogger.h"
2325
#include "eden/fs/utils/Bug.h"
2426
#include "eden/fs/utils/IDGen.h"
2527
#include "eden/fs/utils/Synchronized.h"
@@ -234,8 +236,14 @@ struct HandlerEntry {
234236
Handler h,
235237
FuseArgRenderer r,
236238
ChannelThreadStats::StatPtr s,
237-
AccessType at = AccessType::FsChannelOther)
238-
: name{n}, handler{h}, argRenderer{r}, stat{s}, accessType{at} {}
239+
AccessType at = AccessType::FsChannelOther,
240+
SamplingGroup samplingGroup = SamplingGroup::DropAll)
241+
: name{n},
242+
handler{h},
243+
argRenderer{r},
244+
stat{s},
245+
samplingGroup{samplingGroup},
246+
accessType{at} {}
239247

240248
std::string getShortName() const {
241249
if (name.startsWith("FUSE_")) {
@@ -260,6 +268,7 @@ struct HandlerEntry {
260268
Handler handler = nullptr;
261269
FuseArgRenderer argRenderer = nullptr;
262270
ChannelThreadStats::StatPtr stat = nullptr;
271+
SamplingGroup samplingGroup = SamplingGroup::DropAll;
263272
AccessType accessType = AccessType::FsChannelOther;
264273
};
265274

@@ -351,13 +360,15 @@ constexpr auto kFuseHandlers = [] {
351360
&FuseChannel::fuseRead,
352361
&argrender::read,
353362
&ChannelThreadStats::read,
354-
Read};
363+
Read,
364+
SamplingGroup::Three};
355365
handlers[FUSE_WRITE] = {
356366
"FUSE_WRITE",
357367
&FuseChannel::fuseWrite,
358368
&argrender::write,
359369
&ChannelThreadStats::write,
360-
Write};
370+
Write,
371+
SamplingGroup::Four};
361372
handlers[FUSE_STATFS] = {
362373
"FUSE_STATFS",
363374
&FuseChannel::fuseStatFs,
@@ -415,7 +426,8 @@ constexpr auto kFuseHandlers = [] {
415426
&FuseChannel::fuseReadDir,
416427
&argrender::readdir,
417428
&ChannelThreadStats::readdir,
418-
Read};
429+
Read,
430+
SamplingGroup::Two};
419431
handlers[FUSE_RELEASEDIR] = {
420432
"FUSE_RELEASEDIR",
421433
&FuseChannel::fuseReleaseDir,
@@ -590,6 +602,11 @@ iovec make_iovec(const T& t) {
590602
return iov;
591603
}
592604

605+
SamplingGroup fuseOpcodeSamplingGroup(uint32_t opcode) {
606+
auto* entry = lookupFuseHandlerEntry(opcode);
607+
return entry ? entry->samplingGroup : SamplingGroup::DropAll;
608+
}
609+
593610
} // namespace
594611

595612
StringPiece fuseOpcodeName(uint32_t opcode) {
@@ -779,6 +796,7 @@ FuseChannel::FuseChannel(
779796
std::unique_ptr<FuseDispatcher> dispatcher,
780797
const folly::Logger* straceLogger,
781798
std::shared_ptr<ProcessNameCache> processNameCache,
799+
std::shared_ptr<FsEventLogger> fsEventLogger,
782800
folly::Duration requestTimeout,
783801
Notifications* notifications,
784802
CaseSensitivity caseSensitive,
@@ -804,20 +822,43 @@ FuseChannel::FuseChannel(
804822
installSignalHandler();
805823

806824
traceSubscriptionHandles_.push_back(traceBus_->subscribeFunction(
807-
"FuseChannel request tracking", [this](const FuseTraceEvent& event) {
825+
"FuseChannel request tracking",
826+
[this,
827+
fsEventLogger = std::move(fsEventLogger)](const FuseTraceEvent& event) {
808828
switch (event.getType()) {
809829
case FuseTraceEvent::START: {
810830
auto state = telemetryState_.wlock();
811831
auto [iter, inserted] = state->requests.emplace(
812832
event.getUnique(),
813-
OutstandingRequest{event.getUnique(), event.getRequest()});
833+
OutstandingRequest{
834+
event.getUnique(),
835+
event.getRequest(),
836+
event.monotonicTime});
814837
XCHECK(inserted) << "duplicate fuse start event";
815838
break;
816839
}
817840
case FuseTraceEvent::FINISH: {
818-
auto state = telemetryState_.wlock();
819-
auto erased = state->requests.erase(event.getUnique());
820-
XCHECK(erased) << "duplicate fuse finish event";
841+
uint64_t durationNs = 0;
842+
{
843+
auto state = telemetryState_.wlock();
844+
auto it = state->requests.find(event.getUnique());
845+
XCHECK(it != state->requests.end())
846+
<< "duplicate fuse finish event";
847+
durationNs =
848+
std::chrono::duration_cast<std::chrono::nanoseconds>(
849+
event.monotonicTime - it->second.requestStartTime)
850+
.count();
851+
state->requests.erase(it);
852+
}
853+
854+
if (fsEventLogger) {
855+
auto opcode = event.getRequest().opcode;
856+
fsEventLogger->log({
857+
durationNs,
858+
fuseOpcodeSamplingGroup(opcode),
859+
fuseOpcodeName(opcode),
860+
});
861+
}
821862
break;
822863
}
823864
}

eden/fs/fuse/FuseChannel.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ struct Unit;
4141
namespace facebook::eden {
4242

4343
class Notifications;
44+
class FsEventLogger;
4445
class FuseRequestContext;
4546

4647
using TraceDetailedArgumentsHandle = std::shared_ptr<void>;
@@ -205,6 +206,7 @@ class FuseChannel {
205206
struct OutstandingRequest {
206207
uint64_t unique;
207208
FuseTraceEvent::RequestHeader request;
209+
std::chrono::steady_clock::time_point requestStartTime;
208210
};
209211

210212
/**
@@ -225,6 +227,7 @@ class FuseChannel {
225227
std::unique_ptr<FuseDispatcher> dispatcher,
226228
const folly::Logger* straceLogger,
227229
std::shared_ptr<ProcessNameCache> processNameCache,
230+
std::shared_ptr<FsEventLogger> fsEventLogger,
228231
folly::Duration requestTimeout,
229232
Notifications* FOLLY_NULLABLE notifications,
230233
CaseSensitivity caseSensitive,

eden/fs/fuse/fuse_tester/main.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,11 +133,12 @@ int main(int argc, char** argv) {
133133
std::move(dispatcher),
134134
&straceLogger,
135135
std::make_shared<ProcessNameCache>(),
136+
/*fsEventLogger=*/nullptr,
136137
std::chrono::seconds(60),
137-
nullptr,
138+
/*notifications=*/nullptr,
138139
CaseSensitivity::Sensitive,
139-
true,
140-
12 /* the default on Linux */));
140+
/*requireUtf8Path=*/true,
141+
/*maximumBackgroundRequests=*/12 /* the default on Linux */));
141142

142143
XLOG(INFO) << "Starting FUSE...";
143144
auto completionFuture = channel->initialize().get();

eden/fs/fuse/test/FuseChannelTest.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,12 @@ class FuseChannelTest : public ::testing::Test {
7676
std::move(testDispatcher),
7777
&straceLogger,
7878
std::make_shared<ProcessNameCache>(),
79+
/*fsEventLogger=*/nullptr,
7980
std::chrono::seconds(60),
80-
nullptr,
81+
/*notifications=*/nullptr,
8182
CaseSensitivity::Sensitive,
82-
true,
83-
12));
83+
/*requireUtf8Path=*/true,
84+
/*maximumBackgroundRequests=*/12));
8485
}
8586

8687
FuseChannel::StopFuture performInit(

eden/fs/inodes/EdenMount.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1381,6 +1381,7 @@ std::unique_ptr<FuseChannel, FuseChannelDeleter> makeFuseChannel(
13811381
EdenDispatcherFactory::makeFuseDispatcher(mount),
13821382
&mount->getStraceLogger(),
13831383
mount->getServerState()->getProcessNameCache(),
1384+
mount->getServerState()->getFsEventLogger(),
13841385
std::chrono::duration_cast<folly::Duration>(
13851386
edenConfig->fuseRequestTimeout.getValue()),
13861387
mount->getServerState()->getNotifications(),

eden/fs/inodes/ServerState.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "eden/fs/config/EdenConfig.h"
1414
#include "eden/fs/model/git/TopLevelIgnores.h"
15+
#include "eden/fs/telemetry/FsEventLogger.h"
1516
#include "eden/fs/utils/Clock.h"
1617
#include "eden/fs/utils/FaultInjector.h"
1718
#include "eden/fs/utils/UnboundedQueueExecutor.h"
@@ -22,6 +23,14 @@ DEFINE_bool(
2223
"Block mount attempts via the fault injection framework. "
2324
"Requires --enable_fault_injection.");
2425

26+
namespace {
27+
#if defined(EDEN_HAVE_HIVE_LOGGER)
28+
constexpr auto kHasHiveLogger = true;
29+
#else
30+
constexpr auto kHasHiveLogger = false;
31+
#endif
32+
} // namespace
33+
2534
namespace facebook {
2635
namespace eden {
2736

@@ -58,7 +67,11 @@ ServerState::ServerState(
5867
systemIgnoreFileMonitor_{CachedParsedFileMonitor<GitIgnoreFileParser>{
5968
edenConfig->systemIgnoreFile.getValue(),
6069
kSystemIgnoreMinPollSeconds}},
61-
notifications_(config_) {
70+
notifications_(config_),
71+
fsEventLogger_{
72+
(kHasHiveLogger && edenConfig->requestSamplesPerMinute.getValue())
73+
? std::make_shared<FsEventLogger>(config_, hiveLogger_)
74+
: nullptr} {
6275
// It would be nice if we eventually built a more generic mechanism for
6376
// defining faults to be configured on start up. (e.g., loading this from the
6477
// EdenConfig).

eden/fs/inodes/ServerState.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class Clock;
2727
class EdenConfig;
2828
class FaultInjector;
2929
class IHiveLogger;
30+
class FsEventLogger;
3031
class ProcessNameCache;
3132
class StructuredLogger;
3233
class TopLevelIgnores;
@@ -155,6 +156,15 @@ class ServerState {
155156
return hiveLogger_.get();
156157
}
157158

159+
/**
160+
* Returns a pointer to the FsEventLogger for logging FS event samples, if the
161+
* platform supports it. Otherwise, returns nullptr. The caller is responsible
162+
* for null checking.
163+
*/
164+
std::shared_ptr<FsEventLogger> getFsEventLogger() const {
165+
return fsEventLogger_;
166+
}
167+
158168
FaultInjector& getFaultInjector() {
159169
return *faultInjector_;
160170
}
@@ -182,6 +192,7 @@ class ServerState {
182192
folly::Synchronized<CachedParsedFileMonitor<GitIgnoreFileParser>>
183193
systemIgnoreFileMonitor_;
184194
Notifications notifications_;
195+
std::shared_ptr<FsEventLogger> fsEventLogger_;
185196
};
186197
} // namespace eden
187198
} // namespace facebook

0 commit comments

Comments
 (0)