Skip to content

Commit 14b8c98

Browse files
Log chain shutdown duration (ava-labs#1545)
1 parent a2ae740 commit 14b8c98

File tree

4 files changed

+95
-49
lines changed

4 files changed

+95
-49
lines changed

snow/networking/handler/handler.go

+30-3
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,13 @@ type Handler interface {
6363
Start(ctx context.Context, recoverPanic bool)
6464
Push(ctx context.Context, msg Message)
6565
Len() int
66+
6667
Stop(ctx context.Context)
6768
StopWithError(ctx context.Context, err error)
68-
Stopped() chan struct{}
69+
// AwaitStopped returns an error if the call would block and [ctx] is done.
70+
// Even if [ctx] is done when passed into this function, this function will
71+
// return a nil error if it will not block.
72+
AwaitStopped(ctx context.Context) (time.Duration, error)
6973
}
7074

7175
// handler passes incoming messages from the network to the consensus engine.
@@ -104,6 +108,8 @@ type handler struct {
104108
timeouts chan struct{}
105109

106110
closeOnce sync.Once
111+
startClosingTime time.Time
112+
totalClosingTime time.Duration
107113
closingChan chan struct{}
108114
numDispatchersClosed int
109115
// Closed when this handler and [engine] are done shutting down
@@ -305,6 +311,8 @@ func (h *handler) RegisterTimeout(d time.Duration) {
305311
// Note: It is possible for Stop to be called before/concurrently with Start.
306312
func (h *handler) Stop(ctx context.Context) {
307313
h.closeOnce.Do(func() {
314+
h.startClosingTime = h.clock.Time()
315+
308316
// Must hold the locks here to ensure there's no race condition in where
309317
// we check the value of [h.closing] after the call to [Signal].
310318
h.syncMessageQueue.Shutdown()
@@ -340,8 +348,19 @@ func (h *handler) StopWithError(ctx context.Context, err error) {
340348
h.Stop(ctx)
341349
}
342350

343-
func (h *handler) Stopped() chan struct{} {
344-
return h.closed
351+
func (h *handler) AwaitStopped(ctx context.Context) (time.Duration, error) {
352+
select {
353+
case <-h.closed:
354+
return h.totalClosingTime, nil
355+
default:
356+
}
357+
358+
select {
359+
case <-ctx.Done():
360+
return 0, ctx.Err()
361+
case <-h.closed:
362+
return h.totalClosingTime, nil
363+
}
345364
}
346365

347366
func (h *handler) dispatchSync(ctx context.Context) {
@@ -977,9 +996,17 @@ func (h *handler) shutdown(ctx context.Context) {
977996
if h.onStopped != nil {
978997
go h.onStopped()
979998
}
999+
1000+
h.totalClosingTime = h.clock.Time().Sub(h.startClosingTime)
9801001
close(h.closed)
9811002
}()
9821003

1004+
// shutdown may be called during Start, so we populate the start closing
1005+
// time here in case Stop was never called.
1006+
if h.startClosingTime.IsZero() {
1007+
h.startClosingTime = h.clock.Time()
1008+
}
1009+
9831010
state := h.ctx.State.Get()
9841011
engine, ok := h.engineManager.Get(state.Type).Get(state.State)
9851012
if !ok {

snow/networking/handler/mock_handler.go

+15-14
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

snow/networking/router/chain_router.go

+26-13
Original file line numberDiff line numberDiff line change
@@ -368,15 +368,21 @@ func (cr *ChainRouter) Shutdown(ctx context.Context) {
368368
chain.Stop(ctx)
369369
}
370370

371-
ticker := time.NewTicker(cr.closeTimeout)
372-
defer ticker.Stop()
371+
ctx, cancel := context.WithTimeout(ctx, cr.closeTimeout)
372+
defer cancel()
373373

374374
for _, chain := range prevChains {
375-
select {
376-
case <-chain.Stopped():
377-
case <-ticker.C:
378-
cr.log.Warn("timed out while shutting down the chains")
379-
return
375+
shutdownDuration, err := chain.AwaitStopped(ctx)
376+
377+
chainLog := chain.Context().Log
378+
if err != nil {
379+
chainLog.Warn("timed out while shutting down",
380+
zap.Error(err),
381+
)
382+
} else {
383+
chainLog.Info("chain shutdown",
384+
zap.Duration("shutdownDuration", shutdownDuration),
385+
)
380386
}
381387
}
382388
}
@@ -646,12 +652,19 @@ func (cr *ChainRouter) removeChain(ctx context.Context, chainID ids.ID) {
646652

647653
chain.Stop(ctx)
648654

649-
ticker := time.NewTicker(cr.closeTimeout)
650-
defer ticker.Stop()
651-
select {
652-
case <-chain.Stopped():
653-
case <-ticker.C:
654-
chain.Context().Log.Warn("timed out while shutting down")
655+
ctx, cancel := context.WithTimeout(ctx, cr.closeTimeout)
656+
shutdownDuration, err := chain.AwaitStopped(ctx)
657+
cancel()
658+
659+
chainLog := chain.Context().Log
660+
if err != nil {
661+
chainLog.Warn("timed out while shutting down",
662+
zap.Error(err),
663+
)
664+
} else {
665+
chainLog.Info("chain shutdown",
666+
zap.Duration("shutdownDuration", shutdownDuration),
667+
)
655668
}
656669

657670
if cr.onFatal != nil && cr.criticalChains.Contains(chainID) {

snow/networking/router/chain_router_test.go

+24-19
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,12 @@ const (
4242
)
4343

4444
func TestShutdown(t *testing.T) {
45+
require := require.New(t)
46+
4547
vdrs := validators.NewSet()
4648
err := vdrs.Add(ids.GenerateTestNodeID(), nil, ids.Empty, 1)
47-
require.NoError(t, err)
49+
require.NoError(err)
50+
4851
benchlist := benchlist.NewNoBenchlist()
4952
tm, err := timeout.NewManager(
5053
&timer.AdaptiveTimeoutConfig{
@@ -58,7 +61,7 @@ func TestShutdown(t *testing.T) {
5861
"",
5962
prometheus.NewRegistry(),
6063
)
61-
require.NoError(t, err)
64+
require.NoError(err)
6265
go tm.Dispatch()
6366

6467
chainRouter := ChainRouter{}
@@ -75,29 +78,30 @@ func TestShutdown(t *testing.T) {
7578
"",
7679
prometheus.NewRegistry(),
7780
)
78-
require.NoError(t, err)
81+
require.NoError(err)
7982

8083
shutdownCalled := make(chan struct{}, 1)
8184

82-
ctx := snow.DefaultConsensusContextTest()
85+
chainCtx := snow.DefaultConsensusContextTest()
8386
resourceTracker, err := tracker.NewResourceTracker(
8487
prometheus.NewRegistry(),
8588
resource.NoUsage,
8689
meter.ContinuousFactory{},
8790
time.Second,
8891
)
89-
require.NoError(t, err)
92+
require.NoError(err)
93+
9094
h, err := handler.New(
91-
ctx,
95+
chainCtx,
9296
vdrs,
9397
nil,
9498
time.Second,
9599
testThreadPoolSize,
96100
resourceTracker,
97101
validators.UnhandledSubnetConnector,
98-
subnets.New(ctx.NodeID, subnets.Config{}),
102+
subnets.New(chainCtx.NodeID, subnets.Config{}),
99103
)
100-
require.NoError(t, err)
104+
require.NoError(err)
101105

102106
bootstrapper := &common.BootstrapperTest{
103107
BootstrapableTest: common.BootstrapableTest{
@@ -110,7 +114,7 @@ func TestShutdown(t *testing.T) {
110114
bootstrapper.Default(true)
111115
bootstrapper.CantGossip = false
112116
bootstrapper.ContextF = func() *snow.ConsensusContext {
113-
return ctx
117+
return chainCtx
114118
}
115119
bootstrapper.ShutdownF = func(context.Context) error {
116120
shutdownCalled <- struct{}{}
@@ -125,7 +129,7 @@ func TestShutdown(t *testing.T) {
125129
engine.Default(true)
126130
engine.CantGossip = false
127131
engine.ContextF = func() *snow.ConsensusContext {
128-
return ctx
132+
return chainCtx
129133
}
130134
engine.ShutdownF = func(context.Context) error {
131135
shutdownCalled <- struct{}{}
@@ -147,7 +151,7 @@ func TestShutdown(t *testing.T) {
147151
Consensus: engine,
148152
},
149153
})
150-
ctx.State.Set(snow.EngineState{
154+
chainCtx.State.Set(snow.EngineState{
151155
Type: engineType,
152156
State: snow.NormalOp, // assumed bootstrapping is done
153157
})
@@ -161,18 +165,19 @@ func TestShutdown(t *testing.T) {
161165

162166
chainRouter.Shutdown(context.Background())
163167

164-
ticker := time.NewTicker(250 * time.Millisecond)
168+
ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond)
169+
defer cancel()
170+
165171
select {
166-
case <-ticker.C:
167-
t.Fatalf("Handler shutdown was not called or timed out after 250ms during chainRouter shutdown")
172+
case <-ctx.Done():
173+
require.FailNow("Handler shutdown was not called or timed out after 250ms during chainRouter shutdown")
168174
case <-shutdownCalled:
169175
}
170176

171-
select {
172-
case <-h.Stopped():
173-
default:
174-
t.Fatal("handler shutdown but never closed its closing channel")
175-
}
177+
shutdownDuration, err := h.AwaitStopped(ctx)
178+
require.NoError(err)
179+
require.GreaterOrEqual(shutdownDuration, time.Duration(0))
180+
require.Less(shutdownDuration, 250*time.Millisecond)
176181
}
177182

178183
func TestShutdownTimesOut(t *testing.T) {

0 commit comments

Comments
 (0)