Skip to content

Commit d1b4d92

Browse files
committed
Address code review comments III
Signed-off-by: Yacov Manevich <yacov.manevich@avalabs.org>
1 parent c587e6f commit d1b4d92

15 files changed

+230
-333
lines changed

RELEASES.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22

33
## [v1.12.3] (pending)
44

5-
- Added a health check that alerts if a primary network validator has no nodes connected to it. Runs a configurable time after bootstrapping or 20 minutes by default.
5+
- Extended the network health check by also alerting if a primary network validator has no nodes connected to it. Runs a configurable time after startup or 10 minutes by default.
66

77
### Configs
8-
- How long after bootstrapping the aforementioned health check runs can be configued via:
9-
`--no-ingress-validator-connection-timeout`
8+
- How long after startup the aforementioned health check runs can be configured via:
9+
`--network-no-ingress-connections-grace-period`
1010

1111

1212
## [v1.12.2](https://github.com/ava-labs/avalanchego/releases/tag/v1.12.2)

config/config.go

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -327,13 +327,14 @@ func getNetworkConfig(
327327
},
328328

329329
HealthConfig: network.HealthConfig{
330-
Enabled: sybilProtectionEnabled,
331-
MaxTimeSinceMsgSent: v.GetDuration(NetworkHealthMaxTimeSinceMsgSentKey),
332-
MaxTimeSinceMsgReceived: v.GetDuration(NetworkHealthMaxTimeSinceMsgReceivedKey),
333-
MaxPortionSendQueueBytesFull: v.GetFloat64(NetworkHealthMaxPortionSendQueueFillKey),
334-
MinConnectedPeers: v.GetUint(NetworkHealthMinPeersKey),
335-
MaxSendFailRate: v.GetFloat64(NetworkHealthMaxSendFailRateKey),
336-
SendFailRateHalflife: halflife,
330+
Enabled: sybilProtectionEnabled,
331+
MaxTimeSinceMsgSent: v.GetDuration(NetworkHealthMaxTimeSinceMsgSentKey),
332+
MaxTimeSinceMsgReceived: v.GetDuration(NetworkHealthMaxTimeSinceMsgReceivedKey),
333+
MaxPortionSendQueueBytesFull: v.GetFloat64(NetworkHealthMaxPortionSendQueueFillKey),
334+
MinConnectedPeers: v.GetUint(NetworkHealthMinPeersKey),
335+
MaxSendFailRate: v.GetFloat64(NetworkHealthMaxSendFailRateKey),
336+
SendFailRateHalflife: halflife,
337+
NoIngressValidatorConnectionGracePeriod: v.GetDuration(NetworkNoIngressValidatorConnectionsGracePeriodKey),
337338
},
338339

339340
ProxyEnabled: v.GetBool(NetworkTCPProxyEnabledKey),
@@ -1437,7 +1438,7 @@ func GetNodeConfig(v *viper.Viper) (node.Config, error) {
14371438
nodeConfig.SystemTrackerProcessingHalflife = v.GetDuration(SystemTrackerProcessingHalflifeKey)
14381439
nodeConfig.SystemTrackerCPUHalflife = v.GetDuration(SystemTrackerCPUHalflifeKey)
14391440
nodeConfig.SystemTrackerDiskHalflife = v.GetDuration(SystemTrackerDiskHalflifeKey)
1440-
nodeConfig.NoIngressValidatorConnectionTimeout = v.GetDuration(NoIngressValidatorConnectionTimeoutKey)
1441+
nodeConfig.NoIngressValidatorConnectionTimeout = v.GetDuration(NetworkNoIngressValidatorConnectionsGracePeriodKey)
14411442

14421443
nodeConfig.RequiredAvailableDiskSpace, nodeConfig.WarningThresholdAvailableDiskSpace, err = getDiskSpaceConfig(v)
14431444
if err != nil {

config/flags.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ func addNodeFlags(fs *pflag.FlagSet) {
169169
fs.Duration(NetworkReadHandshakeTimeoutKey, constants.DefaultNetworkReadHandshakeTimeout, "Timeout value for reading handshake messages")
170170
fs.Duration(NetworkPingTimeoutKey, constants.DefaultPingPongTimeout, "Timeout value for Ping-Pong with a peer")
171171
fs.Duration(NetworkPingFrequencyKey, constants.DefaultPingFrequency, "Frequency of pinging other peers")
172-
fs.Duration(NoIngressValidatorConnectionTimeoutKey, constants.DefaultNoIngressValidatorConnectionTimeout, "Time after which nodes are expected to be connected to us if we are a primary network validator, otherwise a health check fails")
172+
fs.Duration(NetworkNoIngressValidatorConnectionsGracePeriodKey, constants.DefaultNoIngressValidatorConnectionGracePeriod, "Time after which nodes are expected to be connected to us if we are a primary network validator, otherwise a health check fails")
173173
fs.String(NetworkCompressionTypeKey, constants.DefaultNetworkCompressionType.String(), fmt.Sprintf("Compression type for outbound messages. Must be one of [%s, %s]", compression.TypeZstd, compression.TypeNone))
174174

175175
fs.Duration(NetworkMaxClockDifferenceKey, constants.DefaultNetworkMaxClockDifference, "Max allowed clock difference value between this node and peers")

config/keys.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ const (
120120
NetworkInboundThrottlerMaxConnsPerSecKey = "network-inbound-connection-throttling-max-conns-per-sec"
121121
NetworkOutboundConnectionThrottlingRpsKey = "network-outbound-connection-throttling-rps"
122122
NetworkOutboundConnectionTimeoutKey = "network-outbound-connection-timeout"
123+
NetworkNoIngressValidatorConnectionsGracePeriodKey = "network-no-ingress-connections-grace-period"
123124
BenchlistFailThresholdKey = "benchlist-fail-threshold"
124125
BenchlistDurationKey = "benchlist-duration"
125126
BenchlistMinFailingDurationKey = "benchlist-min-failing-duration"
@@ -191,7 +192,6 @@ const (
191192
SystemTrackerDiskHalflifeKey = "system-tracker-disk-halflife"
192193
SystemTrackerRequiredAvailableDiskSpaceKey = "system-tracker-disk-required-available-space"
193194
SystemTrackerWarningThresholdAvailableDiskSpaceKey = "system-tracker-disk-warning-threshold-available-space"
194-
NoIngressValidatorConnectionTimeoutKey = "no-ingress-validator-connection-timeout"
195195
DiskVdrAllocKey = "throttler-inbound-disk-validator-alloc"
196196
DiskMaxNonVdrUsageKey = "throttler-inbound-disk-max-non-validator-usage"
197197
DiskMaxNonVdrNodeUsageKey = "throttler-inbound-disk-max-non-validator-node-usage"

network/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ type HealthConfig struct {
2626
// Marks if the health check should be enabled
2727
Enabled bool `json:"-"`
2828

29+
// NoIngressValidatorConnectionGracePeriod denotes the time after which the health check fails
30+
// for primary network validators with no ingress connections.
31+
NoIngressValidatorConnectionGracePeriod time.Duration
32+
2933
// MinConnectedPeers is the minimum number of peers that the network should
3034
// be connected to be considered healthy.
3135
MinConnectedPeers uint `json:"minConnectedPeers"`

network/network.go

Lines changed: 50 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,11 @@ import (
4242
)
4343

4444
const (
45-
ConnectedPeersKey = "connectedPeers"
46-
TimeSinceLastMsgReceivedKey = "timeSinceLastMsgReceived"
47-
TimeSinceLastMsgSentKey = "timeSinceLastMsgSent"
48-
SendFailRateKey = "sendFailRate"
45+
PrimaryNetworkValidatorHealthKey = "primary network validator health"
46+
ConnectedPeersKey = "connectedPeers"
47+
TimeSinceLastMsgReceivedKey = "timeSinceLastMsgReceived"
48+
TimeSinceLastMsgSentKey = "timeSinceLastMsgSent"
49+
SendFailRateKey = "sendFailRate"
4950
)
5051

5152
var (
@@ -156,6 +157,9 @@ type network struct {
156157
connectedPeers peer.Set
157158
closing bool
158159

160+
ingressConnAlerter noIngressConnAlert
161+
startupTime time.Time
162+
159163
// router is notified about all peer [Connected] and [Disconnected] events
160164
// as well as all non-handshake peer messages.
161165
//
@@ -259,38 +263,34 @@ func NewNetwork(
259263
ipTracker.ManuallyTrack(nodeID)
260264
}
261265

262-
// ingressConnCount tracks connections to all remote peers,
263-
// so for safety we set it as pointer, in case the config object is copied.
264-
var ingressConnCount atomic.Uint64
265-
266266
peerConfig := &peer.Config{
267-
ReadBufferSize: config.PeerReadBufferSize,
268-
WriteBufferSize: config.PeerWriteBufferSize,
269-
Metrics: peerMetrics,
270-
MessageCreator: msgCreator,
271-
IngressConnectionCount: &ingressConnCount,
272-
Log: log,
273-
InboundMsgThrottler: inboundMsgThrottler,
274-
Network: nil, // This is set below.
275-
Router: router,
276-
VersionCompatibility: version.GetCompatibility(minCompatibleTime),
277-
MyNodeID: config.MyNodeID,
278-
MySubnets: config.TrackedSubnets,
279-
Beacons: config.Beacons,
280-
Validators: config.Validators,
281-
NetworkID: config.NetworkID,
282-
PingFrequency: config.PingFrequency,
283-
PongTimeout: config.PingPongTimeout,
284-
MaxClockDifference: config.MaxClockDifference,
285-
SupportedACPs: config.SupportedACPs.List(),
286-
ObjectedACPs: config.ObjectedACPs.List(),
287-
ResourceTracker: config.ResourceTracker,
288-
UptimeCalculator: config.UptimeCalculator,
289-
IPSigner: peer.NewIPSigner(config.MyIPPort, config.TLSKey, config.BLSKey),
267+
ReadBufferSize: config.PeerReadBufferSize,
268+
WriteBufferSize: config.PeerWriteBufferSize,
269+
Metrics: peerMetrics,
270+
MessageCreator: msgCreator,
271+
Log: log,
272+
InboundMsgThrottler: inboundMsgThrottler,
273+
Network: nil, // This is set below.
274+
Router: router,
275+
VersionCompatibility: version.GetCompatibility(minCompatibleTime),
276+
MyNodeID: config.MyNodeID,
277+
MySubnets: config.TrackedSubnets,
278+
Beacons: config.Beacons,
279+
Validators: config.Validators,
280+
NetworkID: config.NetworkID,
281+
PingFrequency: config.PingFrequency,
282+
PongTimeout: config.PingPongTimeout,
283+
MaxClockDifference: config.MaxClockDifference,
284+
SupportedACPs: config.SupportedACPs.List(),
285+
ObjectedACPs: config.ObjectedACPs.List(),
286+
ResourceTracker: config.ResourceTracker,
287+
UptimeCalculator: config.UptimeCalculator,
288+
IPSigner: peer.NewIPSigner(config.MyIPPort, config.TLSKey, config.BLSKey),
290289
}
291290

292291
onCloseCtx, cancel := context.WithCancel(context.Background())
293292
n := &network{
293+
startupTime: time.Now(),
294294
config: config,
295295
peerConfig: peerConfig,
296296
metrics: metrics,
@@ -318,6 +318,11 @@ func NewNetwork(
318318
router: router,
319319
}
320320
n.peerConfig.Network = n
321+
n.ingressConnAlerter = noIngressConnAlert{
322+
ingressConnections: n,
323+
validators: config.Validators,
324+
selfID: config.MyNodeID,
325+
}
321326
return n, nil
322327
}
323328

@@ -408,6 +413,15 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) {
408413
details[SendFailRateKey] = sendFailRate
409414
n.metrics.sendFailRate.Set(sendFailRate)
410415

416+
reachablePrimaryNetworkValidator := true
417+
// Make sure if we're a primary network validator, we have ingress connections
418+
if time.Since(n.startupTime) > n.config.NoIngressValidatorConnectionGracePeriod {
419+
connectedPrimaryValidatorInfo, isConnectedPrimaryValidatorErr := n.ingressConnAlerter.checkHealth()
420+
reachablePrimaryNetworkValidator = isConnectedPrimaryValidatorErr == nil
421+
details[PrimaryNetworkValidatorHealthKey] = connectedPrimaryValidatorInfo
422+
}
423+
healthy = healthy && reachablePrimaryNetworkValidator
424+
411425
// emit metrics about the lifetime of peer connections
412426
n.metrics.updatePeerConnectionLifetimeMetrics()
413427

@@ -434,6 +448,11 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) {
434448
if !isMsgFailRate {
435449
errorReasons = append(errorReasons, fmt.Sprintf("messages failure send rate %g > %g", sendFailRate, n.config.HealthConfig.MaxSendFailRate))
436450
}
451+
452+
if !reachablePrimaryNetworkValidator {
453+
errorReasons = append(errorReasons, "primary network validator is unreachable")
454+
}
455+
437456
return details, fmt.Errorf("network layer is unhealthy reason: %s", strings.Join(errorReasons, ", "))
438457
}
439458

network/network_test.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,11 @@ func TestIngressConnCount(t *testing.T) {
325325

326326
wg.Done()
327327

328+
for _, net := range networks {
329+
net.config.NoIngressValidatorConnectionGracePeriod = 0
330+
net.config.HealthConfig.Enabled = true
331+
}
332+
328333
require.Eventually(func() bool {
329334
result := true
330335
for _, net := range networks {
@@ -334,15 +339,20 @@ func TestIngressConnCount(t *testing.T) {
334339
}, time.Minute, time.Millisecond*10)
335340

336341
ingressConnections := make([]int, 0, len(networks))
342+
healthCheckErrors := make([]error, 0, len(networks))
337343

338344
for _, net := range networks {
339345
ingressConnections = append(ingressConnections, net.IngressConnCount())
346+
_, err := net.HealthCheck(context.Background())
347+
healthCheckErrors = append(healthCheckErrors, err)
340348
}
341349

342350
// First node has all nodes connected to it.
343351
// Second node has only the third node connected to it.
344352
// Third node has no node connected to it, as it connects to the first and second node.
345353
require.Equal([]int{2, 1, 0}, ingressConnections)
354+
require.Equal([]error{nil, nil}, healthCheckErrors[:2])
355+
require.EqualError(healthCheckErrors[2], "network layer is unhealthy reason: primary network validator is unreachable")
346356
}
347357

348358
func TestSend(t *testing.T) {

network/no_ingress_conn_alert.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
2+
// See the file LICENSE for licensing terms.
3+
4+
package network
5+
6+
import (
7+
"errors"
8+
9+
"github.com/ava-labs/avalanchego/ids"
10+
"github.com/ava-labs/avalanchego/snow/validators"
11+
"github.com/ava-labs/avalanchego/utils/constants"
12+
)
13+
14+
// ErrNoIngressConnections denotes that no node is connected to this validator.
15+
var ErrNoIngressConnections = errors.New("no ingress connections")
16+
17+
type ingressConnectionCounter interface {
18+
IngressConnCount() int
19+
}
20+
21+
type validatorRetriever interface {
22+
GetValidator(subnetID ids.ID, nodeID ids.NodeID) (*validators.Validator, bool)
23+
}
24+
25+
type noIngressConnAlert struct {
26+
selfID ids.NodeID
27+
ingressConnections ingressConnectionCounter
28+
validators validatorRetriever
29+
}
30+
31+
func ingressConnResult(n int, areWeValidator bool) map[string]interface{} {
32+
return map[string]interface{}{"ingressConnectionCount": n, "primary network validator": areWeValidator}
33+
}
34+
35+
func (nica *noIngressConnAlert) checkHealth() (interface{}, error) {
36+
connCount := nica.ingressConnections.IngressConnCount()
37+
_, areWeValidator := nica.validators.GetValidator(constants.PrimaryNetworkID, nica.selfID)
38+
39+
if connCount > 0 {
40+
return ingressConnResult(connCount, areWeValidator), nil
41+
}
42+
43+
if !areWeValidator {
44+
return ingressConnResult(connCount, areWeValidator), nil
45+
}
46+
47+
return ingressConnResult(connCount, areWeValidator), ErrNoIngressConnections
48+
}

network/no_ingress_conn_alert_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
2+
// See the file LICENSE for licensing terms.
3+
4+
package network
5+
6+
import (
7+
"testing"
8+
9+
"github.com/stretchr/testify/require"
10+
11+
"github.com/ava-labs/avalanchego/ids"
12+
"github.com/ava-labs/avalanchego/snow/validators"
13+
)
14+
15+
type fakeValidatorRetriever struct {
16+
result bool
17+
}
18+
19+
func (m *fakeValidatorRetriever) GetValidator(ids.ID, ids.NodeID) (*validators.Validator, bool) {
20+
return nil, m.result
21+
}
22+
23+
type fakeIngressConnectionCounter struct {
24+
res int
25+
}
26+
27+
func (m *fakeIngressConnectionCounter) IngressConnCount() int {
28+
return m.res
29+
}
30+
31+
func TestNoIngressConnAlertHealthCheck(t *testing.T) {
32+
for _, testCase := range []struct {
33+
name string
34+
getValidatorResult bool
35+
ingressConnCountResult int
36+
expectedErr error
37+
expectedResult interface{}
38+
}{
39+
{
40+
name: "not a validator of a primary network",
41+
expectedResult: map[string]interface{}{"ingressConnectionCount": 0, "primary network validator": false},
42+
},
43+
{
44+
name: "a validator of the primary network",
45+
getValidatorResult: true,
46+
expectedResult: map[string]interface{}{
47+
"ingressConnectionCount": 0, "primary network validator": true,
48+
},
49+
expectedErr: ErrNoIngressConnections,
50+
},
51+
{
52+
name: "a validator with ingress connections",
53+
expectedResult: map[string]interface{}{"ingressConnectionCount": 42, "primary network validator": true},
54+
expectedErr: nil,
55+
ingressConnCountResult: 42,
56+
getValidatorResult: true,
57+
},
58+
} {
59+
t.Run(testCase.name, func(t *testing.T) {
60+
nica := &noIngressConnAlert{
61+
selfID: ids.EmptyNodeID,
62+
validators: &fakeValidatorRetriever{result: testCase.getValidatorResult},
63+
ingressConnections: &fakeIngressConnectionCounter{res: testCase.ingressConnCountResult},
64+
}
65+
66+
result, err := nica.checkHealth()
67+
require.Equal(t, testCase.expectedErr, err)
68+
require.Equal(t, testCase.expectedResult, result)
69+
})
70+
}
71+
}

network/peer/config.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,5 @@ type Config struct {
6161
IPSigner *IPSigner
6262

6363
// IngressConnectionCount counts the ingress (to us) connections.
64-
// Needs to be a pointer because it's shared across all peer connections.
65-
IngressConnectionCount *atomic.Uint64
64+
IngressConnectionCount atomic.Uint64
6665
}

0 commit comments

Comments
 (0)