Skip to content

Commit 3ca0c60

Browse files
author
Flavio Crisciani
committed
NetworkDB performance improvements
CPU profile showed how the mRandomNodes was taking ~30% of the CPU time of the gossip cycle. Changing the data structure from a []string to a map allowed to improve the performance of all the functions that were using it. Following the comparison of the benchmarks before and after the change AddNodeNetwork: benchmark old ns/op new ns/op delta BenchmarkAddNetworkNode-4 1859 181 -90.26% benchmark old allocs new allocs delta BenchmarkAddNetworkNode-4 1 1 +0.00% benchmark old bytes new bytes delta BenchmarkAddNetworkNode-4 15 15 +0.00% DelNodeNetwork: benchmark old ns/op new ns/op delta BenchmarkDeleteNetworkNode-4 71.0 75.8 +6.76% benchmark old allocs new allocs delta BenchmarkDeleteNetworkNode-4 0 0 +0.00% benchmark old bytes new bytes delta BenchmarkDeleteNetworkNode-4 3 7 +133.33% RandomNode: benchmark old ns/op new ns/op delta BenchmarkRandomNodes-4 1830 172 -90.60% benchmark old allocs new allocs delta BenchmarkRandomNodes-4 16 1 -93.75% benchmark old bytes new bytes delta BenchmarkRandomNodes-4 535 48 -91.03% Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
1 parent 83862f4 commit 3ca0c60

File tree

4 files changed

+158
-98
lines changed

4 files changed

+158
-98
lines changed

networkdb/benchmark_test.go

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
package networkdb
2+
3+
import (
4+
"strconv"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
func TestAddNetworkNode(t *testing.T) {
11+
n := &NetworkDB{config: &Config{NodeID: "node-0"}, networkNodes: make(map[string]map[string]struct{})}
12+
for i := 0; i < 2000; i++ {
13+
n.addNetworkNode("network", "node-"+strconv.Itoa(i%1000))
14+
}
15+
assert.Equal(t, 1000, len(n.networkNodes["network"]))
16+
for i := 0; i < 2000; i++ {
17+
n.addNetworkNode("network"+strconv.Itoa(i%1000), "node-"+strconv.Itoa(i))
18+
}
19+
for i := 0; i < 1000; i++ {
20+
assert.Equal(t, 2, len(n.networkNodes["network"+strconv.Itoa(i%1000)]))
21+
}
22+
}
23+
24+
func TestDeleteNetworkNode(t *testing.T) {
25+
n := &NetworkDB{config: &Config{NodeID: "node-0"}, networkNodes: make(map[string]map[string]struct{})}
26+
for i := 0; i < 1000; i++ {
27+
n.addNetworkNode("network", "node-"+strconv.Itoa(i%1000))
28+
}
29+
assert.Equal(t, 1000, len(n.networkNodes["network"]))
30+
for i := 0; i < 2000; i++ {
31+
n.deleteNetworkNode("network", "node-"+strconv.Itoa(i%1000))
32+
}
33+
assert.Equal(t, 0, len(n.networkNodes["network"]))
34+
for i := 0; i < 2000; i++ {
35+
n.addNetworkNode("network"+strconv.Itoa(i%1000), "node-"+strconv.Itoa(i))
36+
}
37+
for i := 0; i < 1000; i++ {
38+
assert.Equal(t, 2, len(n.networkNodes["network"+strconv.Itoa(i%1000)]))
39+
n.deleteNetworkNode("network"+strconv.Itoa(i%1000), "node-"+strconv.Itoa(i))
40+
assert.Equal(t, 1, len(n.networkNodes["network"+strconv.Itoa(i%1000)]))
41+
}
42+
for i := 1000; i < 2000; i++ {
43+
n.deleteNetworkNode("network"+strconv.Itoa(i%1000), "node-"+strconv.Itoa(i))
44+
assert.Equal(t, 0, len(n.networkNodes["network"+strconv.Itoa(i%1000)]))
45+
}
46+
}
47+
48+
func TestRandomNodes(t *testing.T) {
49+
n := &NetworkDB{config: &Config{NodeID: "node-0"}}
50+
nodes := make(map[string]struct{})
51+
for i := 0; i < 1000; i++ {
52+
nodes["node-"+strconv.Itoa(i)] = struct{}{}
53+
}
54+
nodeHit := make(map[string]int)
55+
for i := 0; i < 5000; i++ {
56+
chosen := n.mRandomNodes(3, nodes)
57+
for _, c := range chosen {
58+
if c == "node-0" {
59+
t.Fatal("should never hit itself")
60+
}
61+
nodeHit[c]++
62+
}
63+
}
64+
65+
// check results
66+
var min, max int
67+
for node, hit := range nodeHit {
68+
if min == 0 {
69+
min = hit
70+
}
71+
if hit == 0 && node != "node-0" {
72+
t.Fatal("node never hit")
73+
}
74+
if hit > max {
75+
max = hit
76+
}
77+
if hit < min {
78+
min = hit
79+
}
80+
}
81+
assert.NotEqual(t, 0, min)
82+
}
83+
84+
func BenchmarkAddNetworkNode(b *testing.B) {
85+
n := &NetworkDB{config: &Config{NodeID: "node-0"}, networkNodes: make(map[string]map[string]struct{})}
86+
for i := 0; i < b.N; i++ {
87+
n.addNetworkNode("network", "node-"+strconv.Itoa(i%1000))
88+
}
89+
}
90+
91+
func BenchmarkDeleteNetworkNode(b *testing.B) {
92+
n := &NetworkDB{config: &Config{NodeID: "node-0"}, networkNodes: make(map[string]map[string]struct{})}
93+
for i := 0; i < 1000; i++ {
94+
n.addNetworkNode("network", "node-"+strconv.Itoa(i))
95+
}
96+
b.ResetTimer()
97+
for i := 0; i < b.N; i++ {
98+
n.deleteNetworkNode("network", "node-"+strconv.Itoa(i))
99+
}
100+
}
101+
102+
func BenchmarkRandomNodes(b *testing.B) {
103+
n := &NetworkDB{config: &Config{NodeID: "node-0"}}
104+
nodes := make(map[string]struct{})
105+
for i := 0; i < 1000; i++ {
106+
nodes["node-"+strconv.Itoa(i)] = struct{}{}
107+
}
108+
b.ResetTimer()
109+
for i := 0; i < b.N; i++ {
110+
n.mRandomNodes(3, nodes)
111+
}
112+
}

networkdb/cluster.go

Lines changed: 15 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,9 @@ package networkdb
22

33
import (
44
"bytes"
5-
"crypto/rand"
65
"encoding/hex"
76
"fmt"
87
"log"
9-
"math/big"
108
rnd "math/rand"
119
"net"
1210
"strings"
@@ -283,7 +281,7 @@ func (nDB *NetworkDB) reconnectNode() {
283281
}
284282
nDB.RUnlock()
285283

286-
node := nodes[randomOffset(len(nodes))]
284+
node := nodes[rnd.Int31()%int32(len(nodes))]
287285
addr := net.UDPAddr{IP: node.Addr, Port: int(node.Port)}
288286

289287
if _, err := nDB.memberlist.Join([]string{addr.String()}); err != nil {
@@ -295,7 +293,7 @@ func (nDB *NetworkDB) reconnectNode() {
295293
}
296294

297295
logrus.Debugf("Initiating bulk sync with node %s after reconnect", node.Name)
298-
nDB.bulkSync([]string{node.Name}, true)
296+
nDB.bulkSync([]string{node.Name})
299297
}
300298

301299
// For timing the entry deletion in the repaer APIs that doesn't use monotonic clock
@@ -378,7 +376,7 @@ func (nDB *NetworkDB) gossip() {
378376
nDB.RLock()
379377
thisNodeNetworks := nDB.networks[nDB.config.NodeID]
380378
for nid := range thisNodeNetworks {
381-
networkNodes[nid] = nDB.networkNodes[nid]
379+
networkNodes[nid] = nDB.mRandomNodes(3, nDB.networkNodes[nid])
382380
}
383381
printStats := time.Since(nDB.lastStatsTimestamp) >= nDB.config.StatsPrintPeriod
384382
printHealth := time.Since(nDB.lastHealthTimestamp) >= nDB.config.HealthPrintPeriod
@@ -393,7 +391,6 @@ func (nDB *NetworkDB) gossip() {
393391
}
394392

395393
for nid, nodes := range networkNodes {
396-
mNodes := nDB.mRandomNodes(3, nodes)
397394
bytesAvail := nDB.config.PacketBufferSize - compoundHeaderOverhead
398395

399396
nDB.RLock()
@@ -432,7 +429,7 @@ func (nDB *NetworkDB) gossip() {
432429
// Create a compound message
433430
compound := makeCompoundMessage(msgs)
434431

435-
for _, node := range mNodes {
432+
for _, node := range nodes {
436433
nDB.RLock()
437434
mnode := nDB.nodes[node]
438435
nDB.RUnlock()
@@ -473,15 +470,15 @@ func (nDB *NetworkDB) bulkSyncTables() {
473470
networks = networks[1:]
474471

475472
nDB.RLock()
476-
nodes := nDB.networkNodes[nid]
473+
nodes := nDB.mRandomNodes(2, nDB.networkNodes[nid])
477474
nDB.RUnlock()
478475

479476
// No peer nodes on this network. Move on.
480477
if len(nodes) == 0 {
481478
continue
482479
}
483480

484-
completed, err := nDB.bulkSync(nodes, false)
481+
completed, err := nDB.bulkSync(nodes)
485482
if err != nil {
486483
logrus.Errorf("periodic bulk sync failure for network %s: %v", nid, err)
487484
continue
@@ -508,13 +505,7 @@ func (nDB *NetworkDB) bulkSyncTables() {
508505
}
509506
}
510507

511-
func (nDB *NetworkDB) bulkSync(nodes []string, all bool) ([]string, error) {
512-
if !all {
513-
// Get 2 random nodes. 2nd node will be tried if the bulk sync to
514-
// 1st node fails.
515-
nodes = nDB.mRandomNodes(2, nodes)
516-
}
517-
508+
func (nDB *NetworkDB) bulkSync(nodes []string) ([]string, error) {
518509
if len(nodes) == 0 {
519510
return nil, nil
520511
}
@@ -527,12 +518,7 @@ func (nDB *NetworkDB) bulkSync(nodes []string, all bool) ([]string, error) {
527518
}
528519
logrus.Debugf("%v(%v): Initiating bulk sync with node %v", nDB.config.Hostname, nDB.config.NodeID, node)
529520
networks = nDB.findCommonNetworks(node)
530-
err = nDB.bulkSyncNode(networks, node, true)
531-
// if its periodic bulksync stop after the first successful sync
532-
if !all && err == nil {
533-
break
534-
}
535-
if err != nil {
521+
if err = nDB.bulkSyncNode(networks, node, true); err != nil {
536522
err = fmt.Errorf("bulk sync to node %s failed: %v", node, err)
537523
logrus.Warn(err.Error())
538524
}
@@ -649,48 +635,22 @@ func (nDB *NetworkDB) bulkSyncNode(networks []string, node string, unsolicited b
649635
return nil
650636
}
651637

652-
// Returns a random offset between 0 and n
653-
func randomOffset(n int) int {
654-
if n == 0 {
655-
return 0
656-
}
657-
658-
val, err := rand.Int(rand.Reader, big.NewInt(int64(n)))
659-
if err != nil {
660-
logrus.Errorf("Failed to get a random offset: %v", err)
661-
return 0
662-
}
663-
664-
return int(val.Int64())
665-
}
666-
667638
// mRandomNodes is used to select up to m random nodes. It is possible
668639
// that less than m nodes are returned.
669-
func (nDB *NetworkDB) mRandomNodes(m int, nodes []string) []string {
670-
n := len(nodes)
640+
func (nDB *NetworkDB) mRandomNodes(m int, nodes map[string]struct{}) []string {
671641
mNodes := make([]string, 0, m)
672-
OUTER:
673-
// Probe up to 3*n times, with large n this is not necessary
674-
// since k << n, but with small n we want search to be
675-
// exhaustive
676-
for i := 0; i < 3*n && len(mNodes) < m; i++ {
677-
// Get random node
678-
idx := randomOffset(n)
679-
node := nodes[idx]
680642

643+
var i int
644+
for node := range nodes {
681645
if node == nDB.config.NodeID {
682646
continue
683647
}
684648

685-
// Check if we have this node already
686-
for j := 0; j < len(mNodes); j++ {
687-
if node == mNodes[j] {
688-
continue OUTER
689-
}
690-
}
691-
692-
// Append the node
693649
mNodes = append(mNodes, node)
650+
i++
651+
if i == m {
652+
break
653+
}
694654
}
695655

696656
return mNodes

networkdb/delegate.go

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -154,18 +154,14 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
154154
// Ignore the table events for networks that are in the process of going away
155155
nDB.RLock()
156156
networks := nDB.networks[nDB.config.NodeID]
157-
network, ok := networks[tEvent.NetworkID]
157+
network, okNetwork := networks[tEvent.NetworkID]
158158
// Check if the owner of the event is still part of the network
159-
nodes := nDB.networkNodes[tEvent.NetworkID]
160159
var nodePresent bool
161-
for _, node := range nodes {
162-
if node == tEvent.NodeName {
163-
nodePresent = true
164-
break
165-
}
160+
if nodes, ok := nDB.networkNodes[tEvent.NetworkID]; ok {
161+
_, nodePresent = nodes[tEvent.NodeName]
166162
}
167163
nDB.RUnlock()
168-
if !ok || network.leaving || !nodePresent {
164+
if !okNetwork || network.leaving || !nodePresent {
169165
// I'm out of the network OR the event owner is not anymore part of the network so do not propagate
170166
return false
171167
}

0 commit comments

Comments
 (0)