Skip to content

Commit

Permalink
Implement NodeKiller -- a util to simulate node failures.
Browse files Browse the repository at this point in the history
  • Loading branch information
mborsz committed Nov 29, 2018
1 parent 973b5d2 commit 9e493e1
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 1 deletion.
11 changes: 10 additions & 1 deletion test/e2e/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ import (
)

var (
cloudConfig = &framework.TestContext.CloudConfig
cloudConfig = &framework.TestContext.CloudConfig
nodeKillerStopCh = make(chan struct{})
)

// There are certain operations we only want to run once per overall test invocation
Expand Down Expand Up @@ -136,6 +137,11 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
// Reference common test to make the import valid.
commontest.CurrentSuite = commontest.E2E

if framework.TestContext.NodeKiller.Enabled {
nodeKiller := framework.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider)
nodeKillerStopCh = make(chan struct{})
go nodeKiller.Run(nodeKillerStopCh)
}
return nil

}, func(data []byte) {
Expand All @@ -160,6 +166,9 @@ var _ = ginkgo.SynchronizedAfterSuite(func() {
framework.Logf("Error gathering metrics: %v", err)
}
}
if framework.TestContext.NodeKiller.Enabled {
close(nodeKillerStopCh)
}
})

func gatherTestSuiteMetrics() error {
Expand Down
61 changes: 61 additions & 0 deletions test/e2e/framework/nodes_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@ import (
"path"
"path/filepath"
"strings"
"sync"
"time"

"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
)

func EtcdUpgrade(target_storage, target_version string) error {
Expand Down Expand Up @@ -331,3 +334,61 @@ func waitForSSHTunnels() {
return err == nil, nil
})
}

// NodeKiller is a utility to simulate node failures.
type NodeKiller struct {
config NodeKillerConfig
client clientset.Interface
provider string
}

// NewNodeKiller creates new NodeKiller.
func NewNodeKiller(config NodeKillerConfig, client clientset.Interface, provider string) *NodeKiller {
return &NodeKiller{config, client, provider}
}

// Run starts NodeKiller until stopCh is closed.
func (k *NodeKiller) Run(stopCh <-chan struct{}) {
wait.JitterUntil(func() {
nodes := k.pickNodes()
k.kill(nodes)
}, k.config.Interval, k.config.JitterFactor, true, stopCh)
}

func (k *NodeKiller) pickNodes() []v1.Node {
nodes := GetReadySchedulableNodesOrDie(k.client)
numNodes := int(k.config.FailureRatio * float64(len(nodes.Items)))
shuffledNodes := shuffleNodes(nodes.Items)
if len(shuffledNodes) > numNodes {
return shuffledNodes[:numNodes]
}
return shuffledNodes
}

func (k *NodeKiller) kill(nodes []v1.Node) {
wg := sync.WaitGroup{}
wg.Add(len(nodes))
for _, node := range nodes {
node := node
go func() {
defer wg.Done()

Logf("Stopping docker and kubelet on %q to simulate failure", node.Name)
err := IssueSSHCommand("sudo systemctl stop docker kubelet", k.provider, &node)
if err != nil {
Logf("ERROR while stopping node %q: %v", node.Name, err)
return
}

time.Sleep(k.config.SimulatedDowntime)

Logf("Rebooting %q to repair the node", node.Name)
err = IssueSSHCommand("sudo reboot", k.provider, &node)
if err != nil {
Logf("ERROR while rebooting node %q: %v", node.Name, err)
return
}
}()
}
wg.Wait()
}
27 changes: 27 additions & 0 deletions test/e2e/framework/test_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,26 @@ type TestContextType struct {

// The DNS Domain of the cluster.
ClusterDNSDomain string

// The configration of NodeKiller.
NodeKiller NodeKillerConfig
}

// NodeKillerConfig describes configuration of NodeKiller -- a utility to
// simulate node failures.
type NodeKillerConfig struct {
// Enabled determines whether NodeKill should do anything at all.
// All other options below are ignored if Enabled = false.
Enabled bool
// FailureRatio is a percentage of all nodes that could fail simultinously.
FailureRatio float64
// Interval is time between node failures.
Interval time.Duration
// JitterFactor is factor used to jitter node failures.
// Node will be killed between [Interval, Interval + (1.0 + JitterFactor)].
JitterFactor float64
// SimulatedDowntime is a duration between node is killed and recreated.
SimulatedDowntime time.Duration
}

// NodeTestContextType is part of TestContextType, it is shared by all node e2e test.
Expand Down Expand Up @@ -281,6 +301,13 @@ func RegisterClusterFlags() {
flag.StringVar(&TestContext.IngressUpgradeImage, "ingress-upgrade-image", "", "Image to upgrade to if doing an upgrade test for ingress.")
flag.StringVar(&TestContext.GCEUpgradeScript, "gce-upgrade-script", "", "Script to use to upgrade a GCE cluster.")
flag.BoolVar(&TestContext.CleanStart, "clean-start", false, "If true, purge all namespaces except default and system before running tests. This serves to Cleanup test namespaces from failed/interrupted e2e runs in a long-lived cluster.")

nodeKiller := &TestContext.NodeKiller
flag.BoolVar(&nodeKiller.Enabled, "node-killer", false, "Whether NodeKiller should kill any nodes.")
flag.Float64Var(&nodeKiller.FailureRatio, "node-killer-failure-ratio", 0.01, "Percentage of nodes to be killed")
flag.DurationVar(&nodeKiller.Interval, "node-killer-interval", 1*time.Minute, "Time between node failures.")
flag.Float64Var(&nodeKiller.JitterFactor, "node-killer-jitter-factor", 60, "Factor used to jitter node failures.")
flag.DurationVar(&nodeKiller.SimulatedDowntime, "node-killer-simulated-downtime", 10*time.Minute, "A delay between node death and recreation")
}

// Register flags specific to the node e2e test suite.
Expand Down

0 comments on commit 9e493e1

Please sign in to comment.