Skip to content

Commit

Permalink
Add test case to expose race conditions in drain scheduler
Browse files Browse the repository at this point in the history
The new test helps exposing a current race condition while also helping
preventing similar races in calls `HasSchedule` interacting with
in-progress draining operations.

The race condition also happens during normal draino operation because
the event handler may call `HasSchedule` multiple times for an already
scheduled node.

Data race sample:

```
$ go test -race
Now: 2020-08-20T16:22:10-03:00
==================
WARNING: DATA RACE
Write at 0x00c0000c8098 by goroutine 16:
  github.com/planetlabs/draino/internal/kubernetes.(*DrainSchedules).newSchedule.func1()
      /Users/cezarsa/code/draino/internal/kubernetes/drainSchedule.go:125 +0x235

Previous read at 0x00c0000c8098 by goroutine 12:
  github.com/planetlabs/draino/internal/kubernetes.(*DrainSchedules).HasSchedule()
      /Users/cezarsa/code/draino/internal/kubernetes/drainSchedule.go:59 +0x142
  github.com/planetlabs/draino/internal/kubernetes.TestDrainSchedules_HasSchedule_Polling()
      /Users/cezarsa/code/draino/internal/kubernetes/drainSchedule_test.go:108 +0x54a
  testing.tRunner()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:991 +0x1eb

Goroutine 16 (running) created at:
  time.goFunc()
      /usr/local/Cellar/go/1.14.5/libexec/src/time/sleep.go:168 +0x51

Goroutine 12 (running) created at:
  testing.(*T).Run()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:1042 +0x660
  testing.runTests.func1()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:1284 +0xa6
  testing.tRunner()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:991 +0x1eb
  testing.runTests()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:1282 +0x527
  testing.(*M).Run()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:1199 +0x2ff
  main.main()
      _testmain.go:66 +0x223
==================
==================
WARNING: DATA RACE
Write at 0x00c0000c8099 by goroutine 16:
  github.com/planetlabs/draino/internal/kubernetes.(*DrainSchedules).newSchedule.func1()
      /Users/cezarsa/code/draino/internal/kubernetes/drainSchedule.go:131 +0x65b

Previous read at 0x00c0000c8099 by goroutine 12:
  github.com/planetlabs/draino/internal/kubernetes.(*DrainSchedules).HasSchedule()
      /Users/cezarsa/code/draino/internal/kubernetes/drainSchedule.go:59 +0x15f
  github.com/planetlabs/draino/internal/kubernetes.TestDrainSchedules_HasSchedule_Polling()
      /Users/cezarsa/code/draino/internal/kubernetes/drainSchedule_test.go:108 +0x54a
  testing.tRunner()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:991 +0x1eb

Goroutine 16 (running) created at:
  time.goFunc()
      /usr/local/Cellar/go/1.14.5/libexec/src/time/sleep.go:168 +0x51

Goroutine 12 (running) created at:
  testing.(*T).Run()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:1042 +0x660
  testing.runTests.func1()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:1284 +0xa6
  testing.tRunner()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:991 +0x1eb
  testing.runTests()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:1282 +0x527
  testing.(*M).Run()
      /usr/local/Cellar/go/1.14.5/libexec/src/testing/testing.go:1199 +0x2ff
  main.main()
      _testmain.go:66 +0x223
==================
--- FAIL: TestDrainSchedules_HasSchedule_Polling (11.04s)
    testing.go:906: race detected during execution of test
FAIL
exit status 1
FAIL	github.com/planetlabs/draino/internal/kubernetes	13.829s
```
  • Loading branch information
cezarsa committed Aug 20, 2020
1 parent 14fd977 commit 8a24920
Showing 1 changed file with 43 additions and 0 deletions.
43 changes: 43 additions & 0 deletions internal/kubernetes/drainSchedule_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"testing"
"time"

"github.com/pkg/errors"
"go.uber.org/zap"
v1 "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -84,3 +85,45 @@ func TestDrainSchedules_Schedule(t *testing.T) {
})
}
}

type failDrainer struct {
NoopCordonDrainer
}

func (d *failDrainer) Drain(n *v1.Node) error { return errors.New("myerr") }

// Test to ensure there are no races when calling HasSchedule while the
// scheduler is draining a node.
func TestDrainSchedules_HasSchedule_Polling(t *testing.T) {
scheduler := NewDrainSchedules(&failDrainer{}, &record.FakeRecorder{}, 0, zap.NewNop())
node := &v1.Node{ObjectMeta: meta.ObjectMeta{Name: nodeName}}

when, err := scheduler.Schedule(node)
if err != nil {
t.Fatalf("DrainSchedules.Schedule() error = %v", err)
}

timeout := time.After(time.Until(when) + time.Minute)
for {
hasSchedule, _, failed := scheduler.HasSchedule(node.Name)
if !hasSchedule {
t.Fatalf("Missing schedule record for node %v", node.Name)
}
if failed {
// Having `failed` as true is the expected result here since this
// test is using the `failDrainer{}` drainer. It means that
// HasSchedule was successfully called during or after the draining
// function was scheduled and the test can complete successfully.
break
}
select {
case <-time.After(time.Second):
// Small sleep to ensure we're not running the CPU hot while
// polling `HasSchedule`.
case <-timeout:
// This timeout prevents this test from running forever in case
// some bug caused the draining function never to be scheduled.
t.Fatalf("timeout waiting for HasSchedule to fail")
}
}
}

0 comments on commit 8a24920

Please sign in to comment.