-
Notifications
You must be signed in to change notification settings - Fork 84
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix data race in drain scheduler #86
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ import ( | |
"testing" | ||
"time" | ||
|
||
"github.com/pkg/errors" | ||
"go.uber.org/zap" | ||
v1 "k8s.io/api/core/v1" | ||
meta "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
|
@@ -55,7 +56,7 @@ func TestDrainSchedules_Schedule(t *testing.T) { | |
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
// Check that node is not yet scheduled for drain | ||
hasSchedule, _, _ := scheduler.HasSchedule(tt.node.Name) | ||
hasSchedule, _ := scheduler.HasSchedule(tt.node.Name) | ||
if hasSchedule { | ||
t.Errorf("Node %v should not have any schedule", tt.node.Name) | ||
} | ||
|
@@ -66,7 +67,7 @@ func TestDrainSchedules_Schedule(t *testing.T) { | |
return | ||
} | ||
// Check that node is scheduled for drain | ||
hasSchedule, _, _ = scheduler.HasSchedule(tt.node.Name) | ||
hasSchedule, _ = scheduler.HasSchedule(tt.node.Name) | ||
if !hasSchedule { | ||
t.Errorf("Missing schedule record for node %v", tt.node.Name) | ||
} | ||
|
@@ -77,10 +78,52 @@ func TestDrainSchedules_Schedule(t *testing.T) { | |
// Deleting schedule | ||
scheduler.DeleteSchedule(tt.node.Name) | ||
// Check that node is no more scheduled for drain | ||
hasSchedule, _, _ = scheduler.HasSchedule(tt.node.Name) | ||
hasSchedule, _ = scheduler.HasSchedule(tt.node.Name) | ||
if hasSchedule { | ||
t.Errorf("Node %v should not been scheduled anymore", tt.node.Name) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
type failDrainer struct { | ||
NoopCordonDrainer | ||
} | ||
|
||
func (d *failDrainer) Drain(n *v1.Node) error { return errors.New("myerr") } | ||
|
||
// Test to ensure there are no races when calling HasSchedule while the | ||
// scheduler is draining a node. | ||
func TestDrainSchedules_HasSchedule_Polling(t *testing.T) { | ||
scheduler := NewDrainSchedules(&failDrainer{}, &record.FakeRecorder{}, 0, zap.NewNop()) | ||
node := &v1.Node{ObjectMeta: meta.ObjectMeta{Name: nodeName}} | ||
|
||
when, err := scheduler.Schedule(node) | ||
if err != nil { | ||
t.Fatalf("DrainSchedules.Schedule() error = %v", err) | ||
} | ||
|
||
timeout := time.After(time.Until(when) + time.Minute) | ||
for { | ||
hasSchedule, failed := scheduler.HasSchedule(node.Name) | ||
if !hasSchedule { | ||
t.Fatalf("Missing schedule record for node %v", node.Name) | ||
} | ||
if failed { | ||
// Having `failed` as true is the expected result here since this | ||
// test is using the `failDrainer{}` drainer. It means that | ||
// HasSchedule was successfully called during or after the draining | ||
// function was scheduled and the test can complete successfully. | ||
break | ||
} | ||
select { | ||
case <-time.After(time.Second): | ||
// Small sleep to ensure we're not running the CPU hot while | ||
// polling `HasSchedule`. | ||
case <-timeout: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When do we hit the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Line 114 |
||
// This timeout prevents this test from running forever in case | ||
// some bug caused the draining function never to be scheduled. | ||
t.Fatalf("timeout waiting for HasSchedule to fail") | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So - we want
failed
when the test passes (counter-intuitive on the first read) because we're inducing an error with thefailDrainer
. The reason we're doing this is because we specifically want to exercise the atomicsetFailed
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd suggest just adding a brief comment before the break statement.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the main reason for adding a test case here is to prevent future races when changing
HasSchedule
. The goal of this test is to ensure we're callingHasSchedule
in parallel with the function passed totime.AfterFunc
being executed. Using thefailed
as a condition was an easy way to ensure that the test has reached its goal of having the timer-triggered function running.I'll add a comment here to better explain this.