Skip to content
This repository was archived by the owner on Sep 18, 2020. It is now read-only.

Commit a4cf321

Browse files
authored
Merge pull request #55 from euank/agent-bootcycle
agent: wait for operator to reset us on reboot
2 parents 9f7d6cb + 26fec5c commit a4cf321

File tree

3 files changed

+78
-14
lines changed

3 files changed

+78
-14
lines changed

internal/agent/agent.go

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@ type Klocksmith struct {
2828
lc *login1.Conn
2929
}
3030

31+
var (
32+
shouldRebootSelector = fields.Set(map[string]string{
33+
constants.AnnotationOkToReboot: constants.True,
34+
constants.AnnotationRebootNeeded: constants.True,
35+
}).AsSelector()
36+
)
37+
3138
func New(node string) (*Klocksmith, error) {
3239
// set up kubernetes in-cluster client
3340
kc, err := k8sutil.InClusterClient()
@@ -127,6 +134,11 @@ func (k *Klocksmith) Run() error {
127134
if err := k8sutil.SetNodeAnnotations(k.nc, k.node, anno); err != nil {
128135
return err
129136
}
137+
// Since we set 'reboot-needed=false', 'ok-to-reboot' should clear.
138+
// Wait for it to do so, else we might start reboot-looping
139+
if err := k.waitForNotOkToReboot(); err != nil {
140+
return err
141+
}
130142

131143
// watch update engine for status updates
132144
go k.watchUpdateStatus(k.updateStatusCallback)
@@ -235,12 +247,17 @@ func (k *Klocksmith) waitForRebootSignal() error {
235247
return nil
236248
}
237249

250+
// waitForOkToReboot waits for both 'ok-to-reboot' and 'needs-reboot' to be true.
238251
func (k *Klocksmith) waitForOkToReboot() error {
239252
n, err := k.nc.Get(k.node)
240253
if err != nil {
241254
return fmt.Errorf("failed to get self node (%q): %v", k.node, err)
242255
}
243256

257+
if n.Annotations[constants.AnnotationOkToReboot] == constants.True && n.Annotations[constants.AnnotationRebootNeeded] == constants.True {
258+
return nil
259+
}
260+
244261
// XXX: set timeout > 0?
245262
watcher, err := k.nc.Watch(v1api.ListOptions{
246263
FieldSelector: fields.OneTermEqualSelector("metadata.name", n.Name).String(),
@@ -252,7 +269,7 @@ func (k *Klocksmith) waitForOkToReboot() error {
252269

253270
// hopefully 24 hours is enough time between indicating we need a
254271
// reboot and the controller telling us to do it
255-
ev, err := watch.Until(time.Hour*24, watcher, k8sutil.NodeAnnotationCondition(constants.AnnotationOkToReboot, constants.True))
272+
ev, err := watch.Until(time.Hour*24, watcher, k8sutil.NodeAnnotationCondition(shouldRebootSelector))
256273
if err != nil {
257274
return fmt.Errorf("waiting for annotation %q failed: %v", constants.AnnotationOkToReboot, err)
258275
}
@@ -270,6 +287,60 @@ func (k *Klocksmith) waitForOkToReboot() error {
270287
return nil
271288
}
272289

290+
func (k *Klocksmith) waitForNotOkToReboot() error {
291+
n, err := k.nc.Get(k.node)
292+
if err != nil {
293+
return fmt.Errorf("failed to get self node (%q): %v", k.node, err)
294+
}
295+
296+
if n.Annotations[constants.AnnotationOkToReboot] != constants.True {
297+
return nil
298+
}
299+
300+
// XXX: set timeout > 0?
301+
watcher, err := k.nc.Watch(v1api.ListOptions{
302+
FieldSelector: fields.OneTermEqualSelector("metadata.name", n.Name).String(),
303+
ResourceVersion: n.ResourceVersion,
304+
})
305+
if err != nil {
306+
return fmt.Errorf("failed to watch self node (%q): %v", k.node, err)
307+
}
308+
309+
// Within 24 hours of indicating we don't need a reboot we should be given a not-ok.
310+
// If that isn't the case, it likely means the operator isn't running, and
311+
// we'll just crash-loop in that case, and hopefully that will help the user realize something's wrong.
312+
// Use a custom condition function to use the more correct 'OkToReboot !=
313+
// true' vs '== False'; due to the operator matching on '== True', and not
314+
// going out of its way to convert '' => 'False', checking the exact inverse
315+
// of what the operator checks is the correct thing to do.
316+
ev, err := watch.Until(time.Hour*24, watcher, watch.ConditionFunc(func(event watch.Event) (bool, error) {
317+
switch event.Type {
318+
case watch.Error:
319+
return false, fmt.Errorf("error watching node: %v", event.Object)
320+
case watch.Deleted:
321+
return false, fmt.Errorf("our node was deleted while we were waiting for ready")
322+
}
323+
324+
no := event.Object.(*v1api.Node)
325+
if no.Annotations[constants.AnnotationOkToReboot] != constants.True {
326+
return true, nil
327+
}
328+
return false, nil
329+
}))
330+
if err != nil {
331+
return fmt.Errorf("waiting for annotation %q failed: %v", constants.AnnotationOkToReboot, err)
332+
}
333+
334+
// sanity check
335+
no := ev.Object.(*v1api.Node)
336+
337+
if no.Annotations[constants.AnnotationOkToReboot] == constants.True {
338+
panic("event did not contain annotation expected")
339+
}
340+
341+
return nil
342+
}
343+
273344
func (k *Klocksmith) getPodsForDeletion() ([]v1api.Pod, error) {
274345
pods, err := drain.GetPodsForDeletion(k.kc, k.node)
275346
if err != nil {

internal/k8sutil/metadata.go

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"github.com/golang/glog"
1111
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
1212
v1api "k8s.io/client-go/pkg/api/v1"
13+
"k8s.io/client-go/pkg/fields"
1314
"k8s.io/client-go/pkg/watch"
1415
)
1516

@@ -21,16 +22,12 @@ const (
2122

2223
// NodeAnnotationCondition returns a condition function that succeeds when a
2324
// node being watched has an annotation of key equal to value.
24-
func NodeAnnotationCondition(key, value string) watch.ConditionFunc {
25+
func NodeAnnotationCondition(selector fields.Selector) watch.ConditionFunc {
2526
return func(event watch.Event) (bool, error) {
2627
switch event.Type {
2728
case watch.Modified:
2829
node := event.Object.(*v1api.Node)
29-
if node.Annotations[key] == value {
30-
return true, nil
31-
}
32-
33-
return false, nil
30+
return selector.Matches(fields.Set(node.Annotations)), nil
3431
}
3532

3633
return false, fmt.Errorf("unhandled watch case for %#v", event)

internal/operator/operator.go

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -268,15 +268,11 @@ func (k *Kontroller) Run(ctx context.Context) error {
268268
func (k *Kontroller) markNodeRebootable(n *v1api.Node) {
269269
glog.V(4).Infof("marking %s ok to reboot", n.Name)
270270
if err := k8sutil.UpdateNodeRetry(k.nc, n.Name, func(node *v1api.Node) {
271-
// TODO; reuse selector if I can figure out how to apply it to a single node
272-
if node.Annotations[constants.AnnotationOkToReboot] == constants.True {
273-
glog.Warningf("Node %v became rebootable while we were trying to mark it so", node.Name)
274-
return
275-
}
276-
if node.Annotations[constants.AnnotationRebootNeeded] != constants.True {
277-
glog.Warningf("Node %v became not-ok-for-reboot while trying to mark it ready", node.Name)
271+
if !wantsRebootSelector.Matches(fields.Set(node.Annotations)) {
272+
glog.Warningf("Node %v no longer wanted to a reboot while we were trying to mark it so: %v", node.Name, node.Annotations)
278273
return
279274
}
275+
280276
node.Annotations[constants.AnnotationOkToReboot] = constants.True
281277
}); err != nil {
282278
glog.Infof("Failed to set annotation %q on node %q: %v", constants.AnnotationOkToReboot, n.Name, err)

0 commit comments

Comments
 (0)