@@ -28,6 +28,13 @@ type Klocksmith struct {
2828 lc * login1.Conn
2929}
3030
31+ var (
32+ shouldRebootSelector = fields .Set (map [string ]string {
33+ constants .AnnotationOkToReboot : constants .True ,
34+ constants .AnnotationRebootNeeded : constants .True ,
35+ }).AsSelector ()
36+ )
37+
3138func New (node string ) (* Klocksmith , error ) {
3239 // set up kubernetes in-cluster client
3340 kc , err := k8sutil .InClusterClient ()
@@ -127,6 +134,11 @@ func (k *Klocksmith) Run() error {
127134 if err := k8sutil .SetNodeAnnotations (k .nc , k .node , anno ); err != nil {
128135 return err
129136 }
137+ // Since we set 'reboot-needed=false', 'ok-to-reboot' should clear.
138+ // Wait for it to do so, else we might start reboot-looping
139+ if err := k .waitForNotOkToReboot (); err != nil {
140+ return err
141+ }
130142
131143 // watch update engine for status updates
132144 go k .watchUpdateStatus (k .updateStatusCallback )
@@ -235,12 +247,17 @@ func (k *Klocksmith) waitForRebootSignal() error {
235247 return nil
236248}
237249
250+ // waitForOkToReboot waits for both 'ok-to-reboot' and 'needs-reboot' to be true.
238251func (k * Klocksmith ) waitForOkToReboot () error {
239252 n , err := k .nc .Get (k .node )
240253 if err != nil {
241254 return fmt .Errorf ("failed to get self node (%q): %v" , k .node , err )
242255 }
243256
257+ if n .Annotations [constants .AnnotationOkToReboot ] == constants .True && n .Annotations [constants .AnnotationRebootNeeded ] == constants .True {
258+ return nil
259+ }
260+
244261 // XXX: set timeout > 0?
245262 watcher , err := k .nc .Watch (v1api.ListOptions {
246263 FieldSelector : fields .OneTermEqualSelector ("metadata.name" , n .Name ).String (),
@@ -252,7 +269,7 @@ func (k *Klocksmith) waitForOkToReboot() error {
252269
253270 // hopefully 24 hours is enough time between indicating we need a
254271 // reboot and the controller telling us to do it
255- ev , err := watch .Until (time .Hour * 24 , watcher , k8sutil .NodeAnnotationCondition (constants . AnnotationOkToReboot , constants . True ))
272+ ev , err := watch .Until (time .Hour * 24 , watcher , k8sutil .NodeAnnotationCondition (shouldRebootSelector ))
256273 if err != nil {
257274 return fmt .Errorf ("waiting for annotation %q failed: %v" , constants .AnnotationOkToReboot , err )
258275 }
@@ -270,6 +287,60 @@ func (k *Klocksmith) waitForOkToReboot() error {
270287 return nil
271288}
272289
290+ func (k * Klocksmith ) waitForNotOkToReboot () error {
291+ n , err := k .nc .Get (k .node )
292+ if err != nil {
293+ return fmt .Errorf ("failed to get self node (%q): %v" , k .node , err )
294+ }
295+
296+ if n .Annotations [constants .AnnotationOkToReboot ] != constants .True {
297+ return nil
298+ }
299+
300+ // XXX: set timeout > 0?
301+ watcher , err := k .nc .Watch (v1api.ListOptions {
302+ FieldSelector : fields .OneTermEqualSelector ("metadata.name" , n .Name ).String (),
303+ ResourceVersion : n .ResourceVersion ,
304+ })
305+ if err != nil {
306+ return fmt .Errorf ("failed to watch self node (%q): %v" , k .node , err )
307+ }
308+
309+ // Within 24 hours of indicating we don't need a reboot we should be given a not-ok.
310+ // If that isn't the case, it likely means the operator isn't running, and
311+ // we'll just crash-loop in that case, and hopefully that will help the user realize something's wrong.
312+ // Use a custom condition function to use the more correct 'OkToReboot !=
313+ // true' vs '== False'; due to the operator matching on '== True', and not
314+ // going out of its way to convert '' => 'False', checking the exact inverse
315+ // of what the operator checks is the correct thing to do.
316+ ev , err := watch .Until (time .Hour * 24 , watcher , watch .ConditionFunc (func (event watch.Event ) (bool , error ) {
317+ switch event .Type {
318+ case watch .Error :
319+ return false , fmt .Errorf ("error watching node: %v" , event .Object )
320+ case watch .Deleted :
321+ return false , fmt .Errorf ("our node was deleted while we were waiting for ready" )
322+ }
323+
324+ no := event .Object .(* v1api.Node )
325+ if no .Annotations [constants .AnnotationOkToReboot ] != constants .True {
326+ return true , nil
327+ }
328+ return false , nil
329+ }))
330+ if err != nil {
331+ return fmt .Errorf ("waiting for annotation %q failed: %v" , constants .AnnotationOkToReboot , err )
332+ }
333+
334+ // sanity check
335+ no := ev .Object .(* v1api.Node )
336+
337+ if no .Annotations [constants .AnnotationOkToReboot ] == constants .True {
338+ panic ("event did not contain annotation expected" )
339+ }
340+
341+ return nil
342+ }
343+
273344func (k * Klocksmith ) getPodsForDeletion () ([]v1api.Pod , error ) {
274345 pods , err := drain .GetPodsForDeletion (k .kc , k .node )
275346 if err != nil {
0 commit comments