@@ -95,8 +95,8 @@ func (provider CloudProvider) GetName() string {
95
95
}
96
96
97
97
// getStackName finds the name of a stack matching a given ID.
98
- func (provider * CloudProvider ) getStackName (stackID string ) (string , error ) {
99
- stack , err := stacks .Find (context . TODO () , provider .Heat , stackID ).Extract ()
98
+ func (provider * CloudProvider ) getStackName (ctx context. Context , stackID string ) (string , error ) {
99
+ stack , err := stacks .Find (ctx , provider .Heat , stackID ).Extract ()
100
100
if err != nil {
101
101
return "" , err
102
102
}
@@ -108,14 +108,14 @@ func (provider *CloudProvider) getStackName(stackID string) (string, error) {
108
108
// masters and minions(workers). The key in the map is the server/instance ID
109
109
// in Nova and the value is the resource ID and name of the server, and the
110
110
// parent stack ID and name.
111
- func (provider * CloudProvider ) getAllStackResourceMapping (stackName , stackID string ) (m map [string ]ResourceStackRelationship , err error ) {
111
+ func (provider * CloudProvider ) getAllStackResourceMapping (ctx context. Context , stackName , stackID string ) (m map [string ]ResourceStackRelationship , err error ) {
112
112
if provider .ResourceStackMapping != nil {
113
113
return provider .ResourceStackMapping , nil
114
114
}
115
115
116
116
mapping := make (map [string ]ResourceStackRelationship )
117
117
118
- serverPages , err := stackresources .List (provider .Heat , stackName , stackID , stackresources.ListOpts {Depth : 2 }).AllPages (context . TODO () )
118
+ serverPages , err := stackresources .List (provider .Heat , stackName , stackID , stackresources.ListOpts {Depth : 2 }).AllPages (ctx )
119
119
if err != nil {
120
120
return m , err
121
121
}
@@ -266,7 +266,7 @@ func (provider CloudProvider) waitForServerDetachVolumes(serverID string, timeou
266
266
// will be kept as False, which means the node need to be rebuilt to fix it, otherwise it means the has been processed.
267
267
//
268
268
// The bool type return value means that if the node has been processed from a first time repair PoV
269
- func (provider CloudProvider ) firstTimeRepair (n healthcheck.NodeInfo , serverID string , firstTimeRebootNodes map [string ]healthcheck.NodeInfo ) (bool , error ) {
269
+ func (provider CloudProvider ) firstTimeRepair (ctx context. Context , n healthcheck.NodeInfo , serverID string , firstTimeRebootNodes map [string ]healthcheck.NodeInfo ) (bool , error ) {
270
270
var firstTimeUnhealthy = true
271
271
for id := range unHealthyNodes {
272
272
log .V (5 ).Infof ("comparing server ID %s with known broken ID %s" , serverID , id )
@@ -281,7 +281,7 @@ func (provider CloudProvider) firstTimeRepair(n healthcheck.NodeInfo, serverID s
281
281
if firstTimeUnhealthy {
282
282
log .Infof ("rebooting node %s to repair it" , serverID )
283
283
284
- if res := servers .Reboot (context . TODO () , provider .Nova , serverID , servers.RebootOpts {Type : servers .SoftReboot }); res .Err != nil {
284
+ if res := servers .Reboot (ctx , provider .Nova , serverID , servers.RebootOpts {Type : servers .SoftReboot }); res .Err != nil {
285
285
// Usually it means the node is being rebooted
286
286
log .Warningf ("failed to reboot node %s, error: %v" , serverID , res .Err )
287
287
if strings .Contains (res .Err .Error (), "reboot_started" ) {
@@ -351,7 +351,7 @@ func (provider CloudProvider) firstTimeRepair(n healthcheck.NodeInfo, serverID s
351
351
// - Heat stack ID and resource ID.
352
352
//
353
353
// For worker nodes: Call Magnum resize API directly.
354
- func (provider CloudProvider ) Repair (nodes []healthcheck.NodeInfo ) error {
354
+ func (provider CloudProvider ) Repair (ctx context. Context , nodes []healthcheck.NodeInfo ) error {
355
355
if len (nodes ) == 0 {
356
356
return nil
357
357
}
@@ -370,12 +370,12 @@ func (provider CloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
370
370
371
371
firstTimeRebootNodes := make (map [string ]healthcheck.NodeInfo )
372
372
373
- err := provider .UpdateHealthStatus (masters , workers )
373
+ err := provider .UpdateHealthStatus (ctx , masters , workers )
374
374
if err != nil {
375
375
return fmt .Errorf ("failed to update the health status of cluster %s, error: %v" , clusterName , err )
376
376
}
377
377
378
- cluster , err := clusters .Get (context . TODO () , provider .Magnum , clusterName ).Extract ()
378
+ cluster , err := clusters .Get (ctx , provider .Magnum , clusterName ).Extract ()
379
379
if err != nil {
380
380
return fmt .Errorf ("failed to get the cluster %s, error: %v" , clusterName , err )
381
381
}
@@ -389,7 +389,7 @@ func (provider CloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
389
389
continue
390
390
}
391
391
392
- if processed , err := provider .firstTimeRepair (n , serverID , firstTimeRebootNodes ); err != nil {
392
+ if processed , err := provider .firstTimeRepair (ctx , n , serverID , firstTimeRebootNodes ); err != nil {
393
393
log .Warningf ("Failed to process if the node %s is in first time repair , error: %v" , serverID , err )
394
394
} else if processed {
395
395
log .Infof ("Node %s has been processed" , serverID )
@@ -405,7 +405,7 @@ func (provider CloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
405
405
}
406
406
407
407
nodesToReplace .Insert (serverID )
408
- ng , err := provider .getNodeGroup (clusterName , n )
408
+ ng , err := provider .getNodeGroup (ctx , clusterName , n )
409
409
ngName := "default-worker"
410
410
ngNodeCount := & cluster .NodeCount
411
411
if err == nil {
@@ -419,7 +419,7 @@ func (provider CloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
419
419
NodesToRemove : nodesToReplace .List (),
420
420
}
421
421
422
- clusters .Resize (context . TODO () , provider .Magnum , clusterName , opts )
422
+ clusters .Resize (ctx , provider .Magnum , clusterName , opts )
423
423
// Wait 10 seconds to make sure Magnum has already got the request
424
424
// to avoid sending all of the resize API calls at the same time.
425
425
time .Sleep (10 * time .Second )
@@ -432,14 +432,14 @@ func (provider CloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
432
432
log .Infof ("Cluster %s resized" , clusterName )
433
433
}
434
434
} else {
435
- clusterStackName , err := provider .getStackName (cluster .StackID )
435
+ clusterStackName , err := provider .getStackName (ctx , cluster .StackID )
436
436
if err != nil {
437
437
return fmt .Errorf ("failed to get the Heat stack for cluster %s, error: %v" , clusterName , err )
438
438
}
439
439
440
440
// In order to rebuild the nodes by Heat stack update, we need to know the parent stack ID of the resources and
441
441
// mark them unhealthy first.
442
- allMapping , err := provider .getAllStackResourceMapping (clusterStackName , cluster .StackID )
442
+ allMapping , err := provider .getAllStackResourceMapping (ctx , clusterStackName , cluster .StackID )
443
443
if err != nil {
444
444
return fmt .Errorf ("failed to get the resource stack mapping for cluster %s, error: %v" , clusterName , err )
445
445
}
@@ -456,7 +456,7 @@ func (provider CloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
456
456
continue
457
457
}
458
458
459
- if processed , err := provider .firstTimeRepair (n , serverID , firstTimeRebootNodes ); err != nil {
459
+ if processed , err := provider .firstTimeRepair (ctx , n , serverID , firstTimeRebootNodes ); err != nil {
460
460
log .Warningf ("Failed to process if the node %s is in first time repair , error: %v" , serverID , err )
461
461
} else if processed {
462
462
log .Infof ("Node %s has been processed" , serverID )
@@ -468,7 +468,7 @@ func (provider CloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
468
468
} else {
469
469
// Mark root volume as unhealthy
470
470
if rootVolumeID != "" {
471
- err = stackresources .MarkUnhealthy (context . TODO () , provider .Heat , allMapping [serverID ].StackName , allMapping [serverID ].StackID , rootVolumeID , opts ).ExtractErr ()
471
+ err = stackresources .MarkUnhealthy (ctx , provider .Heat , allMapping [serverID ].StackName , allMapping [serverID ].StackID , rootVolumeID , opts ).ExtractErr ()
472
472
if err != nil {
473
473
log .Errorf ("failed to mark resource %s unhealthy, error: %v" , rootVolumeID , err )
474
474
}
@@ -479,7 +479,7 @@ func (provider CloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
479
479
log .Warningf ("Failed to shutdown the server %s, error: %v" , serverID , err )
480
480
// If the server is failed to delete after 180s, then delete it to avoid the
481
481
// stack update failure later.
482
- res := servers .ForceDelete (context . TODO () , provider .Nova , serverID )
482
+ res := servers .ForceDelete (ctx , provider .Nova , serverID )
483
483
if res .Err != nil {
484
484
log .Warningf ("Failed to delete the server %s, error: %v" , serverID , err )
485
485
}
@@ -488,15 +488,15 @@ func (provider CloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
488
488
log .Infof ("Marking Nova VM %s(Heat resource %s) unhealthy for Heat stack %s" , serverID , allMapping [serverID ].ResourceID , cluster .StackID )
489
489
490
490
// Mark VM as unhealthy
491
- err = stackresources .MarkUnhealthy (context . TODO () , provider .Heat , allMapping [serverID ].StackName , allMapping [serverID ].StackID , allMapping [serverID ].ResourceID , opts ).ExtractErr ()
491
+ err = stackresources .MarkUnhealthy (ctx , provider .Heat , allMapping [serverID ].StackName , allMapping [serverID ].StackID , allMapping [serverID ].ResourceID , opts ).ExtractErr ()
492
492
if err != nil {
493
493
log .Errorf ("failed to mark resource %s unhealthy, error: %v" , serverID , err )
494
494
}
495
495
496
496
delete (unHealthyNodes , serverID )
497
497
}
498
498
499
- if err := stacks .UpdatePatch (context . TODO () , provider .Heat , clusterStackName , cluster .StackID , stacks.UpdateOpts {}).ExtractErr (); err != nil {
499
+ if err := stacks .UpdatePatch (ctx , provider .Heat , clusterStackName , cluster .StackID , stacks.UpdateOpts {}).ExtractErr (); err != nil {
500
500
return fmt .Errorf ("failed to update Heat stack to rebuild resources, error: %v" , err )
501
501
}
502
502
@@ -514,26 +514,26 @@ func (provider CloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
514
514
log .Infof ("Skip node delete for %s because it's repaired by reboot" , serverID )
515
515
continue
516
516
}
517
- if err := provider .KubeClient .CoreV1 ().Nodes ().Delete (context . TODO () , n .KubeNode .Name , metav1.DeleteOptions {}); err != nil {
517
+ if err := provider .KubeClient .CoreV1 ().Nodes ().Delete (ctx , n .KubeNode .Name , metav1.DeleteOptions {}); err != nil {
518
518
log .Errorf ("Failed to remove the node %s from cluster, error: %v" , n .KubeNode .Name , err )
519
519
}
520
520
}
521
521
522
522
return nil
523
523
}
524
524
525
- func (provider CloudProvider ) getNodeGroup (clusterName string , node healthcheck.NodeInfo ) (nodegroups.NodeGroup , error ) {
525
+ func (provider CloudProvider ) getNodeGroup (ctx context. Context , clusterName string , node healthcheck.NodeInfo ) (nodegroups.NodeGroup , error ) {
526
526
var ng nodegroups.NodeGroup
527
527
528
- ngPages , err := nodegroups .List (provider .Magnum , clusterName , nodegroups.ListOpts {}).AllPages (context . TODO () )
528
+ ngPages , err := nodegroups .List (provider .Magnum , clusterName , nodegroups.ListOpts {}).AllPages (ctx )
529
529
if err == nil {
530
530
ngs , err := nodegroups .ExtractNodeGroups (ngPages )
531
531
if err != nil {
532
532
log .Warningf ("Failed to get node group for cluster %s, error: %v" , clusterName , err )
533
533
return ng , err
534
534
}
535
535
for _ , ng := range ngs {
536
- ngInfo , err := nodegroups .Get (context . TODO () , provider .Magnum , clusterName , ng .UUID ).Extract ()
536
+ ngInfo , err := nodegroups .Get (ctx , provider .Magnum , clusterName , ng .UUID ).Extract ()
537
537
if err != nil {
538
538
log .Warningf ("Failed to get node group for cluster %s, error: %v" , clusterName , err )
539
539
return ng , err
@@ -555,7 +555,7 @@ func (provider CloudProvider) getNodeGroup(clusterName string, node healthcheck.
555
555
556
556
// UpdateHealthStatus can update the cluster health status to reflect the
557
557
// real-time health status of the k8s cluster.
558
- func (provider CloudProvider ) UpdateHealthStatus (masters []healthcheck.NodeInfo , workers []healthcheck.NodeInfo ) error {
558
+ func (provider CloudProvider ) UpdateHealthStatus (ctx context. Context , masters []healthcheck.NodeInfo , workers []healthcheck.NodeInfo ) error {
559
559
log .Infof ("start to update cluster health status." )
560
560
clusterName := provider .Config .ClusterName
561
561
@@ -600,7 +600,7 @@ func (provider CloudProvider) UpdateHealthStatus(masters []healthcheck.NodeInfo,
600
600
}
601
601
602
602
log .Infof ("updating cluster health status as %s for reason %s." , healthStatus , healthStatusReason )
603
- res := clusters .Update (context . TODO () , provider .Magnum , clusterName , updateOpts )
603
+ res := clusters .Update (ctx , provider .Magnum , clusterName , updateOpts )
604
604
605
605
if res .Err != nil {
606
606
return fmt .Errorf ("failed to update the health status of cluster %s error: %v" , clusterName , res .Err )
@@ -617,10 +617,10 @@ func (provider CloudProvider) UpdateHealthStatus(masters []healthcheck.NodeInfo,
617
617
// There are two conditions that we disable the repair:
618
618
// - The cluster admin disables the auto healing via OpenStack API.
619
619
// - The Magnum cluster is not in stable status.
620
- func (provider CloudProvider ) Enabled () bool {
620
+ func (provider CloudProvider ) Enabled (ctx context. Context ) bool {
621
621
clusterName := provider .Config .ClusterName
622
622
623
- cluster , err := clusters .Get (context . TODO () , provider .Magnum , clusterName ).Extract ()
623
+ cluster , err := clusters .Get (ctx , provider .Magnum , clusterName ).Extract ()
624
624
if err != nil {
625
625
log .Warningf ("failed to get the cluster %s, error: %v" , clusterName , err )
626
626
return false
@@ -644,12 +644,12 @@ func (provider CloudProvider) Enabled() bool {
644
644
return false
645
645
}
646
646
647
- clusterStackName , err := provider .getStackName (cluster .StackID )
647
+ clusterStackName , err := provider .getStackName (ctx , cluster .StackID )
648
648
if err != nil {
649
649
log .Warningf ("Failed to get the Heat stack ID for cluster %s, error: %v" , clusterName , err )
650
650
return false
651
651
}
652
- stack , err := stacks .Get (context . TODO () , provider .Heat , clusterStackName , cluster .StackID ).Extract ()
652
+ stack , err := stacks .Get (ctx , provider .Heat , clusterStackName , cluster .StackID ).Extract ()
653
653
if err != nil {
654
654
log .Warningf ("Failed to get Heat stack %s for cluster %s, error: %v" , cluster .StackID , clusterName , err )
655
655
return false
0 commit comments