Skip to content

Commit fe47f9e

Browse files
Improve the pod moving behavior during the Kubernetes cluster upgrade. (zalando#281)
* Improve the pod moving behavior during the Kubernetes cluster upgrade. Fix an issue of not waiting for at least one replica to become ready (if the Statefulset indicates there are replicas) when moving the master pod off the decomissioned node. Resolves the first part of zalando#279. Small fixes to error messages. * Eliminate a race condition during the swithover. When the operator initiates the failover (switchover) that fails and then retries it for a second time it may happen that the previous waitForPodChannel is still active. As a result, the operator subscribes to the former master pod two times, causing a panic. The problem was that the original code didn't bother to cancel the waitForPodLalbel for the new master pod in the case when the failover fails. This commit fixes it by adding a stop channel to that function. Code review by @zerg-junior
1 parent ebff820 commit fe47f9e

File tree

5 files changed

+71
-29
lines changed

5 files changed

+71
-29
lines changed

pkg/cluster/cluster.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -858,6 +858,7 @@ func (c *Cluster) GetStatus() *spec.ClusterStatus {
858858
// ManualFailover does manual failover to a candidate pod
859859
func (c *Cluster) ManualFailover(curMaster *v1.Pod, candidate spec.NamespacedName) error {
860860
c.logger.Debugf("failing over from %q to %q", curMaster.Name, candidate)
861+
861862
podLabelErr := make(chan error)
862863
stopCh := make(chan struct{})
863864
defer close(podLabelErr)
@@ -868,11 +869,12 @@ func (c *Cluster) ManualFailover(curMaster *v1.Pod, candidate spec.NamespacedNam
868869

869870
role := Master
870871

871-
_, err := c.waitForPodLabel(ch, &role)
872-
873872
select {
874873
case <-stopCh:
875-
case podLabelErr <- err:
874+
case podLabelErr <- func() error {
875+
_, err := c.waitForPodLabel(ch, stopCh, &role)
876+
return err
877+
}():
876878
}
877879
}()
878880

pkg/cluster/cluster_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ func TestInitRobotUsers(t *testing.T) {
3434
{
3535
manifestUsers: map[string]spec.UserFlags{"foo": {"superuser", "createdb"}},
3636
infraRoles: map[string]spec.PgUser{"foo": {Origin: spec.RoleOriginInfrastructure, Name: "foo", Password: "bar"}},
37-
result: map[string]spec.PgUser{"foo": {Origin: spec.RoleOriginInfrastructure, Name: "foo", Password: "bar"}},
38-
err: nil,
37+
result: map[string]spec.PgUser{"foo": {Origin: spec.RoleOriginInfrastructure, Name: "foo", Password: "bar"}},
38+
err: nil,
3939
},
4040
{
4141
manifestUsers: map[string]spec.UserFlags{"!fooBar": {"superuser", "createdb"}},

pkg/cluster/k8sres.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -707,7 +707,7 @@ func (c *Cluster) shouldCreateLoadBalancerForService(role PostgresRole, spec *sp
707707
// `enable_load_balancer`` governs LB for a master service
708708
// there is no equivalent deprecated operator option for the replica LB
709709
if c.OpConfig.EnableLoadBalancer != nil {
710-
c.logger.Debugf("The operator configmap sets the deprecated `enable_load_balancer` param. Consider using the `enable_master_load_balancer` or `enable_replica_load_balancer` instead.", c.Name)
710+
c.logger.Debugf("The operator configmap sets the deprecated `enable_load_balancer` param. Consider using the `enable_master_load_balancer` or `enable_replica_load_balancer` instead.")
711711
return *c.OpConfig.EnableLoadBalancer
712712
}
713713

pkg/cluster/pod.go

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -149,13 +149,19 @@ func (c *Cluster) movePodFromEndOfLifeNode(pod *v1.Pod) (*v1.Pod, error) {
149149
}
150150

151151
func (c *Cluster) masterCandidate(oldNodeName string) (*v1.Pod, error) {
152+
153+
// Wait until at least one replica pod will come up
154+
if err := c.waitForAnyReplicaLabelReady(); err != nil {
155+
c.logger.Warningf("could not find at least one ready replica: %v", err)
156+
}
157+
152158
replicas, err := c.getRolePods(Replica)
153159
if err != nil {
154160
return nil, fmt.Errorf("could not get replica pods: %v", err)
155161
}
156162

157163
if len(replicas) == 0 {
158-
c.logger.Warningf("single master pod for cluster %q, migration will cause disruption of the service")
164+
c.logger.Warningf("no available master candidates, migration will cause longer downtime of the master instance")
159165
return nil, nil
160166
}
161167

@@ -168,12 +174,16 @@ func (c *Cluster) masterCandidate(oldNodeName string) (*v1.Pod, error) {
168174
}
169175
}
170176
}
171-
c.logger.Debug("no available master candidates on live nodes")
177+
c.logger.Warningf("no available master candidates on live nodes")
172178
return &replicas[rand.Intn(len(replicas))], nil
173179
}
174180

175181
// MigrateMasterPod migrates master pod via failover to a replica
176182
func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
183+
var (
184+
masterCandidatePod *v1.Pod
185+
)
186+
177187
oldMaster, err := c.KubeClient.Pods(podName.Namespace).Get(podName.Name, metav1.GetOptions{})
178188

179189
if err != nil {
@@ -193,10 +203,13 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
193203
c.logger.Warningf("pod %q is not a master", podName)
194204
return nil
195205
}
196-
197-
masterCandidatePod, err := c.masterCandidate(oldMaster.Spec.NodeName)
198-
if err != nil {
199-
return fmt.Errorf("could not get new master candidate: %v", err)
206+
if *c.Statefulset.Spec.Replicas == 1 {
207+
c.logger.Warningf("single master pod for cluster %q, migration will cause longer downtime of the master instance", c.clusterName())
208+
} else {
209+
masterCandidatePod, err = c.masterCandidate(oldMaster.Spec.NodeName)
210+
if err != nil {
211+
return fmt.Errorf("could not get new master candidate: %v", err)
212+
}
200213
}
201214

202215
// there are two cases for each postgres cluster that has its master pod on the node to migrate from:
@@ -250,6 +263,7 @@ func (c *Cluster) MigrateReplicaPod(podName spec.NamespacedName, fromNodeName st
250263
func (c *Cluster) recreatePod(podName spec.NamespacedName) (*v1.Pod, error) {
251264
ch := c.registerPodSubscriber(podName)
252265
defer c.unregisterPodSubscriber(podName)
266+
stopChan := make(chan struct{})
253267

254268
if err := c.KubeClient.Pods(podName.Namespace).Delete(podName.Name, c.deleteOptions); err != nil {
255269
return nil, fmt.Errorf("could not delete pod: %v", err)
@@ -258,7 +272,7 @@ func (c *Cluster) recreatePod(podName spec.NamespacedName) (*v1.Pod, error) {
258272
if err := c.waitForPodDeletion(ch); err != nil {
259273
return nil, err
260274
}
261-
if pod, err := c.waitForPodLabel(ch, nil); err != nil {
275+
if pod, err := c.waitForPodLabel(ch, stopChan, nil); err != nil {
262276
return nil, err
263277
} else {
264278
c.logger.Infof("pod %q has been recreated", podName)

pkg/cluster/util.go

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ func (c *Cluster) getTeamMembers() ([]string, error) {
225225
return teamInfo.Members, nil
226226
}
227227

228-
func (c *Cluster) waitForPodLabel(podEvents chan spec.PodEvent, role *PostgresRole) (*v1.Pod, error) {
228+
func (c *Cluster) waitForPodLabel(podEvents chan spec.PodEvent, stopChan chan struct{}, role *PostgresRole) (*v1.Pod, error) {
229229
timeout := time.After(c.OpConfig.PodLabelWaitTimeout)
230230
for {
231231
select {
@@ -241,6 +241,8 @@ func (c *Cluster) waitForPodLabel(podEvents chan spec.PodEvent, role *PostgresRo
241241
}
242242
case <-timeout:
243243
return nil, fmt.Errorf("pod label wait timeout")
244+
case <-stopChan:
245+
return nil, fmt.Errorf("pod label wait cancelled")
244246
}
245247
}
246248
}
@@ -278,7 +280,10 @@ func (c *Cluster) waitStatefulsetReady() error {
278280
})
279281
}
280282

281-
func (c *Cluster) waitPodLabelsReady() error {
283+
func (c *Cluster) _waitPodLabelsReady(anyReplica bool) error {
284+
var (
285+
podsNumber int
286+
)
282287
ls := c.labelsSet(false)
283288
namespace := c.Namespace
284289

@@ -295,35 +300,56 @@ func (c *Cluster) waitPodLabelsReady() error {
295300
c.OpConfig.PodRoleLabel: string(Replica),
296301
}).String(),
297302
}
298-
pods, err := c.KubeClient.Pods(namespace).List(listOptions)
299-
if err != nil {
300-
return err
303+
podsNumber = 1
304+
if !anyReplica {
305+
pods, err := c.KubeClient.Pods(namespace).List(listOptions)
306+
if err != nil {
307+
return err
308+
}
309+
podsNumber = len(pods.Items)
310+
c.logger.Debugf("Waiting for %d pods to become ready", podsNumber)
311+
} else {
312+
c.logger.Debugf("Waiting for any replica pod to become ready")
301313
}
302-
podsNumber := len(pods.Items)
303314

304-
err = retryutil.Retry(c.OpConfig.ResourceCheckInterval, c.OpConfig.ResourceCheckTimeout,
315+
err := retryutil.Retry(c.OpConfig.ResourceCheckInterval, c.OpConfig.ResourceCheckTimeout,
305316
func() (bool, error) {
306-
masterPods, err2 := c.KubeClient.Pods(namespace).List(masterListOption)
307-
if err2 != nil {
308-
return false, err2
317+
masterCount := 0
318+
if !anyReplica {
319+
masterPods, err2 := c.KubeClient.Pods(namespace).List(masterListOption)
320+
if err2 != nil {
321+
return false, err2
322+
}
323+
if len(masterPods.Items) > 1 {
324+
return false, fmt.Errorf("too many masters (%d pods with the master label found)",
325+
len(masterPods.Items))
326+
}
327+
masterCount = len(masterPods.Items)
309328
}
310329
replicaPods, err2 := c.KubeClient.Pods(namespace).List(replicaListOption)
311330
if err2 != nil {
312331
return false, err2
313332
}
314-
if len(masterPods.Items) > 1 {
315-
return false, fmt.Errorf("too many masters")
316-
}
317-
if len(replicaPods.Items) == podsNumber {
333+
replicaCount := len(replicaPods.Items)
334+
if anyReplica && replicaCount > 0 {
335+
c.logger.Debugf("Found %d running replica pods", replicaCount)
318336
return true, nil
319337
}
320338

321-
return len(masterPods.Items)+len(replicaPods.Items) == podsNumber, nil
339+
return masterCount+replicaCount >= podsNumber, nil
322340
})
323341

324342
return err
325343
}
326344

345+
func (c *Cluster) waitForAnyReplicaLabelReady() error {
346+
return c._waitPodLabelsReady(true)
347+
}
348+
349+
func (c *Cluster) waitForAllPodsLabelReady() error {
350+
return c._waitPodLabelsReady(false)
351+
}
352+
327353
func (c *Cluster) waitStatefulsetPodsReady() error {
328354
c.setProcessName("waiting for the pods of the statefulset")
329355
// TODO: wait for the first Pod only
@@ -332,7 +358,7 @@ func (c *Cluster) waitStatefulsetPodsReady() error {
332358
}
333359

334360
// TODO: wait only for master
335-
if err := c.waitPodLabelsReady(); err != nil {
361+
if err := c.waitForAllPodsLabelReady(); err != nil {
336362
return fmt.Errorf("pod labels error: %v", err)
337363
}
338364

0 commit comments

Comments
 (0)