Handle rolling update

If the image required by the user changes, the operator will now start rolling the upgrade to replicas. After all the replicas are updated the master need to be updated, and for that the user can choose between two behaviors, which are specificable in Spec of the cluster: - `switchover`: promote the replica with less data to receive or to reapply as the new master and then finish the upgrade; - `wait`: wait for the user to require a switchover, and then finish the rolling update process. In the cluster status we keep track of when we required a Pod to restart to be updated, and use that one to check if the Pod have been really updated or not. Co-authored-by: Leonardo Cecchi <leonardo.cecchi@2ndquadrant.it> Co-authored-by: Jonathan Gonzalez V <jonathan.gonzalez@2ndquadrant.com> Co-authored-by: Marco Nenciarini <marco.nenciarini@2ndquadrant.it>
KenKilty · Mar 19, 2020 · 2361be6 · 2361be6
1 parent 4137ec0
commit 2361be6
Show file tree

Hide file tree

Showing 17 changed files with 535 additions and 152 deletions.
diff --git a/api/v1alpha1/cluster_types.go b/api/v1alpha1/cluster_types.go
@@ -81,30 +81,55 @@ type ClusterSpec struct {
 	// Resources requirements of every generated Pod
 	// +optional
 	Resources corev1.ResourceRequirements `json:"resources,omitempty"`
+
+	// Whenever to wait for the user to issue a switchover request or directly
+	// switchover to another replica and then update the last instance
+	MasterUpdateStrategy MasterUpdateStrategy `json:"masterUpdateStrategy,omitempty"`
 }
 
 // ClusterStatus defines the observed state of Cluster
 type ClusterStatus struct {
-	// Name of the image used
-	ImageName string `json:"imageName,omitempty"`
-
 	// Total number of instances in the cluster
 	Instances int32 `json:"instances,omitempty"`
 
 	// Total number of ready instances in the cluster
 	ReadyInstances int32 `json:"readyInstances,omitempty"`
 
+	// Total number of instances which are being updated
+	InstancesBeingUpdated int32 `json:"instancesBeingUpdated,omitempty"`
+
+	// ID of the latest generated node (used to avoid node name clashing)
+	LatestGeneratedNode int32 `json:"latestGeneratedNode,omitempty"`
+
+	// Name of the image used
+	ImageName string `json:"imageName,omitempty"`
+
 	// Current primary instance
 	CurrentPrimary string `json:"currentPrimary,omitempty"`
 
 	// Target primary instance, this is different from the previous one
 	// during a switchover or a failover
 	TargetPrimary string `json:"targetPrimary,omitempty"`
 
-	// ID of the latest generated node (used to avoid node name clashing)
-	LatestGeneratedNode int32 `json:"latestGeneratedNode,omitempty"`
+	// The status of the current updated servers
+	RollingUpdateStatus map[string]RollingUpdateStatus `json:"rollingUpdateStatus,omitempty"`
 }
 
+// MasterUpdateStrategy contains the strategy available to apply an update
+// to the master server of the cluster
+type MasterUpdateStrategy string
+
+const (
+	// MasterUpdateStrategyWait means that the operator need to wait for the
+	// user to manually issue a switchover request before updating the master
+	// server
+	MasterUpdateStrategyWait = "wait"
+
+	// MasterUpdateStrategySwitchover means that the operator will switchover
+	// to another updated replica and then update the master server
+	MasterUpdateStrategySwitchover = "switchover"
+)
+
 // PostgresConfiguration defines the PostgreSQL configuration
 type PostgresConfiguration struct {
 	// PostgreSQL configuration options (postgresql.conf)
@@ -155,6 +180,16 @@ type AffinityConfiguration struct {
 	TopologyKey string `json:"topologyKey"`
 }
 
+// RollingUpdateStatus contains the information about an instance which is
+// being updated
+type RollingUpdateStatus struct {
+	// The image which we put into the Pod
+	ImageName string `json:"imageName"`
+
+	// When the update has been started
+	StartedAt metav1.Time `json:"startedAt,omitempty"`
+}
+
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
 // +kubebuilder:subresource:scale:specpath=.spec.instances,statuspath=.status.instances
@@ -247,6 +282,17 @@ func (cluster *Cluster) GetMaxStopDelay() int32 {
 	return 30
 }
 
+// GetMasterUpdateStrategy get the cluster master update strategy,
+// defaulting to switchover
+func (cluster *Cluster) GetMasterUpdateStrategy() MasterUpdateStrategy {
+	strategy := cluster.Spec.MasterUpdateStrategy
+	if strategy == "" {
+		return MasterUpdateStrategySwitchover
+	}
+
+	return strategy
+}
+
 func init() {
 	SchemeBuilder.Register(&Cluster{}, &ClusterList{})
 }
diff --git a/api/v1alpha1/cluster_types_test.go b/api/v1alpha1/cluster_types_test.go
@@ -67,3 +67,20 @@ var _ = Describe("Detect persistent storage", func() {
 		Expect(cluster.IsUsingPersistentStorage()).To(BeTrue())
 	})
 })
+
+var _ = Describe("Master update strategy", func() {
+	It("defaults to switchover", func() {
+		emptyCluster := Cluster{}
+		Expect(emptyCluster.GetMasterUpdateStrategy()).To(BeEquivalentTo(MasterUpdateStrategySwitchover))
+	})
+
+	It("respect the preference of the user", func() {
+		cluster := Cluster{
+			Spec: ClusterSpec{
+				Instances:            0,
+				MasterUpdateStrategy: MasterUpdateStrategyWait,
+			},
+		}
+		Expect(cluster.GetMasterUpdateStrategy()).To(BeEquivalentTo(MasterUpdateStrategyWait))
+	})
+})
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/postgresql.k8s.2ndq.io_clusters.yaml b/config/crd/bases/postgresql.k8s.2ndq.io_clusters.yaml
@@ -84,6 +84,11 @@ spec:
               format: int32
               minimum: 1
               type: integer
+            masterUpdateStrategy:
+              description: Whenever to wait for the user to issue a switchover request
+                or directly switchover to another replica and then update the last
+                instance
+              type: string
             postgresql:
               description: Configuration of the PostgreSQL server
               properties:
@@ -295,6 +300,10 @@ spec:
               description: Total number of instances in the cluster
               format: int32
               type: integer
+            instancesBeingUpdated:
+              description: Total number of instances which are being updated
+              format: int32
+              type: integer
             latestGeneratedNode:
               description: ID of the latest generated node (used to avoid node name
                 clashing)
@@ -304,6 +313,23 @@ spec:
               description: Total number of ready instances in the cluster
               format: int32
               type: integer
+            rollingUpdateStatus:
+              additionalProperties:
+                description: RollingUpdateStatus contains the information about an
+                  instance which is being updated
+                properties:
+                  imageName:
+                    description: The image which we put into the Pod
+                    type: string
+                  startedAt:
+                    description: When the update has been started
+                    format: date-time
+                    type: string
+                required:
+                - imageName
+                type: object
+              description: The status of the current updated servers
+              type: object
             targetPrimary:
               description: Target primary instance, this is different from the previous
                 one during a switchover or a failover

diff --git a/controllers/cluster_controller.go b/controllers/cluster_controller.go
@@ -19,6 +19,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	"github.com/2ndquadrant/cloud-native-postgresql/api/v1alpha1"
+	"github.com/2ndquadrant/cloud-native-postgresql/pkg/postgres"
 )
 
 const (
@@ -99,18 +100,38 @@ func (r *ClusterReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
 	}
 
 	if cluster.Status.CurrentPrimary != "" && cluster.Status.CurrentPrimary != cluster.Status.TargetPrimary {
-		r.Log.Info("Switchover initiated by PGK, waiting for the cluster to align")
+		r.Log.Info("Switchover in progress, waiting for the cluster to align")
 		// TODO: check if the TargetPrimary is active, otherwise recovery?
 		return ctrl.Result{}, err
 	}
 
 	// Update the status section of this Cluster resource
 	if err = r.updateResourceStatus(ctx, &cluster, childPods); err != nil {
+		if apierrs.IsConflict(err) {
+			// Let's wait for another reconciler loop, since the
+			// status already changed
+			return ctrl.Result{}, nil
+		}
+
+		return ctrl.Result{}, err
+	}
+
+	// Find if we have Pods that we are upgrading
+	if cluster.Status.InstancesBeingUpdated != 0 {
+		r.Log.V(2).Info("There are nodes being upgraded, waiting for the new image to be applied",
+			"clusterName", cluster.Name,
+			"namespace", cluster.Namespace)
+		return ctrl.Result{}, nil
+	}
+
+	// Get the replication status
+	var instancesStatus postgres.PostgresqlStatusList
+	if instancesStatus, err = r.getStatusFromInstances(ctx, childPods); err != nil {
 		return ctrl.Result{}, err
 	}
 
 	// Update the target primary name from the Pods status
-	if err = r.updateTargetPrimaryFromPods(ctx, &cluster, childPods); err != nil {
+	if err = r.updateTargetPrimaryFromPods(ctx, &cluster, instancesStatus); err != nil {
 		return ctrl.Result{}, err
 	}
 
@@ -141,41 +162,43 @@ func (r *ClusterReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
 	// Find if we have Pods that are not ready
 	if cluster.Status.ReadyInstances != cluster.Status.Instances {
 		// A pod is not ready, let's retry
-		r.Log.Info("Waiting for node to be ready")
+		r.Log.V(2).Info("Waiting for node to be ready",
+			"clusterName", cluster.Name,
+			"namespace", cluster.Namespace)
 		return ctrl.Result{}, nil
 	}
 
-	// TODO failing nodes?
-
-	// Are there missing nodes? Let's create one,
-	// but only if no failover/switchover is running
-	if cluster.Status.CurrentPrimary == cluster.Status.TargetPrimary {
-		if cluster.Status.Instances < cluster.Spec.Instances {
-			newNodeSerial, err := r.generateNodeSerial(ctx, &cluster)
-			if err != nil {
-				return ctrl.Result{}, err
-			}
+	// Is there a switchover or failover in progress?
+	// Let's wait for it to terminate before applying the
+	// following operations
+	if cluster.Status.TargetPrimary != cluster.Status.CurrentPrimary {
+		r.Log.V(2).Info("There is a switchover or a failover "+
+			"in progress, waiting for the operation to complete",
+			"clusterName", cluster.Name,
+			"namespace", cluster.Namespace,
+			"currentPrimary", cluster.Status.CurrentPrimary,
+			"targetPrimary", cluster.Status.TargetPrimary)
+		return ctrl.Result{}, nil
+	}
 
-			return r.joinReplicaInstance(ctx, newNodeSerial, &cluster)
+	// Are there missing nodes? Let's create one
+	if cluster.Status.Instances < cluster.Spec.Instances {
+		newNodeSerial, err := r.generateNodeSerial(ctx, &cluster)
+		if err != nil {
+			return ctrl.Result{}, err
 		}
+
+		return ctrl.Result{}, r.joinReplicaInstance(ctx, newNodeSerial, &cluster)
 	}
 
 	// Are there nodes to be removed? Remove one of them
 	if cluster.Status.Instances > cluster.Spec.Instances {
-		// Is there one pod to be deleted?
-		sacrificialPod := getSacrificialPod(childPods.Items)
-		if sacrificialPod == nil {
-			r.Log.Info("There are no instances to be sacrificed. Wait for the next sync loop")
-			return ctrl.Result{}, nil
-		}
+		return ctrl.Result{}, r.scaleDownCluster(ctx, &cluster, childPods)
+	}
 
-		r.Log.Info("Too many nodes for cluster, deleting an instance",
-			"cluster", cluster.Name,
-			"namespace", cluster.Namespace,
-			"pod", sacrificialPod.Name)
-		err = r.Delete(ctx, sacrificialPod)
-		if err != nil {
-			r.Log.Error(err, "Cannot kill the Pod to scale down")
+	// Check if we need to handle a rolling upgrade
+	if cluster.Status.ImageName != cluster.Spec.ImageName {
+		if err = r.upgradeCluster(ctx, &cluster, childPods, instancesStatus); err != nil {
 			return ctrl.Result{}, err
 		}
 	}