diff --git a/charts/postgres-operator/templates/clusterrole.yaml b/charts/postgres-operator/templates/clusterrole.yaml index a24571cf1..25b19b89c 100644 --- a/charts/postgres-operator/templates/clusterrole.yaml +++ b/charts/postgres-operator/templates/clusterrole.yaml @@ -141,4 +141,15 @@ rules: - bind resourceNames: - {{ template "postgres-operator.fullname" . }} +- apiGroups: + - batch + resources: + - cronjobs # enables logical backups + verbs: + - create + - delete + - get + - list + - patch + - update {{ end }} diff --git a/charts/postgres-operator/values.yaml b/charts/postgres-operator/values.yaml index 21813fbe9..c5349b55f 100644 --- a/charts/postgres-operator/values.yaml +++ b/charts/postgres-operator/values.yaml @@ -62,6 +62,9 @@ config: pod_management_policy: "ordered_ready" enable_pod_antiaffinity: "false" pod_antiaffinity_topology_key: "kubernetes.io/hostname" + logical_backup_schedule: "30 00 * * *" + logical_backup_docker_image: "registry.opensource.zalan.do/acid/logical-backup" + logical_backup_s3_bucket: "" rbac: # Specifies whether RBAC resources should be created create: true diff --git a/docs/administrator.md b/docs/administrator.md index 32a749e36..f6f37aafb 100644 --- a/docs/administrator.md +++ b/docs/administrator.md @@ -340,9 +340,18 @@ Postgres database cluster: ## Understanding rolling update of Spilo pods -The operator logs reasons for a rolling update with the `info` level and -a diff between the old and new StatefulSet specs with the `debug` level. -To read the latter log entry with the escaped characters rendered, view it -in CLI with `echo -e`. Note that the resultant message will contain some -noise because the `PodTemplate` used by the operator is yet to be updated -with the default values used internally in Kubernetes. +The operator logs reasons for a rolling update with the `info` level and a diff between the old and new StatefulSet specs with the `debug` level. To benefit from numerous escape characters in the latter log entry, view it in CLI with `echo -e`. Note that the resultant message will contain some noise because the `PodTemplate` used by the operator is yet to be updated with the default values used internally in Kubernetes. + +## Logical backups + +The operator can manage k8s cron jobs to run logical backups of Postgres clusters. The cron job periodically spawns a batch job that runs a single pod. The backup script within this pod's container can connect to a DB for a logical backup. The operator updates cron jobs during Sync if the job schedule changes; the job name acts as the job identifier. These jobs are to be enabled for each indvidual Postgres cluster by setting `enableLogicalBackup: true` in its manifest. Notes: + +1. The provided `registry.opensource.zalan.do/acid/logical-backup` image implements the backup via `pg_dumpall` and upload of (compressed) results to an S3 bucket; `pg_dumpall` requires a `superuser` access to a DB and runs on the replica when possible. + +2. Due to the [limitation of Kubernetes cron jobs](https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-job-limitations) it is highly advisable to set up additional monitoring for this feature; such monitoring is outside of the scope of operator responsibilities. + +3. The operator does not remove old backups. + +4. You may use your own image by overwriting the relevant field in the operator configuration. Any such image must ensure the logical backup is able to finish [in presence of pod restarts](https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#handling-pod-and-container-failures) and [simultaneous invocations](https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-job-limitations) of the backup cron job. + +5. For that feature to work, your RBAC policy must enable operations on the `cronjobs` resource from the `batch` API group for the operator service account. See [example RBAC](../manifests/operator-service-account-rbac.yaml) \ No newline at end of file diff --git a/docs/developer.md b/docs/developer.md index a18ac3323..e181357ab 100644 --- a/docs/developer.md +++ b/docs/developer.md @@ -203,7 +203,7 @@ localhost:8080 by doing: The inner 'query' gets the name of the postgres operator pod, and the outer enables port forwarding. Afterwards, you can access the operator API with: - $ curl http://127.0.0.1:8080/$endpoint| jq . + $ curl --location http://127.0.0.1:8080/$endpoint | jq . The available endpoints are listed below. Note that the worker ID is an integer from 0 up to 'workers' - 1 (value configured in the operator configuration and @@ -323,6 +323,9 @@ be updated. As explained [here](reference/operator_parameters.md), it's possible to configure the operator either with a ConfigMap or CRD, but currently we aim to synchronize parameters everywhere. +When choosing a parameter name for a new option in a PG manifest, keep in mind +the naming conventions there. The `snake_case` variables come from the Patroni/Postgres world, while the `camelCase` from the k8s world. + Note: If one option is defined in the operator configuration and in the cluster [manifest](../manifests/complete-postgres-manifest.yaml), the latter takes precedence. diff --git a/docs/reference/cluster_manifest.md b/docs/reference/cluster_manifest.md index f1491525d..842b50cf9 100644 --- a/docs/reference/cluster_manifest.md +++ b/docs/reference/cluster_manifest.md @@ -14,6 +14,8 @@ measurements. Please, refer to the [Kubernetes documentation](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/) for the possible values of those. +:exclamation: If both operator configmap/CRD and a Postgres cluster manifest define the same parameter, the value from the Postgres cluster manifest is applied. + ## Manifest structure A postgres manifest is a `YAML` document. On the top level both individual @@ -45,7 +47,7 @@ Those parameters are grouped under the `metadata` top-level key. ## Top-level parameters -Those are parameters grouped directly under the `spec` key in the manifest. +These parameters are grouped directly under the `spec` key in the manifest. * **teamId** name of the team the cluster belongs to. Changing it after the cluster @@ -117,6 +119,12 @@ Those are parameters grouped directly under the `spec` key in the manifest. is `false`, then no volume will be mounted no matter how operator was configured (so you can override the operator configuration). +* **enableLogicalBackup** + Determines if the logical backup of this cluster should be taken and uploaded to S3. Default: false. + +* **logicalBackupSchedule** + Schedule for the logical backup k8s cron job. Please take [the reference schedule format](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#schedule) into account. Default: "30 00 \* \* \*" + ## Postgres parameters Those parameters are grouped under the `postgresql` top-level key. diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md index 11fee3846..3e06cf31d 100644 --- a/docs/reference/operator_parameters.md +++ b/docs/reference/operator_parameters.md @@ -51,6 +51,8 @@ parameters, those parameters have no effect and are replaced by the `CRD_READY_WAIT_INTERVAL` and `CRD_READY_WAIT_TIMEOUT` environment variables. They will be deprecated and removed in the future. +For the configmap operator configuration, the [default parameter values](https://github.com/zalando-incubator/postgres-operator/blob/master/pkg/util/config/config.go#L14) mentioned here are likely to be overwritten in your local operator installation via your local version of the operator configmap. In the case you use the operator CRD, all the CRD defaults are provided in the [operator's default configuration manifest](https://github.com/zalando-incubator/postgres-operator/blob/master/manifests/postgresql-operator-default-configuration.yaml) + Variable names are underscore-separated words. @@ -476,4 +478,16 @@ scalyr sidecar. In the CRD-based configuration they are grouped under the Memory limit value for the Scalyr sidecar. The default is `1Gi`. -For the configmap operator configuration, the [default parameter values](https://github.com/zalando/postgres-operator/blob/master/pkg/util/config/config.go#L14) mentioned here are likely to be overwritten in your local operator installation via your local version of the operator configmap. In the case you use the operator CRD, all the CRD defaults are provided in the [operator's default configuration manifest](https://github.com/zalando/postgres-operator/blob/master/manifests/postgresql-operator-default-configuration.yaml) +## Logical backup + + These parameters configure a k8s cron job managed by the operator to produce Postgres logical backups. + In the CRD-based configuration those parameters are grouped under the `logical_backup` key. + + * **logical_backup_schedule** + Backup schedule in the cron format. Please take [the reference schedule format](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#schedule) into account. Default: "30 00 \* \* \*" + + * **logical_backup_docker_image** + Docker image for the pods of the cron job. Must implement backup logic and correctly handle pod and job restarts. The default image runs `pg_dumpall` (on a replica if possible) and uploads compressed results to an S3 bucket under the key `/spilo/pg_cluster_name/cluster_k8s_uuid/logical_backups` Default: "registry.opensource.zalan.do/acid/logical-backup" + + * **logical_backup_s3_bucket** + S3 bucket to store backup results. The bucket has to be present and accessible by Postgres pods. Default: empty. diff --git a/docs/user.md b/docs/user.md index bab09f8c9..1942bab16 100644 --- a/docs/user.md +++ b/docs/user.md @@ -347,3 +347,11 @@ every 6 hours. Note that if the statefulset is scaled down before resizing the size changes are only applied to the volumes attached to the running pods. The size of the volumes that correspond to the previously running pods is not changed. + +## Logical backups + +If you add +``` + enableLogicalBackup: true +``` +to the cluster manifest, the operator will create and sync a k8s cron job to do periodic logical backups of this particular Postgres cluster. Due to the [limitation of Kubernetes cron jobs](https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-job-limitations) it is highly advisable to set up additional monitoring for this feature; such monitoring is outside of the scope of operator responsibilities. See [configuration reference](reference/cluster_manifest.md) and [administrator documentation](administrator.md) for details on how backups are executed. diff --git a/manifests/complete-postgres-manifest.yaml b/manifests/complete-postgres-manifest.yaml index 276e07a1f..b2ebe948e 100644 --- a/manifests/complete-postgres-manifest.yaml +++ b/manifests/complete-postgres-manifest.yaml @@ -64,6 +64,10 @@ spec: # cluster: "acid-batman" # timestamp: "2017-12-19T12:40:33+01:00" # timezone required (offset relative to UTC, see RFC 3339 section 5.6) # s3_wal_path: "s3://custom/path/to/bucket" + + # run periodic backups with k8s cron jobs + # enableLogicalBackup: true + # logicalBackupSchedule: "30 00 * * *" maintenanceWindows: - 01:00-06:00 #UTC - Sat:00:00-04:00 diff --git a/manifests/configmap.yaml b/manifests/configmap.yaml index 17830c41f..bd7d11c6a 100644 --- a/manifests/configmap.yaml +++ b/manifests/configmap.yaml @@ -54,3 +54,7 @@ data: resource_check_interval: 3s resource_check_timeout: 10m resync_period: 5m + + # logical_backup_schedule: "30 00 * * *" + # logical_backup_docker_image: "registry.opensource.zalan.do/acid/logical-backup" + # logical_backup_s3_bucket: "" diff --git a/manifests/minimal-postgres-manifest.yaml b/manifests/minimal-postgres-manifest.yaml index 37d772567..e952df374 100644 --- a/manifests/minimal-postgres-manifest.yaml +++ b/manifests/minimal-postgres-manifest.yaml @@ -17,7 +17,6 @@ spec: # role for application foo foo_user: [] - #databases: name->owner databases: foo: zalando diff --git a/manifests/operator-service-account-rbac.yaml b/manifests/operator-service-account-rbac.yaml index 61467bfae..2057c414f 100644 --- a/manifests/operator-service-account-rbac.yaml +++ b/manifests/operator-service-account-rbac.yaml @@ -142,7 +142,17 @@ rules: - bind resourceNames: - zalando-postgres-operator - +- apiGroups: + - batch + resources: + - cronjobs # enables logical backups + verbs: + - create + - delete + - get + - list + - patch + - update --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/manifests/postgresql-operator-default-configuration.yaml b/manifests/postgresql-operator-default-configuration.yaml index 5ea5ba87c..fa27c6956 100644 --- a/manifests/postgresql-operator-default-configuration.yaml +++ b/manifests/postgresql-operator-default-configuration.yaml @@ -91,4 +91,7 @@ configuration: # scalyr_api_key: "" # scalyr_image: "" # scalyr_server_url: "" - + logical_backup: + logical_backup_schedule: "30 00 * * *" + logical_backup_docker_image: "registry.opensource.zalan.do/acid/logical-backup" + logical_backup_s3_bucket: "" diff --git a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go index d4ea04e15..c6e87d8ea 100644 --- a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go +++ b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go @@ -143,25 +143,26 @@ type ScalyrConfiguration struct { // OperatorConfigurationData defines the operation config type OperatorConfigurationData struct { - EtcdHost string `json:"etcd_host,omitempty"` - DockerImage string `json:"docker_image,omitempty"` - Workers uint32 `json:"workers,omitempty"` - MinInstances int32 `json:"min_instances,omitempty"` - MaxInstances int32 `json:"max_instances,omitempty"` - ResyncPeriod Duration `json:"resync_period,omitempty"` - RepairPeriod Duration `json:"repair_period,omitempty"` - Sidecars map[string]string `json:"sidecar_docker_images,omitempty"` - PostgresUsersConfiguration PostgresUsersConfiguration `json:"users"` - Kubernetes KubernetesMetaConfiguration `json:"kubernetes"` - PostgresPodResources PostgresPodResourcesDefaults `json:"postgres_pod_resources"` - SetMemoryRequestToLimit bool `json:"set_memory_request_to_limit,omitempty"` - Timeouts OperatorTimeouts `json:"timeouts"` - LoadBalancer LoadBalancerConfiguration `json:"load_balancer"` - AWSGCP AWSGCPConfiguration `json:"aws_or_gcp"` - OperatorDebug OperatorDebugConfiguration `json:"debug"` - TeamsAPI TeamsAPIConfiguration `json:"teams_api"` - LoggingRESTAPI LoggingRESTAPIConfiguration `json:"logging_rest_api"` - Scalyr ScalyrConfiguration `json:"scalyr"` + EtcdHost string `json:"etcd_host,omitempty"` + DockerImage string `json:"docker_image,omitempty"` + Workers uint32 `json:"workers,omitempty"` + MinInstances int32 `json:"min_instances,omitempty"` + MaxInstances int32 `json:"max_instances,omitempty"` + ResyncPeriod Duration `json:"resync_period,omitempty"` + RepairPeriod Duration `json:"repair_period,omitempty"` + Sidecars map[string]string `json:"sidecar_docker_images,omitempty"` + PostgresUsersConfiguration PostgresUsersConfiguration `json:"users"` + Kubernetes KubernetesMetaConfiguration `json:"kubernetes"` + PostgresPodResources PostgresPodResourcesDefaults `json:"postgres_pod_resources"` + SetMemoryRequestToLimit bool `json:"set_memory_request_to_limit,omitempty"` + Timeouts OperatorTimeouts `json:"timeouts"` + LoadBalancer LoadBalancerConfiguration `json:"load_balancer"` + AWSGCP AWSGCPConfiguration `json:"aws_or_gcp"` + OperatorDebug OperatorDebugConfiguration `json:"debug"` + TeamsAPI TeamsAPIConfiguration `json:"teams_api"` + LoggingRESTAPI LoggingRESTAPIConfiguration `json:"logging_rest_api"` + Scalyr ScalyrConfiguration `json:"scalyr"` + LogicalBackup OperatorLogicalBackupConfiguration `json:"logical_backup"` } // OperatorConfigurationUsers defines configration for super user @@ -174,3 +175,9 @@ type OperatorConfigurationUsers struct { //Duration shortens this frequently used name type Duration time.Duration + +type OperatorLogicalBackupConfiguration struct { + Schedule string `json:"logical_backup_schedule,omitempty"` + DockerImage string `json:"logical_backup_docker_image,omitempty"` + S3Bucket string `json:"logical_backup_s3_bucket,omitempty"` +} diff --git a/pkg/apis/acid.zalan.do/v1/postgresql_type.go b/pkg/apis/acid.zalan.do/v1/postgresql_type.go index 4def8ad07..87a079da9 100644 --- a/pkg/apis/acid.zalan.do/v1/postgresql_type.go +++ b/pkg/apis/acid.zalan.do/v1/postgresql_type.go @@ -3,7 +3,7 @@ package v1 import ( "time" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -43,17 +43,19 @@ type PostgresSpec struct { // load balancers' source ranges are the same for master and replica services AllowedSourceRanges []string `json:"allowedSourceRanges"` - NumberOfInstances int32 `json:"numberOfInstances"` - Users map[string]UserFlags `json:"users"` - MaintenanceWindows []MaintenanceWindow `json:"maintenanceWindows,omitempty"` - Clone CloneDescription `json:"clone"` - ClusterName string `json:"-"` - Databases map[string]string `json:"databases,omitempty"` - Tolerations []v1.Toleration `json:"tolerations,omitempty"` - Sidecars []Sidecar `json:"sidecars,omitempty"` - InitContainers []v1.Container `json:"init_containers,omitempty"` - PodPriorityClassName string `json:"pod_priority_class_name,omitempty"` - ShmVolume *bool `json:"enableShmVolume,omitempty"` + NumberOfInstances int32 `json:"numberOfInstances"` + Users map[string]UserFlags `json:"users"` + MaintenanceWindows []MaintenanceWindow `json:"maintenanceWindows,omitempty"` + Clone CloneDescription `json:"clone"` + ClusterName string `json:"-"` + Databases map[string]string `json:"databases,omitempty"` + Tolerations []v1.Toleration `json:"tolerations,omitempty"` + Sidecars []Sidecar `json:"sidecars,omitempty"` + InitContainers []v1.Container `json:"init_containers,omitempty"` + PodPriorityClassName string `json:"pod_priority_class_name,omitempty"` + ShmVolume *bool `json:"enableShmVolume,omitempty"` + EnableLogicalBackup bool `json:"enableLogicalBackup,omitempty"` + LogicalBackupSchedule string `json:"logicalBackupSchedule,omitempty"` } // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object diff --git a/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go b/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go index e51b2eaa3..7a27bb794 100644 --- a/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go +++ b/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go @@ -211,6 +211,7 @@ func (in *OperatorConfigurationData) DeepCopyInto(out *OperatorConfigurationData in.TeamsAPI.DeepCopyInto(&out.TeamsAPI) out.LoggingRESTAPI = in.LoggingRESTAPI out.Scalyr = in.Scalyr + out.LogicalBackup = in.LogicalBackup return } @@ -301,6 +302,22 @@ func (in *OperatorDebugConfiguration) DeepCopy() *OperatorDebugConfiguration { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *OperatorLogicalBackupConfiguration) DeepCopyInto(out *OperatorLogicalBackupConfiguration) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OperatorLogicalBackupConfiguration. +func (in *OperatorLogicalBackupConfiguration) DeepCopy() *OperatorLogicalBackupConfiguration { + if in == nil { + return nil + } + out := new(OperatorLogicalBackupConfiguration) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *OperatorTimeouts) DeepCopyInto(out *OperatorTimeouts) { *out = *in diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 21f039149..9cbc46e70 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -81,6 +81,7 @@ type Cluster struct { currentProcess Process processMu sync.RWMutex // protects the current operation for reporting, no need to hold the master mutex specMu sync.RWMutex // protects the spec for reporting, no need to hold the master mutex + } type compareStatefulsetResult struct { @@ -298,6 +299,13 @@ func (c *Cluster) Create() error { c.logger.Infof("databases have been successfully created") } + if c.Postgresql.Spec.EnableLogicalBackup { + if err := c.createLogicalBackupJob(); err != nil { + return fmt.Errorf("could not create a k8s cron job for logical backups: %v", err) + } + c.logger.Info("a k8s cron job for logical backup has been successfully created") + } + if err := c.listResources(); err != nil { c.logger.Errorf("could not list resources: %v", err) } @@ -481,8 +489,10 @@ func compareResoucesAssumeFirstNotNil(a *v1.ResourceRequirements, b *v1.Resource } -// Update changes Kubernetes objects according to the new specification. Unlike the sync case, the missing object. -// (i.e. service) is treated as an error. +// Update changes Kubernetes objects according to the new specification. Unlike the sync case, the missing object +// (i.e. service) is treated as an error +// logical backup cron jobs are an exception: a user-initiated Update can enable a logical backup job +// for a cluster that had no such job before. In this case a missing job is not an error. func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error { updateFailed := false @@ -569,6 +579,43 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error { } }() + // logical backup job + func() { + + // create if it did not exist + if !oldSpec.Spec.EnableLogicalBackup && newSpec.Spec.EnableLogicalBackup { + c.logger.Debugf("creating backup cron job") + if err := c.createLogicalBackupJob(); err != nil { + c.logger.Errorf("could not create a k8s cron job for logical backups: %v", err) + updateFailed = true + return + } + } + + // delete if no longer needed + if oldSpec.Spec.EnableLogicalBackup && !newSpec.Spec.EnableLogicalBackup { + c.logger.Debugf("deleting backup cron job") + if err := c.deleteLogicalBackupJob(); err != nil { + c.logger.Errorf("could not delete a k8s cron job for logical backups: %v", err) + updateFailed = true + return + } + + } + + // apply schedule changes + // this is the only parameter of logical backups a user can overwrite in the cluster manifest + if (oldSpec.Spec.EnableLogicalBackup && newSpec.Spec.EnableLogicalBackup) && + (newSpec.Spec.LogicalBackupSchedule != oldSpec.Spec.LogicalBackupSchedule) { + c.logger.Debugf("updating schedule of the backup cron job") + if err := c.syncLogicalBackupJob(); err != nil { + c.logger.Errorf("could not sync logical backup jobs: %v", err) + updateFailed = true + } + } + + }() + // Roles and Databases if !(c.databaseAccessDisabled() || c.getNumberOfInstances(&c.Spec) <= 0) { c.logger.Debugf("syncing roles") @@ -597,6 +644,12 @@ func (c *Cluster) Delete() { c.mu.Lock() defer c.mu.Unlock() + // delete the backup job before the stateful set of the cluster to prevent connections to non-existing pods + // deleting the cron job also removes pods and batch jobs it created + if err := c.deleteLogicalBackupJob(); err != nil { + c.logger.Warningf("could not remove the logical backup k8s cron job; %v", err) + } + if err := c.deleteStatefulSet(); err != nil { c.logger.Warningf("could not delete statefulset: %v", err) } @@ -629,6 +682,7 @@ func (c *Cluster) Delete() { if err := c.deletePatroniClusterObjects(); err != nil { c.logger.Warningf("could not remove leftover patroni objects; %v", err) } + } //NeedsRepair returns true if the cluster should be included in the repair scan (based on its in-memory status). diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index dbc884657..4953fcfe9 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -20,6 +20,8 @@ import ( "github.com/zalando/postgres-operator/pkg/util" "github.com/zalando/postgres-operator/pkg/util/config" "github.com/zalando/postgres-operator/pkg/util/constants" + batchv1 "k8s.io/api/batch/v1" + batchv1beta1 "k8s.io/api/batch/v1beta1" "k8s.io/apimachinery/pkg/labels" ) @@ -352,7 +354,7 @@ func generateVolumeMounts() []v1.VolumeMount { } } -func generateSpiloContainer( +func generateContainer( name string, dockerImage *string, resourceRequirements *v1.ResourceRequirements, @@ -792,7 +794,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*v1beta1.State // generate the spilo container c.logger.Debugf("Generating Spilo container, environment variables: %v", spiloEnvVars) - spiloContainer := generateSpiloContainer(c.containerName(), + spiloContainer := generateContainer(c.containerName(), &effectiveDockerImage, resourceRequirements, spiloEnvVars, @@ -1281,3 +1283,167 @@ func (c *Cluster) getClusterServiceConnectionParameters(clusterName string) (hos port = "5432" return } + +func (c *Cluster) generateLogicalBackupJob() (*batchv1beta1.CronJob, error) { + + var ( + err error + podTemplate *v1.PodTemplateSpec + resourceRequirements *v1.ResourceRequirements + ) + + // NB: a cron job creates standard batch jobs according to schedule; these batch jobs manage pods and clean-up + + c.logger.Debug("Generating logical backup pod template") + + // allocate for the backup pod the same amount of resources as for normal DB pods + defaultResources := c.makeDefaultResources() + resourceRequirements, err = generateResourceRequirements(c.Spec.Resources, defaultResources) + if err != nil { + return nil, fmt.Errorf("could not generate resource requirements for logical backup pods: %v", err) + } + + envVars := c.generateLogicalBackupPodEnvVars() + logicalBackupContainer := generateContainer( + "logical-backup", + &c.OpConfig.LogicalBackup.LogicalBackupDockerImage, + resourceRequirements, + envVars, + []v1.VolumeMount{}, + c.OpConfig.SpiloPrivileged, // use same value as for normal DB pods + ) + + labels := map[string]string{ + "version": c.Name, + "application": "spilo-logical-backup", + } + podAffinityTerm := v1.PodAffinityTerm{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: labels, + }, + TopologyKey: "kubernetes.io/hostname", + } + podAffinity := v1.Affinity{ + PodAffinity: &v1.PodAffinity{ + PreferredDuringSchedulingIgnoredDuringExecution: []v1.WeightedPodAffinityTerm{{ + Weight: 1, + PodAffinityTerm: podAffinityTerm, + }, + }, + }} + + // re-use the method that generates DB pod templates + if podTemplate, err = generatePodTemplate( + c.Namespace, + c.labelsSet(true), + logicalBackupContainer, + []v1.Container{}, + []v1.Container{}, + &[]v1.Toleration{}, + nodeAffinity(c.OpConfig.NodeReadinessLabel), + int64(c.OpConfig.PodTerminateGracePeriod.Seconds()), + c.OpConfig.PodServiceAccountName, + c.OpConfig.KubeIAMRole, + "", + false, + false, + ""); err != nil { + return nil, fmt.Errorf("could not generate pod template for logical backup pod: %v", err) + } + + // overwrite specifc params of logical backups pods + podTemplate.Spec.Affinity = &podAffinity + podTemplate.Spec.RestartPolicy = "Never" // affects containers within a pod + + // configure a batch job + + jobSpec := batchv1.JobSpec{ + Template: *podTemplate, + } + + // configure a cron job + + jobTemplateSpec := batchv1beta1.JobTemplateSpec{ + Spec: jobSpec, + } + + schedule := c.Postgresql.Spec.LogicalBackupSchedule + if schedule == "" { + schedule = c.OpConfig.LogicalBackupSchedule + } + + cronJob := &batchv1beta1.CronJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: c.getLogicalBackupJobName(), + Namespace: c.Namespace, + Labels: c.labelsSet(true), + }, + Spec: batchv1beta1.CronJobSpec{ + Schedule: schedule, + JobTemplate: jobTemplateSpec, + ConcurrencyPolicy: batchv1beta1.ForbidConcurrent, + }, + } + + return cronJob, nil +} + +func (c *Cluster) generateLogicalBackupPodEnvVars() []v1.EnvVar { + + envVars := []v1.EnvVar{ + { + Name: "SCOPE", + Value: c.Name, + }, + // Bucket env vars + { + Name: "LOGICAL_BACKUP_S3_BUCKET", + Value: c.OpConfig.LogicalBackup.LogicalBackupS3Bucket, + }, + { + Name: "LOGICAL_BACKUP_S3_BUCKET_SCOPE_SUFFIX", + Value: getBucketScopeSuffix(string(c.Postgresql.GetUID())), + }, + // Postgres env vars + { + Name: "PG_VERSION", + Value: c.Spec.PgVersion, + }, + { + Name: "PGPORT", + Value: "5432", + }, + { + Name: "PGUSER", + Value: c.OpConfig.SuperUsername, + }, + { + Name: "PGDATABASE", + Value: c.OpConfig.SuperUsername, + }, + { + Name: "PGSSLMODE", + Value: "require", + }, + { + Name: "PGPASSWORD", + ValueFrom: &v1.EnvVarSource{ + SecretKeyRef: &v1.SecretKeySelector{ + LocalObjectReference: v1.LocalObjectReference{ + Name: c.credentialSecretName(c.OpConfig.SuperUsername), + }, + Key: "password", + }, + }, + }, + } + + c.logger.Debugf("Generated logical backup env vars %v", envVars) + + return envVars +} + +// getLogicalBackupJobName returns the name; the job itself may not exists +func (c *Cluster) getLogicalBackupJobName() (jobName string) { + return "logical-backup-" + c.clusterName().Name +} diff --git a/pkg/cluster/resources.go b/pkg/cluster/resources.go index 18c295804..e8674283a 100644 --- a/pkg/cluster/resources.go +++ b/pkg/cluster/resources.go @@ -6,7 +6,8 @@ import ( "strings" "k8s.io/api/apps/v1beta1" - "k8s.io/api/core/v1" + batchv1beta1 "k8s.io/api/batch/v1beta1" + v1 "k8s.io/api/core/v1" policybeta1 "k8s.io/api/policy/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -609,6 +610,51 @@ func (c *Cluster) createRoles() (err error) { return c.syncRoles() } +func (c *Cluster) createLogicalBackupJob() (err error) { + + c.setProcessName("creating a k8s cron job for logical backups") + + logicalBackupJobSpec, err := c.generateLogicalBackupJob() + if err != nil { + return fmt.Errorf("could not generate k8s cron job spec: %v", err) + } + c.logger.Debugf("Generated cronJobSpec: %v", logicalBackupJobSpec) + + _, err = c.KubeClient.CronJobsGetter.CronJobs(c.Namespace).Create(logicalBackupJobSpec) + if err != nil { + return fmt.Errorf("could not create k8s cron job: %v", err) + } + + return nil +} + +func (c *Cluster) patchLogicalBackupJob(newJob *batchv1beta1.CronJob) error { + c.setProcessName("patching logical backup job") + + patchData, err := specPatch(newJob.Spec) + if err != nil { + return fmt.Errorf("could not form patch for the logical backup job: %v", err) + } + + // update the backup job spec + _, err = c.KubeClient.CronJobsGetter.CronJobs(c.Namespace).Patch( + c.getLogicalBackupJobName(), + types.MergePatchType, + patchData, "") + if err != nil { + return fmt.Errorf("could not patch logical backup job: %v", err) + } + + return nil +} + +func (c *Cluster) deleteLogicalBackupJob() error { + + c.logger.Info("removing the logical backup job") + + return c.KubeClient.CronJobsGetter.CronJobs(c.Namespace).Delete(c.getLogicalBackupJobName(), c.deleteOptions) +} + // GetServiceMaster returns cluster's kubernetes master Service func (c *Cluster) GetServiceMaster() *v1.Service { return c.Services[Master] diff --git a/pkg/cluster/sync.go b/pkg/cluster/sync.go index 9ba2ee40a..f5ae30b81 100644 --- a/pkg/cluster/sync.go +++ b/pkg/cluster/sync.go @@ -3,7 +3,8 @@ package cluster import ( "fmt" - "k8s.io/api/core/v1" + batchv1beta1 "k8s.io/api/batch/v1beta1" + v1 "k8s.io/api/core/v1" policybeta1 "k8s.io/api/policy/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -92,6 +93,16 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error { return err } + // create a logical backup job unless we are running without pods or disable that feature explicitly + if c.Spec.EnableLogicalBackup && c.getNumberOfInstances(&c.Spec) > 0 { + + c.logger.Debug("syncing logical backup job") + if err = c.syncLogicalBackupJob(); err != nil { + err = fmt.Errorf("could not sync the logical backup job: %v", err) + return err + } + } + return err } @@ -519,3 +530,56 @@ func (c *Cluster) syncDatabases() error { return nil } + +func (c *Cluster) syncLogicalBackupJob() error { + var ( + job *batchv1beta1.CronJob + desiredJob *batchv1beta1.CronJob + err error + ) + c.setProcessName("syncing the logical backup job") + + // sync the job if it exists + + jobName := c.getLogicalBackupJobName() + if job, err = c.KubeClient.CronJobsGetter.CronJobs(c.Namespace).Get(jobName, metav1.GetOptions{}); err == nil { + + desiredJob, err = c.generateLogicalBackupJob() + if err != nil { + return fmt.Errorf("could not generate the desired logical backup job state: %v", err) + } + if match, reason := k8sutil.SameLogicalBackupJob(job, desiredJob); !match { + c.logger.Infof("logical job %q is not in the desired state and needs to be updated", + c.getLogicalBackupJobName(), + ) + if reason != "" { + c.logger.Infof("reason: %s", reason) + } + if err = c.patchLogicalBackupJob(desiredJob); err != nil { + return fmt.Errorf("could not update logical backup job to match desired state: %v", err) + } + c.logger.Info("the logical backup job is synced") + } + return nil + } + if !k8sutil.ResourceNotFound(err) { + return fmt.Errorf("could not get logical backp job: %v", err) + } + + // no existing logical backup job, create new one + c.logger.Info("could not find the cluster's logical backup job") + + if err = c.createLogicalBackupJob(); err == nil { + c.logger.Infof("created missing logical backup job %q", jobName) + } else { + if !k8sutil.ResourceAlreadyExists(err) { + return fmt.Errorf("could not create missing logical backup job: %v", err) + } + c.logger.Infof("logical backup job %q already exists", jobName) + if _, err = c.KubeClient.CronJobsGetter.CronJobs(c.Namespace).Get(jobName, metav1.GetOptions{}); err != nil { + return fmt.Errorf("could not fetch existing logical backup job: %v", err) + } + } + + return nil +} diff --git a/pkg/cluster/util.go b/pkg/cluster/util.go index 93268cb93..5b531cc90 100644 --- a/pkg/cluster/util.go +++ b/pkg/cluster/util.go @@ -12,7 +12,7 @@ import ( "time" "k8s.io/api/apps/v1beta1" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" policybeta1 "k8s.io/api/policy/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" diff --git a/pkg/controller/operator_config.go b/pkg/controller/operator_config.go index bb4f89918..245754e1c 100644 --- a/pkg/controller/operator_config.go +++ b/pkg/controller/operator_config.go @@ -106,5 +106,9 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur result.ScalyrCPULimit = fromCRD.Scalyr.ScalyrCPULimit result.ScalyrMemoryLimit = fromCRD.Scalyr.ScalyrMemoryLimit + result.LogicalBackupSchedule = fromCRD.LogicalBackup.Schedule + result.LogicalBackupDockerImage = fromCRD.LogicalBackup.DockerImage + result.LogicalBackupS3Bucket = fromCRD.LogicalBackup.S3Bucket + return result } diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index b2374e042..0cd662a6e 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -66,12 +66,20 @@ type Scalyr struct { ScalyrMemoryLimit string `name:"scalyr_memory_limit" default:"1Gi"` } +// LogicalBackup +type LogicalBackup struct { + LogicalBackupSchedule string `name:"logical_backup_schedule" default:"30 00 * * *"` + LogicalBackupDockerImage string `name:"logical_backup_docker_image" default:"registry.opensource.zalan.do/acid/logical-backup"` + LogicalBackupS3Bucket string `name:"logical_backup_s3_bucket" default:""` +} + // Config describes operator config type Config struct { CRD Resources Auth Scalyr + LogicalBackup WatchedNamespace string `name:"watched_namespace"` // special values: "*" means 'watch all namespaces', the empty string "" means 'watch a namespace where operator is deployed to' EtcdHost string `name:"etcd_host" default:""` // special values: the empty string "" means Patroni will use k8s as a DCS diff --git a/pkg/util/k8sutil/k8sutil.go b/pkg/util/k8sutil/k8sutil.go index dad6a7b8e..bd10256e0 100644 --- a/pkg/util/k8sutil/k8sutil.go +++ b/pkg/util/k8sutil/k8sutil.go @@ -6,8 +6,11 @@ import ( b64 "encoding/base64" + batchv1beta1 "k8s.io/api/batch/v1beta1" + clientbatchv1beta1 "k8s.io/client-go/kubernetes/typed/batch/v1beta1" + "github.com/zalando/postgres-operator/pkg/util/constants" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" policybeta1 "k8s.io/api/policy/v1beta1" apiextclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" apiextbeta1 "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset/typed/apiextensions/v1beta1" @@ -40,6 +43,7 @@ type KubernetesClient struct { rbacv1beta1.RoleBindingsGetter policyv1beta1.PodDisruptionBudgetsGetter apiextbeta1.CustomResourceDefinitionsGetter + clientbatchv1beta1.CronJobsGetter RESTClient rest.Interface AcidV1ClientSet *acidv1client.Clientset @@ -101,6 +105,7 @@ func NewFromConfig(cfg *rest.Config) (KubernetesClient, error) { kubeClient.PodDisruptionBudgetsGetter = client.PolicyV1beta1() kubeClient.RESTClient = client.CoreV1().RESTClient() kubeClient.RoleBindingsGetter = client.RbacV1beta1() + kubeClient.CronJobsGetter = client.BatchV1beta1() apiextClient, err := apiextclient.NewForConfig(cfg) if err != nil { @@ -159,6 +164,28 @@ func SamePDB(cur, new *policybeta1.PodDisruptionBudget) (match bool, reason stri return } +func getJobImage(cronJob *batchv1beta1.CronJob) string { + return cronJob.Spec.JobTemplate.Spec.Template.Spec.Containers[0].Image +} + +// SameLogicalBackupJob compares Specs of logical backup cron jobs +func SameLogicalBackupJob(cur, new *batchv1beta1.CronJob) (match bool, reason string) { + + if cur.Spec.Schedule != new.Spec.Schedule { + return false, fmt.Sprintf("new job's schedule %q doesn't match the current one %q", + new.Spec.Schedule, cur.Spec.Schedule) + } + + newImage := getJobImage(new) + curImage := getJobImage(cur) + if newImage != curImage { + return false, fmt.Sprintf("new job's image %q doesn't match the current one %q", + newImage, curImage) + } + + return true, "" +} + func (c *mockSecret) Get(name string, options metav1.GetOptions) (*v1.Secret, error) { if name != "infrastructureroles-test" { return nil, fmt.Errorf("NotFound") diff --git a/run_operator_locally.sh b/run_operator_locally.sh index 2594097b2..ee0768354 100755 --- a/run_operator_locally.sh +++ b/run_operator_locally.sh @@ -82,15 +82,15 @@ function clean_up(){ function start_minikube(){ - echo "==== START MINIKUBE ==== " + echo "==== START MINIKUBE ====" echo "May take a few minutes ..." minikube start kubectl config set-context minikube - echo "==== MINIKUBE STATUS ==== " + echo "==== MINIKUBE STATUS ====" minikube status - + echo "" } @@ -133,7 +133,7 @@ function deploy_self_built_image() { function start_operator(){ - echo "==== START OPERATOR ==== " + echo "==== START OPERATOR ====" echo "Certain operations may be retried multiple times..." # the order of resource initialization is significant