From fbfae0e3df20141cf08491c6c1b04501bc0566e3 Mon Sep 17 00:00:00 2001 From: Jairo Llopis Date: Wed, 6 May 2026 10:45:40 +0100 Subject: [PATCH] feat(logical-backup): add configurable job history limits and TTL Adds three new configuration options for logical backup cronjobs: - logical_backup_successful_jobs_history_limit (default: 3) - logical_backup_failed_jobs_history_limit (default: 3) - logical_backup_ttl_seconds_after_finished (default: 86400) These options control how many completed/failed backup jobs are retained by Kubernetes and when finished jobs are automatically deleted. This prevents accumulation of old backup jobs and pods in namespaces with many PostgreSQL clusters. Also updates the CronJob comparison logic to detect changes in these new fields and trigger reconciliation when needed. Closes zalando/postgres-operator#1092 --- .../crds/operatorconfigurations.yaml | 12 ++++++++ charts/postgres-operator/values.yaml | 6 ++++ .../v1/operator_configuration_type.go | 3 ++ pkg/cluster/cluster.go | 15 ++++++++++ pkg/cluster/cluster_test.go | 11 +++++++- pkg/cluster/k8sres.go | 28 ++++++++++++++++--- pkg/cluster/k8sres_test.go | 26 +++++++++++++++++ pkg/controller/operator_config.go | 3 ++ pkg/util/config/config.go | 3 ++ 9 files changed, 102 insertions(+), 5 deletions(-) diff --git a/charts/postgres-operator/crds/operatorconfigurations.yaml b/charts/postgres-operator/crds/operatorconfigurations.yaml index 80ef38d25..502684cac 100644 --- a/charts/postgres-operator/crds/operatorconfigurations.yaml +++ b/charts/postgres-operator/crds/operatorconfigurations.yaml @@ -561,6 +561,18 @@ spec: default: "30 00 * * *" logical_backup_cronjob_environment_secret: type: string + logical_backup_failed_jobs_history_limit: + type: integer + minimum: 0 + default: 3 + logical_backup_successful_jobs_history_limit: + type: integer + minimum: 0 + default: 3 + logical_backup_ttl_seconds_after_finished: + type: integer + minimum: 0 + default: 86400 debug: type: object properties: diff --git a/charts/postgres-operator/values.yaml b/charts/postgres-operator/values.yaml index a1f4fa94c..4cddbcebd 100644 --- a/charts/postgres-operator/values.yaml +++ b/charts/postgres-operator/values.yaml @@ -399,6 +399,12 @@ configLogicalBackup: logical_backup_schedule: "30 00 * * *" # secret to be used as reference for env variables in cronjob logical_backup_cronjob_environment_secret: "" + # number of successful backup jobs to keep in cronjob history + logical_backup_successful_jobs_history_limit: 3 + # number of failed backup jobs to keep in cronjob history + logical_backup_failed_jobs_history_limit: 3 + # TTL in seconds after which finished backup jobs are automatically deleted + logical_backup_ttl_seconds_after_finished: 86400 # automate creation of human users with teams API service configTeamsApi: diff --git a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go index 453d618d3..35ffcbf96 100644 --- a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go +++ b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go @@ -244,6 +244,9 @@ type OperatorLogicalBackupConfiguration struct { MemoryRequest string `json:"logical_backup_memory_request,omitempty"` CPULimit string `json:"logical_backup_cpu_limit,omitempty"` MemoryLimit string `json:"logical_backup_memory_limit,omitempty"` + SuccessfulJobsHistoryLimit *int32 `json:"logical_backup_successful_jobs_history_limit,omitempty"` + FailedJobsHistoryLimit *int32 `json:"logical_backup_failed_jobs_history_limit,omitempty"` + TTLSecondsAfterFinished *int32 `json:"logical_backup_ttl_seconds_after_finished,omitempty"` } // PatroniConfiguration defines configuration for Patroni diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 8e0b3c79f..743c26c85 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -896,6 +896,21 @@ func (c *Cluster) compareLogicalBackupJob(cur, new *batchv1.CronJob) *compareLog reasons = append(reasons, fmt.Sprintf("logical backup container specs do not match: %v", strings.Join(contReasons, `', '`))) } + if !reflect.DeepEqual(cur.Spec.SuccessfulJobsHistoryLimit, new.Spec.SuccessfulJobsHistoryLimit) { + match = false + reasons = append(reasons, fmt.Sprintf("new job's successfulJobsHistoryLimit %v does not match the current one %v", new.Spec.SuccessfulJobsHistoryLimit, cur.Spec.SuccessfulJobsHistoryLimit)) + } + + if !reflect.DeepEqual(cur.Spec.FailedJobsHistoryLimit, new.Spec.FailedJobsHistoryLimit) { + match = false + reasons = append(reasons, fmt.Sprintf("new job's failedJobsHistoryLimit %v does not match the current one %v", new.Spec.FailedJobsHistoryLimit, cur.Spec.FailedJobsHistoryLimit)) + } + + if !reflect.DeepEqual(cur.Spec.JobTemplate.Spec.TTLSecondsAfterFinished, new.Spec.JobTemplate.Spec.TTLSecondsAfterFinished) { + match = false + reasons = append(reasons, fmt.Sprintf("new job's TTLSecondsAfterFinished %v does not match the current one %v", new.Spec.JobTemplate.Spec.TTLSecondsAfterFinished, cur.Spec.JobTemplate.Spec.TTLSecondsAfterFinished)) + } + return &compareLogicalBackupJobResult{match: match, reasons: reasons, deletedPodAnnotations: deletedPodAnnotations} } diff --git a/pkg/cluster/cluster_test.go b/pkg/cluster/cluster_test.go index 8046943d4..bfa4c9cd8 100644 --- a/pkg/cluster/cluster_test.go +++ b/pkg/cluster/cluster_test.go @@ -1524,12 +1524,21 @@ func TestCompareServices(t *testing.T) { } } +var ( + defaultSuccessfulJobsHistoryLimit = int32(3) + defaultFailedJobsHistoryLimit = int32(3) + defaultTTLSecondsAfterFinished = int32(86400) +) + func newCronJob(image, schedule string, vars []v1.EnvVar, mounts []v1.VolumeMount) *batchv1.CronJob { cron := &batchv1.CronJob{ Spec: batchv1.CronJobSpec{ - Schedule: schedule, + Schedule: schedule, + SuccessfulJobsHistoryLimit: &defaultSuccessfulJobsHistoryLimit, + FailedJobsHistoryLimit: &defaultFailedJobsHistoryLimit, JobTemplate: batchv1.JobTemplateSpec{ Spec: batchv1.JobSpec{ + TTLSecondsAfterFinished: &defaultTTLSecondsAfterFinished, Template: v1.PodTemplateSpec{ Spec: v1.PodSpec{ Containers: []v1.Container{ diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index 2eb867f06..0452779c4 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -2379,7 +2379,13 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1.CronJob, error) { // configure a batch job jobSpec := batchv1.JobSpec{ - Template: *podTemplate, + Template: *podTemplate, + TTLSecondsAfterFinished: c.OpConfig.LogicalBackup.LogicalBackupTTLSecondsAfterFinished, + } + + if jobSpec.TTLSecondsAfterFinished == nil { + defaultTTL := int32(86400) + jobSpec.TTLSecondsAfterFinished = &defaultTTL } // configure a cron job @@ -2393,6 +2399,18 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1.CronJob, error) { schedule = c.OpConfig.LogicalBackupSchedule } + successfulJobsHistoryLimit := c.OpConfig.LogicalBackup.LogicalBackupSuccessfulJobsHistoryLimit + if successfulJobsHistoryLimit == nil { + defaultLimit := int32(3) + successfulJobsHistoryLimit = &defaultLimit + } + + failedJobsHistoryLimit := c.OpConfig.LogicalBackup.LogicalBackupFailedJobsHistoryLimit + if failedJobsHistoryLimit == nil { + defaultLimit := int32(3) + failedJobsHistoryLimit = &defaultLimit + } + cronJob := &batchv1.CronJob{ ObjectMeta: metav1.ObjectMeta{ Name: c.getLogicalBackupJobName(), @@ -2402,9 +2420,11 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1.CronJob, error) { OwnerReferences: c.ownerReferences(), }, Spec: batchv1.CronJobSpec{ - Schedule: schedule, - JobTemplate: jobTemplateSpec, - ConcurrencyPolicy: batchv1.ForbidConcurrent, + Schedule: schedule, + JobTemplate: jobTemplateSpec, + ConcurrencyPolicy: batchv1.ForbidConcurrent, + SuccessfulJobsHistoryLimit: successfulJobsHistoryLimit, + FailedJobsHistoryLimit: failedJobsHistoryLimit, }, } diff --git a/pkg/cluster/k8sres_test.go b/pkg/cluster/k8sres_test.go index 62481c7e3..d30a0a183 100644 --- a/pkg/cluster/k8sres_test.go +++ b/pkg/cluster/k8sres_test.go @@ -4040,6 +4040,32 @@ func TestGenerateLogicalBackupJob(t *testing.T) { if !reflect.DeepEqual(tt.expectedResources, clusterResources) { t.Errorf("%s - %s: expected resources %#v, got %#v", t.Name(), tt.subTest, tt.expectedResources, clusterResources) } + + expectedSuccessfulJobsHistoryLimit := int32(3) + if cluster.OpConfig.LogicalBackup.LogicalBackupSuccessfulJobsHistoryLimit != nil { + expectedSuccessfulJobsHistoryLimit = *cluster.OpConfig.LogicalBackup.LogicalBackupSuccessfulJobsHistoryLimit + } + if *cronJob.Spec.SuccessfulJobsHistoryLimit != expectedSuccessfulJobsHistoryLimit { + t.Errorf("%s - %s: expected successfulJobsHistoryLimit %d, got %d", t.Name(), tt.subTest, expectedSuccessfulJobsHistoryLimit, *cronJob.Spec.SuccessfulJobsHistoryLimit) + } + + expectedFailedJobsHistoryLimit := int32(3) + if cluster.OpConfig.LogicalBackup.LogicalBackupFailedJobsHistoryLimit != nil { + expectedFailedJobsHistoryLimit = *cluster.OpConfig.LogicalBackup.LogicalBackupFailedJobsHistoryLimit + } + if *cronJob.Spec.FailedJobsHistoryLimit != expectedFailedJobsHistoryLimit { + t.Errorf("%s - %s: expected failedJobsHistoryLimit %d, got %d", t.Name(), tt.subTest, expectedFailedJobsHistoryLimit, *cronJob.Spec.FailedJobsHistoryLimit) + } + + expectedTTL := int32(86400) + if cluster.OpConfig.LogicalBackup.LogicalBackupTTLSecondsAfterFinished != nil { + expectedTTL = *cluster.OpConfig.LogicalBackup.LogicalBackupTTLSecondsAfterFinished + } + if cronJob.Spec.JobTemplate.Spec.TTLSecondsAfterFinished == nil { + t.Errorf("%s - %s: expected TTLSecondsAfterFinished to be set", t.Name(), tt.subTest) + } else if *cronJob.Spec.JobTemplate.Spec.TTLSecondsAfterFinished != expectedTTL { + t.Errorf("%s - %s: expected TTLSecondsAfterFinished %d, got %d", t.Name(), tt.subTest, expectedTTL, *cronJob.Spec.JobTemplate.Spec.TTLSecondsAfterFinished) + } } } diff --git a/pkg/controller/operator_config.go b/pkg/controller/operator_config.go index 4df8a8bd2..6feda6dd4 100644 --- a/pkg/controller/operator_config.go +++ b/pkg/controller/operator_config.go @@ -212,6 +212,9 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur result.LogicalBackupMemoryRequest = fromCRD.LogicalBackup.MemoryRequest result.LogicalBackupCPULimit = fromCRD.LogicalBackup.CPULimit result.LogicalBackupMemoryLimit = fromCRD.LogicalBackup.MemoryLimit + result.LogicalBackupSuccessfulJobsHistoryLimit = util.CoalesceInt32(fromCRD.LogicalBackup.SuccessfulJobsHistoryLimit, k8sutil.Int32ToPointer(3)) + result.LogicalBackupFailedJobsHistoryLimit = util.CoalesceInt32(fromCRD.LogicalBackup.FailedJobsHistoryLimit, k8sutil.Int32ToPointer(3)) + result.LogicalBackupTTLSecondsAfterFinished = fromCRD.LogicalBackup.TTLSecondsAfterFinished // debug config result.DebugLogging = fromCRD.OperatorDebug.DebugLogging diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index 796594a89..c3ec15b28 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -148,6 +148,9 @@ type LogicalBackup struct { LogicalBackupMemoryRequest string `name:"logical_backup_memory_request"` LogicalBackupCPULimit string `name:"logical_backup_cpu_limit"` LogicalBackupMemoryLimit string `name:"logical_backup_memory_limit"` + LogicalBackupSuccessfulJobsHistoryLimit *int32 `name:"logical_backup_successful_jobs_history_limit" default:"3"` + LogicalBackupFailedJobsHistoryLimit *int32 `name:"logical_backup_failed_jobs_history_limit" default:"3"` + LogicalBackupTTLSecondsAfterFinished *int32 `name:"logical_backup_ttl_seconds_after_finished" default:"86400"` } // Operator options for connection pooler