diff --git a/pkg/cluster/majorversionupgrade.go b/pkg/cluster/majorversionupgrade.go index b80cbaa09..6de339062 100644 --- a/pkg/cluster/majorversionupgrade.go +++ b/pkg/cluster/majorversionupgrade.go @@ -243,8 +243,22 @@ func (c *Cluster) majorVersionUpgrade() error { if err = c.criticalOperationLabel(pods, nil); err != nil { return fmt.Errorf("failed to remove critical-operation label: %s", err) } + // Delete the critical-op PDB after the critical operation is complete + if err = c.deleteCriticalOpPodDisruptionBudget(); err != nil { + c.logger.Warningf("failed to delete critical-op PDB: %s", err) + } return nil }() + + // Create the critical-op PDB before starting the critical operation. + // This ensures protection is in place immediately, rather than waiting + // for a sync cycle. The sync function also creates the PDB if it detects + // pods with the critical-operation label, serving as a safety net for + // edge cases like operator restarts during critical operations. + if err = c.createCriticalOpPodDisruptionBudget(); err != nil { + c.logger.Warningf("failed to create critical-op PDB: %s", err) + } + val := "true" if err = c.criticalOperationLabel(pods, &val); err != nil { return fmt.Errorf("failed to assign critical-operation label: %s", err) diff --git a/pkg/cluster/resources.go b/pkg/cluster/resources.go index ed3eb3d75..0cee6960a 100644 --- a/pkg/cluster/resources.go +++ b/pkg/cluster/resources.go @@ -465,20 +465,13 @@ func (c *Cluster) createCriticalOpPodDisruptionBudget() error { } func (c *Cluster) createPodDisruptionBudgets() error { - errors := make([]string, 0) - + // Only create the primary PDB during cluster creation. + // The critical-op PDB is created on-demand during critical operations + // (e.g., major version upgrades) to avoid false alerts when no pods + // have the critical-operation label. err := c.createPrimaryPodDisruptionBudget() if err != nil { - errors = append(errors, fmt.Sprintf("could not create primary pod disruption budget: %v", err)) - } - - err = c.createCriticalOpPodDisruptionBudget() - if err != nil { - errors = append(errors, fmt.Sprintf("could not create pod disruption budget for critical operations: %v", err)) - } - - if len(errors) > 0 { - return fmt.Errorf("%v", strings.Join(errors, `', '`)) + return fmt.Errorf("could not create primary pod disruption budget: %v", err) } return nil } diff --git a/pkg/cluster/sync.go b/pkg/cluster/sync.go index ecf692702..5cb3ef555 100644 --- a/pkg/cluster/sync.go +++ b/pkg/cluster/sync.go @@ -499,7 +499,34 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error { pdb *policyv1.PodDisruptionBudget err error ) - if pdb, err = c.KubeClient.PodDisruptionBudgets(c.Namespace).Get(context.TODO(), c.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}); err == nil { + + // Check if any pods have the critical-operation label + hasCriticalOpPods, err := c.hasCriticalOperationPods() + if err != nil { + return fmt.Errorf("could not check for critical operation pods: %v", err) + } + + pdb, err = c.KubeClient.PodDisruptionBudgets(c.Namespace).Get(context.TODO(), c.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}) + pdbExists := err == nil + + if !pdbExists && !k8sutil.ResourceNotFound(err) { + return fmt.Errorf("could not get pod disruption budget: %v", err) + } + + // If no pods have the critical-operation label, delete the PDB if it exists + if !hasCriticalOpPods { + if pdbExists { + c.CriticalOpPodDisruptionBudget = pdb + c.logger.Infof("no pods with critical-operation label, deleting critical-op PDB") + if err = c.deleteCriticalOpPodDisruptionBudget(); err != nil { + return fmt.Errorf("could not delete pod disruption budget for critical operations: %v", err) + } + } + return nil + } + + // Pods have critical-operation label, ensure PDB exists + if pdbExists { c.CriticalOpPodDisruptionBudget = pdb newPDB := c.generateCriticalOpPodDisruptionBudget() match, reason := c.comparePodDisruptionBudget(pdb, newPDB) @@ -512,13 +539,10 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error { c.CriticalOpPodDisruptionBudget = pdb } return nil + } - } - if !k8sutil.ResourceNotFound(err) { - return fmt.Errorf("could not get pod disruption budget: %v", err) - } // no existing pod disruption budget, create new one - c.logger.Infof("could not find pod disruption budget for critical operations") + c.logger.Infof("pods with critical-operation label found, creating critical-op PDB") if err = c.createCriticalOpPodDisruptionBudget(); err != nil { if !k8sutil.ResourceAlreadyExists(err) { @@ -533,6 +557,21 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error { return nil } +// hasCriticalOperationPods checks if any pods in the cluster have the critical-operation=true label +func (c *Cluster) hasCriticalOperationPods() (bool, error) { + pods, err := c.listPods() + if err != nil { + return false, err + } + + for _, pod := range pods { + if val, ok := pod.Labels["critical-operation"]; ok && val == "true" { + return true, nil + } + } + return false, nil +} + func (c *Cluster) syncPodDisruptionBudgets(isUpdate bool) error { errors := make([]string, 0) diff --git a/pkg/cluster/sync_test.go b/pkg/cluster/sync_test.go index 87e9dc8a5..90151ec4d 100644 --- a/pkg/cluster/sync_test.go +++ b/pkg/cluster/sync_test.go @@ -1018,3 +1018,134 @@ func TestUpdateSecretNameConflict(t *testing.T) { expectedError := fmt.Sprintf("syncing secret %s failed: could not update secret because of user name mismatch", "default/prepared-owner-user.acid-test-cluster.credentials") assert.Contains(t, err.Error(), expectedError) } + +func TestSyncCriticalOpPodDisruptionBudget(t *testing.T) { + testName := "test syncing critical-op PDB on-demand" + clusterName := "acid-test-cluster-pdb" + namespace := "default" + + clientSet := fake.NewSimpleClientset() + acidClientSet := fakeacidv1.NewSimpleClientset() + + client := k8sutil.KubernetesClient{ + PodDisruptionBudgetsGetter: clientSet.PolicyV1(), + PodsGetter: clientSet.CoreV1(), + PostgresqlsGetter: acidClientSet.AcidV1(), + StatefulSetsGetter: clientSet.AppsV1(), + } + + pg := acidv1.Postgresql{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: namespace, + }, + Spec: acidv1.PostgresSpec{ + NumberOfInstances: 3, + Volume: acidv1.Volume{ + Size: "1Gi", + }, + }, + } + + cluster := New( + Config{ + OpConfig: config.Config{ + PDBNameFormat: "postgres-{cluster}-pdb", + EnablePodDisruptionBudget: util.True(), + Resources: config.Resources{ + ClusterLabels: map[string]string{"application": "spilo"}, + ClusterNameLabel: "cluster-name", + PodRoleLabel: "spilo-role", + ResourceCheckInterval: time.Duration(3), + ResourceCheckTimeout: time.Duration(10), + }, + }, + }, client, pg, logger, eventRecorder) + + cluster.Name = clusterName + cluster.Namespace = namespace + + // Create pods without critical-operation label + for i := 0; i < 3; i++ { + pod := v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s-%d", clusterName, i), + Namespace: namespace, + Labels: map[string]string{ + "application": "spilo", + "cluster-name": clusterName, + }, + }, + } + _, err := cluster.KubeClient.Pods(namespace).Create(context.TODO(), &pod, metav1.CreateOptions{}) + assert.NoError(t, err) + } + + // Test 1: Sync with no critical-operation labels - PDB should NOT be created + err := cluster.syncCriticalOpPodDisruptionBudget(false) + assert.NoError(t, err) + + _, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}) + assert.Error(t, err, "%s: critical-op PDB should not exist when no pods have critical-operation label", testName) + + // Test 2: Add critical-operation label to pods - PDB should be created + for i := 0; i < 3; i++ { + pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{}) + assert.NoError(t, err) + pod.Labels["critical-operation"] = "true" + _, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{}) + assert.NoError(t, err) + } + + err = cluster.syncCriticalOpPodDisruptionBudget(false) + assert.NoError(t, err) + + pdb, err := cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}) + assert.NoError(t, err, "%s: critical-op PDB should exist when pods have critical-operation label", testName) + assert.Equal(t, int32(3), pdb.Spec.MinAvailable.IntVal, "%s: minAvailable should be 3", testName) + + // Test 3: Remove critical-operation label from pods - PDB should be deleted + for i := 0; i < 3; i++ { + pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{}) + assert.NoError(t, err) + delete(pod.Labels, "critical-operation") + _, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{}) + assert.NoError(t, err) + } + + err = cluster.syncCriticalOpPodDisruptionBudget(false) + assert.NoError(t, err) + + _, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}) + assert.Error(t, err, "%s: critical-op PDB should be deleted when no pods have critical-operation label", testName) + + // Test 4: Try to delete again when PDB is already deleted - should handle gracefully + err = cluster.syncCriticalOpPodDisruptionBudget(false) + assert.NoError(t, err, "%s: syncing when PDB already deleted should not error", testName) + + // Test 5: Try to create when PDB already exists - should handle gracefully + // First, add labels back to pods + for i := 0; i < 3; i++ { + pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{}) + assert.NoError(t, err) + pod.Labels["critical-operation"] = "true" + _, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{}) + assert.NoError(t, err) + } + + // Create the PDB + err = cluster.syncCriticalOpPodDisruptionBudget(false) + assert.NoError(t, err) + _, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}) + assert.NoError(t, err, "%s: PDB should exist after sync with labeled pods", testName) + + // Now sync again - should handle "already exists" gracefully + cluster.CriticalOpPodDisruptionBudget = nil // Simulate operator restart (in-memory state lost) + err = cluster.syncCriticalOpPodDisruptionBudget(false) + assert.NoError(t, err, "%s: syncing when PDB already exists should not error", testName) + + // Verify PDB still exists and is properly tracked + pdb, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}) + assert.NoError(t, err) + assert.Equal(t, int32(3), pdb.Spec.MinAvailable.IntVal) +}