This commit is contained in:
Alec Thomas 2026-01-27 10:47:15 +01:00 committed by GitHub
commit 42e17e63b0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 199 additions and 21 deletions

View File

@ -243,8 +243,22 @@ func (c *Cluster) majorVersionUpgrade() error {
if err = c.criticalOperationLabel(pods, nil); err != nil {
return fmt.Errorf("failed to remove critical-operation label: %s", err)
}
// Delete the critical-op PDB after the critical operation is complete
if err = c.deleteCriticalOpPodDisruptionBudget(); err != nil {
c.logger.Warningf("failed to delete critical-op PDB: %s", err)
}
return nil
}()
// Create the critical-op PDB before starting the critical operation.
// This ensures protection is in place immediately, rather than waiting
// for a sync cycle. The sync function also creates the PDB if it detects
// pods with the critical-operation label, serving as a safety net for
// edge cases like operator restarts during critical operations.
if err = c.createCriticalOpPodDisruptionBudget(); err != nil {
c.logger.Warningf("failed to create critical-op PDB: %s", err)
}
val := "true"
if err = c.criticalOperationLabel(pods, &val); err != nil {
return fmt.Errorf("failed to assign critical-operation label: %s", err)

View File

@ -465,20 +465,13 @@ func (c *Cluster) createCriticalOpPodDisruptionBudget() error {
}
func (c *Cluster) createPodDisruptionBudgets() error {
errors := make([]string, 0)
// Only create the primary PDB during cluster creation.
// The critical-op PDB is created on-demand during critical operations
// (e.g., major version upgrades) to avoid false alerts when no pods
// have the critical-operation label.
err := c.createPrimaryPodDisruptionBudget()
if err != nil {
errors = append(errors, fmt.Sprintf("could not create primary pod disruption budget: %v", err))
}
err = c.createCriticalOpPodDisruptionBudget()
if err != nil {
errors = append(errors, fmt.Sprintf("could not create pod disruption budget for critical operations: %v", err))
}
if len(errors) > 0 {
return fmt.Errorf("%v", strings.Join(errors, `', '`))
return fmt.Errorf("could not create primary pod disruption budget: %v", err)
}
return nil
}

View File

@ -499,7 +499,34 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error {
pdb *policyv1.PodDisruptionBudget
err error
)
if pdb, err = c.KubeClient.PodDisruptionBudgets(c.Namespace).Get(context.TODO(), c.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}); err == nil {
// Check if any pods have the critical-operation label
hasCriticalOpPods, err := c.hasCriticalOperationPods()
if err != nil {
return fmt.Errorf("could not check for critical operation pods: %v", err)
}
pdb, err = c.KubeClient.PodDisruptionBudgets(c.Namespace).Get(context.TODO(), c.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
pdbExists := err == nil
if !pdbExists && !k8sutil.ResourceNotFound(err) {
return fmt.Errorf("could not get pod disruption budget: %v", err)
}
// If no pods have the critical-operation label, delete the PDB if it exists
if !hasCriticalOpPods {
if pdbExists {
c.CriticalOpPodDisruptionBudget = pdb
c.logger.Infof("no pods with critical-operation label, deleting critical-op PDB")
if err = c.deleteCriticalOpPodDisruptionBudget(); err != nil {
return fmt.Errorf("could not delete pod disruption budget for critical operations: %v", err)
}
}
return nil
}
// Pods have critical-operation label, ensure PDB exists
if pdbExists {
c.CriticalOpPodDisruptionBudget = pdb
newPDB := c.generateCriticalOpPodDisruptionBudget()
match, reason := c.comparePodDisruptionBudget(pdb, newPDB)
@ -512,27 +539,40 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error {
c.CriticalOpPodDisruptionBudget = pdb
}
return nil
}
}
if !k8sutil.ResourceNotFound(err) {
return fmt.Errorf("could not get pod disruption budget: %v", err)
}
// no existing pod disruption budget, create new one
c.logger.Infof("could not find pod disruption budget for critical operations")
c.logger.Infof("pods with critical-operation label found, creating critical-op PDB")
if err = c.createCriticalOpPodDisruptionBudget(); err != nil {
if !k8sutil.ResourceAlreadyExists(err) {
return fmt.Errorf("could not create pod disruption budget for critical operations: %v", err)
}
c.logger.Infof("pod disruption budget %q already exists", util.NameFromMeta(pdb.ObjectMeta))
if pdb, err = c.KubeClient.PodDisruptionBudgets(c.Namespace).Get(context.TODO(), c.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}); err != nil {
return fmt.Errorf("could not fetch existing %q pod disruption budget", util.NameFromMeta(pdb.ObjectMeta))
pdbName := c.criticalOpPodDisruptionBudgetName()
c.logger.Infof("pod disruption budget %q already exists", pdbName)
if pdb, err = c.KubeClient.PodDisruptionBudgets(c.Namespace).Get(context.TODO(), pdbName, metav1.GetOptions{}); err != nil {
return fmt.Errorf("could not fetch existing %q pod disruption budget", pdbName)
}
}
return nil
}
// hasCriticalOperationPods checks if any pods in the cluster have the critical-operation=true label
func (c *Cluster) hasCriticalOperationPods() (bool, error) {
pods, err := c.listPods()
if err != nil {
return false, err
}
for _, pod := range pods {
if val, ok := pod.Labels["critical-operation"]; ok && val == "true" {
return true, nil
}
}
return false, nil
}
func (c *Cluster) syncPodDisruptionBudgets(isUpdate bool) error {
errors := make([]string, 0)

View File

@ -1053,3 +1053,134 @@ func TestUpdateSecretNameConflict(t *testing.T) {
expectedError := fmt.Sprintf("syncing secret %s failed: could not update secret because of user name mismatch", "default/prepared-owner-user.acid-test-cluster.credentials")
assert.Contains(t, err.Error(), expectedError)
}
func TestSyncCriticalOpPodDisruptionBudget(t *testing.T) {
testName := "test syncing critical-op PDB on-demand"
clusterName := "acid-test-cluster-pdb"
namespace := "default"
clientSet := fake.NewSimpleClientset()
acidClientSet := fakeacidv1.NewSimpleClientset()
client := k8sutil.KubernetesClient{
PodDisruptionBudgetsGetter: clientSet.PolicyV1(),
PodsGetter: clientSet.CoreV1(),
PostgresqlsGetter: acidClientSet.AcidV1(),
StatefulSetsGetter: clientSet.AppsV1(),
}
pg := acidv1.Postgresql{
ObjectMeta: metav1.ObjectMeta{
Name: clusterName,
Namespace: namespace,
},
Spec: acidv1.PostgresSpec{
NumberOfInstances: 3,
Volume: acidv1.Volume{
Size: "1Gi",
},
},
}
cluster := New(
Config{
OpConfig: config.Config{
PDBNameFormat: "postgres-{cluster}-pdb",
EnablePodDisruptionBudget: util.True(),
Resources: config.Resources{
ClusterLabels: map[string]string{"application": "spilo"},
ClusterNameLabel: "cluster-name",
PodRoleLabel: "spilo-role",
ResourceCheckInterval: time.Duration(3),
ResourceCheckTimeout: time.Duration(10),
},
},
}, client, pg, logger, eventRecorder)
cluster.Name = clusterName
cluster.Namespace = namespace
// Create pods without critical-operation label
for i := 0; i < 3; i++ {
pod := v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-%d", clusterName, i),
Namespace: namespace,
Labels: map[string]string{
"application": "spilo",
"cluster-name": clusterName,
},
},
}
_, err := cluster.KubeClient.Pods(namespace).Create(context.TODO(), &pod, metav1.CreateOptions{})
assert.NoError(t, err)
}
// Test 1: Sync with no critical-operation labels - PDB should NOT be created
err := cluster.syncCriticalOpPodDisruptionBudget(false)
assert.NoError(t, err)
_, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
assert.Error(t, err, "%s: critical-op PDB should not exist when no pods have critical-operation label", testName)
// Test 2: Add critical-operation label to pods - PDB should be created
for i := 0; i < 3; i++ {
pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{})
assert.NoError(t, err)
pod.Labels["critical-operation"] = "true"
_, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{})
assert.NoError(t, err)
}
err = cluster.syncCriticalOpPodDisruptionBudget(false)
assert.NoError(t, err)
pdb, err := cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
assert.NoError(t, err, "%s: critical-op PDB should exist when pods have critical-operation label", testName)
assert.Equal(t, int32(3), pdb.Spec.MinAvailable.IntVal, "%s: minAvailable should be 3", testName)
// Test 3: Remove critical-operation label from pods - PDB should be deleted
for i := 0; i < 3; i++ {
pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{})
assert.NoError(t, err)
delete(pod.Labels, "critical-operation")
_, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{})
assert.NoError(t, err)
}
err = cluster.syncCriticalOpPodDisruptionBudget(false)
assert.NoError(t, err)
_, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
assert.Error(t, err, "%s: critical-op PDB should be deleted when no pods have critical-operation label", testName)
// Test 4: Try to delete again when PDB is already deleted - should handle gracefully
err = cluster.syncCriticalOpPodDisruptionBudget(false)
assert.NoError(t, err, "%s: syncing when PDB already deleted should not error", testName)
// Test 5: Try to create when PDB already exists - should handle gracefully
// First, add labels back to pods
for i := 0; i < 3; i++ {
pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{})
assert.NoError(t, err)
pod.Labels["critical-operation"] = "true"
_, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{})
assert.NoError(t, err)
}
// Create the PDB
err = cluster.syncCriticalOpPodDisruptionBudget(false)
assert.NoError(t, err)
_, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
assert.NoError(t, err, "%s: PDB should exist after sync with labeled pods", testName)
// Now sync again - should handle "already exists" gracefully
cluster.CriticalOpPodDisruptionBudget = nil // Simulate operator restart (in-memory state lost)
err = cluster.syncCriticalOpPodDisruptionBudget(false)
assert.NoError(t, err, "%s: syncing when PDB already exists should not error", testName)
// Verify PDB still exists and is properly tracked
pdb, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
assert.NoError(t, err)
assert.Equal(t, int32(3), pdb.Spec.MinAvailable.IntVal)
}