Create critical-op PDB on-demand to avoid false monitoring alerts
The critical-op PodDisruptionBudget was previously created permanently, but its selector (critical-operation=true) matched no pods during normal operation. This caused false alerts in monitoring systems like kube-prometheus-stack because the PDB expected healthy pods but none matched. Changes: - Modified syncCriticalOpPodDisruptionBudget to check if any pods have the critical-operation label before creating/keeping the PDB - PDB is now created on-demand when pods are labeled (e.g., during major version upgrades) and deleted when labels are removed - Updated majorVersionUpgrade to explicitly create/delete the PDB around the critical operation for immediate protection - Removed automatic critical-op PDB creation from initial cluster setup - Added test to verify on-demand PDB creation and deletion behavior, including edge cases for idempotent create/delete operations The explicit PDB creation in majorVersionUpgrade ensures immediate protection before the critical operation starts. The sync function serves as a safety net for edge cases like bootstrap (where Patroni applies labels) or operator restarts during critical operations. Fixes #3020
This commit is contained in:
parent
a06f8d796b
commit
513291c58d
|
|
@ -243,8 +243,22 @@ func (c *Cluster) majorVersionUpgrade() error {
|
|||
if err = c.criticalOperationLabel(pods, nil); err != nil {
|
||||
return fmt.Errorf("failed to remove critical-operation label: %s", err)
|
||||
}
|
||||
// Delete the critical-op PDB after the critical operation is complete
|
||||
if err = c.deleteCriticalOpPodDisruptionBudget(); err != nil {
|
||||
c.logger.Warningf("failed to delete critical-op PDB: %s", err)
|
||||
}
|
||||
return nil
|
||||
}()
|
||||
|
||||
// Create the critical-op PDB before starting the critical operation.
|
||||
// This ensures protection is in place immediately, rather than waiting
|
||||
// for a sync cycle. The sync function also creates the PDB if it detects
|
||||
// pods with the critical-operation label, serving as a safety net for
|
||||
// edge cases like operator restarts during critical operations.
|
||||
if err = c.createCriticalOpPodDisruptionBudget(); err != nil {
|
||||
c.logger.Warningf("failed to create critical-op PDB: %s", err)
|
||||
}
|
||||
|
||||
val := "true"
|
||||
if err = c.criticalOperationLabel(pods, &val); err != nil {
|
||||
return fmt.Errorf("failed to assign critical-operation label: %s", err)
|
||||
|
|
|
|||
|
|
@ -465,20 +465,13 @@ func (c *Cluster) createCriticalOpPodDisruptionBudget() error {
|
|||
}
|
||||
|
||||
func (c *Cluster) createPodDisruptionBudgets() error {
|
||||
errors := make([]string, 0)
|
||||
|
||||
// Only create the primary PDB during cluster creation.
|
||||
// The critical-op PDB is created on-demand during critical operations
|
||||
// (e.g., major version upgrades) to avoid false alerts when no pods
|
||||
// have the critical-operation label.
|
||||
err := c.createPrimaryPodDisruptionBudget()
|
||||
if err != nil {
|
||||
errors = append(errors, fmt.Sprintf("could not create primary pod disruption budget: %v", err))
|
||||
}
|
||||
|
||||
err = c.createCriticalOpPodDisruptionBudget()
|
||||
if err != nil {
|
||||
errors = append(errors, fmt.Sprintf("could not create pod disruption budget for critical operations: %v", err))
|
||||
}
|
||||
|
||||
if len(errors) > 0 {
|
||||
return fmt.Errorf("%v", strings.Join(errors, `', '`))
|
||||
return fmt.Errorf("could not create primary pod disruption budget: %v", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -499,7 +499,34 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error {
|
|||
pdb *policyv1.PodDisruptionBudget
|
||||
err error
|
||||
)
|
||||
if pdb, err = c.KubeClient.PodDisruptionBudgets(c.Namespace).Get(context.TODO(), c.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{}); err == nil {
|
||||
|
||||
// Check if any pods have the critical-operation label
|
||||
hasCriticalOpPods, err := c.hasCriticalOperationPods()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not check for critical operation pods: %v", err)
|
||||
}
|
||||
|
||||
pdb, err = c.KubeClient.PodDisruptionBudgets(c.Namespace).Get(context.TODO(), c.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
|
||||
pdbExists := err == nil
|
||||
|
||||
if !pdbExists && !k8sutil.ResourceNotFound(err) {
|
||||
return fmt.Errorf("could not get pod disruption budget: %v", err)
|
||||
}
|
||||
|
||||
// If no pods have the critical-operation label, delete the PDB if it exists
|
||||
if !hasCriticalOpPods {
|
||||
if pdbExists {
|
||||
c.CriticalOpPodDisruptionBudget = pdb
|
||||
c.logger.Infof("no pods with critical-operation label, deleting critical-op PDB")
|
||||
if err = c.deleteCriticalOpPodDisruptionBudget(); err != nil {
|
||||
return fmt.Errorf("could not delete pod disruption budget for critical operations: %v", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Pods have critical-operation label, ensure PDB exists
|
||||
if pdbExists {
|
||||
c.CriticalOpPodDisruptionBudget = pdb
|
||||
newPDB := c.generateCriticalOpPodDisruptionBudget()
|
||||
match, reason := c.comparePodDisruptionBudget(pdb, newPDB)
|
||||
|
|
@ -512,13 +539,10 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error {
|
|||
c.CriticalOpPodDisruptionBudget = pdb
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
}
|
||||
if !k8sutil.ResourceNotFound(err) {
|
||||
return fmt.Errorf("could not get pod disruption budget: %v", err)
|
||||
}
|
||||
// no existing pod disruption budget, create new one
|
||||
c.logger.Infof("could not find pod disruption budget for critical operations")
|
||||
c.logger.Infof("pods with critical-operation label found, creating critical-op PDB")
|
||||
|
||||
if err = c.createCriticalOpPodDisruptionBudget(); err != nil {
|
||||
if !k8sutil.ResourceAlreadyExists(err) {
|
||||
|
|
@ -533,6 +557,21 @@ func (c *Cluster) syncCriticalOpPodDisruptionBudget(isUpdate bool) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// hasCriticalOperationPods checks if any pods in the cluster have the critical-operation=true label
|
||||
func (c *Cluster) hasCriticalOperationPods() (bool, error) {
|
||||
pods, err := c.listPods()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
for _, pod := range pods {
|
||||
if val, ok := pod.Labels["critical-operation"]; ok && val == "true" {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func (c *Cluster) syncPodDisruptionBudgets(isUpdate bool) error {
|
||||
errors := make([]string, 0)
|
||||
|
||||
|
|
|
|||
|
|
@ -1018,3 +1018,134 @@ func TestUpdateSecretNameConflict(t *testing.T) {
|
|||
expectedError := fmt.Sprintf("syncing secret %s failed: could not update secret because of user name mismatch", "default/prepared-owner-user.acid-test-cluster.credentials")
|
||||
assert.Contains(t, err.Error(), expectedError)
|
||||
}
|
||||
|
||||
func TestSyncCriticalOpPodDisruptionBudget(t *testing.T) {
|
||||
testName := "test syncing critical-op PDB on-demand"
|
||||
clusterName := "acid-test-cluster-pdb"
|
||||
namespace := "default"
|
||||
|
||||
clientSet := fake.NewSimpleClientset()
|
||||
acidClientSet := fakeacidv1.NewSimpleClientset()
|
||||
|
||||
client := k8sutil.KubernetesClient{
|
||||
PodDisruptionBudgetsGetter: clientSet.PolicyV1(),
|
||||
PodsGetter: clientSet.CoreV1(),
|
||||
PostgresqlsGetter: acidClientSet.AcidV1(),
|
||||
StatefulSetsGetter: clientSet.AppsV1(),
|
||||
}
|
||||
|
||||
pg := acidv1.Postgresql{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: clusterName,
|
||||
Namespace: namespace,
|
||||
},
|
||||
Spec: acidv1.PostgresSpec{
|
||||
NumberOfInstances: 3,
|
||||
Volume: acidv1.Volume{
|
||||
Size: "1Gi",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cluster := New(
|
||||
Config{
|
||||
OpConfig: config.Config{
|
||||
PDBNameFormat: "postgres-{cluster}-pdb",
|
||||
EnablePodDisruptionBudget: util.True(),
|
||||
Resources: config.Resources{
|
||||
ClusterLabels: map[string]string{"application": "spilo"},
|
||||
ClusterNameLabel: "cluster-name",
|
||||
PodRoleLabel: "spilo-role",
|
||||
ResourceCheckInterval: time.Duration(3),
|
||||
ResourceCheckTimeout: time.Duration(10),
|
||||
},
|
||||
},
|
||||
}, client, pg, logger, eventRecorder)
|
||||
|
||||
cluster.Name = clusterName
|
||||
cluster.Namespace = namespace
|
||||
|
||||
// Create pods without critical-operation label
|
||||
for i := 0; i < 3; i++ {
|
||||
pod := v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: fmt.Sprintf("%s-%d", clusterName, i),
|
||||
Namespace: namespace,
|
||||
Labels: map[string]string{
|
||||
"application": "spilo",
|
||||
"cluster-name": clusterName,
|
||||
},
|
||||
},
|
||||
}
|
||||
_, err := cluster.KubeClient.Pods(namespace).Create(context.TODO(), &pod, metav1.CreateOptions{})
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
// Test 1: Sync with no critical-operation labels - PDB should NOT be created
|
||||
err := cluster.syncCriticalOpPodDisruptionBudget(false)
|
||||
assert.NoError(t, err)
|
||||
|
||||
_, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
|
||||
assert.Error(t, err, "%s: critical-op PDB should not exist when no pods have critical-operation label", testName)
|
||||
|
||||
// Test 2: Add critical-operation label to pods - PDB should be created
|
||||
for i := 0; i < 3; i++ {
|
||||
pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{})
|
||||
assert.NoError(t, err)
|
||||
pod.Labels["critical-operation"] = "true"
|
||||
_, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{})
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
err = cluster.syncCriticalOpPodDisruptionBudget(false)
|
||||
assert.NoError(t, err)
|
||||
|
||||
pdb, err := cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
|
||||
assert.NoError(t, err, "%s: critical-op PDB should exist when pods have critical-operation label", testName)
|
||||
assert.Equal(t, int32(3), pdb.Spec.MinAvailable.IntVal, "%s: minAvailable should be 3", testName)
|
||||
|
||||
// Test 3: Remove critical-operation label from pods - PDB should be deleted
|
||||
for i := 0; i < 3; i++ {
|
||||
pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{})
|
||||
assert.NoError(t, err)
|
||||
delete(pod.Labels, "critical-operation")
|
||||
_, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{})
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
err = cluster.syncCriticalOpPodDisruptionBudget(false)
|
||||
assert.NoError(t, err)
|
||||
|
||||
_, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
|
||||
assert.Error(t, err, "%s: critical-op PDB should be deleted when no pods have critical-operation label", testName)
|
||||
|
||||
// Test 4: Try to delete again when PDB is already deleted - should handle gracefully
|
||||
err = cluster.syncCriticalOpPodDisruptionBudget(false)
|
||||
assert.NoError(t, err, "%s: syncing when PDB already deleted should not error", testName)
|
||||
|
||||
// Test 5: Try to create when PDB already exists - should handle gracefully
|
||||
// First, add labels back to pods
|
||||
for i := 0; i < 3; i++ {
|
||||
pod, err := cluster.KubeClient.Pods(namespace).Get(context.TODO(), fmt.Sprintf("%s-%d", clusterName, i), metav1.GetOptions{})
|
||||
assert.NoError(t, err)
|
||||
pod.Labels["critical-operation"] = "true"
|
||||
_, err = cluster.KubeClient.Pods(namespace).Update(context.TODO(), pod, metav1.UpdateOptions{})
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
// Create the PDB
|
||||
err = cluster.syncCriticalOpPodDisruptionBudget(false)
|
||||
assert.NoError(t, err)
|
||||
_, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
|
||||
assert.NoError(t, err, "%s: PDB should exist after sync with labeled pods", testName)
|
||||
|
||||
// Now sync again - should handle "already exists" gracefully
|
||||
cluster.CriticalOpPodDisruptionBudget = nil // Simulate operator restart (in-memory state lost)
|
||||
err = cluster.syncCriticalOpPodDisruptionBudget(false)
|
||||
assert.NoError(t, err, "%s: syncing when PDB already exists should not error", testName)
|
||||
|
||||
// Verify PDB still exists and is properly tracked
|
||||
pdb, err = cluster.KubeClient.PodDisruptionBudgets(namespace).Get(context.TODO(), cluster.criticalOpPodDisruptionBudgetName(), metav1.GetOptions{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, int32(3), pdb.Spec.MinAvailable.IntVal)
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue