add fix to recreate non running pods in syncStatefulsets

2026-03-06 11:27:31 -08:00 · 2026-03-06 11:27:31 -08:00 · c5142ee5ec
parent d495825f4b
commit c5142ee5ec
4 changed files with 456 additions and 3 deletions
--- a/pkg/cluster/pod.go
+++ b/pkg/cluster/pod.go
@ -376,6 +376,30 @@ func (c *Cluster) getPatroniMemberData(pod *v1.Pod) (patroni.MemberData, error)
 	return memberData, nil
 }

+// podIsNotRunning returns true if a pod is not in a healthy running state,
+// e.g. stuck in CreateContainerConfigError, CrashLoopBackOff, ImagePullBackOff, etc.
+func podIsNotRunning(pod *v1.Pod) bool {
+	if pod.Status.Phase != v1.PodRunning {
+		return true
+	}
+	for _, cs := range pod.Status.ContainerStatuses {
+		if cs.State.Waiting != nil || cs.State.Terminated != nil {
+			return true
+		}
+	}
+	return false
+}
+
+// allPodsRunning returns true only if every pod in the list is in a healthy running state.
+func (c *Cluster) allPodsRunning(pods []v1.Pod) bool {
+	for i := range pods {
+		if podIsNotRunning(&pods[i]) {
+			return false
+		}
+	}
+	return true
+}
+
 func (c *Cluster) recreatePod(podName spec.NamespacedName) (*v1.Pod, error) {
 	stopCh := make(chan struct{})
 	ch := c.registerPodSubscriber(podName)
@ -444,7 +468,8 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
 		// switchover if
 		// 1. we have not observed a new master pod when re-creating former replicas
 		// 2. we know possible switchover targets even when no replicas were recreated
-		if newMasterPod == nil && len(replicas) > 0 {
+		// 3. the master pod is actually running (can't switchover a dead master)
+		if newMasterPod == nil && len(replicas) > 0 && !podIsNotRunning(masterPod) {
 			masterCandidate, err := c.getSwitchoverCandidate(masterPod)
 			if err != nil {
 				// do not recreate master now so it will keep the update flag and switchover will be retried on next sync
@ -455,6 +480,9 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
 			}
 		} else if newMasterPod == nil && len(replicas) == 0 {
 			c.logger.Warningf("cannot perform switch over before re-creating the pod: no replicas")
+		} else if podIsNotRunning(masterPod) {
+			c.logger.Warningf("master pod %q is not running, skipping switchover and recreating directly",
+				util.NameFromMeta(masterPod.ObjectMeta))
 		}
 		c.logger.Infof("recreating old master pod %q", util.NameFromMeta(masterPod.ObjectMeta))

--- a/pkg/cluster/pod_test.go
+++ b/pkg/cluster/pod_test.go
@ -112,3 +112,283 @@ func TestGetSwitchoverCandidate(t *testing.T) {
 		}
 	}
 }
+
+func TestPodIsNotRunning(t *testing.T) {
+	tests := []struct {
+		subtest  string
+		pod      v1.Pod
+		expected bool
+	}{
+		{
+			subtest: "pod running with all containers ready",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Running: &v1.ContainerStateRunning{},
+							},
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			subtest: "pod in pending phase",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodPending,
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod running but container in CreateContainerConfigError",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Waiting: &v1.ContainerStateWaiting{
+									Reason:  "CreateContainerConfigError",
+									Message: `secret "some-secret" not found`,
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod running but container in CrashLoopBackOff",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Waiting: &v1.ContainerStateWaiting{
+									Reason: "CrashLoopBackOff",
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod running but container terminated",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Terminated: &v1.ContainerStateTerminated{
+									ExitCode: 137,
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod running with mixed container states - one healthy one broken",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Running: &v1.ContainerStateRunning{},
+							},
+						},
+						{
+							State: v1.ContainerState{
+								Waiting: &v1.ContainerStateWaiting{
+									Reason: "CreateContainerConfigError",
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod in failed phase",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodFailed,
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "pod running with multiple healthy containers",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Running: &v1.ContainerStateRunning{},
+							},
+						},
+						{
+							State: v1.ContainerState{
+								Running: &v1.ContainerStateRunning{},
+							},
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			subtest: "pod running with ImagePullBackOff",
+			pod: v1.Pod{
+				Status: v1.PodStatus{
+					Phase: v1.PodRunning,
+					ContainerStatuses: []v1.ContainerStatus{
+						{
+							State: v1.ContainerState{
+								Waiting: &v1.ContainerStateWaiting{
+									Reason: "ImagePullBackOff",
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.subtest, func(t *testing.T) {
+			result := podIsNotRunning(&tt.pod)
+			if result != tt.expected {
+				t.Errorf("podIsNotRunning() = %v, expected %v", result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestAllPodsRunning(t *testing.T) {
+	client, _ := newFakeK8sSyncClient()
+
+	var cluster = New(
+		Config{
+			OpConfig: config.Config{
+				Resources: config.Resources{
+					ClusterLabels:    map[string]string{"application": "spilo"},
+					ClusterNameLabel: "cluster-name",
+					PodRoleLabel:     "spilo-role",
+				},
+			},
+		}, client, acidv1.Postgresql{}, logger, eventRecorder)
+
+	tests := []struct {
+		subtest  string
+		pods     []v1.Pod
+		expected bool
+	}{
+		{
+			subtest: "all pods running",
+			pods: []v1.Pod{
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
+						},
+					},
+				},
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
+						},
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			subtest: "one pod not running",
+			pods: []v1.Pod{
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
+						},
+					},
+				},
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{
+								State: v1.ContainerState{
+									Waiting: &v1.ContainerStateWaiting{
+										Reason: "CreateContainerConfigError",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			subtest: "all pods not running",
+			pods: []v1.Pod{
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodPending,
+					},
+				},
+				{
+					Status: v1.PodStatus{
+						Phase: v1.PodRunning,
+						ContainerStatuses: []v1.ContainerStatus{
+							{
+								State: v1.ContainerState{
+									Waiting: &v1.ContainerStateWaiting{
+										Reason: "CrashLoopBackOff",
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			subtest:  "empty pod list",
+			pods:     []v1.Pod{},
+			expected: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.subtest, func(t *testing.T) {
+			result := cluster.allPodsRunning(tt.pods)
+			if result != tt.expected {
+				t.Errorf("allPodsRunning() = %v, expected %v", result, tt.expected)
+			}
+		})
+	}
+}
--- a/pkg/cluster/sync.go
+++ b/pkg/cluster/sync.go
@ -718,14 +718,26 @@ func (c *Cluster) syncStatefulSet() error {
 	if configPatched, restartPrimaryFirst, restartWait, err = c.syncPatroniConfig(pods, c.Spec.Patroni, requiredPgParameters); err != nil {
 		c.logger.Warningf("Patroni config updated? %v - errors during config sync: %v", configPatched, err)
 		postponeReasons = append(postponeReasons, "errors during Patroni config sync")
-		isSafeToRecreatePods = false
+		// Only mark unsafe if all pods are running. If some pods are not running,
+		// Patroni API errors are expected and should not block pod recreation,
+		// which is the only way to fix non-running pods.
+		if c.allPodsRunning(pods) {
+			isSafeToRecreatePods = false
+		} else {
+			c.logger.Warningf("ignoring Patroni config sync errors because some pods are not running")
+		}
 	}

 	// restart Postgres where it is still pending
 	if err = c.restartInstances(pods, restartWait, restartPrimaryFirst); err != nil {
 		c.logger.Errorf("errors while restarting Postgres in pods via Patroni API: %v", err)
 		postponeReasons = append(postponeReasons, "errors while restarting Postgres via Patroni API")
-		isSafeToRecreatePods = false
+		// Same logic: don't let unreachable non-running pods block recreation.
+		if c.allPodsRunning(pods) {
+			isSafeToRecreatePods = false
+		} else {
+			c.logger.Warningf("ignoring Patroni restart errors because some pods are not running")
+		}
 	}

 	// if we get here we also need to re-create the pods (either leftovers from the old
--- a/pkg/cluster/sync_test.go
+++ b/pkg/cluster/sync_test.go
@ -1053,3 +1053,136 @@ func TestUpdateSecretNameConflict(t *testing.T) {
 	expectedError := fmt.Sprintf("syncing secret %s failed: error while checking for password rotation: could not update secret because of user name mismatch", "default/prepared-owner-user.acid-test-cluster.credentials")
 	assert.Contains(t, err.Error(), expectedError)
 }
+
+func TestSyncStatefulSetNonRunningPodsDoNotBlockRecreation(t *testing.T) {
+	testName := "test that non-running pods do not block rolling update"
+	client, _ := newFakeK8sSyncClient()
+	clusterName := "acid-test-cluster"
+	namespace := "default"
+
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+
+	pg := acidv1.Postgresql{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      clusterName,
+			Namespace: namespace,
+		},
+		Spec: acidv1.PostgresSpec{
+			NumberOfInstances: 1,
+			Volume: acidv1.Volume{
+				Size: "1Gi",
+			},
+		},
+	}
+
+	var cluster = New(
+		Config{
+			OpConfig: config.Config{
+				PatroniAPICheckInterval: time.Duration(1),
+				PatroniAPICheckTimeout:  time.Duration(5),
+				PodManagementPolicy:     "ordered_ready",
+				Resources: config.Resources{
+					ClusterLabels:         map[string]string{"application": "spilo"},
+					ClusterNameLabel:      "cluster-name",
+					DefaultCPURequest:     "300m",
+					DefaultCPULimit:       "300m",
+					DefaultMemoryRequest:  "300Mi",
+					DefaultMemoryLimit:    "300Mi",
+					PodRoleLabel:          "spilo-role",
+					ResourceCheckInterval: time.Duration(3),
+					ResourceCheckTimeout:  time.Duration(10),
+				},
+			},
+		}, client, pg, logger, eventRecorder)
+
+	cluster.Name = clusterName
+	cluster.Namespace = namespace
+
+	// mock Patroni API that always fails (simulates unreachable pod)
+	mockClient := mocks.NewMockHTTPClient(ctrl)
+	mockClient.EXPECT().Do(gomock.Any()).Return(nil, fmt.Errorf("connection refused")).AnyTimes()
+	mockClient.EXPECT().Get(gomock.Any()).Return(nil, fmt.Errorf("connection refused")).AnyTimes()
+	cluster.patroni = patroni.New(patroniLogger, mockClient)
+
+	// test allPodsRunning with non-running pods returns false
+	nonRunningPods := []v1.Pod{
+		{
+			Status: v1.PodStatus{
+				Phase: v1.PodRunning,
+				ContainerStatuses: []v1.ContainerStatus{
+					{
+						State: v1.ContainerState{
+							Waiting: &v1.ContainerStateWaiting{
+								Reason:  "CreateContainerConfigError",
+								Message: `secret "old-secret" not found`,
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	if cluster.allPodsRunning(nonRunningPods) {
+		t.Errorf("%s: allPodsRunning should return false for pods in CreateContainerConfigError", testName)
+	}
+
+	// test allPodsRunning with running pods returns true
+	runningPods := []v1.Pod{
+		{
+			Status: v1.PodStatus{
+				Phase: v1.PodRunning,
+				ContainerStatuses: []v1.ContainerStatus{
+					{
+						State: v1.ContainerState{
+							Running: &v1.ContainerStateRunning{},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	if !cluster.allPodsRunning(runningPods) {
+		t.Errorf("%s: allPodsRunning should return true for running pods", testName)
+	}
+
+	// test mixed: 3-node cluster with 1 broken replica
+	mixedPods := []v1.Pod{
+		{
+			Status: v1.PodStatus{
+				Phase: v1.PodRunning,
+				ContainerStatuses: []v1.ContainerStatus{
+					{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
+				},
+			},
+		},
+		{
+			Status: v1.PodStatus{
+				Phase: v1.PodRunning,
+				ContainerStatuses: []v1.ContainerStatus{
+					{State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}},
+				},
+			},
+		},
+		{
+			Status: v1.PodStatus{
+				Phase: v1.PodRunning,
+				ContainerStatuses: []v1.ContainerStatus{
+					{
+						State: v1.ContainerState{
+							Waiting: &v1.ContainerStateWaiting{
+								Reason: "CreateContainerConfigError",
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	if cluster.allPodsRunning(mixedPods) {
+		t.Errorf("%s: allPodsRunning should return false when one pod is in CreateContainerConfigError", testName)
+	}
+}