diff --git a/pkg/cluster/pod.go b/pkg/cluster/pod.go index 098fdc057..582e3cb47 100644 --- a/pkg/cluster/pod.go +++ b/pkg/cluster/pod.go @@ -469,10 +469,24 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e func() (bool, error) { var err error members, err = c.patroni.GetClusterMembers(master) - if err != nil { return false, err } + + // look for SyncStandby candidates (which also implies pod is in running state) + for _, member := range members { + if PostgresRole(member.Role) == SyncStandby { + syncCandidates = append(syncCandidates, member) + } + } + + // if synchronous mode is enabled and no SyncStandy was found + // return false for retry - cannot failover with no sync candidate + if c.Spec.Patroni.SynchronousMode && len(syncCandidates) == 0 { + c.logger.Warnf("no sync standby found - retrying fetching cluster members") + return false, nil + } + return true, nil }, ) @@ -480,28 +494,26 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e return spec.NamespacedName{}, fmt.Errorf("failed to get Patroni cluster members: %s", err) } - for _, member := range members { - if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" { - candidates = append(candidates, member) - if PostgresRole(member.Role) == SyncStandby { - syncCandidates = append(syncCandidates, member) - } - } - } - // pick candidate with lowest lag - // if sync_standby replicas were found assume synchronous_mode is enabled and ignore other candidates list if len(syncCandidates) > 0 { sort.Slice(syncCandidates, func(i, j int) bool { return syncCandidates[i].Lag < syncCandidates[j].Lag }) return spec.NamespacedName{Namespace: master.Namespace, Name: syncCandidates[0].Name}, nil - } - if len(candidates) > 0 { - sort.Slice(candidates, func(i, j int) bool { - return candidates[i].Lag < candidates[j].Lag - }) - return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil + } else { + // in asynchronous mode find running replicas + for _, member := range members { + if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" { + candidates = append(candidates, member) + } + } + + if len(candidates) > 0 { + sort.Slice(candidates, func(i, j int) bool { + return candidates[i].Lag < candidates[j].Lag + }) + return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil + } } return spec.NamespacedName{}, fmt.Errorf("no switchover candidate found") diff --git a/pkg/cluster/pod_test.go b/pkg/cluster/pod_test.go index 068145312..6a642387e 100644 --- a/pkg/cluster/pod_test.go +++ b/pkg/cluster/pod_test.go @@ -36,30 +36,42 @@ func TestGetSwitchoverCandidate(t *testing.T) { tests := []struct { subtest string clusterJson string + syncModeEnabled bool expectedCandidate spec.NamespacedName expectedError error }{ { subtest: "choose sync_standby over replica", clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "sync_standby", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 0}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 0}]}`, + syncModeEnabled: true, expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"}, expectedError: nil, }, + { + subtest: "no running sync_standby available", + clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 0}]}`, + syncModeEnabled: true, + expectedCandidate: spec.NamespacedName{}, + expectedError: fmt.Errorf("failed to get Patroni cluster members: unexpected end of JSON input"), + }, { subtest: "choose replica with lowest lag", clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 2}]}`, + syncModeEnabled: false, expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-2"}, expectedError: nil, }, { subtest: "choose first replica when lag is equal evrywhere", clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 5}]}`, + syncModeEnabled: false, expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"}, expectedError: nil, }, { subtest: "no running replica available", clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 2}, {"name": "acid-test-cluster-1", "role": "replica", "state": "starting", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 2}]}`, + syncModeEnabled: false, expectedCandidate: spec.NamespacedName{}, expectedError: fmt.Errorf("no switchover candidate found"), }, @@ -81,6 +93,7 @@ func TestGetSwitchoverCandidate(t *testing.T) { cluster.patroni = p mockMasterPod := newMockPod("192.168.100.1") mockMasterPod.Namespace = namespace + cluster.Spec.Patroni.SynchronousMode = tt.syncModeEnabled candidate, err := cluster.getSwitchoverCandidate(mockMasterPod) if err != nil && err.Error() != tt.expectedError.Error() { diff --git a/pkg/cluster/sync.go b/pkg/cluster/sync.go index 2373fd33e..bd31271f4 100644 --- a/pkg/cluster/sync.go +++ b/pkg/cluster/sync.go @@ -458,7 +458,7 @@ func (c *Cluster) syncPatroniConfig(pods []v1.Pod, requiredPatroniConfig acidv1. // get Postgres config, compare with manifest and update via Patroni PATCH endpoint if it differs for i, pod := range pods { podName := util.NameFromMeta(pods[i].ObjectMeta) - effectivePatroniConfig, effectivePgParameters, err = c.patroni.GetConfig(&pod) + effectivePatroniConfig, effectivePgParameters, err = c.getPatroniConfig(&pod) if err != nil { errors = append(errors, fmt.Sprintf("could not get Postgres config from pod %s: %v", podName, err)) continue