in sync mode select only syncStandby as switchover candidate (#2278)

* in sync mode select only syncStandby as swicthover candidate
* do not exit retry with err
* unit test: use error from reading byte stream twice
This commit is contained in:
Felix Kunde 2023-04-06 12:04:55 +02:00 committed by GitHub
parent 0ac5f58fa9
commit 1105228d3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 43 additions and 18 deletions

View File

@ -469,10 +469,24 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e
func() (bool, error) { func() (bool, error) {
var err error var err error
members, err = c.patroni.GetClusterMembers(master) members, err = c.patroni.GetClusterMembers(master)
if err != nil { if err != nil {
return false, err return false, err
} }
// look for SyncStandby candidates (which also implies pod is in running state)
for _, member := range members {
if PostgresRole(member.Role) == SyncStandby {
syncCandidates = append(syncCandidates, member)
}
}
// if synchronous mode is enabled and no SyncStandy was found
// return false for retry - cannot failover with no sync candidate
if c.Spec.Patroni.SynchronousMode && len(syncCandidates) == 0 {
c.logger.Warnf("no sync standby found - retrying fetching cluster members")
return false, nil
}
return true, nil return true, nil
}, },
) )
@ -480,28 +494,26 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e
return spec.NamespacedName{}, fmt.Errorf("failed to get Patroni cluster members: %s", err) return spec.NamespacedName{}, fmt.Errorf("failed to get Patroni cluster members: %s", err)
} }
for _, member := range members {
if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" {
candidates = append(candidates, member)
if PostgresRole(member.Role) == SyncStandby {
syncCandidates = append(syncCandidates, member)
}
}
}
// pick candidate with lowest lag // pick candidate with lowest lag
// if sync_standby replicas were found assume synchronous_mode is enabled and ignore other candidates list
if len(syncCandidates) > 0 { if len(syncCandidates) > 0 {
sort.Slice(syncCandidates, func(i, j int) bool { sort.Slice(syncCandidates, func(i, j int) bool {
return syncCandidates[i].Lag < syncCandidates[j].Lag return syncCandidates[i].Lag < syncCandidates[j].Lag
}) })
return spec.NamespacedName{Namespace: master.Namespace, Name: syncCandidates[0].Name}, nil return spec.NamespacedName{Namespace: master.Namespace, Name: syncCandidates[0].Name}, nil
} } else {
if len(candidates) > 0 { // in asynchronous mode find running replicas
sort.Slice(candidates, func(i, j int) bool { for _, member := range members {
return candidates[i].Lag < candidates[j].Lag if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" {
}) candidates = append(candidates, member)
return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil }
}
if len(candidates) > 0 {
sort.Slice(candidates, func(i, j int) bool {
return candidates[i].Lag < candidates[j].Lag
})
return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil
}
} }
return spec.NamespacedName{}, fmt.Errorf("no switchover candidate found") return spec.NamespacedName{}, fmt.Errorf("no switchover candidate found")

View File

@ -36,30 +36,42 @@ func TestGetSwitchoverCandidate(t *testing.T) {
tests := []struct { tests := []struct {
subtest string subtest string
clusterJson string clusterJson string
syncModeEnabled bool
expectedCandidate spec.NamespacedName expectedCandidate spec.NamespacedName
expectedError error expectedError error
}{ }{
{ {
subtest: "choose sync_standby over replica", subtest: "choose sync_standby over replica",
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "sync_standby", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 0}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 0}]}`, clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "sync_standby", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 0}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 0}]}`,
syncModeEnabled: true,
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"}, expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"},
expectedError: nil, expectedError: nil,
}, },
{
subtest: "no running sync_standby available",
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 0}]}`,
syncModeEnabled: true,
expectedCandidate: spec.NamespacedName{},
expectedError: fmt.Errorf("failed to get Patroni cluster members: unexpected end of JSON input"),
},
{ {
subtest: "choose replica with lowest lag", subtest: "choose replica with lowest lag",
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 2}]}`, clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 2}]}`,
syncModeEnabled: false,
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-2"}, expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-2"},
expectedError: nil, expectedError: nil,
}, },
{ {
subtest: "choose first replica when lag is equal evrywhere", subtest: "choose first replica when lag is equal evrywhere",
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 5}]}`, clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 5}]}`,
syncModeEnabled: false,
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"}, expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"},
expectedError: nil, expectedError: nil,
}, },
{ {
subtest: "no running replica available", subtest: "no running replica available",
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 2}, {"name": "acid-test-cluster-1", "role": "replica", "state": "starting", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 2}]}`, clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 2}, {"name": "acid-test-cluster-1", "role": "replica", "state": "starting", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 2}]}`,
syncModeEnabled: false,
expectedCandidate: spec.NamespacedName{}, expectedCandidate: spec.NamespacedName{},
expectedError: fmt.Errorf("no switchover candidate found"), expectedError: fmt.Errorf("no switchover candidate found"),
}, },
@ -81,6 +93,7 @@ func TestGetSwitchoverCandidate(t *testing.T) {
cluster.patroni = p cluster.patroni = p
mockMasterPod := newMockPod("192.168.100.1") mockMasterPod := newMockPod("192.168.100.1")
mockMasterPod.Namespace = namespace mockMasterPod.Namespace = namespace
cluster.Spec.Patroni.SynchronousMode = tt.syncModeEnabled
candidate, err := cluster.getSwitchoverCandidate(mockMasterPod) candidate, err := cluster.getSwitchoverCandidate(mockMasterPod)
if err != nil && err.Error() != tt.expectedError.Error() { if err != nil && err.Error() != tt.expectedError.Error() {

View File

@ -458,7 +458,7 @@ func (c *Cluster) syncPatroniConfig(pods []v1.Pod, requiredPatroniConfig acidv1.
// get Postgres config, compare with manifest and update via Patroni PATCH endpoint if it differs // get Postgres config, compare with manifest and update via Patroni PATCH endpoint if it differs
for i, pod := range pods { for i, pod := range pods {
podName := util.NameFromMeta(pods[i].ObjectMeta) podName := util.NameFromMeta(pods[i].ObjectMeta)
effectivePatroniConfig, effectivePgParameters, err = c.patroni.GetConfig(&pod) effectivePatroniConfig, effectivePgParameters, err = c.getPatroniConfig(&pod)
if err != nil { if err != nil {
errors = append(errors, fmt.Sprintf("could not get Postgres config from pod %s: %v", podName, err)) errors = append(errors, fmt.Sprintf("could not get Postgres config from pod %s: %v", podName, err))
continue continue