choose switchover candidate based on lowest lag in MB
This commit is contained in:
parent
d784c961b4
commit
4f87238fe0
|
|
@ -4,8 +4,8 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
appsv1 "k8s.io/api/apps/v1"
|
appsv1 "k8s.io/api/apps/v1"
|
||||||
|
|
@ -483,9 +483,8 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
|
||||||
func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, error) {
|
func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, error) {
|
||||||
|
|
||||||
var members []patroni.ClusterMember
|
var members []patroni.ClusterMember
|
||||||
candidates := make([]spec.NamespacedName, 0)
|
candidates := make([]patroni.ClusterMember, 0)
|
||||||
syncCandidates := make([]spec.NamespacedName, 0)
|
syncCandidates := make([]patroni.ClusterMember, 0)
|
||||||
skipReasons := make([]string, 0)
|
|
||||||
|
|
||||||
err := retryutil.Retry(1*time.Second, 5*time.Second,
|
err := retryutil.Retry(1*time.Second, 5*time.Second,
|
||||||
func() (bool, error) {
|
func() (bool, error) {
|
||||||
|
|
@ -503,32 +502,24 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, member := range members {
|
for _, member := range members {
|
||||||
if member.LagInMB > 0 {
|
|
||||||
skipReasons = append(skipReasons, fmt.Sprintf("%s lags behind by %d MB", member.Name, member.LagInMB))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" {
|
if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" {
|
||||||
candidates = append(candidates, spec.NamespacedName{Namespace: master.Namespace, Name: member.Name})
|
candidates = append(candidates, member)
|
||||||
if PostgresRole(member.Role) != SyncStandby {
|
if PostgresRole(member.Role) == SyncStandby {
|
||||||
syncCandidates = append(syncCandidates, spec.NamespacedName{Namespace: master.Namespace, Name: member.Name})
|
syncCandidates = append(syncCandidates, member)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(syncCandidates) > 0 {
|
if len(syncCandidates) > 0 {
|
||||||
return candidates[rand.Intn(len(syncCandidates))], nil
|
sort.Slice(syncCandidates, func(i, j int) bool { return syncCandidates[i].LagInMB < syncCandidates[j].LagInMB })
|
||||||
|
return spec.NamespacedName{Namespace: master.Namespace, Name: syncCandidates[0].Name}, nil
|
||||||
}
|
}
|
||||||
if len(candidates) > 0 {
|
if len(candidates) > 0 {
|
||||||
return candidates[rand.Intn(len(candidates))], nil
|
sort.Slice(candidates, func(i, j int) bool { return candidates[i].LagInMB < candidates[j].LagInMB })
|
||||||
|
return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(skipReasons) > 0 {
|
return spec.NamespacedName{}, fmt.Errorf("no switchover candidate found")
|
||||||
err = fmt.Errorf("no replica suitable for switchover: %s", strings.Join(skipReasons, `','`))
|
|
||||||
} else {
|
|
||||||
err = fmt.Errorf("no replica running")
|
|
||||||
}
|
|
||||||
|
|
||||||
return spec.NamespacedName{}, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cluster) podIsEndOfLife(pod *v1.Pod) (bool, error) {
|
func (c *Cluster) podIsEndOfLife(pod *v1.Pod) (bool, error) {
|
||||||
|
|
|
||||||
|
|
@ -38,16 +38,22 @@ func TestGetSwitchoverCandidate(t *testing.T) {
|
||||||
expectedError: nil,
|
expectedError: nil,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
subtest: "choose replica without a lag",
|
subtest: "choose replica with lowest lag",
|
||||||
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 0}]}`,
|
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 2}]}`,
|
||||||
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-2"},
|
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-2"},
|
||||||
expectedError: nil,
|
expectedError: nil,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
subtest: "no suitable replica available",
|
subtest: "choose first replica when lag is equal evrywhere",
|
||||||
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}]}`,
|
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 5}]}`,
|
||||||
|
expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"},
|
||||||
|
expectedError: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
subtest: "no running replica available",
|
||||||
|
clusterJson: `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 2}, {"name": "acid-test-cluster-1", "role": "replica", "state": "starting", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 2}]}`,
|
||||||
expectedCandidate: spec.NamespacedName{},
|
expectedCandidate: spec.NamespacedName{},
|
||||||
expectedError: fmt.Errorf("no replica suitable for switchover: acid-test-cluster-1 lags behind by 5 MB"),
|
expectedError: fmt.Errorf("no switchover candidate found"),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue