choose switchover candidate based on lowest lag in MB
This commit is contained in:
		
							parent
							
								
									d784c961b4
								
							
						
					
					
						commit
						4f87238fe0
					
				| 
						 | 
					@ -4,8 +4,8 @@ import (
 | 
				
			||||||
	"context"
 | 
						"context"
 | 
				
			||||||
	"fmt"
 | 
						"fmt"
 | 
				
			||||||
	"math/rand"
 | 
						"math/rand"
 | 
				
			||||||
 | 
						"sort"
 | 
				
			||||||
	"strconv"
 | 
						"strconv"
 | 
				
			||||||
	"strings"
 | 
					 | 
				
			||||||
	"time"
 | 
						"time"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	appsv1 "k8s.io/api/apps/v1"
 | 
						appsv1 "k8s.io/api/apps/v1"
 | 
				
			||||||
| 
						 | 
					@ -483,9 +483,8 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
 | 
				
			||||||
func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, error) {
 | 
					func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, error) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	var members []patroni.ClusterMember
 | 
						var members []patroni.ClusterMember
 | 
				
			||||||
	candidates := make([]spec.NamespacedName, 0)
 | 
						candidates := make([]patroni.ClusterMember, 0)
 | 
				
			||||||
	syncCandidates := make([]spec.NamespacedName, 0)
 | 
						syncCandidates := make([]patroni.ClusterMember, 0)
 | 
				
			||||||
	skipReasons := make([]string, 0)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	err := retryutil.Retry(1*time.Second, 5*time.Second,
 | 
						err := retryutil.Retry(1*time.Second, 5*time.Second,
 | 
				
			||||||
		func() (bool, error) {
 | 
							func() (bool, error) {
 | 
				
			||||||
| 
						 | 
					@ -503,32 +502,24 @@ func (c *Cluster) getSwitchoverCandidate(master *v1.Pod) (spec.NamespacedName, e
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for _, member := range members {
 | 
						for _, member := range members {
 | 
				
			||||||
		if member.LagInMB > 0 {
 | 
					 | 
				
			||||||
			skipReasons = append(skipReasons, fmt.Sprintf("%s lags behind by %d MB", member.Name, member.LagInMB))
 | 
					 | 
				
			||||||
			continue
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" {
 | 
							if PostgresRole(member.Role) != Leader && PostgresRole(member.Role) != StandbyLeader && member.State == "running" {
 | 
				
			||||||
			candidates = append(candidates, spec.NamespacedName{Namespace: master.Namespace, Name: member.Name})
 | 
								candidates = append(candidates, member)
 | 
				
			||||||
			if PostgresRole(member.Role) != SyncStandby {
 | 
								if PostgresRole(member.Role) == SyncStandby {
 | 
				
			||||||
				syncCandidates = append(syncCandidates, spec.NamespacedName{Namespace: master.Namespace, Name: member.Name})
 | 
									syncCandidates = append(syncCandidates, member)
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if len(syncCandidates) > 0 {
 | 
						if len(syncCandidates) > 0 {
 | 
				
			||||||
		return candidates[rand.Intn(len(syncCandidates))], nil
 | 
							sort.Slice(syncCandidates, func(i, j int) bool { return syncCandidates[i].LagInMB < syncCandidates[j].LagInMB })
 | 
				
			||||||
 | 
							return spec.NamespacedName{Namespace: master.Namespace, Name: syncCandidates[0].Name}, nil
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	if len(candidates) > 0 {
 | 
						if len(candidates) > 0 {
 | 
				
			||||||
		return candidates[rand.Intn(len(candidates))], nil
 | 
							sort.Slice(candidates, func(i, j int) bool { return candidates[i].LagInMB < candidates[j].LagInMB })
 | 
				
			||||||
 | 
							return spec.NamespacedName{Namespace: master.Namespace, Name: candidates[0].Name}, nil
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if len(skipReasons) > 0 {
 | 
						return spec.NamespacedName{}, fmt.Errorf("no switchover candidate found")
 | 
				
			||||||
		err = fmt.Errorf("no replica suitable for switchover: %s", strings.Join(skipReasons, `','`))
 | 
					 | 
				
			||||||
	} else {
 | 
					 | 
				
			||||||
		err = fmt.Errorf("no replica running")
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return spec.NamespacedName{}, err
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func (c *Cluster) podIsEndOfLife(pod *v1.Pod) (bool, error) {
 | 
					func (c *Cluster) podIsEndOfLife(pod *v1.Pod) (bool, error) {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,16 +38,22 @@ func TestGetSwitchoverCandidate(t *testing.T) {
 | 
				
			||||||
			expectedError:     nil,
 | 
								expectedError:     nil,
 | 
				
			||||||
		},
 | 
							},
 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
			subtest:           "choose replica without a lag",
 | 
								subtest:           "choose replica with lowest lag",
 | 
				
			||||||
			clusterJson:       `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 0}]}`,
 | 
								clusterJson:       `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 2}]}`,
 | 
				
			||||||
			expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-2"},
 | 
								expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-2"},
 | 
				
			||||||
			expectedError:     nil,
 | 
								expectedError:     nil,
 | 
				
			||||||
		},
 | 
							},
 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
			subtest:           "no suitable replica available",
 | 
								subtest:           "choose first replica when lag is equal evrywhere",
 | 
				
			||||||
			clusterJson:       `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}]}`,
 | 
								clusterJson:       `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 1}, {"name": "acid-test-cluster-1", "role": "replica", "state": "running", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 1, "lag": 5}, {"name": "acid-test-cluster-2", "role": "replica", "state": "running", "api_url": "http://192.168.100.3:8008/patroni", "host": "192.168.100.3", "port": 5432, "timeline": 1, "lag": 5}]}`,
 | 
				
			||||||
 | 
								expectedCandidate: spec.NamespacedName{Namespace: namespace, Name: "acid-test-cluster-1"},
 | 
				
			||||||
 | 
								expectedError:     nil,
 | 
				
			||||||
 | 
							},
 | 
				
			||||||
 | 
							{
 | 
				
			||||||
 | 
								subtest:           "no running replica available",
 | 
				
			||||||
 | 
								clusterJson:       `{"members": [{"name": "acid-test-cluster-0", "role": "leader", "state": "running", "api_url": "http://192.168.100.1:8008/patroni", "host": "192.168.100.1", "port": 5432, "timeline": 2}, {"name": "acid-test-cluster-1", "role": "replica", "state": "starting", "api_url": "http://192.168.100.2:8008/patroni", "host": "192.168.100.2", "port": 5432, "timeline": 2}]}`,
 | 
				
			||||||
			expectedCandidate: spec.NamespacedName{},
 | 
								expectedCandidate: spec.NamespacedName{},
 | 
				
			||||||
			expectedError:     fmt.Errorf("no replica suitable for switchover: acid-test-cluster-1 lags behind by 5 MB"),
 | 
								expectedError:     fmt.Errorf("no switchover candidate found"),
 | 
				
			||||||
		},
 | 
							},
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue