BugFix: Switchover (during a Node drain) fails randomly in synchronous mode (#1984)

* Use getSwitchoverCandidate instead of masterCandidate when trying to migrating master pod to a replica
Ref: #1983

* Remove unused masterCandidate (replaced by getSwitchoverCandidate)
Ref: #1983
This commit is contained in:
JBWatenbergScality 2022-08-19 15:14:53 +02:00 committed by GitHub
parent b2642fa2fc
commit b91b69c736
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 10 additions and 36 deletions

View File

@ -3,7 +3,6 @@ package cluster
import (
"context"
"fmt"
"math/rand"
"sort"
"strconv"
"time"
@ -212,42 +211,12 @@ func (c *Cluster) movePodFromEndOfLifeNode(pod *v1.Pod) (*v1.Pod, error) {
return newPod, nil
}
func (c *Cluster) masterCandidate(oldNodeName string) (*v1.Pod, error) {
// Wait until at least one replica pod will come up
if err := c.waitForAnyReplicaLabelReady(); err != nil {
c.logger.Warningf("could not find at least one ready replica: %v", err)
}
replicas, err := c.getRolePods(Replica)
if err != nil {
return nil, fmt.Errorf("could not get replica pods: %v", err)
}
if len(replicas) == 0 {
c.logger.Warningf("no available master candidates, migration will cause longer downtime of Postgres cluster")
return nil, nil
}
for i, pod := range replicas {
// look for replicas running on live nodes. Ignore errors when querying the nodes.
if pod.Spec.NodeName != oldNodeName {
eol, err := c.podIsEndOfLife(&pod)
if err == nil && !eol {
return &replicas[i], nil
}
}
}
c.logger.Warningf("no available master candidates on live nodes")
return &replicas[rand.Intn(len(replicas))], nil
}
// MigrateMasterPod migrates master pod via failover to a replica
func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
var (
masterCandidatePod *v1.Pod
err error
eol bool
masterCandidateName spec.NamespacedName
err error
eol bool
)
oldMaster, err := c.KubeClient.Pods(podName.Namespace).Get(context.TODO(), podName.Name, metav1.GetOptions{})
@ -283,13 +252,19 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
}
// We may not have a cached statefulset if the initial cluster sync has aborted, revert to the spec in that case.
if *c.Statefulset.Spec.Replicas > 1 {
if masterCandidatePod, err = c.masterCandidate(oldMaster.Spec.NodeName); err != nil {
if masterCandidateName, err = c.getSwitchoverCandidate(oldMaster); err != nil {
return fmt.Errorf("could not find suitable replica pod as candidate for failover: %v", err)
}
} else {
c.logger.Warningf("migrating single pod cluster %q, this will cause downtime of the Postgres cluster until pod is back", c.clusterName())
}
masterCandidatePod, err := c.KubeClient.Pods(masterCandidateName.Namespace).Get(context.TODO(), masterCandidateName.Name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("could not get master candidate pod: %v", err)
}
// there are two cases for each postgres cluster that has its master pod on the node to migrate from:
// - the cluster has some replicas - migrate one of those if necessary and failover to it
// - there are no replicas - just terminate the master and wait until it respawns
@ -306,7 +281,6 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
return fmt.Errorf("could not move pod: %v", err)
}
masterCandidateName := util.NameFromMeta(masterCandidatePod.ObjectMeta)
err = retryutil.Retry(1*time.Minute, 5*time.Minute,
func() (bool, error) {
err := c.Switchover(oldMaster, masterCandidateName)