BugFix: Switchover (during a Node drain) fails randomly in synchronous mode (#1984)
* Use getSwitchoverCandidate instead of masterCandidate when trying to migrating master pod to a replica Ref: #1983 * Remove unused masterCandidate (replaced by getSwitchoverCandidate) Ref: #1983
This commit is contained in:
parent
b2642fa2fc
commit
b91b69c736
|
|
@ -3,7 +3,6 @@ package cluster
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math/rand"
|
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
@ -212,42 +211,12 @@ func (c *Cluster) movePodFromEndOfLifeNode(pod *v1.Pod) (*v1.Pod, error) {
|
||||||
return newPod, nil
|
return newPod, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cluster) masterCandidate(oldNodeName string) (*v1.Pod, error) {
|
|
||||||
|
|
||||||
// Wait until at least one replica pod will come up
|
|
||||||
if err := c.waitForAnyReplicaLabelReady(); err != nil {
|
|
||||||
c.logger.Warningf("could not find at least one ready replica: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
replicas, err := c.getRolePods(Replica)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("could not get replica pods: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(replicas) == 0 {
|
|
||||||
c.logger.Warningf("no available master candidates, migration will cause longer downtime of Postgres cluster")
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
for i, pod := range replicas {
|
|
||||||
// look for replicas running on live nodes. Ignore errors when querying the nodes.
|
|
||||||
if pod.Spec.NodeName != oldNodeName {
|
|
||||||
eol, err := c.podIsEndOfLife(&pod)
|
|
||||||
if err == nil && !eol {
|
|
||||||
return &replicas[i], nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
c.logger.Warningf("no available master candidates on live nodes")
|
|
||||||
return &replicas[rand.Intn(len(replicas))], nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// MigrateMasterPod migrates master pod via failover to a replica
|
// MigrateMasterPod migrates master pod via failover to a replica
|
||||||
func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
|
func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
|
||||||
var (
|
var (
|
||||||
masterCandidatePod *v1.Pod
|
masterCandidateName spec.NamespacedName
|
||||||
err error
|
err error
|
||||||
eol bool
|
eol bool
|
||||||
)
|
)
|
||||||
|
|
||||||
oldMaster, err := c.KubeClient.Pods(podName.Namespace).Get(context.TODO(), podName.Name, metav1.GetOptions{})
|
oldMaster, err := c.KubeClient.Pods(podName.Namespace).Get(context.TODO(), podName.Name, metav1.GetOptions{})
|
||||||
|
|
@ -283,13 +252,19 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
|
||||||
}
|
}
|
||||||
// We may not have a cached statefulset if the initial cluster sync has aborted, revert to the spec in that case.
|
// We may not have a cached statefulset if the initial cluster sync has aborted, revert to the spec in that case.
|
||||||
if *c.Statefulset.Spec.Replicas > 1 {
|
if *c.Statefulset.Spec.Replicas > 1 {
|
||||||
if masterCandidatePod, err = c.masterCandidate(oldMaster.Spec.NodeName); err != nil {
|
if masterCandidateName, err = c.getSwitchoverCandidate(oldMaster); err != nil {
|
||||||
return fmt.Errorf("could not find suitable replica pod as candidate for failover: %v", err)
|
return fmt.Errorf("could not find suitable replica pod as candidate for failover: %v", err)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
c.logger.Warningf("migrating single pod cluster %q, this will cause downtime of the Postgres cluster until pod is back", c.clusterName())
|
c.logger.Warningf("migrating single pod cluster %q, this will cause downtime of the Postgres cluster until pod is back", c.clusterName())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
masterCandidatePod, err := c.KubeClient.Pods(masterCandidateName.Namespace).Get(context.TODO(), masterCandidateName.Name, metav1.GetOptions{})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("could not get master candidate pod: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
// there are two cases for each postgres cluster that has its master pod on the node to migrate from:
|
// there are two cases for each postgres cluster that has its master pod on the node to migrate from:
|
||||||
// - the cluster has some replicas - migrate one of those if necessary and failover to it
|
// - the cluster has some replicas - migrate one of those if necessary and failover to it
|
||||||
// - there are no replicas - just terminate the master and wait until it respawns
|
// - there are no replicas - just terminate the master and wait until it respawns
|
||||||
|
|
@ -306,7 +281,6 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
|
||||||
return fmt.Errorf("could not move pod: %v", err)
|
return fmt.Errorf("could not move pod: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
masterCandidateName := util.NameFromMeta(masterCandidatePod.ObjectMeta)
|
|
||||||
err = retryutil.Retry(1*time.Minute, 5*time.Minute,
|
err = retryutil.Retry(1*time.Minute, 5*time.Minute,
|
||||||
func() (bool, error) {
|
func() (bool, error) {
|
||||||
err := c.Switchover(oldMaster, masterCandidateName)
|
err := c.Switchover(oldMaster, masterCandidateName)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue