perform manual failover on pg cluster rolling upgrade

This commit is contained in:
Murat Kabilov 2017-10-04 16:56:47 +03:00 committed by GitHub
parent 00194d0130
commit 48ec6b35b9
3 changed files with 56 additions and 8 deletions

View File

@ -668,3 +668,37 @@ func (c *Cluster) GetStatus() *spec.ClusterStatus {
Error: c.Error,
}
}
// ManualFailover does manual failover to a candidate pod
func (c *Cluster) ManualFailover(curMaster *v1.Pod, candidate spec.NamespacedName) error {
c.logger.Debugf("failing over from %q to %q", curMaster.Name, candidate)
podLabelErr := make(chan error)
stopCh := make(chan struct{})
defer close(podLabelErr)
go func() {
ch := c.registerPodSubscriber(candidate)
defer c.unregisterPodSubscriber(candidate)
role := Master
select {
case <-stopCh:
case podLabelErr <- c.waitForPodLabel(ch, &role):
}
}()
if err := c.patroni.Failover(curMaster, candidate.Name); err != nil {
close(stopCh)
return fmt.Errorf("could not failover: %v", err)
}
c.logger.Debugf("successfully failed over from %q to %q", curMaster.Name, candidate)
defer close(stopCh)
if err := <-podLabelErr; err != nil {
return fmt.Errorf("could not get master pod label: %v", err)
}
return nil
}

View File

@ -104,7 +104,7 @@ func (c *Cluster) recreatePod(pod v1.Pod) error {
if err := c.waitForPodDeletion(ch); err != nil {
return err
}
if err := c.waitForPodLabel(ch); err != nil {
if err := c.waitForPodLabel(ch, nil); err != nil {
return err
}
c.logger.Infof("pod %q is ready", podName)
@ -127,6 +127,7 @@ func (c *Cluster) recreatePods() error {
c.logger.Infof("there are %d pods in the cluster to recreate", len(pods.Items))
var masterPod v1.Pod
replicas := make([]spec.NamespacedName, 0)
for _, pod := range pods.Items {
role := c.podSpiloRole(&pod)
@ -138,11 +139,17 @@ func (c *Cluster) recreatePods() error {
if err := c.recreatePod(pod); err != nil {
return fmt.Errorf("could not recreate replica pod %q: %v", util.NameFromMeta(pod.ObjectMeta), err)
}
replicas = append(replicas, util.NameFromMeta(pod.ObjectMeta))
}
if masterPod.Name == "" {
c.logger.Warningln("no master pod in the cluster")
} else {
//TODO: do manual failover
err := c.ManualFailover(&masterPod, masterCandidate(replicas))
if err != nil {
return fmt.Errorf("could not perform manual failover: %v", err)
}
//TODO: specify master, leave new master empty
c.logger.Infof("recreating master pod %q", util.NameFromMeta(masterPod.ObjectMeta))

View File

@ -3,6 +3,7 @@ package cluster
import (
"encoding/json"
"fmt"
"math/rand"
"strings"
"time"
@ -182,15 +183,17 @@ func (c *Cluster) getTeamMembers() ([]string, error) {
return teamInfo.Members, nil
}
func (c *Cluster) waitForPodLabel(podEvents chan spec.PodEvent) error {
func (c *Cluster) waitForPodLabel(podEvents chan spec.PodEvent, role *PostgresRole) error {
for {
select {
case podEvent := <-podEvents:
role := c.podSpiloRole(podEvent.CurPod)
// We cannot assume any role of the newly created pod. Normally, for a multi-pod cluster
// we should observe the 'replica' value, but it could be that some pods are not allowed
// to promote, therefore, the new pod could be a master as well.
if role == constants.PodRoleMaster || role == constants.PodRoleReplica {
podRole := PostgresRole(podEvent.CurPod.Labels[c.OpConfig.PodRoleLabel])
if role == nil {
if podRole == Master || podRole == Replica {
return nil
}
} else if *role == podRole {
return nil
}
case <-time.After(c.OpConfig.PodLabelWaitTimeout):
@ -342,3 +345,7 @@ func (c *Cluster) credentialSecretNameForCluster(username string, clusterName st
func (c *Cluster) podSpiloRole(pod *v1.Pod) string {
return pod.Labels[c.OpConfig.PodRoleLabel]
}
func masterCandidate(replicas []spec.NamespacedName) spec.NamespacedName {
return replicas[rand.Intn(len(replicas))]
}