From 0bef3b325f54ed648b569d09bf33e81fb44eeabb Mon Sep 17 00:00:00 2001 From: Felix Kunde Date: Fri, 9 Dec 2022 12:42:10 +0100 Subject: [PATCH] fix migration of single-node clusters (#2134) --- pkg/cluster/pod.go | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pkg/cluster/pod.go b/pkg/cluster/pod.go index 73f077058..098fdc057 100644 --- a/pkg/cluster/pod.go +++ b/pkg/cluster/pod.go @@ -214,19 +214,16 @@ func (c *Cluster) movePodFromEndOfLifeNode(pod *v1.Pod) (*v1.Pod, error) { // MigrateMasterPod migrates master pod via failover to a replica func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error { var ( - masterCandidateName spec.NamespacedName - err error - eol bool + err error + eol bool ) oldMaster, err := c.KubeClient.Pods(podName.Namespace).Get(context.TODO(), podName.Name, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("could not get pod: %v", err) + return fmt.Errorf("could not get master pod: %v", err) } c.logger.Infof("starting process to migrate master pod %q", podName) - if eol, err = c.podIsEndOfLife(oldMaster); err != nil { return fmt.Errorf("could not get node %q: %v", oldMaster.Spec.NodeName, err) } @@ -250,21 +247,21 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error { } c.Statefulset = sset } - // We may not have a cached statefulset if the initial cluster sync has aborted, revert to the spec in that case. + // we may not have a cached statefulset if the initial cluster sync has aborted, revert to the spec in that case + masterCandidateName := podName + masterCandidatePod := oldMaster if *c.Statefulset.Spec.Replicas > 1 { if masterCandidateName, err = c.getSwitchoverCandidate(oldMaster); err != nil { return fmt.Errorf("could not find suitable replica pod as candidate for failover: %v", err) } + masterCandidatePod, err = c.KubeClient.Pods(masterCandidateName.Namespace).Get(context.TODO(), masterCandidateName.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("could not get master candidate pod: %v", err) + } } else { c.logger.Warningf("migrating single pod cluster %q, this will cause downtime of the Postgres cluster until pod is back", c.clusterName()) } - masterCandidatePod, err := c.KubeClient.Pods(masterCandidateName.Namespace).Get(context.TODO(), masterCandidateName.Name, metav1.GetOptions{}) - - if err != nil { - return fmt.Errorf("could not get master candidate pod: %v", err) - } - // there are two cases for each postgres cluster that has its master pod on the node to migrate from: // - the cluster has some replicas - migrate one of those if necessary and failover to it // - there are no replicas - just terminate the master and wait until it respawns