From e90a01050c68fef2e7e095b4c7b443a25730c25f Mon Sep 17 00:00:00 2001 From: Oleksii Kliukin Date: Mon, 16 Jul 2018 11:50:35 +0200 Subject: [PATCH] Switchover must wait for the inner goroutine before it returns. (#343) * Switchover must wait for the inner goroutine before it returns. Otherwise, two corner cases may happen: - waitForPodLabel writes to the podLabelErr channel that has been already closed by the outer routine - the outer routine exists and the caller subscribes to the pod the inner goroutine has already subscribed to, resulting in panic. The previous commit https://github.com/zalando-incubator/postgres-operator/commit/fe47f9ebeadd54639913296735158b42d17ee012 that touched that code added the cancellation channel, but didn't bother to actually wait for the goroutine to be cancelled. Per report and review from @valer-cara. Original issue: https://github.com/zalando-incubator/postgres-operator/issues/342 --- pkg/cluster/cluster.go | 43 ++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 2662cd521..1dd5fd6b1 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -867,14 +867,19 @@ func (c *Cluster) GetStatus() *spec.ClusterStatus { } // Switchover does a switchover (via Patroni) to a candidate pod -func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) error { +func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) (err error) { + c.logger.Debugf("failing over from %q to %q", curMaster.Name, candidate) + var wg sync.WaitGroup + podLabelErr := make(chan error) stopCh := make(chan struct{}) - defer close(podLabelErr) + + wg.Add(1) go func() { + defer wg.Done() ch := c.registerPodSubscriber(candidate) defer c.unregisterPodSubscriber(candidate) @@ -882,26 +887,32 @@ func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) e select { case <-stopCh: - case podLabelErr <- func() error { - _, err := c.waitForPodLabel(ch, stopCh, &role) - return err + case podLabelErr <- func() (err error) { + _, err = c.waitForPodLabel(ch, stopCh, &role) + return }(): } }() - if err := c.patroni.Switchover(curMaster, candidate.Name); err != nil { - close(stopCh) - return fmt.Errorf("could not failover: %v", err) - } - c.logger.Debugf("successfully failed over from %q to %q", curMaster.Name, candidate) - - defer close(stopCh) - - if err := <-podLabelErr; err != nil { - return fmt.Errorf("could not get master pod label: %v", err) + if err = c.patroni.Switchover(curMaster, candidate.Name); err == nil { + c.logger.Debugf("successfully failed over from %q to %q", curMaster.Name, candidate) + if err = <-podLabelErr; err != nil { + err = fmt.Errorf("could not get master pod label: %v", err) + } + } else { + err = fmt.Errorf("could not failover: %v", err) } - return nil + // signal the role label waiting goroutine to close the shop and go home + close(stopCh) + // wait until the goroutine terminates, since unregisterPodSubscriber + // must be called before the outer return; otherwsise we risk subscribing to the same pod twice. + wg.Wait() + // close the label waiting channel no sooner than the waiting goroutine terminates. + close(podLabelErr) + + return + } // Lock locks the cluster