Switchover must wait for the inner goroutine before it returns. (#343)

* Switchover must wait for the inner goroutine before it returns.

Otherwise, two corner cases may happen:

 - waitForPodLabel writes to the podLabelErr channel that has been
   already closed by the outer routine

 - the outer routine exists and the caller subscribes to the pod
   the inner goroutine has already subscribed to, resulting in panic.

 The previous commit fe47f9ebea
 that touched that code added the cancellation channel, but didn't bother
 to actually wait for the goroutine to be cancelled.

 Per report and review from @valer-cara.
 Original issue: https://github.com/zalando-incubator/postgres-operator/issues/342
This commit is contained in:
Oleksii Kliukin 2018-07-16 11:50:35 +02:00 committed by GitHub
parent b7b950eb28
commit e90a01050c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 27 additions and 16 deletions

View File

@ -867,14 +867,19 @@ func (c *Cluster) GetStatus() *spec.ClusterStatus {
}
// Switchover does a switchover (via Patroni) to a candidate pod
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) error {
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) (err error) {
c.logger.Debugf("failing over from %q to %q", curMaster.Name, candidate)
var wg sync.WaitGroup
podLabelErr := make(chan error)
stopCh := make(chan struct{})
defer close(podLabelErr)
wg.Add(1)
go func() {
defer wg.Done()
ch := c.registerPodSubscriber(candidate)
defer c.unregisterPodSubscriber(candidate)
@ -882,26 +887,32 @@ func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) e
select {
case <-stopCh:
case podLabelErr <- func() error {
_, err := c.waitForPodLabel(ch, stopCh, &role)
return err
case podLabelErr <- func() (err error) {
_, err = c.waitForPodLabel(ch, stopCh, &role)
return
}():
}
}()
if err := c.patroni.Switchover(curMaster, candidate.Name); err != nil {
close(stopCh)
return fmt.Errorf("could not failover: %v", err)
}
c.logger.Debugf("successfully failed over from %q to %q", curMaster.Name, candidate)
defer close(stopCh)
if err := <-podLabelErr; err != nil {
return fmt.Errorf("could not get master pod label: %v", err)
if err = c.patroni.Switchover(curMaster, candidate.Name); err == nil {
c.logger.Debugf("successfully failed over from %q to %q", curMaster.Name, candidate)
if err = <-podLabelErr; err != nil {
err = fmt.Errorf("could not get master pod label: %v", err)
}
} else {
err = fmt.Errorf("could not failover: %v", err)
}
return nil
// signal the role label waiting goroutine to close the shop and go home
close(stopCh)
// wait until the goroutine terminates, since unregisterPodSubscriber
// must be called before the outer return; otherwsise we risk subscribing to the same pod twice.
wg.Wait()
// close the label waiting channel no sooner than the waiting goroutine terminates.
close(podLabelErr)
return
}
// Lock locks the cluster