package cluster import ( "fmt" "math/rand" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/pkg/api/v1" "github.com/zalando-incubator/postgres-operator/pkg/spec" "github.com/zalando-incubator/postgres-operator/pkg/util" ) func (c *Cluster) listPods() ([]v1.Pod, error) { listOptions := metav1.ListOptions{ LabelSelector: c.labelsSet().String(), } pods, err := c.KubeClient.Pods(c.Namespace).List(listOptions) if err != nil { return nil, fmt.Errorf("could not get list of pods: %v", err) } return pods.Items, nil } func (c *Cluster) getRolePods(role PostgresRole) ([]v1.Pod, error) { listOptions := metav1.ListOptions{ LabelSelector: c.roleLabelsSet(role).String(), } pods, err := c.KubeClient.Pods(c.Namespace).List(listOptions) if err != nil { return nil, fmt.Errorf("could not get list of pods: %v", err) } if len(pods.Items) == 0 { return nil, fmt.Errorf("no pods") } if role == Master && len(pods.Items) > 1 { return nil, fmt.Errorf("too many masters") } return pods.Items, nil } func (c *Cluster) deletePods() error { c.logger.Debugln("deleting pods") pods, err := c.listPods() if err != nil { return err } for _, obj := range pods { podName := util.NameFromMeta(obj.ObjectMeta) c.logger.Debugf("deleting pod %q", podName) if err := c.deletePod(podName); err != nil { c.logger.Errorf("could not delete pod %q: %v", podName, err) } else { c.logger.Infof("pod %q has been deleted", podName) } } if len(pods) > 0 { c.logger.Debugln("pods have been deleted") } else { c.logger.Debugln("no pods to delete") } return nil } func (c *Cluster) deletePod(podName spec.NamespacedName) error { c.setProcessName("deleting pod %q", podName) ch := c.registerPodSubscriber(podName) defer c.unregisterPodSubscriber(podName) if err := c.KubeClient.Pods(podName.Namespace).Delete(podName.Name, c.deleteOptions); err != nil { return err } if err := c.waitForPodDeletion(ch); err != nil { return err } return nil } func (c *Cluster) unregisterPodSubscriber(podName spec.NamespacedName) { c.logger.Debugf("unsubscribing from pod %q events", podName) c.podSubscribersMu.Lock() defer c.podSubscribersMu.Unlock() if _, ok := c.podSubscribers[podName]; !ok { panic("subscriber for pod '" + podName.String() + "' is not found") } close(c.podSubscribers[podName]) delete(c.podSubscribers, podName) } func (c *Cluster) registerPodSubscriber(podName spec.NamespacedName) chan spec.PodEvent { c.logger.Debugf("subscribing to pod %q", podName) c.podSubscribersMu.Lock() defer c.podSubscribersMu.Unlock() ch := make(chan spec.PodEvent) if _, ok := c.podSubscribers[podName]; ok { panic("pod '" + podName.String() + "' is already subscribed") } c.podSubscribers[podName] = ch return ch } func (c *Cluster) movePodFromEndOfLifeNode(pod *v1.Pod) (*v1.Pod, error) { podName := util.NameFromMeta(pod.ObjectMeta) if eol, err := c.podIsEndOfLife(pod); err != nil { return nil, fmt.Errorf("could not get node %q: %v", pod.Spec.NodeName, err) } else if !eol { c.logger.Infof("pod %q is already on a live node", podName) return pod, nil } c.setProcessName("moving pod %q out of end-of-life node %q", podName, pod.Spec.NodeName) c.logger.Infof("moving pod %q out of the end-of-life node %q", podName, pod.Spec.NodeName) if err := c.recreatePod(podName); err != nil { return nil, fmt.Errorf("could not move pod: %v", err) } newPod, err := c.KubeClient.Pods(podName.Namespace).Get(podName.Name, metav1.GetOptions{}) if err != nil { return nil, fmt.Errorf("could not get pod: %v", err) } if newPod.Spec.NodeName == pod.Spec.NodeName { return nil, fmt.Errorf("pod %q remained on the same node", podName) } if eol, err := c.podIsEndOfLife(newPod); err != nil { return nil, fmt.Errorf("could not get node %q: %v", pod.Spec.NodeName, err) } else if eol { c.logger.Warningf("pod %q moved to end-of-life node %q", podName, newPod.Spec.NodeName) return newPod, nil } c.logger.Infof("pod %q moved from node %q to node %q", podName, pod.Spec.NodeName, newPod.Spec.NodeName) return newPod, nil } func (c *Cluster) masterCandidate(oldNodeName string) (*v1.Pod, error) { replicas, err := c.getRolePods(Replica) if err != nil { return nil, fmt.Errorf("could not get replica pods: %v", err) } for i, pod := range replicas { // look for replicas running on live nodes. Ignore errors when querying the nodes. if pod.Spec.NodeName != oldNodeName { eol, err := c.podIsEndOfLife(&pod) if err == nil && !eol { return &replicas[i], nil } } } c.logger.Debug("no available master candidates on live nodes") return &replicas[rand.Intn(len(replicas))], nil } // MigrateMasterPod migrates master pod via failover to a replica func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error { oldMaster, err := c.KubeClient.Pods(podName.Namespace).Get(podName.Name, metav1.GetOptions{}) if err != nil { return fmt.Errorf("could not get pod: %v", err) } c.logger.Infof("migrating master pod %q", podName) if eol, err := c.podIsEndOfLife(oldMaster); err != nil { return fmt.Errorf("could not get node %q: %v", oldMaster.Spec.NodeName, err) } else if !eol { c.logger.Debugf("pod is already on a live node") return nil } if role := PostgresRole(oldMaster.Labels[c.OpConfig.PodRoleLabel]); role != Master { c.logger.Warningf("pod %q is not a master", podName) return nil } masterCandidatePod, err := c.masterCandidate(oldMaster.Spec.NodeName) if err != nil { return fmt.Errorf("could not get new master candidate: %v", err) } pod, err := c.movePodFromEndOfLifeNode(masterCandidatePod) if err != nil { return fmt.Errorf("could not move pod: %v", err) } masterCandidateName := util.NameFromMeta(pod.ObjectMeta) if err := c.ManualFailover(oldMaster, masterCandidateName); err != nil { return fmt.Errorf("could not failover to pod %q: %v", masterCandidateName, err) } _, err = c.movePodFromEndOfLifeNode(oldMaster) if err != nil { return fmt.Errorf("could not move pod: %v", err) } return nil } // MigrateReplicaPod recreates pod on a new node func (c *Cluster) MigrateReplicaPod(podName spec.NamespacedName, fromNodeName string) error { replicaPod, err := c.KubeClient.Pods(podName.Namespace).Get(podName.Name, metav1.GetOptions{}) if err != nil { return fmt.Errorf("could not get pod: %v", err) } c.logger.Infof("migrating replica pod %q", podName) if replicaPod.Spec.NodeName != fromNodeName { c.logger.Infof("pod %q has already migrated to node %q", podName, replicaPod.Spec.NodeName) return nil } if role := PostgresRole(replicaPod.Labels[c.OpConfig.PodRoleLabel]); role != Replica { return fmt.Errorf("pod %q is not a replica", podName) } _, err = c.movePodFromEndOfLifeNode(replicaPod) if err != nil { return fmt.Errorf("could not move pod: %v", err) } return nil } func (c *Cluster) recreatePod(podName spec.NamespacedName) error { ch := c.registerPodSubscriber(podName) defer c.unregisterPodSubscriber(podName) if err := c.KubeClient.Pods(podName.Namespace).Delete(podName.Name, c.deleteOptions); err != nil { return fmt.Errorf("could not delete pod: %v", err) } if err := c.waitForPodDeletion(ch); err != nil { return err } if err := c.waitForPodLabel(ch, nil); err != nil { return err } c.logger.Infof("pod %q has been recreated", podName) return nil } func (c *Cluster) recreatePods() error { c.setProcessName("recreating pods") ls := c.labelsSet() namespace := c.Namespace listOptions := metav1.ListOptions{ LabelSelector: ls.String(), } pods, err := c.KubeClient.Pods(namespace).List(listOptions) if err != nil { return fmt.Errorf("could not get the list of pods: %v", err) } c.logger.Infof("there are %d pods in the cluster to recreate", len(pods.Items)) var masterPod *v1.Pod replicas := make([]spec.NamespacedName, 0) for i, pod := range pods.Items { role := PostgresRole(pod.Labels[c.OpConfig.PodRoleLabel]) if role == Master { masterPod = &pods.Items[i] continue } podName := util.NameFromMeta(pods.Items[i].ObjectMeta) if err := c.recreatePod(podName); err != nil { return fmt.Errorf("could not recreate replica pod %q: %v", util.NameFromMeta(pod.ObjectMeta), err) } replicas = append(replicas, util.NameFromMeta(pod.ObjectMeta)) } if masterPod == nil { c.logger.Warningln("no master pod in the cluster") } else { if len(replicas) > 0 { err := c.ManualFailover(masterPod, masterCandidate(replicas)) if err != nil { return fmt.Errorf("could not perform manual failover: %v", err) } } //TODO: specify master, leave new master empty c.logger.Infof("recreating master pod %q", util.NameFromMeta(masterPod.ObjectMeta)) if err := c.recreatePod(util.NameFromMeta(masterPod.ObjectMeta)); err != nil { return fmt.Errorf("could not recreate master pod %q: %v", util.NameFromMeta(masterPod.ObjectMeta), err) } } return nil } func (c *Cluster) podIsEndOfLife(pod *v1.Pod) (bool, error) { node, err := c.KubeClient.Nodes().Get(pod.Spec.NodeName, metav1.GetOptions{}) if err != nil { return false, err } return node.Spec.Unschedulable || util.MapContains(node.Labels, c.OpConfig.EOLNodeLabel), nil }