Migrate only master pods. Migrate single masters. (#199)
Avoid migrating replica pods, since they will be handled by the node draining anyway (the PDB specifies that only masters are to be kept). Allow migration of the single-pod clusters.
This commit is contained in:
parent
bb5ce6cbbe
commit
23011bdf9a
|
|
@ -34,10 +34,6 @@ func (c *Cluster) getRolePods(role PostgresRole) ([]v1.Pod, error) {
|
||||||
return nil, fmt.Errorf("could not get list of pods: %v", err)
|
return nil, fmt.Errorf("could not get list of pods: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(pods.Items) == 0 {
|
|
||||||
return nil, fmt.Errorf("no pods")
|
|
||||||
}
|
|
||||||
|
|
||||||
if role == Master && len(pods.Items) > 1 {
|
if role == Master && len(pods.Items) > 1 {
|
||||||
return nil, fmt.Errorf("too many masters")
|
return nil, fmt.Errorf("too many masters")
|
||||||
}
|
}
|
||||||
|
|
@ -158,6 +154,11 @@ func (c *Cluster) masterCandidate(oldNodeName string) (*v1.Pod, error) {
|
||||||
return nil, fmt.Errorf("could not get replica pods: %v", err)
|
return nil, fmt.Errorf("could not get replica pods: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(replicas) == 0 {
|
||||||
|
c.logger.Warningf("single master pod for cluster %q, migration will cause disruption of the service")
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
for i, pod := range replicas {
|
for i, pod := range replicas {
|
||||||
// look for replicas running on live nodes. Ignore errors when querying the nodes.
|
// look for replicas running on live nodes. Ignore errors when querying the nodes.
|
||||||
if pod.Spec.NodeName != oldNodeName {
|
if pod.Spec.NodeName != oldNodeName {
|
||||||
|
|
@ -198,21 +199,25 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
|
||||||
return fmt.Errorf("could not get new master candidate: %v", err)
|
return fmt.Errorf("could not get new master candidate: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
pod, err := c.movePodFromEndOfLifeNode(masterCandidatePod)
|
// there are two cases for each postgres cluster that has its master pod on the node to migrate from:
|
||||||
if err != nil {
|
// - the cluster has some replicas - migrate one of those if necessary and failover to it
|
||||||
return fmt.Errorf("could not move pod: %v", err)
|
// - there are no replicas - just terminate the master and wait until it respawns
|
||||||
}
|
// in both cases the result is the new master up and running on a new node.
|
||||||
|
if masterCandidatePod != nil {
|
||||||
|
pod, err := c.movePodFromEndOfLifeNode(masterCandidatePod)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("could not move pod: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
masterCandidateName := util.NameFromMeta(pod.ObjectMeta)
|
masterCandidateName := util.NameFromMeta(pod.ObjectMeta)
|
||||||
if err := c.ManualFailover(oldMaster, masterCandidateName); err != nil {
|
if err := c.ManualFailover(oldMaster, masterCandidateName); err != nil {
|
||||||
return fmt.Errorf("could not failover to pod %q: %v", masterCandidateName, err)
|
return fmt.Errorf("could not failover to pod %q: %v", masterCandidateName, err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if _, err = c.movePodFromEndOfLifeNode(oldMaster); err != nil {
|
||||||
|
return fmt.Errorf("could not move pod: %v", err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = c.movePodFromEndOfLifeNode(oldMaster)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not move pod: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -95,6 +95,9 @@ func (c *Cluster) preScaleDown(newStatefulSet *v1beta1.StatefulSet) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("could not get master pod: %v", err)
|
return fmt.Errorf("could not get master pod: %v", err)
|
||||||
}
|
}
|
||||||
|
if len(masterPod) == 0 {
|
||||||
|
return fmt.Errorf("no master pod is running in the cluster")
|
||||||
|
}
|
||||||
|
|
||||||
podNum, err := getPodIndex(masterPod[0].Name)
|
podNum, err := getPodIndex(masterPod[0].Name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ func (c *Controller) nodeAdd(obj interface{}) {
|
||||||
c.logger.Debugf("new node has been added: %q (%s)", util.NameFromMeta(node.ObjectMeta), node.Spec.ProviderID)
|
c.logger.Debugf("new node has been added: %q (%s)", util.NameFromMeta(node.ObjectMeta), node.Spec.ProviderID)
|
||||||
// check if the node became not ready while the operator was down (otherwise we would have caught it in nodeUpdate)
|
// check if the node became not ready while the operator was down (otherwise we would have caught it in nodeUpdate)
|
||||||
if !c.nodeIsReady(node) {
|
if !c.nodeIsReady(node) {
|
||||||
c.movePodsOffNode(node)
|
c.moveMasterPodsOffNode(node)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -64,7 +64,7 @@ func (c *Controller) nodeUpdate(prev, cur interface{}) {
|
||||||
if !c.nodeIsReady(nodePrev) || c.nodeIsReady(nodeCur) {
|
if !c.nodeIsReady(nodePrev) || c.nodeIsReady(nodeCur) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
c.movePodsOffNode(nodeCur)
|
c.moveMasterPodsOffNode(nodeCur)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Controller) nodeIsReady(node *v1.Node) bool {
|
func (c *Controller) nodeIsReady(node *v1.Node) bool {
|
||||||
|
|
@ -72,7 +72,7 @@ func (c *Controller) nodeIsReady(node *v1.Node) bool {
|
||||||
util.MapContains(node.Labels, map[string]string{"master": "true"}))
|
util.MapContains(node.Labels, map[string]string{"master": "true"}))
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Controller) movePodsOffNode(node *v1.Node) {
|
func (c *Controller) moveMasterPodsOffNode(node *v1.Node) {
|
||||||
nodeName := util.NameFromMeta(node.ObjectMeta)
|
nodeName := util.NameFromMeta(node.ObjectMeta)
|
||||||
c.logger.Infof("moving pods: node %q became unschedulable and does not have a ready label: %q",
|
c.logger.Infof("moving pods: node %q became unschedulable and does not have a ready label: %q",
|
||||||
nodeName, c.opConfig.NodeReadinessLabel)
|
nodeName, c.opConfig.NodeReadinessLabel)
|
||||||
|
|
@ -95,14 +95,15 @@ func (c *Controller) movePodsOffNode(node *v1.Node) {
|
||||||
|
|
||||||
clusters := make(map[*cluster.Cluster]bool)
|
clusters := make(map[*cluster.Cluster]bool)
|
||||||
masterPods := make(map[*v1.Pod]*cluster.Cluster)
|
masterPods := make(map[*v1.Pod]*cluster.Cluster)
|
||||||
replicaPods := make(map[*v1.Pod]*cluster.Cluster)
|
|
||||||
movedPods := 0
|
movedPods := 0
|
||||||
for _, pod := range nodePods {
|
for _, pod := range nodePods {
|
||||||
podName := util.NameFromMeta(pod.ObjectMeta)
|
podName := util.NameFromMeta(pod.ObjectMeta)
|
||||||
|
|
||||||
role, ok := pod.Labels[c.opConfig.PodRoleLabel]
|
role, ok := pod.Labels[c.opConfig.PodRoleLabel]
|
||||||
if !ok {
|
if !ok || cluster.PostgresRole(role) != cluster.Master {
|
||||||
c.logger.Warningf("could not move pod %q: pod has no role", podName)
|
if !ok {
|
||||||
|
c.logger.Warningf("could not move pod %q: pod has no role", podName)
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -116,17 +117,11 @@ func (c *Controller) movePodsOffNode(node *v1.Node) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
movedPods++
|
|
||||||
|
|
||||||
if !clusters[cl] {
|
if !clusters[cl] {
|
||||||
clusters[cl] = true
|
clusters[cl] = true
|
||||||
}
|
}
|
||||||
|
|
||||||
if cluster.PostgresRole(role) == cluster.Master {
|
masterPods[pod] = cl
|
||||||
masterPods[pod] = cl
|
|
||||||
} else {
|
|
||||||
replicaPods[pod] = cl
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for cl := range clusters {
|
for cl := range clusters {
|
||||||
|
|
@ -138,16 +133,8 @@ func (c *Controller) movePodsOffNode(node *v1.Node) {
|
||||||
|
|
||||||
if err := cl.MigrateMasterPod(podName); err != nil {
|
if err := cl.MigrateMasterPod(podName); err != nil {
|
||||||
c.logger.Errorf("could not move master pod %q: %v", podName, err)
|
c.logger.Errorf("could not move master pod %q: %v", podName, err)
|
||||||
movedPods--
|
} else {
|
||||||
}
|
movedPods++
|
||||||
}
|
|
||||||
|
|
||||||
for pod, cl := range replicaPods {
|
|
||||||
podName := util.NameFromMeta(pod.ObjectMeta)
|
|
||||||
|
|
||||||
if err := cl.MigrateReplicaPod(podName, node.Name); err != nil {
|
|
||||||
c.logger.Errorf("could not move replica pod %q: %v", podName, err)
|
|
||||||
movedPods--
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -155,13 +142,13 @@ func (c *Controller) movePodsOffNode(node *v1.Node) {
|
||||||
cl.Unlock()
|
cl.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
totalPods := len(nodePods)
|
totalPods := len(masterPods)
|
||||||
|
|
||||||
c.logger.Infof("%d/%d pods have been moved out from the %q node",
|
c.logger.Infof("%d/%d master pods have been moved out from the %q node",
|
||||||
movedPods, totalPods, nodeName)
|
movedPods, totalPods, nodeName)
|
||||||
|
|
||||||
if leftPods := totalPods - movedPods; leftPods > 0 {
|
if leftPods := totalPods - movedPods; leftPods > 0 {
|
||||||
c.logger.Warnf("could not move %d/%d pods from the %q node",
|
c.logger.Warnf("could not move master %d/%d pods from the %q node",
|
||||||
leftPods, totalPods, nodeName)
|
leftPods, totalPods, nodeName)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue