Unify warnings about unmovable pods (#389)
* Unify warnings about unmovable pods * Log conditions that prevent master pod migration
This commit is contained in:
parent
d6e6b00770
commit
4fa09e0dcb
|
|
@ -7,7 +7,10 @@ import (
|
||||||
"k8s.io/apimachinery/pkg/runtime"
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
"k8s.io/apimachinery/pkg/watch"
|
"k8s.io/apimachinery/pkg/watch"
|
||||||
|
|
||||||
|
"fmt"
|
||||||
|
|
||||||
"github.com/zalando-incubator/postgres-operator/pkg/cluster"
|
"github.com/zalando-incubator/postgres-operator/pkg/cluster"
|
||||||
|
"github.com/zalando-incubator/postgres-operator/pkg/spec"
|
||||||
"github.com/zalando-incubator/postgres-operator/pkg/util"
|
"github.com/zalando-incubator/postgres-operator/pkg/util"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -55,15 +58,16 @@ func (c *Controller) nodeUpdate(prev, cur interface{}) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if util.MapContains(nodeCur.Labels, map[string]string{"master": "true"}) {
|
if !c.nodeIsReady(nodePrev) {
|
||||||
|
c.logger.Debugf("The decommissioned node %v should have already triggered master pod migration. Previous k8s-reported state of the node: %v", util.NameFromMeta(nodePrev.ObjectMeta), nodePrev)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// do nothing if the node should have already triggered an update or
|
if c.nodeIsReady(nodeCur) {
|
||||||
// if only one of the label and the unschedulability criteria are met.
|
c.logger.Debugf("The decommissioned node %v become schedulable again. Current k8s-reported state of the node: %v", util.NameFromMeta(nodeCur.ObjectMeta), nodeCur)
|
||||||
if !c.nodeIsReady(nodePrev) || c.nodeIsReady(nodeCur) {
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
c.moveMasterPodsOffNode(nodeCur)
|
c.moveMasterPodsOffNode(nodeCur)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -73,8 +77,9 @@ func (c *Controller) nodeIsReady(node *v1.Node) bool {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Controller) moveMasterPodsOffNode(node *v1.Node) {
|
func (c *Controller) moveMasterPodsOffNode(node *v1.Node) {
|
||||||
|
|
||||||
nodeName := util.NameFromMeta(node.ObjectMeta)
|
nodeName := util.NameFromMeta(node.ObjectMeta)
|
||||||
c.logger.Infof("moving pods: node %q became unschedulable and does not have a ready label: %q",
|
c.logger.Infof("moving pods: node %q became unschedulable and does not have a ready label %q",
|
||||||
nodeName, c.opConfig.NodeReadinessLabel)
|
nodeName, c.opConfig.NodeReadinessLabel)
|
||||||
|
|
||||||
opts := metav1.ListOptions{
|
opts := metav1.ListOptions{
|
||||||
|
|
@ -82,7 +87,7 @@ func (c *Controller) moveMasterPodsOffNode(node *v1.Node) {
|
||||||
}
|
}
|
||||||
podList, err := c.KubeClient.Pods(c.opConfig.WatchedNamespace).List(opts)
|
podList, err := c.KubeClient.Pods(c.opConfig.WatchedNamespace).List(opts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.logger.Errorf("could not fetch list of the pods: %v", err)
|
c.logger.Errorf("could not fetch the list of Spilo pods: %v", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -93,17 +98,25 @@ func (c *Controller) moveMasterPodsOffNode(node *v1.Node) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
movedMasterPods := 0
|
||||||
|
movableMasterPods := make(map[*v1.Pod]*cluster.Cluster)
|
||||||
|
unmovablePods := make(map[spec.NamespacedName]string)
|
||||||
|
|
||||||
clusters := make(map[*cluster.Cluster]bool)
|
clusters := make(map[*cluster.Cluster]bool)
|
||||||
masterPods := make(map[*v1.Pod]*cluster.Cluster)
|
|
||||||
movedPods := 0
|
|
||||||
for _, pod := range nodePods {
|
for _, pod := range nodePods {
|
||||||
|
|
||||||
podName := util.NameFromMeta(pod.ObjectMeta)
|
podName := util.NameFromMeta(pod.ObjectMeta)
|
||||||
|
|
||||||
role, ok := pod.Labels[c.opConfig.PodRoleLabel]
|
role, ok := pod.Labels[c.opConfig.PodRoleLabel]
|
||||||
if !ok || cluster.PostgresRole(role) != cluster.Master {
|
|
||||||
if !ok {
|
if !ok {
|
||||||
c.logger.Warningf("could not move pod %q: pod has no role", podName)
|
// pods with an unknown role cannot be safely moved to another node
|
||||||
|
unmovablePods[podName] = fmt.Sprintf("could not move pod %q from node %q: pod has no role label %q", podName, nodeName, c.opConfig.PodRoleLabel)
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// deployments can transparently re-create replicas so we do not move away such pods
|
||||||
|
if cluster.PostgresRole(role) == cluster.Replica {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -113,7 +126,7 @@ func (c *Controller) moveMasterPodsOffNode(node *v1.Node) {
|
||||||
cl, ok := c.clusters[clusterName]
|
cl, ok := c.clusters[clusterName]
|
||||||
c.clustersMu.RUnlock()
|
c.clustersMu.RUnlock()
|
||||||
if !ok {
|
if !ok {
|
||||||
c.logger.Warningf("could not move pod %q: pod does not belong to a known cluster", podName)
|
unmovablePods[podName] = fmt.Sprintf("could not move master pod %q from node %q: pod belongs to an unknown Postgres cluster %q", podName, nodeName, clusterName)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -121,20 +134,20 @@ func (c *Controller) moveMasterPodsOffNode(node *v1.Node) {
|
||||||
clusters[cl] = true
|
clusters[cl] = true
|
||||||
}
|
}
|
||||||
|
|
||||||
masterPods[pod] = cl
|
movableMasterPods[pod] = cl
|
||||||
}
|
}
|
||||||
|
|
||||||
for cl := range clusters {
|
for cl := range clusters {
|
||||||
cl.Lock()
|
cl.Lock()
|
||||||
}
|
}
|
||||||
|
|
||||||
for pod, cl := range masterPods {
|
for pod, cl := range movableMasterPods {
|
||||||
podName := util.NameFromMeta(pod.ObjectMeta)
|
|
||||||
|
|
||||||
if err := cl.MigrateMasterPod(podName); err != nil {
|
podName := util.NameFromMeta(pod.ObjectMeta)
|
||||||
c.logger.Errorf("could not move master pod %q: %v", podName, err)
|
if err := cl.MigrateMasterPod(podName); err == nil {
|
||||||
|
movedMasterPods++
|
||||||
} else {
|
} else {
|
||||||
movedPods++
|
unmovablePods[podName] = fmt.Sprintf("could not move master pod %q from node %q: %v", podName, nodeName, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -142,15 +155,16 @@ func (c *Controller) moveMasterPodsOffNode(node *v1.Node) {
|
||||||
cl.Unlock()
|
cl.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
totalPods := len(masterPods)
|
if leftPods := len(unmovablePods); leftPods > 0 {
|
||||||
|
c.logger.Warnf("could not move %d master or unknown role pods from the node %q, you may have to delete them manually",
|
||||||
c.logger.Infof("%d/%d master pods have been moved out from the %q node",
|
leftPods, nodeName)
|
||||||
movedPods, totalPods, nodeName)
|
for _, reason := range unmovablePods {
|
||||||
|
c.logger.Warning(reason)
|
||||||
if leftPods := totalPods - movedPods; leftPods > 0 {
|
|
||||||
c.logger.Warnf("could not move master %d/%d pods from the %q node",
|
|
||||||
leftPods, totalPods, nodeName)
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c.logger.Infof("%d master pods have been moved out from the node %q", movedMasterPods, nodeName)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Controller) nodeDelete(obj interface{}) {
|
func (c *Controller) nodeDelete(obj interface{}) {
|
||||||
|
|
|
||||||
|
|
@ -121,7 +121,7 @@ function deploy_self_built_image() {
|
||||||
# update the tag in the postgres operator conf
|
# update the tag in the postgres operator conf
|
||||||
# since the image with this tag already exists on the machine,
|
# since the image with this tag already exists on the machine,
|
||||||
# docker should not attempt to fetch it from the registry due to imagePullPolicy
|
# docker should not attempt to fetch it from the registry due to imagePullPolicy
|
||||||
sed --expression "s/\(image\:.*\:\).*$/\1$TAG/" manifests/postgres-operator.yaml > "$PATH_TO_LOCAL_OPERATOR_MANIFEST"
|
sed --expression "s/\(image\:.*\:\).*$/\1$TAG/; s/smoke-tested-//" manifests/postgres-operator.yaml > "$PATH_TO_LOCAL_OPERATOR_MANIFEST"
|
||||||
|
|
||||||
retry "kubectl create -f \"$PATH_TO_LOCAL_OPERATOR_MANIFEST\"" "attempt to create $PATH_TO_LOCAL_OPERATOR_MANIFEST resource"
|
retry "kubectl create -f \"$PATH_TO_LOCAL_OPERATOR_MANIFEST\"" "attempt to create $PATH_TO_LOCAL_OPERATOR_MANIFEST resource"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue