add e2e test for node readiness label

This commit is contained in:
Felix Kunde 2020-02-25 17:40:29 +01:00
parent b24da3201c
commit 5774fce104
2 changed files with 70 additions and 15 deletions

View File

@ -57,6 +57,7 @@ class EndToEndTestCase(unittest.TestCase):
k8s.create_with_kubectl("manifests/minimal-postgres-manifest.yaml") k8s.create_with_kubectl("manifests/minimal-postgres-manifest.yaml")
k8s.wait_for_pod_start('spilo-role=master') k8s.wait_for_pod_start('spilo-role=master')
k8s.wait_for_pod_start('spilo-role=replica')
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_enable_load_balancer(self): def test_enable_load_balancer(self):
@ -190,6 +191,53 @@ class EndToEndTestCase(unittest.TestCase):
self.assertEqual(2, k8s.count_pods_with_label(labels)) self.assertEqual(2, k8s.count_pods_with_label(labels))
self.assert_master_is_unique() self.assert_master_is_unique()
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_node_readisness_label(self):
'''
Remove node readiness label from master node. This must cause a failover.
'''
k8s = self.k8s
cluster_label = 'cluster-name=acid-minimal-cluster'
readiness_label = 'lifecycle-status'
readiness_value = 'ready'
# get nodes of master and replica(s) (expected target of new master)
current_master_node, current_replica_nodes = k8s.get_pg_nodes(cluster_label)
num_replicas = len(current_replica_nodes)
failover_targets = self.get_failover_targets(current_master_node, current_replica_nodes)
# add node_readiness_label to potential failover nodes
patch_readiness_label = {
"metadata": {
"labels": {
readiness_label: readiness_value
}
}
}
for failover_target in failover_targets:
k8s.api.core_v1.patch_node(failover_target, patch_readiness_label)
# define node_readiness_label in config map which should trigger a failover of the master
patch_readiness_label_config = {
"data": {
"node_readiness_label": readiness_label + ':' + readiness_value,
}
}
k8s.update_config(patch_readiness_label_config)
k8s.wait_for_master_failover(failover_targets)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
new_master_node, new_replica_nodes = k8s.get_pg_nodes(cluster_label)
self.assertNotEqual(current_master_node, new_master_node,
"Master on {} did not fail over to one of {}".format(current_master_node, failover_targets))
self.assertEqual(num_replicas, len(new_replica_nodes),
"Expected {} replicas, found {}".format(num_replicas, len(new_replica_nodes)))
self.assert_master_is_unique()
# patch also master node
k8s.api.core_v1.patch_node(current_master_node, patch_readiness_label)
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_taint_based_eviction(self): def test_taint_based_eviction(self):
''' '''
@ -199,16 +247,9 @@ class EndToEndTestCase(unittest.TestCase):
cluster_label = 'cluster-name=acid-minimal-cluster' cluster_label = 'cluster-name=acid-minimal-cluster'
# get nodes of master and replica(s) (expected target of new master) # get nodes of master and replica(s) (expected target of new master)
current_master_node, failover_targets = k8s.get_pg_nodes(cluster_label) current_master_node, current_replica_nodes = k8s.get_pg_nodes(cluster_label)
num_replicas = len(failover_targets) num_replicas = len(current_replica_nodes)
failover_targets = self.get_failover_targets(current_master_node, current_replica_nodes)
# if all pods live on the same node, failover will happen to other worker(s)
failover_targets = [x for x in failover_targets if x != current_master_node]
if len(failover_targets) == 0:
nodes = k8s.api.core_v1.list_node()
for n in nodes.items:
if "node-role.kubernetes.io/master" not in n.metadata.labels and n.metadata.name != current_master_node:
failover_targets.append(n.metadata.name)
# taint node with postgres=:NoExecute to force failover # taint node with postgres=:NoExecute to force failover
body = { body = {
@ -346,12 +387,26 @@ class EndToEndTestCase(unittest.TestCase):
} }
k8s.update_config(unpatch_custom_service_annotations) k8s.update_config(unpatch_custom_service_annotations)
def get_failover_targets(self, master_node, replica_nodes):
'''
If all pods live on the same node, failover will happen to other worker(s)
'''
k8s = self.k8s
failover_targets = [x for x in replica_nodes if x != master_node]
if len(failover_targets) == 0:
nodes = k8s.api.core_v1.list_node()
for n in nodes.items:
if "node-role.kubernetes.io/master" not in n.metadata.labels and n.metadata.name != master_node:
failover_targets.append(n.metadata.name)
return failover_targets
def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"): def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"):
''' '''
Check that there is a single pod in the k8s cluster with the label "spilo-role=master" Check that there is a single pod in the k8s cluster with the label "spilo-role=master"
To be called manually after operations that affect pods To be called manually after operations that affect pods
''' '''
k8s = self.k8s k8s = self.k8s
labels = 'spilo-role=master,cluster-name=' + clusterName labels = 'spilo-role=master,cluster-name=' + clusterName

View File

@ -5,7 +5,7 @@ import (
"time" "time"
"github.com/zalando/postgres-operator/pkg/util/retryutil" "github.com/zalando/postgres-operator/pkg/util/retryutil"
"k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
@ -172,19 +172,19 @@ func (c *Controller) nodeDelete(obj interface{}) {
} }
func (c *Controller) moveMasterPodsOffNode(node *v1.Node) { func (c *Controller) moveMasterPodsOffNode(node *v1.Node) {
// retry to move master until configured timeout is reached
err := retryutil.Retry(1*time.Minute, c.opConfig.MasterPodMoveTimeout, err := retryutil.Retry(1*time.Minute, c.opConfig.MasterPodMoveTimeout,
func() (bool, error) { func() (bool, error) {
err := c.attemptToMoveMasterPodsOffNode(node) err := c.attemptToMoveMasterPodsOffNode(node)
if err != nil { if err != nil {
return false, fmt.Errorf("unable to move master pods off the unschedulable node; will retry after delay of 1 minute") return false, err
} }
return true, nil return true, nil
}, },
) )
if err != nil { if err != nil {
c.logger.Warningf("failed to move master pods from the node %q: timeout of %v minutes expired", node.Name, c.opConfig.MasterPodMoveTimeout) c.logger.Warningf("failed to move master pods from the node %q: %v", node.Name, err)
} }
} }