include toleration diff in stateful set sync

make taint e2e test more robust

change merge behavior of nodeSelectorTerms for node readiness label
This commit is contained in:
Felix Kunde 2021-12-23 13:47:09 +01:00
parent 95785b813c
commit ced0eae14a
2 changed files with 77 additions and 63 deletions

View File

@ -53,7 +53,7 @@ class K8s:
return master_pod_node, replica_pod_nodes return master_pod_node, replica_pod_nodes
def get_cluster_nodes(self, cluster_labels='cluster-name=acid-minimal-cluster', namespace='default'): def get_cluster_nodes(self, cluster_labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'):
m = [] m = []
r = [] r = []
podsList = self.api.core_v1.list_namespaced_pod(namespace, label_selector=cluster_labels) podsList = self.api.core_v1.list_namespaced_pod(namespace, label_selector=cluster_labels)

View File

@ -883,9 +883,9 @@ class EndToEndTestCase(unittest.TestCase):
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running") self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
# get nodes of master and replica(s) # get nodes of master and replica(s)
master_node, replica_nodes = k8s.get_pg_nodes(cluster_label) master_nodes, replica_nodes = k8s.get_cluster_nodes()
self.assertNotEqual(master_node, []) self.assertNotEqual(master_nodes, [])
self.assertNotEqual(replica_nodes, []) self.assertNotEqual(replica_nodes, [])
# label node with environment=postgres # label node with environment=postgres
@ -898,8 +898,8 @@ class EndToEndTestCase(unittest.TestCase):
} }
try: try:
# patch current master node with the label # patch master node with the label
k8s.api.core_v1.patch_node(master_node, node_label_body) k8s.api.core_v1.patch_node(master_nodes[0], node_label_body)
# add node affinity to cluster # add node affinity to cluster
patch_node_affinity_config = { patch_node_affinity_config = {
@ -923,7 +923,6 @@ class EndToEndTestCase(unittest.TestCase):
} }
} }
} }
k8s.api.custom_objects_api.patch_namespaced_custom_object( k8s.api.custom_objects_api.patch_namespaced_custom_object(
group="acid.zalan.do", group="acid.zalan.do",
version="v1", version="v1",
@ -934,14 +933,17 @@ class EndToEndTestCase(unittest.TestCase):
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement # node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement
k8s.wait_for_pod_failover(master_node, 'spilo-role=replica,' + cluster_label) k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
# master pod needs to be replaced as well to finish the rolling update
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label) podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label)
for pod in podsList.items: for pod in podsList.items:
if pod.metadata.labels.get('spilo-role') == 'replica': if pod.metadata.labels.get('spilo-role') == 'replica':
self.assertEqual(master_node, pod.spec.node_name, self.assertEqual(master_nodes[0], pod.spec.node_name,
"Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_node, pod.spec.node_name)) "Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_nodes[0], pod.spec.node_name))
# check that pod has correct node affinity # check that pod has correct node affinity
key = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].key key = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].key
@ -966,7 +968,7 @@ class EndToEndTestCase(unittest.TestCase):
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# node affinity change should cause another rolling update and relocation of replica # node affinity change should cause another rolling update and relocation of replica
k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label) k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
except timeout_decorator.TimeoutError: except timeout_decorator.TimeoutError:
@ -974,7 +976,6 @@ class EndToEndTestCase(unittest.TestCase):
raise raise
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
@unittest.skip("Skipping this test until fixed")
def test_node_readiness_label(self): def test_node_readiness_label(self):
''' '''
Remove node readiness label from master node. This must cause a failover. Remove node readiness label from master node. This must cause a failover.
@ -984,12 +985,19 @@ class EndToEndTestCase(unittest.TestCase):
readiness_label = 'lifecycle-status' readiness_label = 'lifecycle-status'
readiness_value = 'ready' readiness_value = 'ready'
try: # verify we are in good state from potential previous tests
# get nodes of master and replica(s) (expected target of new master) self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
current_master_node, current_replica_nodes = k8s.get_pg_nodes(cluster_label) self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
num_replicas = len(current_replica_nodes)
failover_targets = self.get_failover_targets(current_master_node, current_replica_nodes)
# get nodes of master and replica(s) (expected target of new master)
master_nodes, replica_nodes = k8s.get_cluster_nodes()
self.assertNotEqual(master_nodes, [])
self.assertNotEqual(replica_nodes, [])
num_replicas = len(replica_nodes)
failover_targets = self.get_failover_targets(master_nodes[0], replica_nodes)
try:
# add node_readiness_label to potential failover nodes # add node_readiness_label to potential failover nodes
patch_readiness_label = { patch_readiness_label = {
"metadata": { "metadata": {
@ -998,7 +1006,6 @@ class EndToEndTestCase(unittest.TestCase):
} }
} }
} }
self.assertTrue(len(failover_targets) > 0, "No failover targets available")
for failover_target in failover_targets: for failover_target in failover_targets:
k8s.api.core_v1.patch_node(failover_target, patch_readiness_label) k8s.api.core_v1.patch_node(failover_target, patch_readiness_label)
@ -1009,19 +1016,31 @@ class EndToEndTestCase(unittest.TestCase):
} }
} }
k8s.update_config(patch_readiness_label_config, "setting readiness label") k8s.update_config(patch_readiness_label_config, "setting readiness label")
new_master_node, new_replica_nodes = self.assert_failover( self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
current_master_node, num_replicas, failover_targets, cluster_label)
# node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement
k8s.wait_for_pod_failover(failover_targets, 'spilo-role=master,' + cluster_label)
# the replica however will not start due to a volume node affinity conflict
# only if the pvc and pod are deleted it can be scheduled
replica = k8s.get_cluster_replica_pod()
if replica.status.phase == 'Pending':
k8s.api.core_v1.delete_namespaced_persistent_volume_claim('pgdata-' + replica.metadata.name, 'default')
k8s.api.core_v1.delete_namespaced_pod(replica.metadata.name, 'default')
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
# patch also node where master ran before # patch also node where master ran before
k8s.api.core_v1.patch_node(current_master_node, patch_readiness_label) k8s.api.core_v1.patch_node(master_nodes[0], patch_readiness_label)
# toggle pod anti affinity to move replica away from master node
self.eventuallyTrue(lambda: self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label), "Pods are redistributed")
except timeout_decorator.TimeoutError: except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log())) print('Operator log: {}'.format(k8s.get_operator_log()))
raise raise
# toggle pod anti affinity to move replica away from master node
self.eventuallyTrue(lambda: self.assert_distributed_pods(master_nodes), "Pods are redistributed")
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_overwrite_pooler_deployment(self): def test_overwrite_pooler_deployment(self):
k8s = self.k8s k8s = self.k8s
@ -1426,7 +1445,6 @@ class EndToEndTestCase(unittest.TestCase):
# get nodes of master and replica(s) (expected target of new master) # get nodes of master and replica(s) (expected target of new master)
master_nodes, replica_nodes = k8s.get_cluster_nodes() master_nodes, replica_nodes = k8s.get_cluster_nodes()
self.assertNotEqual(master_nodes, []) self.assertNotEqual(master_nodes, [])
self.assertNotEqual(replica_nodes, []) self.assertNotEqual(replica_nodes, [])
@ -1441,10 +1459,7 @@ class EndToEndTestCase(unittest.TestCase):
] ]
} }
} }
k8s.api.core_v1.patch_node(master_nodes[0], body) k8s.api.core_v1.patch_node(master_nodes[0], body)
self.eventuallyTrue(lambda: k8s.get_cluster_nodes()[0], replica_nodes)
self.assertNotEqual(lambda: k8s.get_cluster_nodes()[0], master_nodes)
# add toleration to pods # add toleration to pods
patch_toleration_config = { patch_toleration_config = {
@ -1453,15 +1468,20 @@ class EndToEndTestCase(unittest.TestCase):
} }
} }
k8s.update_config(patch_toleration_config, step="allow tainted nodes") try:
k8s.update_config(patch_toleration_config, step="allow tainted nodes")
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"},
"Operator does not get in sync")
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running") self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running") self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log()))
raise
# toggle pod anti affinity to move replica away from master node # toggle pod anti affinity to move replica away from master node
nm, new_replica_nodes = k8s.get_cluster_nodes() self.assert_distributed_pods(replica_nodes)
new_master_node = nm[0]
self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label)
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_zz_cluster_deletion(self): def test_zz_cluster_deletion(self):
@ -1564,23 +1584,6 @@ class EndToEndTestCase(unittest.TestCase):
return failover_targets return failover_targets
def assert_failover(self, current_master_node, num_replicas, failover_targets, cluster_label):
'''
Check if master is failing over. The replica should move first to be the switchover target
'''
k8s = self.k8s
k8s.wait_for_pod_failover(failover_targets, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
new_master_node, new_replica_nodes = k8s.get_pg_nodes(cluster_label)
self.assertNotEqual(current_master_node, new_master_node,
"Master on {} did not fail over to one of {}".format(current_master_node, failover_targets))
self.assertEqual(num_replicas, len(new_replica_nodes),
"Expected {} replicas, found {}".format(num_replicas, len(new_replica_nodes)))
self.assert_master_is_unique()
return new_master_node, new_replica_nodes
def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"): def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"):
''' '''
Check that there is a single pod in the k8s cluster with the label "spilo-role=master" Check that there is a single pod in the k8s cluster with the label "spilo-role=master"
@ -1592,14 +1595,13 @@ class EndToEndTestCase(unittest.TestCase):
num_of_master_pods = k8s.count_pods_with_label(labels, namespace) num_of_master_pods = k8s.count_pods_with_label(labels, namespace)
self.assertEqual(num_of_master_pods, 1, "Expected 1 master pod, found {}".format(num_of_master_pods)) self.assertEqual(num_of_master_pods, 1, "Expected 1 master pod, found {}".format(num_of_master_pods))
def assert_distributed_pods(self, master_node, replica_nodes, cluster_label): @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=acid-minimal-cluster'):
''' '''
Other tests can lead to the situation that master and replica are on the same node. Other tests can lead to the situation that master and replica are on the same node.
Toggle pod anti affinty to distribute pods accross nodes (replica in particular). Toggle pod anti affinty to distribute pods accross nodes (replica in particular).
''' '''
k8s = self.k8s k8s = self.k8s
cluster_label = 'application=spilo,cluster-name=acid-minimal-cluster'
failover_targets = self.get_failover_targets(master_node, replica_nodes)
# enable pod anti affintiy in config map which should trigger movement of replica # enable pod anti affintiy in config map which should trigger movement of replica
patch_enable_antiaffinity = { patch_enable_antiaffinity = {
@ -1607,18 +1609,30 @@ class EndToEndTestCase(unittest.TestCase):
"enable_pod_antiaffinity": "true" "enable_pod_antiaffinity": "true"
} }
} }
k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
self.assert_failover(master_node, len(replica_nodes), failover_targets, cluster_label)
# now disable pod anti affintiy again which will cause yet another failover try:
patch_disable_antiaffinity = { k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
"data": { self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
"enable_pod_antiaffinity": "false"
k8s.wait_for_pod_failover(target_nodes, 'spilo-role=replica,' + cluster_labels)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
# now disable pod anti affintiy again which will cause yet another failover
patch_disable_antiaffinity = {
"data": {
"enable_pod_antiaffinity": "false"
}
} }
} k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity") self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) k8s.wait_for_pod_start('spilo-role=master,' + cluster_labels)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log()))
raise
return True return True
def list_databases(self, pod_name): def list_databases(self, pod_name):