diff --git a/e2e/tests/k8s_api.py b/e2e/tests/k8s_api.py index 0cb40535c..27135ebff 100644 --- a/e2e/tests/k8s_api.py +++ b/e2e/tests/k8s_api.py @@ -280,12 +280,8 @@ class K8s: return None return pod.items[0].spec.containers[0].image - def get_cluster_leader_pod(self, pg_cluster_name, namespace='default'): - labels = { - 'application': 'spilo', - 'cluster-name': pg_cluster_name, - 'spilo-role': 'master', - } + def get_cluster_pod(self, role, labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'): + labels = labels + ',spilo-role=' + role pods = self.api.core_v1.list_namespaced_pod( namespace, label_selector=to_selector(labels)).items @@ -293,6 +289,12 @@ class K8s: if pods: return pods[0] + def get_cluster_leader_pod(self, labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'): + return self.get_cluster_pod('master', labels, namespace) + + def get_cluster_replica_pod(self, labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'): + return self.get_cluster_pod('replica', labels, namespace) + class K8sBase: ''' diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py index 77733e759..4458b6c5f 100644 --- a/e2e/tests/test_e2e.py +++ b/e2e/tests/test_e2e.py @@ -334,7 +334,7 @@ class EndToEndTestCase(unittest.TestCase): # credentials. dbList = [] - leader = k8s.get_cluster_leader_pod('acid-minimal-cluster') + leader = k8s.get_cluster_leader_pod() dbListQuery = "select datname from pg_database" schemasQuery = """ select schema_name @@ -773,7 +773,7 @@ class EndToEndTestCase(unittest.TestCase): # get nodes of master and replica(s) (expected target of new master) _, replica_nodes = k8s.get_pg_nodes(cluster_label) - # rolling update annotation + # rolling update annotation flag = { "metadata": { "annotations": { @@ -782,25 +782,115 @@ class EndToEndTestCase(unittest.TestCase): } } - podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label) - for pod in podsList.items: - # add flag only to the master to make it appear to the operator as a leftover from a rolling update - if pod.metadata.labels.get('spilo-role') == 'master': + try: + podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label) + for pod in podsList.items: + # add flag only to the master to make it appear to the operator as a leftover from a rolling update + if pod.metadata.labels.get('spilo-role') == 'master': + old_creation_timestamp = pod.metadata.creation_timestamp + k8s.patch_pod(flag, pod.metadata.name, pod.metadata.namespace) + # operator will perform a switchover to an existing replica before recreating the master pod + else: + switchover_target = pod.metadata.name + + # do not wait until the next sync + k8s.delete_operator_pod() + + # operator should now recreate the master pod and do a switchover before + k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label) + + # check if the former replica is now the new master + leader = k8s.get_cluster_leader_pod() + self.eventuallyEqual(lambda: leader.metadata.name, switchover_target, "Rolling update flag did not trigger switchover") + + # check that the old master has been recreated + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) + replica = k8s.get_cluster_replica_pod() + self.eventuallyTrue(replica.metadata.creation_timestamp > old_creation_timestamp, "Old master pod was not recreated") + + + except timeout_decorator.TimeoutError: + print('Operator log: {}'.format(k8s.get_operator_log())) + raise + + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) + def test_rolling_update_label_timeout(self): + ''' + Simulate case when replica does not receive label in time and rolling update does not finish + ''' + k8s = self.k8s + cluster_label = 'application=spilo,cluster-name=acid-minimal-cluster' + + # verify we are in good state from potential previous tests + self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running") + + # get nodes of master and replica(s) (expected target of new master) + _, replica_nodes = k8s.get_pg_nodes(cluster_label) + + # rolling update annotation + flag = { + "metadata": { + "annotations": { + "zalando-postgres-operator-rolling-update-required": "true", + } + } + } + + # make pod_label_wait_timeout so short that rolling update fails on first try + # temporarily lower resync interval to simulate that pods get healthy in between SYNCs + patch_resync_config = { + "data": { + "pod_label_wait_timeout": "2s", + "resync_period": "20s", + } + } + + try: + # patch both pods for rolling update + podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label) + for pod in podsList.items: k8s.patch_pod(flag, pod.metadata.name, pod.metadata.namespace) - # operator will perform a switchover to an existing replica before recreating the master pod - else: - switchover_target = pod.metadata.name + if pod.metadata.labels.get('spilo-role') == 'replica': + switchover_target = pod.metadata.name - # do not wait until the next sync - k8s.delete_operator_pod() + # update config and restart operator + k8s.update_config(patch_resync_config, "update resync interval and pod_label_wait_timeout") - # operator should now recreate the master pod and do a switchover before - k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label) - k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) + # operator should now recreate the replica pod first and do a switchover after + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) - # check if the former replica is now the new master - leader = k8s.get_cluster_leader_pod('acid-minimal-cluster') - self.eventuallyEqual(lambda: leader.metadata.name, switchover_target, "Rolling update flag did not trigger switchover") + # pod_label_wait_timeout should have been exceeded hence the rolling update is continued on next sync + # check if the cluster state is "SyncFailed" + self.eventuallyEqual(lambda: k8s.pg_get_status(), "SyncFailed", "Expected SYNC event to fail") + + # wait for next sync, replica should be running normally by now and be ready for switchover + time.sleep(10) + k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label) + + # check if the former replica is now the new master + leader = k8s.get_cluster_leader_pod() + self.eventuallyEqual(lambda: leader.metadata.name, switchover_target, "Rolling update flag did not trigger switchover") + + # wait for the old master to get restarted + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) + + # status should again be "SyncFailed" but turn into "Running" on the next sync + time.sleep(10) + self.eventuallyEqual(lambda: k8s.pg_get_status(), "Running", "Expected running cluster after two syncs") + + # revert config changes + patch_resync_config = { + "data": { + "pod_label_wait_timeout": "10m", + "resync_period": "30m", + } + } + k8s.update_config(patch_resync_config, "revert resync interval and pod_label_wait_timeout") + + + except timeout_decorator.TimeoutError: + print('Operator log: {}'.format(k8s.get_operator_log())) + raise @timeout_decorator.timeout(TEST_TIMEOUT_SEC) def test_zz_node_readiness_label(self):