improve e2e test debugging (#1107)

* print operator log in most tests when they time out
2020-08-28 14:57:19 +02:00 · 2020-08-28 14:57:19 +02:00 · 5e93aabea6
parent 30c86758a3
commit 5e93aabea6
1 changed files with 322 additions and 259 deletions
--- a/e2e/tests/test_e2e.py
+++ b/e2e/tests/test_e2e.py
@ -163,6 +163,7 @@ class EndToEndTestCase(unittest.TestCase):
        k8s = self.k8s
        cluster_label = 'application=spilo,cluster-name=acid-minimal-cluster'

+        try:
            # enable load balancer services
            pg_patch_enable_lbs = {
                "spec": {
@ -203,6 +204,56 @@ class EndToEndTestCase(unittest.TestCase):
            self.assertEqual(repl_svc_type, 'ClusterIP',
                             "Expected ClusterIP service type for replica, found {}".format(repl_svc_type))

+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
+    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
+    def test_infrastructure_roles(self):
+        '''
+            Test using external secrets for infrastructure roles
+        '''
+        k8s = self.k8s
+        # update infrastructure roles description
+        secret_name = "postgresql-infrastructure-roles"
+        roles = "secretname: postgresql-infrastructure-roles-new, userkey: user, rolekey: memberof, passwordkey: password, defaultrolevalue: robot_zmon"
+        patch_infrastructure_roles = {
+            "data": {
+                "infrastructure_roles_secret_name": secret_name,
+                "infrastructure_roles_secrets": roles,
+            },
+        }
+        k8s.update_config(patch_infrastructure_roles)
+
+        # wait a little before proceeding
+        time.sleep(30)
+
+        try:
+            # check that new roles are represented in the config by requesting the
+            # operator configuration via API
+            operator_pod = k8s.get_operator_pod()
+            get_config_cmd = "wget --quiet -O - localhost:8080/config"
+            result = k8s.exec_with_kubectl(operator_pod.metadata.name, get_config_cmd)
+            roles_dict = (json.loads(result.stdout)
+                          .get("controller", {})
+                          .get("InfrastructureRoles"))
+
+            self.assertTrue("robot_zmon_acid_monitoring_new" in roles_dict)
+            role = roles_dict["robot_zmon_acid_monitoring_new"]
+            role.pop("Password", None)
+            self.assertDictEqual(role, {
+                "Name": "robot_zmon_acid_monitoring_new",
+                "Flags": None,
+                "MemberOf": ["robot_zmon"],
+                "Parameters": None,
+                "AdminRole": "",
+                "Origin": 2,
+            })
+
+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_lazy_spilo_upgrade(self):
        '''
@ -230,6 +281,7 @@ class EndToEndTestCase(unittest.TestCase):
        pod0 = 'acid-minimal-cluster-0'
        pod1 = 'acid-minimal-cluster-1'

+        try:
            # restart the pod to get a container with the new image
            k8s.api.core_v1.delete_namespaced_pod(pod0, 'default')
            time.sleep(60)
@ -237,7 +289,8 @@ class EndToEndTestCase(unittest.TestCase):
            # lazy update works if the restarted pod and older pods run different Spilo versions
            new_image = k8s.get_effective_pod_image(pod0)
            old_image = k8s.get_effective_pod_image(pod1)
-        self.assertNotEqual(new_image, old_image, "Lazy updated failed: pods have the same image {}".format(new_image))
+            self.assertNotEqual(new_image, old_image,
+                                "Lazy updated failed: pods have the same image {}".format(new_image))

            # sanity check
            assert_msg = "Image {} of a new pod differs from {} in operator conf".format(new_image, conf_image)
@ -263,6 +316,10 @@ class EndToEndTestCase(unittest.TestCase):
            assert_msg = "Disabling lazy upgrade failed: pods still have different images {} and {}".format(image0, image1)
            self.assertEqual(image0, image1, assert_msg)

+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_logical_backup_cron_job(self):
        '''
@ -287,6 +344,8 @@ class EndToEndTestCase(unittest.TestCase):
        }
        k8s.api.custom_objects_api.patch_namespaced_custom_object(
            "acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_enable_backup)
+
+        try:
            k8s.wait_for_logical_backup_job_creation()

            jobs = k8s.get_logical_backup_job().items
@ -327,6 +386,10 @@ class EndToEndTestCase(unittest.TestCase):
            self.assertEqual(0, len(jobs),
                             "Expected 0 logical backup jobs, found {}".format(len(jobs)))

+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_min_resource_limits(self):
        '''
@ -365,6 +428,8 @@ class EndToEndTestCase(unittest.TestCase):
        }
        k8s.api.custom_objects_api.patch_namespaced_custom_object(
            "acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_resources)
+
+        try:
            k8s.wait_for_pod_failover(failover_targets, labels)
            k8s.wait_for_pod_start('spilo-role=replica')

@ -380,6 +445,10 @@ class EndToEndTestCase(unittest.TestCase):
                             "Expected memory limit {}, found {}"
                             .format(minMemoryLimit, masterPod.spec.containers[0].resources.limits['memory']))

+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_multi_namespace_support(self):
        '''
@ -392,10 +461,15 @@ class EndToEndTestCase(unittest.TestCase):
            pg_manifest["metadata"]["namespace"] = self.namespace
            yaml.dump(pg_manifest, f, Dumper=yaml.Dumper)

+        try:
            k8s.create_with_kubectl("manifests/complete-postgres-manifest.yaml")
            k8s.wait_for_pod_start("spilo-role=master", self.namespace)
            self.assert_master_is_unique(self.namespace, "acid-test-cluster")

+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_node_readiness_label(self):
        '''
@ -406,6 +480,7 @@ class EndToEndTestCase(unittest.TestCase):
        readiness_label = 'lifecycle-status'
        readiness_value = 'ready'

+        try:
            # get nodes of master and replica(s) (expected target of new master)
            current_master_node, current_replica_nodes = k8s.get_pg_nodes(cluster_label)
            num_replicas = len(current_replica_nodes)
@ -441,6 +516,10 @@ class EndToEndTestCase(unittest.TestCase):
            # toggle pod anti affinity to move replica away from master node
            self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label)

+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_scaling(self):
        '''
@ -449,6 +528,7 @@ class EndToEndTestCase(unittest.TestCase):
        k8s = self.k8s
        labels = "application=spilo,cluster-name=acid-minimal-cluster"

+        try:
            k8s.wait_for_pg_to_scale(3)
            self.assertEqual(3, k8s.count_pods_with_label(labels))
            self.assert_master_is_unique()
@ -457,6 +537,10 @@ class EndToEndTestCase(unittest.TestCase):
            self.assertEqual(2, k8s.count_pods_with_label(labels))
            self.assert_master_is_unique()

+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_service_annotations(self):
        '''
@ -470,6 +554,7 @@ class EndToEndTestCase(unittest.TestCase):
        }
        k8s.update_config(patch_custom_service_annotations)

+        try:
            pg_patch_custom_annotations = {
                "spec": {
                    "serviceAnnotations": {
@ -492,6 +577,10 @@ class EndToEndTestCase(unittest.TestCase):
            self.assertTrue(k8s.check_service_annotations(
                "cluster-name=acid-minimal-cluster,spilo-role=replica", annotations))

+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
        # clean up
        unpatch_custom_service_annotations = {
            "data": {
@ -515,6 +604,7 @@ class EndToEndTestCase(unittest.TestCase):
        }
        k8s.update_config(patch_sset_propagate_annotations)

+        try:
            pg_crd_annotations = {
                "metadata": {
                    "annotations": {
@ -534,6 +624,10 @@ class EndToEndTestCase(unittest.TestCase):
            }
            self.assertTrue(k8s.check_statefulset_annotations(cluster_label, annotations))

+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_taint_based_eviction(self):
        '''
@ -559,6 +653,7 @@ class EndToEndTestCase(unittest.TestCase):
            }
        }

+        try:
            # patch node and test if master is failing over to one of the expected nodes
            k8s.api.core_v1.patch_node(current_master_node, body)
            new_master_node, new_replica_nodes = self.assert_failover(
@ -578,46 +673,9 @@ class EndToEndTestCase(unittest.TestCase):
            # toggle pod anti affinity to move replica away from master node
            self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label)

-    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
-    def test_infrastructure_roles(self):
-        '''
-            Test using external secrets for infrastructure roles
-        '''
-        k8s = self.k8s
-        # update infrastructure roles description
-        secret_name = "postgresql-infrastructure-roles"
-        roles = "secretname: postgresql-infrastructure-roles-new, userkey: user, rolekey: memberof, passwordkey: password, defaultrolevalue: robot_zmon"
-        patch_infrastructure_roles = {
-            "data": {
-                "infrastructure_roles_secret_name": secret_name,
-                "infrastructure_roles_secrets": roles,
-            },
-        }
-        k8s.update_config(patch_infrastructure_roles)
-
-        # wait a little before proceeding
-        time.sleep(30)
-
-        # check that new roles are represented in the config by requesting the
-        # operator configuration via API
-        operator_pod = k8s.get_operator_pod()
-        get_config_cmd = "wget --quiet -O - localhost:8080/config"
-        result = k8s.exec_with_kubectl(operator_pod.metadata.name, get_config_cmd)
-        roles_dict = (json.loads(result.stdout)
-                      .get("controller", {})
-                      .get("InfrastructureRoles"))
-
-        self.assertTrue("robot_zmon_acid_monitoring_new" in roles_dict)
-        role = roles_dict["robot_zmon_acid_monitoring_new"]
-        role.pop("Password", None)
-        self.assertDictEqual(role, {
-            "Name": "robot_zmon_acid_monitoring_new",
-            "Flags": None,
-            "MemberOf": ["robot_zmon"],
-            "Parameters": None,
-            "AdminRole": "",
-            "Origin": 2,
-        })
+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise

    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_x_cluster_deletion(self):
@ -636,6 +694,7 @@ class EndToEndTestCase(unittest.TestCase):
        }
        k8s.update_config(patch_delete_annotations)

+        try:
            # this delete attempt should be omitted because of missing annotations
            k8s.api.custom_objects_api.delete_namespaced_custom_object(
                "acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster")
@ -684,6 +743,10 @@ class EndToEndTestCase(unittest.TestCase):
            self.assertEqual(0, k8s.count_pdbs_with_label(cluster_label))
            self.assertEqual(0, k8s.count_secrets_with_label(cluster_label))

+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
    def get_failover_targets(self, master_node, replica_nodes):
        '''
           If all pods live on the same node, failover will happen to other worker(s)