* Patroni state function added in k8s

* Lazy upgrade now properly covered with eventual and waiting for pod start * patching config now updates deployment, patching annotation, allowing to trace change step * run.sh no takes NOCLEANUP to stop kind from being deleted * if kind config is present, run will not install kind * Fast e2e local execution now possible once kind is up
2020-10-19 23:35:08 +02:00 · 2020-10-19 23:35:08 +02:00 · 966575dd4b
parent c1ad71668b
commit 966575dd4b
4 changed files with 90 additions and 31 deletions
--- a/e2e/README.md
+++ b/e2e/README.md
@ -35,6 +35,11 @@ In the e2e folder you can invoke tests either with `make test` or with:
 To run both the build and test step you can invoke `make e2e` from the parent
 directory.

+To run the end 2 end test and keep the kind state execute:
+```bash
+NOCLEANUP=True ./run.sh
+```
+
 ## Covered use cases

 The current tests are all bundled in [`test_e2e.py`](tests/test_e2e.py):
--- a/e2e/run.sh
+++ b/e2e/run.sh
@ -58,7 +58,6 @@ function run_tests(){
  --mount type=bind,source="$(readlink -f tests)",target=/tests \
  --mount type=bind,source="$(readlink -f exec.sh)",target=/exec.sh \
  -e OPERATOR_IMAGE="${operator_image}" "${e2e_test_runner_image}"
-  
 }

 function clean_up(){
@ -70,11 +69,10 @@ function clean_up(){

 function main(){

-  trap "clean_up" QUIT TERM EXIT
-
-  time pull_images
-  time start_kind
-  time set_kind_api_server_ip
+  [[ -z ${NOCLEANUP-} ]] && trap "clean_up" QUIT TERM EXIT
+  pull_images
+  [[ ! -f ${kubeconfig_path} ]] && start_kind
+  set_kind_api_server_ip
  run_tests
  exit 0
 }
--- a/e2e/run_tests_image.sh
+++ b/e2e/run_tests_image.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+
+export cluster_name="postgres-operator-e2e-tests"
+export kubeconfig_path="/tmp/kind-config-${cluster_name}"
+export operator_image="registry.opensource.zalan.do/acid/postgres-operator:latest"
+export e2e_test_runner_image="registry.opensource.zalan.do/acid/postgres-operator-e2e-tests-runner:latest"
+
+docker run -it --entrypoint /bin/bash --network=host -e "TERM=xterm-256color" \
+    --mount type=bind,source="$(readlink -f ${kubeconfig_path})",target=/root/.kube/config \
+    --mount type=bind,source="$(readlink -f manifests)",target=/manifests \
+    --mount type=bind,source="$(readlink -f tests)",target=/tests \
+    --mount type=bind,source="$(readlink -f exec.sh)",target=/exec.sh \
+    -e OPERATOR_IMAGE="${operator_image}" "${e2e_test_runner_image}"
--- a/e2e/tests/test_e2e.py
+++ b/e2e/tests/test_e2e.py
@ -10,6 +10,8 @@ import yaml
 from datetime import datetime
 from kubernetes import client, config

+SPILO_CURRENT = "registry.opensource.zalan.do/acid/spilo-12:1.6-p5"
+SPILO_LAZY = "registry.opensource.zalan.do/acid/spilo-cdp-12:1.6-p114"

 def to_selector(labels):
    return ",".join(["=".join(l) for l in labels.items()])
@ -75,13 +77,19 @@ class EndToEndTestCase(unittest.TestCase):
        k8s = cls.k8s = K8s()

        # remove existing local storage class and create hostpath class
-        k8s.api.storage_v1_api.delete_storage_class("standard")
+        try:
+            k8s.api.storage_v1_api.delete_storage_class("standard")
+        except:
+            print("Storage class has already been remove")

        # operator deploys pod service account there on start up
        # needed for test_multi_namespace_support()
        cls.namespace = "test"
-        v1_namespace = client.V1Namespace(metadata=client.V1ObjectMeta(name=cls.namespace))
-        k8s.api.core_v1.create_namespace(v1_namespace)
+        try:
+            v1_namespace = client.V1Namespace(metadata=client.V1ObjectMeta(name=cls.namespace))
+            k8s.api.core_v1.create_namespace(v1_namespace)
+        except:
+            print("Namespace already present")

        # submit the most recent operator image built on the Docker host
        with open("manifests/postgres-operator.yaml", 'r+') as f:
@ -313,27 +321,47 @@ class EndToEndTestCase(unittest.TestCase):

        k8s = self.k8s

+        pod0 = 'acid-minimal-cluster-0'
+        pod1 = 'acid-minimal-cluster-1'
+
+        self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
+        self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members(pod0)), 2, "Postgres status did not enter running")
+
+        patch_lazy_spilo_upgrade = {
+            "data": {
+                "docker_image": SPILO_CURRENT,
+                "enable_lazy_spilo_upgrade": "false"
+            }
+        }
+        k8s.update_config(patch_lazy_spilo_upgrade, step="Init baseline image version")
+
+        self.eventuallyEqual(lambda: k8s.get_statefulset_image(), SPILO_CURRENT, "Stagefulset not updated initially")
+        
+        self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
+        self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members(pod0)), 2, "Postgres status did not enter running")
+        
+        self.eventuallyEqual(lambda: k8s.get_effective_pod_image(pod0), SPILO_CURRENT, "Rolling upgrade was not executed")
+        self.eventuallyEqual(lambda: k8s.get_effective_pod_image(pod1), SPILO_CURRENT, "Rolling upgrade was not executed")
+
        # update docker image in config and enable the lazy upgrade
-        conf_image = "registry.opensource.zalan.do/acid/spilo-cdp-12:1.6-p114"
+        conf_image = SPILO_LAZY
        patch_lazy_spilo_upgrade = {
            "data": {
                "docker_image": conf_image,
                "enable_lazy_spilo_upgrade": "true"
            }
        }
-        k8s.update_config(patch_lazy_spilo_upgrade)
-
-        pod0 = 'acid-minimal-cluster-0'
-        pod1 = 'acid-minimal-cluster-1'
+        k8s.update_config(patch_lazy_spilo_upgrade,step="patch image and lazy upgrade")
+        self.eventuallyEqual(lambda: k8s.get_statefulset_image(), conf_image, "Statefulset not updated to next Docker image")

        try:
            # restart the pod to get a container with the new image
            k8s.api.core_v1.delete_namespaced_pod(pod0, 'default')            
            
            # verify only pod-0 which was deleted got new image from statefulset
-            self.eventuallyEqual(lambda: k8s.get_effective_pod_image(pod0), conf_image, "Delete pod-0 did not get new spilo image")
-            old_image = k8s.get_effective_pod_image(pod1)
-            self.assertNotEqual(conf_image, old_image, "pod-1 should not have change Docker image to {}".format(old_image))
+            self.eventuallyEqual(lambda: k8s.get_effective_pod_image(pod0), conf_image, "Delete pod-0 did not get new spilo image")            
+            self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No two pods running after lazy rolling upgrade")
+            self.assertNotEqual(lambda: k8s.get_effective_pod_image(pod1), SPILO_CURRENT, "pod-1 should not have change Docker image to {}".format(SPILO_CURRENT))

            # clean up
            unpatch_lazy_spilo_upgrade = {
@ -341,13 +369,12 @@ class EndToEndTestCase(unittest.TestCase):
                    "enable_lazy_spilo_upgrade": "false",
                }
            }
-            k8s.update_config(unpatch_lazy_spilo_upgrade)
+            k8s.update_config(unpatch_lazy_spilo_upgrade, step="patch lazy upgrade")

            # at this point operator will complete the normal rolling upgrade
            # so we additonally test if disabling the lazy upgrade - forcing the normal rolling upgrade - works
-
-            self.eventuallyEqual(lambda: k8s.get_effective_pod_image(pod0), conf_image, "Rolling upgrade was not executed")
-            self.eventuallyEqual(lambda: k8s.get_effective_pod_image(pod1), conf_image, "Rolling upgrade was not executed")
+            self.eventuallyEqual(lambda: k8s.get_effective_pod_image(pod0), conf_image, "Rolling upgrade was not executed", 50, 3)
+            self.eventuallyEqual(lambda: k8s.get_effective_pod_image(pod1), conf_image, "Rolling upgrade was not executed", 50, 3)

        except timeout_decorator.TimeoutError:
            print('Operator log: {}'.format(k8s.get_operator_log()))
@ -379,7 +406,7 @@ class EndToEndTestCase(unittest.TestCase):
            "acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_enable_backup)

        try:
-            self.eventuallyEqual(lambda: len(k8s.get_logical_backup_job()), 1, "failed to create logical backup job")
+            self.eventuallyEqual(lambda: len(k8s.get_logical_backup_job().items), 1, "failed to create logical backup job")

            job = k8s.get_logical_backup_job().items[0]
            self.assertEqual(job.metadata.name, "logical-backup-acid-minimal-cluster",
@ -396,7 +423,7 @@ class EndToEndTestCase(unittest.TestCase):
                    "logical_backup_docker_image": image,
                }
            }
-            k8s.update_config(patch_logical_backup_image)
+            k8s.update_config(patch_logical_backup_image, step="patch logical backup image")

            def get_docker_image():
                jobs = k8s.get_logical_backup_job().items
@ -414,7 +441,7 @@ class EndToEndTestCase(unittest.TestCase):
            k8s.api.custom_objects_api.patch_namespaced_custom_object(
                "acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_disable_backup)
            
-            self.eventuallyEqual(lambda: len(self.get_logical_backup_job()), 0, "failed to create logical backup job")
+            self.eventuallyEqual(lambda: len(k8s.get_logical_backup_job().items), 0, "failed to create logical backup job")

        except timeout_decorator.TimeoutError:
            print('Operator log: {}'.format(k8s.get_operator_log()))
@ -991,6 +1018,10 @@ class K8s:
    def count_pdbs_with_label(self, labels, namespace='default'):
        return len(self.api.policy_v1_beta1.list_namespaced_pod_disruption_budget(
            namespace, label_selector=labels).items)
+  
+    def count_running_pods(self, labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'):
+        pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items
+        return len(list(filter(lambda x: x.status.phase=='Running', pods)))

    def wait_for_pod_failover(self, failover_targets, labels, namespace='default'):
        pod_phase = 'Failing over'
@ -1016,19 +1047,18 @@ class K8s:
    def wait_for_logical_backup_job_creation(self):
        self.wait_for_logical_backup_job(expected_num_of_jobs=1)

-    def delete_operator_pod(self):
-        operator_pod = self.api.core_v1.list_namespaced_pod(
-            'default', label_selector="name=postgres-operator").items[0].metadata.name
-        self.api.core_v1.delete_namespaced_pod(operator_pod, "default")  # restart reloads the conf
+    def delete_operator_pod(self, step="Delete operator deplyment"):
+        operator_pod = self.api.core_v1.list_namespaced_pod('default', label_selector="name=postgres-operator").items[0].metadata.name
+        self.api.apps_v1.patch_namespaced_deployment("postgres-operator","default", {"spec":{"template":{"metadata":{"annotations":{"step":"{}-{}".format(step, time.time())}}}}})
        self.wait_for_operator_pod_start()

-    def update_config(self, config_map_patch):
+    def update_config(self, config_map_patch, step="Updating operator deployment"):
        self.api.core_v1.patch_namespaced_config_map("postgres-operator", "default", config_map_patch)
-        self.delete_operator_pod()
+        self.delete_operator_pod(step=step)

    def create_with_kubectl(self, path):
        return subprocess.run(
-            ["kubectl", "create", "-f", path],
+            ["kubectl", "apply", "-f", path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)

@ -1037,6 +1067,19 @@ class K8s:
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE)

+    def get_patroni_state(self, pod):
+        return json.loads(self.exec_with_kubectl(pod, "patronictl list -f json").stdout)
+
+    def get_patroni_running_members(self, pod):
+        result = self.get_patroni_state(pod)
+        return list(filter(lambda x: x["State"]=="running", result))
+    
+    def get_statefulset_image(self, label_selector="application=spilo,cluster-name=acid-minimal-cluster", namespace='default'):
+        ssets = self.api.apps_v1.list_namespaced_stateful_set(namespace, label_selector=label_selector, limit=1)
+        if len(ssets.items) == 0:
+            return None
+        return ssets.items[0].spec.template.spec.containers[0].image
+
    def get_effective_pod_image(self, pod_name, namespace='default'):
        '''
        Get the Spilo image pod currently uses. In case of lazy rolling updates