add more docs and e2e test

2022-04-01 17:03:40 +02:00 · 2022-04-01 17:03:40 +02:00 · 6f72a3a3a0
parent bd96226f1a
commit 6f72a3a3a0
4 changed files with 89 additions and 37 deletions
--- a/docs/user.md
+++ b/docs/user.md
@ -838,15 +838,15 @@ point you should restore.
 ## Setting up a standby cluster
 Standby cluster is a [Patroni feature](https://github.com/zalando/patroni/blob/master/docs/replica_bootstrap.rst#standby-cluster)
-that first clones a database, and keeps replicating changes afterwards. As the
+that first clones a database, and keeps replicating changes afterwards. It can
-replication is happening by the means of archived WAL files (stored on S3 or
+exist in a different location than its source database, but unlike cloning,
-the equivalent of other cloud providers), the standby cluster can exist in a
+the PostgreSQL version between source and target cluster has to be the same.
 different location than its source database. Unlike cloning, the PostgreSQL
 version between source and target cluster has to be the same.
 To start a cluster as standby, add the following `standby` section in the YAML
-file. Specify the S3/GS bucket path. Omitting both settings will result in an error
+file. You can stream changes from archived WAL files (AWS S3 or Google Cloud
-and no statefulset will be created.
+Storage) or from a remote primary where you specify the host address and port.
 If you leave out the port, Patroni will use `"5432"`. Only one option can be
 specfied in the manifest:
 ```yaml
 spec:
@ -860,32 +860,42 @@ spec:
    gs_wal_path: "gs://<bucketname>/spilo/<source_db_cluster>/<UID>/wal/<PGVERSION>"
 ```
-At the moment, the operator only allows to stream from the WAL archive of the
+```yaml
-master. Thus, it is recommended to deploy standby clusters with only [one pod](https://github.com/zalando/postgres-operator/blob/master/manifests/standby-manifest.yaml#L10).
+spec:
-You can raise the instance count when detaching. Note, that the same pod role
+  standby:
-labels like for normal clusters are used: The standby leader is labeled as
+    standby_host: "acid-minimal-cluster.default"
-`master`.
+    standby_port: "5433"
 ```
 Note, that the pods and services use the same role labels like for normal clusters:
 The standby leader is labeled as `master`. When using the `standby_host` option
 you have to copy the credentials from the source cluster's secrets to successfully
 bootstrap a standby cluster (see next chapter).
 ### Providing credentials of source cluster
 A standby cluster is replicating the data (including users and passwords) from
 the source database and is read-only. The system and application users (like
 standby, postgres etc.) all have a password that does not match the credentials
-stored in secrets which are created by the operator. One solution is to create
+stored in secrets which are created by the operator. You have two options:
-secrets beforehand and paste in the credentials of the source cluster.
+
 a. Create secrets manually beforehand and paste the credentials of the source
   cluster
 b. Let the operator create the secrets when it bootstraps the standby cluster.
   Patch the secrets with the credentials of the source cluster. Replace the
   spilo pods.
 Otherwise, you will see errors in the Postgres logs saying users cannot log in
 and the operator logs will complain about not being able to sync resources.
 If you stream changes from a remote primary you have to align the secrets or
 the standby cluster will not start up.
-When you only run a standby leader, you can safely ignore this, as it will be
+If you stream changes from WAL files and you only run a standby leader, you
-sorted out once the cluster is detached from the source. It is also harmless if
+can safely ignore the secret mismatch, as it will be sorted out once the
-you don’t plan it. But, when you created a standby replica, too, fix the
+cluster is detached from the source. It is also harmless if you do not plan it.
-credentials right away. WAL files will pile up on the standby leader if no
+But, when you create a standby replica, too, fix the credentials right away.
-connection can be established between standby replica(s). You can also edit the
+WAL files will pile up on the standby leader if no connection can be
-secrets after their creation. Find them by:
+established between standby replica(s).
 ```bash
 kubectl get secrets --all-namespaces | grep <standby-cluster-name>
 ```
 ### Promote the standby
--- a/e2e/tests/k8s_api.py
+++ b/e2e/tests/k8s_api.py
@ -321,9 +321,15 @@ class K8s:
    def get_cluster_replica_pod(self, labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'):
        return self.get_cluster_pod('replica', labels, namespace)
-    def get_secret_data(self, username, clustername='acid-minimal-cluster', namespace='default'):
+    def get_secret(self, username, clustername='acid-minimal-cluster', namespace='default'):
-        return self.api.core_v1.read_namespaced_secret(
+        secret = self.api.core_v1.read_namespaced_secret(
-                "{}.{}.credentials.postgresql.acid.zalan.do".format(username.replace("_","-"), clustername), namespace).data
+                "{}.{}.credentials.postgresql.acid.zalan.do".format(username.replace("_","-"), clustername), namespace)
        secret.metadata.resource_version = None
        secret.metadata.uid = None
        return secret
    def create_secret(self, secret, namespace='default'):
        return self.api.core_v1.create_namespaced_secret(namespace, secret)
 class K8sBase:
    '''
--- a/e2e/tests/test_e2e.py
+++ b/e2e/tests/test_e2e.py
@ -1319,8 +1319,8 @@ class EndToEndTestCase(unittest.TestCase):
        self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
        # check if next rotation date was set in secret
-        secret_data = k8s.get_secret_data("zalando")
+        zalando_secret = k8s.get_secret("zalando")
-        next_rotation_timestamp = datetime.strptime(str(base64.b64decode(secret_data["nextRotation"]), 'utf-8'), "%Y-%m-%dT%H:%M:%SZ")
+        next_rotation_timestamp = datetime.strptime(str(base64.b64decode(zalando_secret.data["nextRotation"]), 'utf-8'), "%Y-%m-%dT%H:%M:%SZ")
        today90days = today+timedelta(days=90)
        self.assertEqual(today90days, next_rotation_timestamp.date(),
                        "Unexpected rotation date in secret of zalando user: expected {}, got {}".format(today90days, next_rotation_timestamp.date()))
@ -1361,9 +1361,9 @@ class EndToEndTestCase(unittest.TestCase):
                             "Operator does not get in sync")
        # check if next rotation date and username have been replaced
-        secret_data = k8s.get_secret_data("foo_user")
+        foo_user_secret = k8s.get_secret("foo_user")
-        secret_username = str(base64.b64decode(secret_data["username"]), 'utf-8')
+        secret_username = str(base64.b64decode(foo_user_secret.data["username"]), 'utf-8')
-        next_rotation_timestamp = datetime.strptime(str(base64.b64decode(secret_data["nextRotation"]), 'utf-8'), "%Y-%m-%dT%H:%M:%SZ")
+        next_rotation_timestamp = datetime.strptime(str(base64.b64decode(foo_user_secret.data["nextRotation"]), 'utf-8'), "%Y-%m-%dT%H:%M:%SZ")
        rotation_user = "foo_user"+today.strftime("%y%m%d")
        today30days = today+timedelta(days=30)
@ -1396,9 +1396,9 @@ class EndToEndTestCase(unittest.TestCase):
                             "Operator does not get in sync")
        # check if username in foo_user secret is reset
-        secret_data = k8s.get_secret_data("foo_user")
+        foo_user_secret = k8s.get_secret("foo_user")
-        secret_username = str(base64.b64decode(secret_data["username"]), 'utf-8')
+        secret_username = str(base64.b64decode(foo_user_secret.data["username"]), 'utf-8')
-        next_rotation_timestamp = str(base64.b64decode(secret_data["nextRotation"]), 'utf-8')
+        next_rotation_timestamp = str(base64.b64decode(foo_user_secret.data["nextRotation"]), 'utf-8')
        self.assertEqual("foo_user", secret_username,
                        "Unexpected username in secret of foo_user: expected {}, got {}".format("foo_user", secret_username))
        self.assertEqual('', next_rotation_timestamp,
@ -1644,6 +1644,42 @@ class EndToEndTestCase(unittest.TestCase):
        self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
        self.eventuallyTrue(lambda: k8s.check_statefulset_annotations(cluster_label, annotations), "Annotations missing")
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_standby_cluster(self):
        '''
        Create standby cluster streaming from remote primary
        '''
        k8s = self.k8s
        standby_cluster_name = 'acid-standby-cluster'
        cluster_name_label = 'cluster-name'
        cluster_label = 'application=spilo,{}={}'.format(cluster_name_label, standby_cluster_name)
        superuser_name = 'postgres'
        replication_user = 'standby'
        secret_suffix = 'credentials.postgresql.acid.zalan.do'
        # copy secrets from remote cluster before operator creates them when bootstrapping the standby cluster
        postgres_secret = k8s.get_secret(superuser_name)
        postgres_secret.metadata.name = '{}.{}.{}'.format(superuser_name, standby_cluster_name, secret_suffix)
        postgres_secret.metadata.labels[cluster_name_label] = standby_cluster_name
        k8s.create_secret(postgres_secret)
        standby_secret = k8s.get_secret(replication_user)
        standby_secret.metadata.name = '{}.{}.{}'.format(replication_user, standby_cluster_name, secret_suffix)
        standby_secret.metadata.labels[cluster_name_label] = standby_cluster_name
        k8s.create_secret(standby_secret)
        try:
            k8s.create_with_kubectl("manifests/standby-manifest.yaml")
            k8s.wait_for_pod_start("spilo-role=master," + cluster_label)
        except timeout_decorator.TimeoutError:
            print('Operator log: {}'.format(k8s.get_operator_log()))
            raise
        finally:
            # delete the standby cluster so that the k8s_api.get_operator_state works correctly in subsequent tests
            k8s.api.custom_objects_api.delete_namespaced_custom_object(
                "acid.zalan.do", "v1", "default", "postgresqls", "acid-standby-cluster")
            time.sleep(5)
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_taint_based_eviction(self):
        '''
--- a/manifests/standby-manifest.yaml
+++ b/manifests/standby-manifest.yaml
@ -12,6 +12,6 @@ spec:
    version: "14"
  # Make this a standby cluster and provide either the s3 bucket path of source cluster or the remote primary host for continuous streaming.
  standby:
-    s3_wal_path: "s3://mybucket/spilo/acid-minimal-cluster/abcd1234-2a4b-4b2a-8c9c-c1234defg567/wal/14/"
+    # s3_wal_path: "s3://mybucket/spilo/acid-minimal-cluster/abcd1234-2a4b-4b2a-8c9c-c1234defg567/wal/14/"
-    # standby_host: "acid-minimal-cluster.default"
+    standby_host: "acid-minimal-cluster.default"
    # standby_port: "5432"