add more docs and e2e test

This commit is contained in:
Felix Kunde 2022-04-01 17:03:40 +02:00
parent bd96226f1a
commit 6f72a3a3a0
4 changed files with 89 additions and 37 deletions

View File

@ -838,15 +838,15 @@ point you should restore.
## Setting up a standby cluster
Standby cluster is a [Patroni feature](https://github.com/zalando/patroni/blob/master/docs/replica_bootstrap.rst#standby-cluster)
that first clones a database, and keeps replicating changes afterwards. As the
replication is happening by the means of archived WAL files (stored on S3 or
the equivalent of other cloud providers), the standby cluster can exist in a
different location than its source database. Unlike cloning, the PostgreSQL
version between source and target cluster has to be the same.
that first clones a database, and keeps replicating changes afterwards. It can
exist in a different location than its source database, but unlike cloning,
the PostgreSQL version between source and target cluster has to be the same.
To start a cluster as standby, add the following `standby` section in the YAML
file. Specify the S3/GS bucket path. Omitting both settings will result in an error
and no statefulset will be created.
file. You can stream changes from archived WAL files (AWS S3 or Google Cloud
Storage) or from a remote primary where you specify the host address and port.
If you leave out the port, Patroni will use `"5432"`. Only one option can be
specfied in the manifest:
```yaml
spec:
@ -860,32 +860,42 @@ spec:
gs_wal_path: "gs://<bucketname>/spilo/<source_db_cluster>/<UID>/wal/<PGVERSION>"
```
At the moment, the operator only allows to stream from the WAL archive of the
master. Thus, it is recommended to deploy standby clusters with only [one pod](https://github.com/zalando/postgres-operator/blob/master/manifests/standby-manifest.yaml#L10).
You can raise the instance count when detaching. Note, that the same pod role
labels like for normal clusters are used: The standby leader is labeled as
`master`.
```yaml
spec:
standby:
standby_host: "acid-minimal-cluster.default"
standby_port: "5433"
```
Note, that the pods and services use the same role labels like for normal clusters:
The standby leader is labeled as `master`. When using the `standby_host` option
you have to copy the credentials from the source cluster's secrets to successfully
bootstrap a standby cluster (see next chapter).
### Providing credentials of source cluster
A standby cluster is replicating the data (including users and passwords) from
the source database and is read-only. The system and application users (like
standby, postgres etc.) all have a password that does not match the credentials
stored in secrets which are created by the operator. One solution is to create
secrets beforehand and paste in the credentials of the source cluster.
stored in secrets which are created by the operator. You have two options:
a. Create secrets manually beforehand and paste the credentials of the source
cluster
b. Let the operator create the secrets when it bootstraps the standby cluster.
Patch the secrets with the credentials of the source cluster. Replace the
spilo pods.
Otherwise, you will see errors in the Postgres logs saying users cannot log in
and the operator logs will complain about not being able to sync resources.
If you stream changes from a remote primary you have to align the secrets or
the standby cluster will not start up.
When you only run a standby leader, you can safely ignore this, as it will be
sorted out once the cluster is detached from the source. It is also harmless if
you dont plan it. But, when you created a standby replica, too, fix the
credentials right away. WAL files will pile up on the standby leader if no
connection can be established between standby replica(s). You can also edit the
secrets after their creation. Find them by:
```bash
kubectl get secrets --all-namespaces | grep <standby-cluster-name>
```
If you stream changes from WAL files and you only run a standby leader, you
can safely ignore the secret mismatch, as it will be sorted out once the
cluster is detached from the source. It is also harmless if you do not plan it.
But, when you create a standby replica, too, fix the credentials right away.
WAL files will pile up on the standby leader if no connection can be
established between standby replica(s).
### Promote the standby

View File

@ -321,9 +321,15 @@ class K8s:
def get_cluster_replica_pod(self, labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'):
return self.get_cluster_pod('replica', labels, namespace)
def get_secret_data(self, username, clustername='acid-minimal-cluster', namespace='default'):
return self.api.core_v1.read_namespaced_secret(
"{}.{}.credentials.postgresql.acid.zalan.do".format(username.replace("_","-"), clustername), namespace).data
def get_secret(self, username, clustername='acid-minimal-cluster', namespace='default'):
secret = self.api.core_v1.read_namespaced_secret(
"{}.{}.credentials.postgresql.acid.zalan.do".format(username.replace("_","-"), clustername), namespace)
secret.metadata.resource_version = None
secret.metadata.uid = None
return secret
def create_secret(self, secret, namespace='default'):
return self.api.core_v1.create_namespaced_secret(namespace, secret)
class K8sBase:
'''

View File

@ -1319,8 +1319,8 @@ class EndToEndTestCase(unittest.TestCase):
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# check if next rotation date was set in secret
secret_data = k8s.get_secret_data("zalando")
next_rotation_timestamp = datetime.strptime(str(base64.b64decode(secret_data["nextRotation"]), 'utf-8'), "%Y-%m-%dT%H:%M:%SZ")
zalando_secret = k8s.get_secret("zalando")
next_rotation_timestamp = datetime.strptime(str(base64.b64decode(zalando_secret.data["nextRotation"]), 'utf-8'), "%Y-%m-%dT%H:%M:%SZ")
today90days = today+timedelta(days=90)
self.assertEqual(today90days, next_rotation_timestamp.date(),
"Unexpected rotation date in secret of zalando user: expected {}, got {}".format(today90days, next_rotation_timestamp.date()))
@ -1361,9 +1361,9 @@ class EndToEndTestCase(unittest.TestCase):
"Operator does not get in sync")
# check if next rotation date and username have been replaced
secret_data = k8s.get_secret_data("foo_user")
secret_username = str(base64.b64decode(secret_data["username"]), 'utf-8')
next_rotation_timestamp = datetime.strptime(str(base64.b64decode(secret_data["nextRotation"]), 'utf-8'), "%Y-%m-%dT%H:%M:%SZ")
foo_user_secret = k8s.get_secret("foo_user")
secret_username = str(base64.b64decode(foo_user_secret.data["username"]), 'utf-8')
next_rotation_timestamp = datetime.strptime(str(base64.b64decode(foo_user_secret.data["nextRotation"]), 'utf-8'), "%Y-%m-%dT%H:%M:%SZ")
rotation_user = "foo_user"+today.strftime("%y%m%d")
today30days = today+timedelta(days=30)
@ -1396,9 +1396,9 @@ class EndToEndTestCase(unittest.TestCase):
"Operator does not get in sync")
# check if username in foo_user secret is reset
secret_data = k8s.get_secret_data("foo_user")
secret_username = str(base64.b64decode(secret_data["username"]), 'utf-8')
next_rotation_timestamp = str(base64.b64decode(secret_data["nextRotation"]), 'utf-8')
foo_user_secret = k8s.get_secret("foo_user")
secret_username = str(base64.b64decode(foo_user_secret.data["username"]), 'utf-8')
next_rotation_timestamp = str(base64.b64decode(foo_user_secret.data["nextRotation"]), 'utf-8')
self.assertEqual("foo_user", secret_username,
"Unexpected username in secret of foo_user: expected {}, got {}".format("foo_user", secret_username))
self.assertEqual('', next_rotation_timestamp,
@ -1644,6 +1644,42 @@ class EndToEndTestCase(unittest.TestCase):
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
self.eventuallyTrue(lambda: k8s.check_statefulset_annotations(cluster_label, annotations), "Annotations missing")
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_standby_cluster(self):
'''
Create standby cluster streaming from remote primary
'''
k8s = self.k8s
standby_cluster_name = 'acid-standby-cluster'
cluster_name_label = 'cluster-name'
cluster_label = 'application=spilo,{}={}'.format(cluster_name_label, standby_cluster_name)
superuser_name = 'postgres'
replication_user = 'standby'
secret_suffix = 'credentials.postgresql.acid.zalan.do'
# copy secrets from remote cluster before operator creates them when bootstrapping the standby cluster
postgres_secret = k8s.get_secret(superuser_name)
postgres_secret.metadata.name = '{}.{}.{}'.format(superuser_name, standby_cluster_name, secret_suffix)
postgres_secret.metadata.labels[cluster_name_label] = standby_cluster_name
k8s.create_secret(postgres_secret)
standby_secret = k8s.get_secret(replication_user)
standby_secret.metadata.name = '{}.{}.{}'.format(replication_user, standby_cluster_name, secret_suffix)
standby_secret.metadata.labels[cluster_name_label] = standby_cluster_name
k8s.create_secret(standby_secret)
try:
k8s.create_with_kubectl("manifests/standby-manifest.yaml")
k8s.wait_for_pod_start("spilo-role=master," + cluster_label)
except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log()))
raise
finally:
# delete the standby cluster so that the k8s_api.get_operator_state works correctly in subsequent tests
k8s.api.custom_objects_api.delete_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-standby-cluster")
time.sleep(5)
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_taint_based_eviction(self):
'''

View File

@ -12,6 +12,6 @@ spec:
version: "14"
# Make this a standby cluster and provide either the s3 bucket path of source cluster or the remote primary host for continuous streaming.
standby:
s3_wal_path: "s3://mybucket/spilo/acid-minimal-cluster/abcd1234-2a4b-4b2a-8c9c-c1234defg567/wal/14/"
# standby_host: "acid-minimal-cluster.default"
# s3_wal_path: "s3://mybucket/spilo/acid-minimal-cluster/abcd1234-2a4b-4b2a-8c9c-c1234defg567/wal/14/"
standby_host: "acid-minimal-cluster.default"
# standby_port: "5432"