toleration diff and nodeReadinessLabel merge with manifest matchExpressions (#1729)

* include tolerations in statefulset comparison * provide alternative merge behavior of nodeSelectorTerms for node readiness label * add config option to change affinity merge behavior * reworked e2e tests around node affinity
2022-01-27 15:57:24 +01:00 · 2022-01-27 15:57:24 +01:00 · a78a619e90
parent fe340192ca
commit a78a619e90
17 changed files with 245 additions and 104 deletions
--- a/charts/postgres-operator/crds/operatorconfigurations.yaml
+++ b/charts/postgres-operator/crds/operatorconfigurations.yaml
@ -233,6 +233,11 @@ spec:
                    type: object
                    additionalProperties:
                      type: string
+                  node_readiness_label_merge:
+                    type: string
+                    enum:
+                      - "AND"
+                      - "OR"
                  oauth_token_secret_name:
                    type: string
                    default: "postgresql-operator"
--- a/charts/postgres-operator/values.yaml
+++ b/charts/postgres-operator/values.yaml
@ -132,6 +132,9 @@ configKubernetes:
  # node_readiness_label:
  #   status: ready

+  # defines how nodeAffinity from manifest should be merged with node_readiness_label
+  # node_readiness_label_merge: "OR"
+
  # namespaced name of the secret containing the OAuth2 token to pass to the teams API
  # oauth_token_secret_name: postgresql-operator

--- a/docs/administrator.md
+++ b/docs/administrator.md
@ -339,6 +339,81 @@ master pods from being evicted by the K8s runtime. To prevent eviction
 completely, specify the toleration by leaving out the `tolerationSeconds` value
 (similar to how Kubernetes' own DaemonSets are configured)

+## Node readiness labels
+
+The operator can watch on certain node labels to detect e.g. the start of a
+Kubernetes cluster upgrade procedure and move master pods off the nodes to be
+decommissioned. Key-value pairs for these node readiness labels can be
+specified in the configuration (option name is in singular form):
+
+```yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: postgres-operator
+data:
+  node_readiness_label: "status1:ready,status2:ready"
+```
+
+```yaml
+apiVersion: "acid.zalan.do/v1"
+kind: OperatorConfiguration
+metadata:
+  name: postgresql-configuration
+configuration:
+  kubernetes:
+    node_readiness_label:
+      status1: ready
+      status2: ready
+```
+
+The operator will create a `nodeAffinity` on the pods. This makes the
+`node_readiness_label` option the global configuration for defining node
+affinities for all Postgres clusters. You can have both, cluster-specific and
+global affinity, defined and they will get merged on the pods. If
+`node_readiness_label_merge` is configured to `"AND"` the node readiness
+affinity will end up under the same `matchExpressions` section(s) from the
+manifest affinity.
+
+```yaml
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+        - matchExpressions:
+          - key: environment
+            operator: In
+            values:
+            - pci
+          - key: status1
+            operator: In
+            values:
+            - ready
+          - key: status2
+            ...
+```
+
+If `node_readiness_label_merge` is set to `"OR"` (default) the readiness label
+affinty will be appended with its own expressions block:
+
+```yaml
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+        - matchExpressions:
+          - key: environment
+            ...
+        - matchExpressions:
+          - key: storage
+            ...
+        - matchExpressions:
+          - key: status1
+            ...
+          - key: status2
+            ...
+```
+
 ## Enable pod anti affinity

 To ensure Postgres pods are running on different topologies, you can use
--- a/docs/reference/operator_parameters.md
+++ b/docs/reference/operator_parameters.md
@ -344,11 +344,16 @@ configuration they are grouped under the `kubernetes` key.

 * **node_readiness_label**
  a set of labels that a running and active node should possess to be
-  considered `ready`. The operator uses values of those labels to detect the
-  start of the Kubernetes cluster upgrade procedure and move master pods off
-  the nodes to be decommissioned. When the set is not empty, the operator also
-  assigns the `Affinity` clause to the Postgres pods to be scheduled only on
-  `ready` nodes. The default is empty.
+  considered `ready`. When the set is not empty, the operator assigns the
+  `nodeAffinity` clause to the Postgres pods to be scheduled only on `ready`
+  nodes. The default is empty.
+
+* **node_readiness_label_merge**
+  If a `nodeAffinity` is also specified in the postgres cluster manifest
+  it will get merged with the `node_readiness_label` affinity on the pods.
+  The merge strategy can be configured - it can either be "AND" or "OR".
+  See [user docs](../user.md#use-taints-tolerations-and-node-affinity-for-dedicated-postgresql-nodes)
+  for more details. Default is "OR".

 * **toleration**
  a dictionary that should contain `key`, `operator`, `value` and
--- a/docs/user.md
+++ b/docs/user.md
@ -671,7 +671,9 @@ configured [default requests](reference/operator_parameters.md#kubernetes-resour

 To ensure Postgres pods are running on nodes without any other application pods,
 you can use [taints and tolerations](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/)
-and configure the required toleration in the manifest.
+and configure the required toleration in the manifest. Tolerations can also be
+defined in the [operator config](administrator.md#use-taints-and-tolerations-for-dedicated-postgresql-nodes)
+to apply for all Postgres clusters.

 ```yaml
 spec:
@ -703,6 +705,9 @@ spec:
          - pci
 ```

+If you need to define a `nodeAffinity` for all your Postgres clusters use the
+`node_readiness_label` [configuration](administrator.md#node-readiness-labels).
+
 ## In-place major version upgrade

 Starting with Spilo 13, operator supports in-place major version upgrade to a
--- a/e2e/tests/k8s_api.py
+++ b/e2e/tests/k8s_api.py
@ -53,7 +53,7 @@ class K8s:

        return master_pod_node, replica_pod_nodes

-    def get_cluster_nodes(self, cluster_labels='cluster-name=acid-minimal-cluster', namespace='default'):
+    def get_cluster_nodes(self, cluster_labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'):
        m = []
        r = []
        podsList = self.api.core_v1.list_namespaced_pod(namespace, label_selector=cluster_labels)
--- a/e2e/tests/test_e2e.py
+++ b/e2e/tests/test_e2e.py
@ -286,7 +286,7 @@ class EndToEndTestCase(unittest.TestCase):
        # revert config change
        revert_resync = {
            "data": {
-                "resync_period": "30m",
+                "resync_period": "4m",
            },
        }
        k8s.update_config(revert_resync)
@ -880,12 +880,10 @@ class EndToEndTestCase(unittest.TestCase):

        # verify we are in good state from potential previous tests
        self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
-        self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")

        # get nodes of master and replica(s)
-        master_node, replica_nodes = k8s.get_pg_nodes(cluster_label)
-
-        self.assertNotEqual(master_node, [])
+        master_nodes, replica_nodes = k8s.get_cluster_nodes()
+        self.assertNotEqual(master_nodes, [])
        self.assertNotEqual(replica_nodes, [])

        # label node with environment=postgres
@ -898,8 +896,8 @@ class EndToEndTestCase(unittest.TestCase):
        }

        try:
-            # patch current master node with the label
-            k8s.api.core_v1.patch_node(master_node, node_label_body)
+            # patch master node with the label
+            k8s.api.core_v1.patch_node(master_nodes[0], node_label_body)

            # add node affinity to cluster
            patch_node_affinity_config = {
@ -923,7 +921,6 @@ class EndToEndTestCase(unittest.TestCase):
                    }
                }
            }
-
            k8s.api.custom_objects_api.patch_namespaced_custom_object(
                group="acid.zalan.do",
                version="v1",
@ -934,14 +931,17 @@ class EndToEndTestCase(unittest.TestCase):
            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")

            # node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement
-            k8s.wait_for_pod_failover(master_node, 'spilo-role=replica,' + cluster_label)
+            k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
+            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
+            # next master will be switched over and pod needs to be replaced as well to finish the rolling update
+            k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)

            podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label)
            for pod in podsList.items:
                if pod.metadata.labels.get('spilo-role') == 'replica':
-                    self.assertEqual(master_node, pod.spec.node_name,
-                         "Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_node, pod.spec.node_name))
+                    self.assertEqual(master_nodes[0], pod.spec.node_name,
+                         "Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_nodes[0], pod.spec.node_name))

                    # check that pod has correct node affinity
                    key = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].key
@ -966,15 +966,17 @@ class EndToEndTestCase(unittest.TestCase):
            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")

            # node affinity change should cause another rolling update and relocation of replica
-            k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label)
+            k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=replica,' + cluster_label)
            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)

        except timeout_decorator.TimeoutError:
            print('Operator log: {}'.format(k8s.get_operator_log()))
            raise

+        # toggle pod anti affinity to make sure replica and master run on separate nodes
+        self.assert_distributed_pods(replica_nodes)
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
-    @unittest.skip("Skipping this test until fixed")
    def test_node_readiness_label(self):
        '''
           Remove node readiness label from master node. This must cause a failover.
@ -984,12 +986,15 @@ class EndToEndTestCase(unittest.TestCase):
        readiness_label = 'lifecycle-status'
        readiness_value = 'ready'

-        try:
-            # get nodes of master and replica(s) (expected target of new master)
-            current_master_node, current_replica_nodes = k8s.get_pg_nodes(cluster_label)
-            num_replicas = len(current_replica_nodes)
-            failover_targets = self.get_failover_targets(current_master_node, current_replica_nodes)
+        # verify we are in good state from potential previous tests
+        self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")

+        # get nodes of master and replica(s) (expected target of new master)
+        master_nodes, replica_nodes = k8s.get_cluster_nodes()
+        self.assertNotEqual(master_nodes, [])
+        self.assertNotEqual(replica_nodes, [])
+
+        try:
            # add node_readiness_label to potential failover nodes
            patch_readiness_label = {
                "metadata": {
@ -998,30 +1003,43 @@ class EndToEndTestCase(unittest.TestCase):
                    }
                }
            }
-            self.assertTrue(len(failover_targets) > 0, "No failover targets available")
-            for failover_target in failover_targets:
-                k8s.api.core_v1.patch_node(failover_target, patch_readiness_label)
+            for replica_node in replica_nodes:
+                k8s.api.core_v1.patch_node(replica_node, patch_readiness_label)

-            # define node_readiness_label in config map which should trigger a failover of the master
+            # define node_readiness_label in config map which should trigger a rolling update
            patch_readiness_label_config = {
                "data": {
                    "node_readiness_label": readiness_label + ':' + readiness_value,
+                    "node_readiness_label_merge": "AND",
                }
            }
            k8s.update_config(patch_readiness_label_config, "setting readiness label")
-            new_master_node, new_replica_nodes = self.assert_failover(
-                current_master_node, num_replicas, failover_targets, cluster_label)
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
+
+            # first replica will be replaced and get the new affinity
+            # however, it might not start due to a volume node affinity conflict
+            # in this case only if the pvc and pod are deleted it can be scheduled
+            replica = k8s.get_cluster_replica_pod()
+            if replica.status.phase == 'Pending':
+                k8s.api.core_v1.delete_namespaced_persistent_volume_claim('pgdata-' + replica.metadata.name, 'default')
+                k8s.api.core_v1.delete_namespaced_pod(replica.metadata.name, 'default')
+                k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
+
+            # next master will be switched over and pod needs to be replaced as well to finish the rolling update
+            k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label)
+            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)

            # patch also node where master ran before
-            k8s.api.core_v1.patch_node(current_master_node, patch_readiness_label)
-
-            # toggle pod anti affinity to move replica away from master node
-            self.eventuallyTrue(lambda: self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label), "Pods are redistributed")
+            k8s.api.core_v1.patch_node(master_nodes[0], patch_readiness_label)

        except timeout_decorator.TimeoutError:
            print('Operator log: {}'.format(k8s.get_operator_log()))
            raise

+        # toggle pod anti affinity to move replica away from master node
+        self.assert_distributed_pods(master_nodes)
+
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_overwrite_pooler_deployment(self):
        k8s = self.k8s
@ -1309,7 +1327,7 @@ class EndToEndTestCase(unittest.TestCase):
            patch_resync_config = {
                "data": {
                    "pod_label_wait_timeout": "10m",
-                    "resync_period": "30m",
+                    "resync_period": "4m",
                }
            }
            k8s.update_config(patch_resync_config, "revert resync interval and pod_label_wait_timeout")
@ -1413,7 +1431,6 @@ class EndToEndTestCase(unittest.TestCase):
        self.eventuallyTrue(lambda: k8s.check_statefulset_annotations(cluster_label, annotations), "Annotations missing")

    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
-    @unittest.skip("Skipping this test until fixed")
    def test_taint_based_eviction(self):
        '''
           Add taint "postgres=:NoExecute" to node with master. This must cause a failover.
@ -1427,7 +1444,6 @@ class EndToEndTestCase(unittest.TestCase):

        # get nodes of master and replica(s) (expected target of new master)
        master_nodes, replica_nodes = k8s.get_cluster_nodes()
-
        self.assertNotEqual(master_nodes, [])
        self.assertNotEqual(replica_nodes, [])

@ -1442,10 +1458,7 @@ class EndToEndTestCase(unittest.TestCase):
                ]
            }
        }
-
        k8s.api.core_v1.patch_node(master_nodes[0], body)
-        self.eventuallyTrue(lambda: k8s.get_cluster_nodes()[0], replica_nodes)
-        self.assertNotEqual(lambda: k8s.get_cluster_nodes()[0], master_nodes)

        # add toleration to pods
        patch_toleration_config = {
@ -1454,15 +1467,20 @@ class EndToEndTestCase(unittest.TestCase):
            }
        }

-        k8s.update_config(patch_toleration_config, step="allow tainted nodes")
+        try:
+            k8s.update_config(patch_toleration_config, step="allow tainted nodes")
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"},
+                        "Operator does not get in sync")

-        self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
-        self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
+            self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
+            self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
+
+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise

        # toggle pod anti affinity to move replica away from master node
-        nm, new_replica_nodes = k8s.get_cluster_nodes()
-        new_master_node = nm[0]
-        self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label)
+        self.assert_distributed_pods(master_nodes)

    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_zz_cluster_deletion(self):
@ -1549,39 +1567,6 @@ class EndToEndTestCase(unittest.TestCase):
        }
        k8s.update_config(patch_delete_annotations)

-    def get_failover_targets(self, master_node, replica_nodes):
-        '''
-           If all pods live on the same node, failover will happen to other worker(s)
-        '''
-        k8s = self.k8s
-        k8s_master_exclusion = 'kubernetes.io/hostname!=postgres-operator-e2e-tests-control-plane'
-
-        failover_targets = [x for x in replica_nodes if x != master_node]
-        if len(failover_targets) == 0:
-            nodes = k8s.api.core_v1.list_node(label_selector=k8s_master_exclusion)
-            for n in nodes.items:
-                if n.metadata.name != master_node:
-                    failover_targets.append(n.metadata.name)
-
-        return failover_targets
-
-    def assert_failover(self, current_master_node, num_replicas, failover_targets, cluster_label):
-        '''
-           Check if master is failing over. The replica should move first to be the switchover target
-        '''
-        k8s = self.k8s
-        k8s.wait_for_pod_failover(failover_targets, 'spilo-role=master,' + cluster_label)
-        k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
-
-        new_master_node, new_replica_nodes = k8s.get_pg_nodes(cluster_label)
-        self.assertNotEqual(current_master_node, new_master_node,
-                            "Master on {} did not fail over to one of {}".format(current_master_node, failover_targets))
-        self.assertEqual(num_replicas, len(new_replica_nodes),
-                         "Expected {} replicas, found {}".format(num_replicas, len(new_replica_nodes)))
-        self.assert_master_is_unique()
-
-        return new_master_node, new_replica_nodes
-
    def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"):
        '''
           Check that there is a single pod in the k8s cluster with the label "spilo-role=master"
@ -1593,14 +1578,23 @@ class EndToEndTestCase(unittest.TestCase):
        num_of_master_pods = k8s.count_pods_with_label(labels, namespace)
        self.assertEqual(num_of_master_pods, 1, "Expected 1 master pod, found {}".format(num_of_master_pods))

-    def assert_distributed_pods(self, master_node, replica_nodes, cluster_label):
+    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
+    def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=acid-minimal-cluster'):
        '''
           Other tests can lead to the situation that master and replica are on the same node.
           Toggle pod anti affinty to distribute pods accross nodes (replica in particular).
        '''
        k8s = self.k8s
-        cluster_label = 'application=spilo,cluster-name=acid-minimal-cluster'
-        failover_targets = self.get_failover_targets(master_node, replica_nodes)
+        cluster_labels = 'application=spilo,cluster-name=acid-minimal-cluster'
+
+        # get nodes of master and replica(s)
+        master_nodes, replica_nodes = k8s.get_cluster_nodes()
+        self.assertNotEqual(master_nodes, [])
+        self.assertNotEqual(replica_nodes, [])
+
+        # if nodes are different we can quit here
+        if master_nodes[0] not in replica_nodes:
+            return True             

        # enable pod anti affintiy in config map which should trigger movement of replica
        patch_enable_antiaffinity = {
@ -1608,18 +1602,40 @@ class EndToEndTestCase(unittest.TestCase):
                "enable_pod_antiaffinity": "true"
            }
        }
-        k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
-        self.assert_failover(master_node, len(replica_nodes), failover_targets, cluster_label)

-        # now disable pod anti affintiy again which will cause yet another failover
-        patch_disable_antiaffinity = {
-            "data": {
-                "enable_pod_antiaffinity": "false"
+        try:
+            k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
+
+            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
+            k8s.wait_for_running_pods(cluster_labels, 2)
+
+            # now disable pod anti affintiy again which will cause yet another failover
+            patch_disable_antiaffinity = {
+                "data": {
+                    "enable_pod_antiaffinity": "false"
+                }
            }
-        }
-        k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
-        k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
-        k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
+            k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
+            self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
+            
+            k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
+            k8s.wait_for_running_pods(cluster_labels, 2)
+
+            master_nodes, replica_nodes = k8s.get_cluster_nodes()
+            self.assertNotEqual(master_nodes, [])
+            self.assertNotEqual(replica_nodes, [])
+
+            # if nodes are different we can quit here
+            for target_node in target_nodes:
+                if (target_node not in master_nodes or target_node not in replica_nodes) and master_nodes[0] in replica_nodes:
+                    print('Pods run on the same node') 
+                    return False
+
+        except timeout_decorator.TimeoutError:
+            print('Operator log: {}'.format(k8s.get_operator_log()))
+            raise
+
        return True

    def list_databases(self, pod_name):
--- a/manifests/configmap.yaml
+++ b/manifests/configmap.yaml
@ -86,7 +86,8 @@ data:
  # min_cpu_limit: 250m
  # min_memory_limit: 250Mi
  # minimal_major_version: "9.6"
-  # node_readiness_label: ""
+  # node_readiness_label: "status:ready"
+  # node_readiness_label_merge: "OR"
  # oauth_token_secret_name: postgresql-operator
  # pam_configuration: |
  #  https://info.example.com/oauth2/tokeninfo?access_token= uid realm=/employees
--- a/manifests/operatorconfiguration.crd.yaml
+++ b/manifests/operatorconfiguration.crd.yaml
@ -228,6 +228,11 @@ spec:
                    type: object
                    additionalProperties:
                      type: string
+                  node_readiness_label_merge:
+                    type: string
+                    enum:
+                      - "AND"
+                      - "OR"
                  oauth_token_secret_name:
                    type: string
                    default: "postgresql-operator"
--- a/manifests/postgresql-operator-default-configuration.yaml
+++ b/manifests/postgresql-operator-default-configuration.yaml
@ -70,6 +70,7 @@ configuration:
    master_pod_move_timeout: 20m
    # node_readiness_label:
    #   status: ready
+    # node_readiness_label_merge: "OR"
    oauth_token_secret_name: postgresql-operator
    pdb_name_format: "postgres-{cluster}-pdb"
    pod_antiaffinity_topology_key: "kubernetes.io/hostname"
--- a/pkg/apis/acid.zalan.do/v1/crds.go
+++ b/pkg/apis/acid.zalan.do/v1/crds.go
@ -1167,6 +1167,17 @@ var OperatorConfigCRDResourceValidation = apiextv1.CustomResourceValidation{
 									},
 								},
 							},
+							"node_readiness_label_merge": {
+								Type: "string",
+								Enum: []apiextv1.JSON{
+									{
+										Raw: []byte(`"AND"`),
+									},
+									{
+										Raw: []byte(`"OR"`),
+									},
+								},
+							},
 							"oauth_token_secret_name": {
 								Type: "string",
 							},
--- a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go
+++ b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go
@ -82,6 +82,7 @@ type KubernetesMetaConfiguration struct {
 	DeleteAnnotationDateKey                string                       `json:"delete_annotation_date_key,omitempty"`
 	DeleteAnnotationNameKey                string                       `json:"delete_annotation_name_key,omitempty"`
 	NodeReadinessLabel                     map[string]string            `json:"node_readiness_label,omitempty"`
+	NodeReadinessLabelMerge                string                       `json:"node_readiness_label_merge,omitempty"`
 	CustomPodAnnotations                   map[string]string            `json:"custom_pod_annotations,omitempty"`
 	// TODO: use a proper toleration structure?
 	PodToleration              map[string]string   `json:"toleration,omitempty"`
--- a/pkg/cluster/cluster.go
+++ b/pkg/cluster/cluster.go
@ -375,7 +375,6 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
 		reasons = append(reasons, "new statefulset's number of replicas does not match the current one")
 	}
 	if !reflect.DeepEqual(c.Statefulset.Annotations, statefulSet.Annotations) {
-		match = false
 		needsReplace = true
 		reasons = append(reasons, "new statefulset's annotations do not match the current one")
 	}
@ -406,6 +405,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
 		needsRollUpdate = true
 		reasons = append(reasons, "new statefulset's pod affinity does not match the current one")
 	}
+	if len(c.Statefulset.Spec.Template.Spec.Tolerations) != len(statefulSet.Spec.Template.Spec.Tolerations) {
+		needsReplace = true
+		needsRollUpdate = true
+		reasons = append(reasons, "new statefulset's pod tolerations does not match the current one")
+	}

 	// Some generated fields like creationTimestamp make it not possible to use DeepCompare on Spec.Template.ObjectMeta
 	if !reflect.DeepEqual(c.Statefulset.Spec.Template.Labels, statefulSet.Spec.Template.Labels) {
@ -427,13 +431,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
 	}

 	if !reflect.DeepEqual(c.Statefulset.Spec.Template.Annotations, statefulSet.Spec.Template.Annotations) {
-		match = false
 		needsReplace = true
 		needsRollUpdate = true
 		reasons = append(reasons, "new statefulset's pod template metadata annotations does not match the current one")
 	}
 	if !reflect.DeepEqual(c.Statefulset.Spec.Template.Spec.SecurityContext, statefulSet.Spec.Template.Spec.SecurityContext) {
-		match = false
 		needsReplace = true
 		needsRollUpdate = true
 		reasons = append(reasons, "new statefulset's pod template security context in spec does not match the current one")
@ -469,7 +471,6 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
 	// we assume any change in priority happens by rolling out a new priority class
 	// changing the priority value in an existing class is not supproted
 	if c.Statefulset.Spec.Template.Spec.PriorityClassName != statefulSet.Spec.Template.Spec.PriorityClassName {
-		match = false
 		needsReplace = true
 		needsRollUpdate = true
 		reasons = append(reasons, "new statefulset's pod priority class in spec does not match the current one")
--- a/pkg/cluster/connection_pooler.go
+++ b/pkg/cluster/connection_pooler.go
@ -309,7 +309,7 @@ func (c *Cluster) generateConnectionPoolerPodTemplate(role PostgresRole) (
 		},
 	}

-	nodeAffinity := nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity)
+	nodeAffinity := c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity)
 	if c.OpConfig.EnablePodAntiAffinity {
 		labelsSet := labels.Set(c.connectionPoolerLabels(role, false).MatchLabels)
 		podTemplate.Spec.Affinity = generatePodAffinity(labelsSet, c.OpConfig.PodAntiAffinityTopologyKey, nodeAffinity)
--- a/pkg/cluster/k8sres.go
+++ b/pkg/cluster/k8sres.go
@ -327,7 +327,7 @@ func generateCapabilities(capabilities []string) *v1.Capabilities {
 	return nil
 }

-func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity {
+func (c *Cluster) nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity {
 	if len(nodeReadinessLabel) == 0 && nodeAffinity == nil {
 		return nil
 	}
@ -352,8 +352,18 @@ func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAff
 				},
 			}
 		} else {
-			nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
-				NodeSelectorTerms: append(nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, nodeReadinessSelectorTerm),
+			if c.OpConfig.NodeReadinessLabelMerge == "OR" {
+				manifestTerms := nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
+				manifestTerms = append(manifestTerms, nodeReadinessSelectorTerm)
+				nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
+					NodeSelectorTerms: manifestTerms,
+				}
+			} else if c.OpConfig.NodeReadinessLabelMerge == "AND" {
+				for i, nodeSelectorTerm := range nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms {
+					manifestExpressions := nodeSelectorTerm.MatchExpressions
+					manifestExpressions = append(manifestExpressions, matchExpressions...)
+					nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[i] = v1.NodeSelectorTerm{MatchExpressions: manifestExpressions}
+				}
 			}
 		}
 	}
@ -1260,7 +1270,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef
 		effectiveRunAsUser,
 		effectiveRunAsGroup,
 		effectiveFSGroup,
-		nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity),
+		c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity),
 		spec.SchedulerName,
 		int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
 		c.OpConfig.PodServiceAccountName,
@ -2010,7 +2020,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1beta1.CronJob, error) {
 		nil,
 		nil,
 		nil,
-		nodeAffinity(c.OpConfig.NodeReadinessLabel, nil),
+		c.nodeAffinity(c.OpConfig.NodeReadinessLabel, nil),
 		nil,
 		int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
 		c.OpConfig.PodServiceAccountName,
--- a/pkg/controller/operator_config.go
+++ b/pkg/controller/operator_config.go
@ -110,6 +110,7 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur
 	result.DeleteAnnotationDateKey = fromCRD.Kubernetes.DeleteAnnotationDateKey
 	result.DeleteAnnotationNameKey = fromCRD.Kubernetes.DeleteAnnotationNameKey
 	result.NodeReadinessLabel = fromCRD.Kubernetes.NodeReadinessLabel
+	result.NodeReadinessLabelMerge = fromCRD.Kubernetes.NodeReadinessLabelMerge
 	result.PodPriorityClassName = fromCRD.Kubernetes.PodPriorityClassName
 	result.PodManagementPolicy = util.Coalesce(fromCRD.Kubernetes.PodManagementPolicy, "ordered_ready")
 	result.MasterPodMoveTimeout = util.CoalesceDuration(time.Duration(fromCRD.Kubernetes.MasterPodMoveTimeout), "10m")
--- a/pkg/util/config/config.go
+++ b/pkg/util/config/config.go
@ -55,6 +55,7 @@ type Resources struct {
 	PodEnvironmentConfigMap       spec.NamespacedName `name:"pod_environment_configmap"`
 	PodEnvironmentSecret          string              `name:"pod_environment_secret"`
 	NodeReadinessLabel            map[string]string   `name:"node_readiness_label" default:""`
+	NodeReadinessLabelMerge       string              `name:"node_readiness_label_merge" default:"OR"`
 	MaxInstances                  int32               `name:"max_instances" default:"-1"`
 	MinInstances                  int32               `name:"min_instances" default:"-1"`
 	ShmVolume                     *bool               `name:"enable_shm_volume" default:"true"`