toleration diff and nodeReadinessLabel merge with manifest matchExpressions (#1729)
* include tolerations in statefulset comparison * provide alternative merge behavior of nodeSelectorTerms for node readiness label * add config option to change affinity merge behavior * reworked e2e tests around node affinity
This commit is contained in:
parent
fe340192ca
commit
a78a619e90
|
|
@ -233,6 +233,11 @@ spec:
|
|||
type: object
|
||||
additionalProperties:
|
||||
type: string
|
||||
node_readiness_label_merge:
|
||||
type: string
|
||||
enum:
|
||||
- "AND"
|
||||
- "OR"
|
||||
oauth_token_secret_name:
|
||||
type: string
|
||||
default: "postgresql-operator"
|
||||
|
|
|
|||
|
|
@ -132,6 +132,9 @@ configKubernetes:
|
|||
# node_readiness_label:
|
||||
# status: ready
|
||||
|
||||
# defines how nodeAffinity from manifest should be merged with node_readiness_label
|
||||
# node_readiness_label_merge: "OR"
|
||||
|
||||
# namespaced name of the secret containing the OAuth2 token to pass to the teams API
|
||||
# oauth_token_secret_name: postgresql-operator
|
||||
|
||||
|
|
|
|||
|
|
@ -339,6 +339,81 @@ master pods from being evicted by the K8s runtime. To prevent eviction
|
|||
completely, specify the toleration by leaving out the `tolerationSeconds` value
|
||||
(similar to how Kubernetes' own DaemonSets are configured)
|
||||
|
||||
## Node readiness labels
|
||||
|
||||
The operator can watch on certain node labels to detect e.g. the start of a
|
||||
Kubernetes cluster upgrade procedure and move master pods off the nodes to be
|
||||
decommissioned. Key-value pairs for these node readiness labels can be
|
||||
specified in the configuration (option name is in singular form):
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: postgres-operator
|
||||
data:
|
||||
node_readiness_label: "status1:ready,status2:ready"
|
||||
```
|
||||
|
||||
```yaml
|
||||
apiVersion: "acid.zalan.do/v1"
|
||||
kind: OperatorConfiguration
|
||||
metadata:
|
||||
name: postgresql-configuration
|
||||
configuration:
|
||||
kubernetes:
|
||||
node_readiness_label:
|
||||
status1: ready
|
||||
status2: ready
|
||||
```
|
||||
|
||||
The operator will create a `nodeAffinity` on the pods. This makes the
|
||||
`node_readiness_label` option the global configuration for defining node
|
||||
affinities for all Postgres clusters. You can have both, cluster-specific and
|
||||
global affinity, defined and they will get merged on the pods. If
|
||||
`node_readiness_label_merge` is configured to `"AND"` the node readiness
|
||||
affinity will end up under the same `matchExpressions` section(s) from the
|
||||
manifest affinity.
|
||||
|
||||
```yaml
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: environment
|
||||
operator: In
|
||||
values:
|
||||
- pci
|
||||
- key: status1
|
||||
operator: In
|
||||
values:
|
||||
- ready
|
||||
- key: status2
|
||||
...
|
||||
```
|
||||
|
||||
If `node_readiness_label_merge` is set to `"OR"` (default) the readiness label
|
||||
affinty will be appended with its own expressions block:
|
||||
|
||||
```yaml
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: environment
|
||||
...
|
||||
- matchExpressions:
|
||||
- key: storage
|
||||
...
|
||||
- matchExpressions:
|
||||
- key: status1
|
||||
...
|
||||
- key: status2
|
||||
...
|
||||
```
|
||||
|
||||
## Enable pod anti affinity
|
||||
|
||||
To ensure Postgres pods are running on different topologies, you can use
|
||||
|
|
|
|||
|
|
@ -344,11 +344,16 @@ configuration they are grouped under the `kubernetes` key.
|
|||
|
||||
* **node_readiness_label**
|
||||
a set of labels that a running and active node should possess to be
|
||||
considered `ready`. The operator uses values of those labels to detect the
|
||||
start of the Kubernetes cluster upgrade procedure and move master pods off
|
||||
the nodes to be decommissioned. When the set is not empty, the operator also
|
||||
assigns the `Affinity` clause to the Postgres pods to be scheduled only on
|
||||
`ready` nodes. The default is empty.
|
||||
considered `ready`. When the set is not empty, the operator assigns the
|
||||
`nodeAffinity` clause to the Postgres pods to be scheduled only on `ready`
|
||||
nodes. The default is empty.
|
||||
|
||||
* **node_readiness_label_merge**
|
||||
If a `nodeAffinity` is also specified in the postgres cluster manifest
|
||||
it will get merged with the `node_readiness_label` affinity on the pods.
|
||||
The merge strategy can be configured - it can either be "AND" or "OR".
|
||||
See [user docs](../user.md#use-taints-tolerations-and-node-affinity-for-dedicated-postgresql-nodes)
|
||||
for more details. Default is "OR".
|
||||
|
||||
* **toleration**
|
||||
a dictionary that should contain `key`, `operator`, `value` and
|
||||
|
|
|
|||
|
|
@ -671,7 +671,9 @@ configured [default requests](reference/operator_parameters.md#kubernetes-resour
|
|||
|
||||
To ensure Postgres pods are running on nodes without any other application pods,
|
||||
you can use [taints and tolerations](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/)
|
||||
and configure the required toleration in the manifest.
|
||||
and configure the required toleration in the manifest. Tolerations can also be
|
||||
defined in the [operator config](administrator.md#use-taints-and-tolerations-for-dedicated-postgresql-nodes)
|
||||
to apply for all Postgres clusters.
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
|
|
@ -703,6 +705,9 @@ spec:
|
|||
- pci
|
||||
```
|
||||
|
||||
If you need to define a `nodeAffinity` for all your Postgres clusters use the
|
||||
`node_readiness_label` [configuration](administrator.md#node-readiness-labels).
|
||||
|
||||
## In-place major version upgrade
|
||||
|
||||
Starting with Spilo 13, operator supports in-place major version upgrade to a
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ class K8s:
|
|||
|
||||
return master_pod_node, replica_pod_nodes
|
||||
|
||||
def get_cluster_nodes(self, cluster_labels='cluster-name=acid-minimal-cluster', namespace='default'):
|
||||
def get_cluster_nodes(self, cluster_labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'):
|
||||
m = []
|
||||
r = []
|
||||
podsList = self.api.core_v1.list_namespaced_pod(namespace, label_selector=cluster_labels)
|
||||
|
|
|
|||
|
|
@ -286,7 +286,7 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
# revert config change
|
||||
revert_resync = {
|
||||
"data": {
|
||||
"resync_period": "30m",
|
||||
"resync_period": "4m",
|
||||
},
|
||||
}
|
||||
k8s.update_config(revert_resync)
|
||||
|
|
@ -880,12 +880,10 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
|
||||
# verify we are in good state from potential previous tests
|
||||
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
|
||||
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
|
||||
|
||||
# get nodes of master and replica(s)
|
||||
master_node, replica_nodes = k8s.get_pg_nodes(cluster_label)
|
||||
|
||||
self.assertNotEqual(master_node, [])
|
||||
master_nodes, replica_nodes = k8s.get_cluster_nodes()
|
||||
self.assertNotEqual(master_nodes, [])
|
||||
self.assertNotEqual(replica_nodes, [])
|
||||
|
||||
# label node with environment=postgres
|
||||
|
|
@ -898,8 +896,8 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
}
|
||||
|
||||
try:
|
||||
# patch current master node with the label
|
||||
k8s.api.core_v1.patch_node(master_node, node_label_body)
|
||||
# patch master node with the label
|
||||
k8s.api.core_v1.patch_node(master_nodes[0], node_label_body)
|
||||
|
||||
# add node affinity to cluster
|
||||
patch_node_affinity_config = {
|
||||
|
|
@ -923,7 +921,6 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
k8s.api.custom_objects_api.patch_namespaced_custom_object(
|
||||
group="acid.zalan.do",
|
||||
version="v1",
|
||||
|
|
@ -934,14 +931,17 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
||||
|
||||
# node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement
|
||||
k8s.wait_for_pod_failover(master_node, 'spilo-role=replica,' + cluster_label)
|
||||
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
|
||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
||||
# next master will be switched over and pod needs to be replaced as well to finish the rolling update
|
||||
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
|
||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
||||
|
||||
podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label)
|
||||
for pod in podsList.items:
|
||||
if pod.metadata.labels.get('spilo-role') == 'replica':
|
||||
self.assertEqual(master_node, pod.spec.node_name,
|
||||
"Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_node, pod.spec.node_name))
|
||||
self.assertEqual(master_nodes[0], pod.spec.node_name,
|
||||
"Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_nodes[0], pod.spec.node_name))
|
||||
|
||||
# check that pod has correct node affinity
|
||||
key = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].key
|
||||
|
|
@ -966,15 +966,17 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
||||
|
||||
# node affinity change should cause another rolling update and relocation of replica
|
||||
k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label)
|
||||
k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=replica,' + cluster_label)
|
||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
||||
|
||||
except timeout_decorator.TimeoutError:
|
||||
print('Operator log: {}'.format(k8s.get_operator_log()))
|
||||
raise
|
||||
|
||||
# toggle pod anti affinity to make sure replica and master run on separate nodes
|
||||
self.assert_distributed_pods(replica_nodes)
|
||||
|
||||
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
|
||||
@unittest.skip("Skipping this test until fixed")
|
||||
def test_node_readiness_label(self):
|
||||
'''
|
||||
Remove node readiness label from master node. This must cause a failover.
|
||||
|
|
@ -984,12 +986,15 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
readiness_label = 'lifecycle-status'
|
||||
readiness_value = 'ready'
|
||||
|
||||
try:
|
||||
# get nodes of master and replica(s) (expected target of new master)
|
||||
current_master_node, current_replica_nodes = k8s.get_pg_nodes(cluster_label)
|
||||
num_replicas = len(current_replica_nodes)
|
||||
failover_targets = self.get_failover_targets(current_master_node, current_replica_nodes)
|
||||
# verify we are in good state from potential previous tests
|
||||
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
|
||||
|
||||
# get nodes of master and replica(s) (expected target of new master)
|
||||
master_nodes, replica_nodes = k8s.get_cluster_nodes()
|
||||
self.assertNotEqual(master_nodes, [])
|
||||
self.assertNotEqual(replica_nodes, [])
|
||||
|
||||
try:
|
||||
# add node_readiness_label to potential failover nodes
|
||||
patch_readiness_label = {
|
||||
"metadata": {
|
||||
|
|
@ -998,30 +1003,43 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
}
|
||||
}
|
||||
}
|
||||
self.assertTrue(len(failover_targets) > 0, "No failover targets available")
|
||||
for failover_target in failover_targets:
|
||||
k8s.api.core_v1.patch_node(failover_target, patch_readiness_label)
|
||||
for replica_node in replica_nodes:
|
||||
k8s.api.core_v1.patch_node(replica_node, patch_readiness_label)
|
||||
|
||||
# define node_readiness_label in config map which should trigger a failover of the master
|
||||
# define node_readiness_label in config map which should trigger a rolling update
|
||||
patch_readiness_label_config = {
|
||||
"data": {
|
||||
"node_readiness_label": readiness_label + ':' + readiness_value,
|
||||
"node_readiness_label_merge": "AND",
|
||||
}
|
||||
}
|
||||
k8s.update_config(patch_readiness_label_config, "setting readiness label")
|
||||
new_master_node, new_replica_nodes = self.assert_failover(
|
||||
current_master_node, num_replicas, failover_targets, cluster_label)
|
||||
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
||||
|
||||
# first replica will be replaced and get the new affinity
|
||||
# however, it might not start due to a volume node affinity conflict
|
||||
# in this case only if the pvc and pod are deleted it can be scheduled
|
||||
replica = k8s.get_cluster_replica_pod()
|
||||
if replica.status.phase == 'Pending':
|
||||
k8s.api.core_v1.delete_namespaced_persistent_volume_claim('pgdata-' + replica.metadata.name, 'default')
|
||||
k8s.api.core_v1.delete_namespaced_pod(replica.metadata.name, 'default')
|
||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
||||
|
||||
# next master will be switched over and pod needs to be replaced as well to finish the rolling update
|
||||
k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label)
|
||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
||||
|
||||
# patch also node where master ran before
|
||||
k8s.api.core_v1.patch_node(current_master_node, patch_readiness_label)
|
||||
|
||||
# toggle pod anti affinity to move replica away from master node
|
||||
self.eventuallyTrue(lambda: self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label), "Pods are redistributed")
|
||||
k8s.api.core_v1.patch_node(master_nodes[0], patch_readiness_label)
|
||||
|
||||
except timeout_decorator.TimeoutError:
|
||||
print('Operator log: {}'.format(k8s.get_operator_log()))
|
||||
raise
|
||||
|
||||
# toggle pod anti affinity to move replica away from master node
|
||||
self.assert_distributed_pods(master_nodes)
|
||||
|
||||
|
||||
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
|
||||
def test_overwrite_pooler_deployment(self):
|
||||
k8s = self.k8s
|
||||
|
|
@ -1309,7 +1327,7 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
patch_resync_config = {
|
||||
"data": {
|
||||
"pod_label_wait_timeout": "10m",
|
||||
"resync_period": "30m",
|
||||
"resync_period": "4m",
|
||||
}
|
||||
}
|
||||
k8s.update_config(patch_resync_config, "revert resync interval and pod_label_wait_timeout")
|
||||
|
|
@ -1413,7 +1431,6 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
self.eventuallyTrue(lambda: k8s.check_statefulset_annotations(cluster_label, annotations), "Annotations missing")
|
||||
|
||||
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
|
||||
@unittest.skip("Skipping this test until fixed")
|
||||
def test_taint_based_eviction(self):
|
||||
'''
|
||||
Add taint "postgres=:NoExecute" to node with master. This must cause a failover.
|
||||
|
|
@ -1427,7 +1444,6 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
|
||||
# get nodes of master and replica(s) (expected target of new master)
|
||||
master_nodes, replica_nodes = k8s.get_cluster_nodes()
|
||||
|
||||
self.assertNotEqual(master_nodes, [])
|
||||
self.assertNotEqual(replica_nodes, [])
|
||||
|
||||
|
|
@ -1442,10 +1458,7 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
]
|
||||
}
|
||||
}
|
||||
|
||||
k8s.api.core_v1.patch_node(master_nodes[0], body)
|
||||
self.eventuallyTrue(lambda: k8s.get_cluster_nodes()[0], replica_nodes)
|
||||
self.assertNotEqual(lambda: k8s.get_cluster_nodes()[0], master_nodes)
|
||||
|
||||
# add toleration to pods
|
||||
patch_toleration_config = {
|
||||
|
|
@ -1454,15 +1467,20 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
}
|
||||
}
|
||||
|
||||
k8s.update_config(patch_toleration_config, step="allow tainted nodes")
|
||||
try:
|
||||
k8s.update_config(patch_toleration_config, step="allow tainted nodes")
|
||||
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"},
|
||||
"Operator does not get in sync")
|
||||
|
||||
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
|
||||
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
|
||||
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
|
||||
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
|
||||
|
||||
except timeout_decorator.TimeoutError:
|
||||
print('Operator log: {}'.format(k8s.get_operator_log()))
|
||||
raise
|
||||
|
||||
# toggle pod anti affinity to move replica away from master node
|
||||
nm, new_replica_nodes = k8s.get_cluster_nodes()
|
||||
new_master_node = nm[0]
|
||||
self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label)
|
||||
self.assert_distributed_pods(master_nodes)
|
||||
|
||||
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
|
||||
def test_zz_cluster_deletion(self):
|
||||
|
|
@ -1549,39 +1567,6 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
}
|
||||
k8s.update_config(patch_delete_annotations)
|
||||
|
||||
def get_failover_targets(self, master_node, replica_nodes):
|
||||
'''
|
||||
If all pods live on the same node, failover will happen to other worker(s)
|
||||
'''
|
||||
k8s = self.k8s
|
||||
k8s_master_exclusion = 'kubernetes.io/hostname!=postgres-operator-e2e-tests-control-plane'
|
||||
|
||||
failover_targets = [x for x in replica_nodes if x != master_node]
|
||||
if len(failover_targets) == 0:
|
||||
nodes = k8s.api.core_v1.list_node(label_selector=k8s_master_exclusion)
|
||||
for n in nodes.items:
|
||||
if n.metadata.name != master_node:
|
||||
failover_targets.append(n.metadata.name)
|
||||
|
||||
return failover_targets
|
||||
|
||||
def assert_failover(self, current_master_node, num_replicas, failover_targets, cluster_label):
|
||||
'''
|
||||
Check if master is failing over. The replica should move first to be the switchover target
|
||||
'''
|
||||
k8s = self.k8s
|
||||
k8s.wait_for_pod_failover(failover_targets, 'spilo-role=master,' + cluster_label)
|
||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
||||
|
||||
new_master_node, new_replica_nodes = k8s.get_pg_nodes(cluster_label)
|
||||
self.assertNotEqual(current_master_node, new_master_node,
|
||||
"Master on {} did not fail over to one of {}".format(current_master_node, failover_targets))
|
||||
self.assertEqual(num_replicas, len(new_replica_nodes),
|
||||
"Expected {} replicas, found {}".format(num_replicas, len(new_replica_nodes)))
|
||||
self.assert_master_is_unique()
|
||||
|
||||
return new_master_node, new_replica_nodes
|
||||
|
||||
def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"):
|
||||
'''
|
||||
Check that there is a single pod in the k8s cluster with the label "spilo-role=master"
|
||||
|
|
@ -1593,14 +1578,23 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
num_of_master_pods = k8s.count_pods_with_label(labels, namespace)
|
||||
self.assertEqual(num_of_master_pods, 1, "Expected 1 master pod, found {}".format(num_of_master_pods))
|
||||
|
||||
def assert_distributed_pods(self, master_node, replica_nodes, cluster_label):
|
||||
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
|
||||
def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=acid-minimal-cluster'):
|
||||
'''
|
||||
Other tests can lead to the situation that master and replica are on the same node.
|
||||
Toggle pod anti affinty to distribute pods accross nodes (replica in particular).
|
||||
'''
|
||||
k8s = self.k8s
|
||||
cluster_label = 'application=spilo,cluster-name=acid-minimal-cluster'
|
||||
failover_targets = self.get_failover_targets(master_node, replica_nodes)
|
||||
cluster_labels = 'application=spilo,cluster-name=acid-minimal-cluster'
|
||||
|
||||
# get nodes of master and replica(s)
|
||||
master_nodes, replica_nodes = k8s.get_cluster_nodes()
|
||||
self.assertNotEqual(master_nodes, [])
|
||||
self.assertNotEqual(replica_nodes, [])
|
||||
|
||||
# if nodes are different we can quit here
|
||||
if master_nodes[0] not in replica_nodes:
|
||||
return True
|
||||
|
||||
# enable pod anti affintiy in config map which should trigger movement of replica
|
||||
patch_enable_antiaffinity = {
|
||||
|
|
@ -1608,18 +1602,40 @@ class EndToEndTestCase(unittest.TestCase):
|
|||
"enable_pod_antiaffinity": "true"
|
||||
}
|
||||
}
|
||||
k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
|
||||
self.assert_failover(master_node, len(replica_nodes), failover_targets, cluster_label)
|
||||
|
||||
# now disable pod anti affintiy again which will cause yet another failover
|
||||
patch_disable_antiaffinity = {
|
||||
"data": {
|
||||
"enable_pod_antiaffinity": "false"
|
||||
try:
|
||||
k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
|
||||
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
||||
|
||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
|
||||
k8s.wait_for_running_pods(cluster_labels, 2)
|
||||
|
||||
# now disable pod anti affintiy again which will cause yet another failover
|
||||
patch_disable_antiaffinity = {
|
||||
"data": {
|
||||
"enable_pod_antiaffinity": "false"
|
||||
}
|
||||
}
|
||||
}
|
||||
k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
|
||||
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
|
||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
||||
k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
|
||||
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
||||
|
||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
|
||||
k8s.wait_for_running_pods(cluster_labels, 2)
|
||||
|
||||
master_nodes, replica_nodes = k8s.get_cluster_nodes()
|
||||
self.assertNotEqual(master_nodes, [])
|
||||
self.assertNotEqual(replica_nodes, [])
|
||||
|
||||
# if nodes are different we can quit here
|
||||
for target_node in target_nodes:
|
||||
if (target_node not in master_nodes or target_node not in replica_nodes) and master_nodes[0] in replica_nodes:
|
||||
print('Pods run on the same node')
|
||||
return False
|
||||
|
||||
except timeout_decorator.TimeoutError:
|
||||
print('Operator log: {}'.format(k8s.get_operator_log()))
|
||||
raise
|
||||
|
||||
return True
|
||||
|
||||
def list_databases(self, pod_name):
|
||||
|
|
|
|||
|
|
@ -86,7 +86,8 @@ data:
|
|||
# min_cpu_limit: 250m
|
||||
# min_memory_limit: 250Mi
|
||||
# minimal_major_version: "9.6"
|
||||
# node_readiness_label: ""
|
||||
# node_readiness_label: "status:ready"
|
||||
# node_readiness_label_merge: "OR"
|
||||
# oauth_token_secret_name: postgresql-operator
|
||||
# pam_configuration: |
|
||||
# https://info.example.com/oauth2/tokeninfo?access_token= uid realm=/employees
|
||||
|
|
|
|||
|
|
@ -228,6 +228,11 @@ spec:
|
|||
type: object
|
||||
additionalProperties:
|
||||
type: string
|
||||
node_readiness_label_merge:
|
||||
type: string
|
||||
enum:
|
||||
- "AND"
|
||||
- "OR"
|
||||
oauth_token_secret_name:
|
||||
type: string
|
||||
default: "postgresql-operator"
|
||||
|
|
|
|||
|
|
@ -70,6 +70,7 @@ configuration:
|
|||
master_pod_move_timeout: 20m
|
||||
# node_readiness_label:
|
||||
# status: ready
|
||||
# node_readiness_label_merge: "OR"
|
||||
oauth_token_secret_name: postgresql-operator
|
||||
pdb_name_format: "postgres-{cluster}-pdb"
|
||||
pod_antiaffinity_topology_key: "kubernetes.io/hostname"
|
||||
|
|
|
|||
|
|
@ -1167,6 +1167,17 @@ var OperatorConfigCRDResourceValidation = apiextv1.CustomResourceValidation{
|
|||
},
|
||||
},
|
||||
},
|
||||
"node_readiness_label_merge": {
|
||||
Type: "string",
|
||||
Enum: []apiextv1.JSON{
|
||||
{
|
||||
Raw: []byte(`"AND"`),
|
||||
},
|
||||
{
|
||||
Raw: []byte(`"OR"`),
|
||||
},
|
||||
},
|
||||
},
|
||||
"oauth_token_secret_name": {
|
||||
Type: "string",
|
||||
},
|
||||
|
|
|
|||
|
|
@ -82,6 +82,7 @@ type KubernetesMetaConfiguration struct {
|
|||
DeleteAnnotationDateKey string `json:"delete_annotation_date_key,omitempty"`
|
||||
DeleteAnnotationNameKey string `json:"delete_annotation_name_key,omitempty"`
|
||||
NodeReadinessLabel map[string]string `json:"node_readiness_label,omitempty"`
|
||||
NodeReadinessLabelMerge string `json:"node_readiness_label_merge,omitempty"`
|
||||
CustomPodAnnotations map[string]string `json:"custom_pod_annotations,omitempty"`
|
||||
// TODO: use a proper toleration structure?
|
||||
PodToleration map[string]string `json:"toleration,omitempty"`
|
||||
|
|
|
|||
|
|
@ -375,7 +375,6 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
|
|||
reasons = append(reasons, "new statefulset's number of replicas does not match the current one")
|
||||
}
|
||||
if !reflect.DeepEqual(c.Statefulset.Annotations, statefulSet.Annotations) {
|
||||
match = false
|
||||
needsReplace = true
|
||||
reasons = append(reasons, "new statefulset's annotations do not match the current one")
|
||||
}
|
||||
|
|
@ -406,6 +405,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
|
|||
needsRollUpdate = true
|
||||
reasons = append(reasons, "new statefulset's pod affinity does not match the current one")
|
||||
}
|
||||
if len(c.Statefulset.Spec.Template.Spec.Tolerations) != len(statefulSet.Spec.Template.Spec.Tolerations) {
|
||||
needsReplace = true
|
||||
needsRollUpdate = true
|
||||
reasons = append(reasons, "new statefulset's pod tolerations does not match the current one")
|
||||
}
|
||||
|
||||
// Some generated fields like creationTimestamp make it not possible to use DeepCompare on Spec.Template.ObjectMeta
|
||||
if !reflect.DeepEqual(c.Statefulset.Spec.Template.Labels, statefulSet.Spec.Template.Labels) {
|
||||
|
|
@ -427,13 +431,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
|
|||
}
|
||||
|
||||
if !reflect.DeepEqual(c.Statefulset.Spec.Template.Annotations, statefulSet.Spec.Template.Annotations) {
|
||||
match = false
|
||||
needsReplace = true
|
||||
needsRollUpdate = true
|
||||
reasons = append(reasons, "new statefulset's pod template metadata annotations does not match the current one")
|
||||
}
|
||||
if !reflect.DeepEqual(c.Statefulset.Spec.Template.Spec.SecurityContext, statefulSet.Spec.Template.Spec.SecurityContext) {
|
||||
match = false
|
||||
needsReplace = true
|
||||
needsRollUpdate = true
|
||||
reasons = append(reasons, "new statefulset's pod template security context in spec does not match the current one")
|
||||
|
|
@ -469,7 +471,6 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
|
|||
// we assume any change in priority happens by rolling out a new priority class
|
||||
// changing the priority value in an existing class is not supproted
|
||||
if c.Statefulset.Spec.Template.Spec.PriorityClassName != statefulSet.Spec.Template.Spec.PriorityClassName {
|
||||
match = false
|
||||
needsReplace = true
|
||||
needsRollUpdate = true
|
||||
reasons = append(reasons, "new statefulset's pod priority class in spec does not match the current one")
|
||||
|
|
|
|||
|
|
@ -309,7 +309,7 @@ func (c *Cluster) generateConnectionPoolerPodTemplate(role PostgresRole) (
|
|||
},
|
||||
}
|
||||
|
||||
nodeAffinity := nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity)
|
||||
nodeAffinity := c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity)
|
||||
if c.OpConfig.EnablePodAntiAffinity {
|
||||
labelsSet := labels.Set(c.connectionPoolerLabels(role, false).MatchLabels)
|
||||
podTemplate.Spec.Affinity = generatePodAffinity(labelsSet, c.OpConfig.PodAntiAffinityTopologyKey, nodeAffinity)
|
||||
|
|
|
|||
|
|
@ -327,7 +327,7 @@ func generateCapabilities(capabilities []string) *v1.Capabilities {
|
|||
return nil
|
||||
}
|
||||
|
||||
func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity {
|
||||
func (c *Cluster) nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity {
|
||||
if len(nodeReadinessLabel) == 0 && nodeAffinity == nil {
|
||||
return nil
|
||||
}
|
||||
|
|
@ -352,8 +352,18 @@ func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAff
|
|||
},
|
||||
}
|
||||
} else {
|
||||
nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
|
||||
NodeSelectorTerms: append(nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, nodeReadinessSelectorTerm),
|
||||
if c.OpConfig.NodeReadinessLabelMerge == "OR" {
|
||||
manifestTerms := nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
|
||||
manifestTerms = append(manifestTerms, nodeReadinessSelectorTerm)
|
||||
nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
|
||||
NodeSelectorTerms: manifestTerms,
|
||||
}
|
||||
} else if c.OpConfig.NodeReadinessLabelMerge == "AND" {
|
||||
for i, nodeSelectorTerm := range nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms {
|
||||
manifestExpressions := nodeSelectorTerm.MatchExpressions
|
||||
manifestExpressions = append(manifestExpressions, matchExpressions...)
|
||||
nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[i] = v1.NodeSelectorTerm{MatchExpressions: manifestExpressions}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1260,7 +1270,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef
|
|||
effectiveRunAsUser,
|
||||
effectiveRunAsGroup,
|
||||
effectiveFSGroup,
|
||||
nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity),
|
||||
c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity),
|
||||
spec.SchedulerName,
|
||||
int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
|
||||
c.OpConfig.PodServiceAccountName,
|
||||
|
|
@ -2010,7 +2020,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1beta1.CronJob, error) {
|
|||
nil,
|
||||
nil,
|
||||
nil,
|
||||
nodeAffinity(c.OpConfig.NodeReadinessLabel, nil),
|
||||
c.nodeAffinity(c.OpConfig.NodeReadinessLabel, nil),
|
||||
nil,
|
||||
int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
|
||||
c.OpConfig.PodServiceAccountName,
|
||||
|
|
|
|||
|
|
@ -110,6 +110,7 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur
|
|||
result.DeleteAnnotationDateKey = fromCRD.Kubernetes.DeleteAnnotationDateKey
|
||||
result.DeleteAnnotationNameKey = fromCRD.Kubernetes.DeleteAnnotationNameKey
|
||||
result.NodeReadinessLabel = fromCRD.Kubernetes.NodeReadinessLabel
|
||||
result.NodeReadinessLabelMerge = fromCRD.Kubernetes.NodeReadinessLabelMerge
|
||||
result.PodPriorityClassName = fromCRD.Kubernetes.PodPriorityClassName
|
||||
result.PodManagementPolicy = util.Coalesce(fromCRD.Kubernetes.PodManagementPolicy, "ordered_ready")
|
||||
result.MasterPodMoveTimeout = util.CoalesceDuration(time.Duration(fromCRD.Kubernetes.MasterPodMoveTimeout), "10m")
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ type Resources struct {
|
|||
PodEnvironmentConfigMap spec.NamespacedName `name:"pod_environment_configmap"`
|
||||
PodEnvironmentSecret string `name:"pod_environment_secret"`
|
||||
NodeReadinessLabel map[string]string `name:"node_readiness_label" default:""`
|
||||
NodeReadinessLabelMerge string `name:"node_readiness_label_merge" default:"OR"`
|
||||
MaxInstances int32 `name:"max_instances" default:"-1"`
|
||||
MinInstances int32 `name:"min_instances" default:"-1"`
|
||||
ShmVolume *bool `name:"enable_shm_volume" default:"true"`
|
||||
|
|
|
|||
Loading…
Reference in New Issue