toleration diff and nodeReadinessLabel merge with manifest matchExpressions (#1729)

* include tolerations in statefulset comparison
* provide alternative merge behavior of nodeSelectorTerms for node readiness label
* add config option to change affinity merge behavior
* reworked e2e tests around node affinity
This commit is contained in:
Felix Kunde 2022-01-27 15:57:24 +01:00 committed by GitHub
parent fe340192ca
commit a78a619e90
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 245 additions and 104 deletions

View File

@ -233,6 +233,11 @@ spec:
type: object
additionalProperties:
type: string
node_readiness_label_merge:
type: string
enum:
- "AND"
- "OR"
oauth_token_secret_name:
type: string
default: "postgresql-operator"

View File

@ -132,6 +132,9 @@ configKubernetes:
# node_readiness_label:
# status: ready
# defines how nodeAffinity from manifest should be merged with node_readiness_label
# node_readiness_label_merge: "OR"
# namespaced name of the secret containing the OAuth2 token to pass to the teams API
# oauth_token_secret_name: postgresql-operator

View File

@ -339,6 +339,81 @@ master pods from being evicted by the K8s runtime. To prevent eviction
completely, specify the toleration by leaving out the `tolerationSeconds` value
(similar to how Kubernetes' own DaemonSets are configured)
## Node readiness labels
The operator can watch on certain node labels to detect e.g. the start of a
Kubernetes cluster upgrade procedure and move master pods off the nodes to be
decommissioned. Key-value pairs for these node readiness labels can be
specified in the configuration (option name is in singular form):
```yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: postgres-operator
data:
node_readiness_label: "status1:ready,status2:ready"
```
```yaml
apiVersion: "acid.zalan.do/v1"
kind: OperatorConfiguration
metadata:
name: postgresql-configuration
configuration:
kubernetes:
node_readiness_label:
status1: ready
status2: ready
```
The operator will create a `nodeAffinity` on the pods. This makes the
`node_readiness_label` option the global configuration for defining node
affinities for all Postgres clusters. You can have both, cluster-specific and
global affinity, defined and they will get merged on the pods. If
`node_readiness_label_merge` is configured to `"AND"` the node readiness
affinity will end up under the same `matchExpressions` section(s) from the
manifest affinity.
```yaml
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: environment
operator: In
values:
- pci
- key: status1
operator: In
values:
- ready
- key: status2
...
```
If `node_readiness_label_merge` is set to `"OR"` (default) the readiness label
affinty will be appended with its own expressions block:
```yaml
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: environment
...
- matchExpressions:
- key: storage
...
- matchExpressions:
- key: status1
...
- key: status2
...
```
## Enable pod anti affinity
To ensure Postgres pods are running on different topologies, you can use

View File

@ -344,11 +344,16 @@ configuration they are grouped under the `kubernetes` key.
* **node_readiness_label**
a set of labels that a running and active node should possess to be
considered `ready`. The operator uses values of those labels to detect the
start of the Kubernetes cluster upgrade procedure and move master pods off
the nodes to be decommissioned. When the set is not empty, the operator also
assigns the `Affinity` clause to the Postgres pods to be scheduled only on
`ready` nodes. The default is empty.
considered `ready`. When the set is not empty, the operator assigns the
`nodeAffinity` clause to the Postgres pods to be scheduled only on `ready`
nodes. The default is empty.
* **node_readiness_label_merge**
If a `nodeAffinity` is also specified in the postgres cluster manifest
it will get merged with the `node_readiness_label` affinity on the pods.
The merge strategy can be configured - it can either be "AND" or "OR".
See [user docs](../user.md#use-taints-tolerations-and-node-affinity-for-dedicated-postgresql-nodes)
for more details. Default is "OR".
* **toleration**
a dictionary that should contain `key`, `operator`, `value` and

View File

@ -671,7 +671,9 @@ configured [default requests](reference/operator_parameters.md#kubernetes-resour
To ensure Postgres pods are running on nodes without any other application pods,
you can use [taints and tolerations](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/)
and configure the required toleration in the manifest.
and configure the required toleration in the manifest. Tolerations can also be
defined in the [operator config](administrator.md#use-taints-and-tolerations-for-dedicated-postgresql-nodes)
to apply for all Postgres clusters.
```yaml
spec:
@ -703,6 +705,9 @@ spec:
- pci
```
If you need to define a `nodeAffinity` for all your Postgres clusters use the
`node_readiness_label` [configuration](administrator.md#node-readiness-labels).
## In-place major version upgrade
Starting with Spilo 13, operator supports in-place major version upgrade to a

View File

@ -53,7 +53,7 @@ class K8s:
return master_pod_node, replica_pod_nodes
def get_cluster_nodes(self, cluster_labels='cluster-name=acid-minimal-cluster', namespace='default'):
def get_cluster_nodes(self, cluster_labels='application=spilo,cluster-name=acid-minimal-cluster', namespace='default'):
m = []
r = []
podsList = self.api.core_v1.list_namespaced_pod(namespace, label_selector=cluster_labels)

View File

@ -286,7 +286,7 @@ class EndToEndTestCase(unittest.TestCase):
# revert config change
revert_resync = {
"data": {
"resync_period": "30m",
"resync_period": "4m",
},
}
k8s.update_config(revert_resync)
@ -880,12 +880,10 @@ class EndToEndTestCase(unittest.TestCase):
# verify we are in good state from potential previous tests
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
# get nodes of master and replica(s)
master_node, replica_nodes = k8s.get_pg_nodes(cluster_label)
self.assertNotEqual(master_node, [])
master_nodes, replica_nodes = k8s.get_cluster_nodes()
self.assertNotEqual(master_nodes, [])
self.assertNotEqual(replica_nodes, [])
# label node with environment=postgres
@ -898,8 +896,8 @@ class EndToEndTestCase(unittest.TestCase):
}
try:
# patch current master node with the label
k8s.api.core_v1.patch_node(master_node, node_label_body)
# patch master node with the label
k8s.api.core_v1.patch_node(master_nodes[0], node_label_body)
# add node affinity to cluster
patch_node_affinity_config = {
@ -923,7 +921,6 @@ class EndToEndTestCase(unittest.TestCase):
}
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
group="acid.zalan.do",
version="v1",
@ -934,14 +931,17 @@ class EndToEndTestCase(unittest.TestCase):
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement
k8s.wait_for_pod_failover(master_node, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
# next master will be switched over and pod needs to be replaced as well to finish the rolling update
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label)
for pod in podsList.items:
if pod.metadata.labels.get('spilo-role') == 'replica':
self.assertEqual(master_node, pod.spec.node_name,
"Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_node, pod.spec.node_name))
self.assertEqual(master_nodes[0], pod.spec.node_name,
"Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_nodes[0], pod.spec.node_name))
# check that pod has correct node affinity
key = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].key
@ -966,15 +966,17 @@ class EndToEndTestCase(unittest.TestCase):
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# node affinity change should cause another rolling update and relocation of replica
k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log()))
raise
# toggle pod anti affinity to make sure replica and master run on separate nodes
self.assert_distributed_pods(replica_nodes)
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
@unittest.skip("Skipping this test until fixed")
def test_node_readiness_label(self):
'''
Remove node readiness label from master node. This must cause a failover.
@ -984,12 +986,15 @@ class EndToEndTestCase(unittest.TestCase):
readiness_label = 'lifecycle-status'
readiness_value = 'ready'
try:
# get nodes of master and replica(s) (expected target of new master)
current_master_node, current_replica_nodes = k8s.get_pg_nodes(cluster_label)
num_replicas = len(current_replica_nodes)
failover_targets = self.get_failover_targets(current_master_node, current_replica_nodes)
# verify we are in good state from potential previous tests
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
# get nodes of master and replica(s) (expected target of new master)
master_nodes, replica_nodes = k8s.get_cluster_nodes()
self.assertNotEqual(master_nodes, [])
self.assertNotEqual(replica_nodes, [])
try:
# add node_readiness_label to potential failover nodes
patch_readiness_label = {
"metadata": {
@ -998,30 +1003,43 @@ class EndToEndTestCase(unittest.TestCase):
}
}
}
self.assertTrue(len(failover_targets) > 0, "No failover targets available")
for failover_target in failover_targets:
k8s.api.core_v1.patch_node(failover_target, patch_readiness_label)
for replica_node in replica_nodes:
k8s.api.core_v1.patch_node(replica_node, patch_readiness_label)
# define node_readiness_label in config map which should trigger a failover of the master
# define node_readiness_label in config map which should trigger a rolling update
patch_readiness_label_config = {
"data": {
"node_readiness_label": readiness_label + ':' + readiness_value,
"node_readiness_label_merge": "AND",
}
}
k8s.update_config(patch_readiness_label_config, "setting readiness label")
new_master_node, new_replica_nodes = self.assert_failover(
current_master_node, num_replicas, failover_targets, cluster_label)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# first replica will be replaced and get the new affinity
# however, it might not start due to a volume node affinity conflict
# in this case only if the pvc and pod are deleted it can be scheduled
replica = k8s.get_cluster_replica_pod()
if replica.status.phase == 'Pending':
k8s.api.core_v1.delete_namespaced_persistent_volume_claim('pgdata-' + replica.metadata.name, 'default')
k8s.api.core_v1.delete_namespaced_pod(replica.metadata.name, 'default')
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
# next master will be switched over and pod needs to be replaced as well to finish the rolling update
k8s.wait_for_pod_failover(replica_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
# patch also node where master ran before
k8s.api.core_v1.patch_node(current_master_node, patch_readiness_label)
# toggle pod anti affinity to move replica away from master node
self.eventuallyTrue(lambda: self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label), "Pods are redistributed")
k8s.api.core_v1.patch_node(master_nodes[0], patch_readiness_label)
except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log()))
raise
# toggle pod anti affinity to move replica away from master node
self.assert_distributed_pods(master_nodes)
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_overwrite_pooler_deployment(self):
k8s = self.k8s
@ -1309,7 +1327,7 @@ class EndToEndTestCase(unittest.TestCase):
patch_resync_config = {
"data": {
"pod_label_wait_timeout": "10m",
"resync_period": "30m",
"resync_period": "4m",
}
}
k8s.update_config(patch_resync_config, "revert resync interval and pod_label_wait_timeout")
@ -1413,7 +1431,6 @@ class EndToEndTestCase(unittest.TestCase):
self.eventuallyTrue(lambda: k8s.check_statefulset_annotations(cluster_label, annotations), "Annotations missing")
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
@unittest.skip("Skipping this test until fixed")
def test_taint_based_eviction(self):
'''
Add taint "postgres=:NoExecute" to node with master. This must cause a failover.
@ -1427,7 +1444,6 @@ class EndToEndTestCase(unittest.TestCase):
# get nodes of master and replica(s) (expected target of new master)
master_nodes, replica_nodes = k8s.get_cluster_nodes()
self.assertNotEqual(master_nodes, [])
self.assertNotEqual(replica_nodes, [])
@ -1442,10 +1458,7 @@ class EndToEndTestCase(unittest.TestCase):
]
}
}
k8s.api.core_v1.patch_node(master_nodes[0], body)
self.eventuallyTrue(lambda: k8s.get_cluster_nodes()[0], replica_nodes)
self.assertNotEqual(lambda: k8s.get_cluster_nodes()[0], master_nodes)
# add toleration to pods
patch_toleration_config = {
@ -1454,15 +1467,20 @@ class EndToEndTestCase(unittest.TestCase):
}
}
k8s.update_config(patch_toleration_config, step="allow tainted nodes")
try:
k8s.update_config(patch_toleration_config, step="allow tainted nodes")
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"},
"Operator does not get in sync")
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log()))
raise
# toggle pod anti affinity to move replica away from master node
nm, new_replica_nodes = k8s.get_cluster_nodes()
new_master_node = nm[0]
self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label)
self.assert_distributed_pods(master_nodes)
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_zz_cluster_deletion(self):
@ -1549,39 +1567,6 @@ class EndToEndTestCase(unittest.TestCase):
}
k8s.update_config(patch_delete_annotations)
def get_failover_targets(self, master_node, replica_nodes):
'''
If all pods live on the same node, failover will happen to other worker(s)
'''
k8s = self.k8s
k8s_master_exclusion = 'kubernetes.io/hostname!=postgres-operator-e2e-tests-control-plane'
failover_targets = [x for x in replica_nodes if x != master_node]
if len(failover_targets) == 0:
nodes = k8s.api.core_v1.list_node(label_selector=k8s_master_exclusion)
for n in nodes.items:
if n.metadata.name != master_node:
failover_targets.append(n.metadata.name)
return failover_targets
def assert_failover(self, current_master_node, num_replicas, failover_targets, cluster_label):
'''
Check if master is failing over. The replica should move first to be the switchover target
'''
k8s = self.k8s
k8s.wait_for_pod_failover(failover_targets, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
new_master_node, new_replica_nodes = k8s.get_pg_nodes(cluster_label)
self.assertNotEqual(current_master_node, new_master_node,
"Master on {} did not fail over to one of {}".format(current_master_node, failover_targets))
self.assertEqual(num_replicas, len(new_replica_nodes),
"Expected {} replicas, found {}".format(num_replicas, len(new_replica_nodes)))
self.assert_master_is_unique()
return new_master_node, new_replica_nodes
def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"):
'''
Check that there is a single pod in the k8s cluster with the label "spilo-role=master"
@ -1593,14 +1578,23 @@ class EndToEndTestCase(unittest.TestCase):
num_of_master_pods = k8s.count_pods_with_label(labels, namespace)
self.assertEqual(num_of_master_pods, 1, "Expected 1 master pod, found {}".format(num_of_master_pods))
def assert_distributed_pods(self, master_node, replica_nodes, cluster_label):
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=acid-minimal-cluster'):
'''
Other tests can lead to the situation that master and replica are on the same node.
Toggle pod anti affinty to distribute pods accross nodes (replica in particular).
'''
k8s = self.k8s
cluster_label = 'application=spilo,cluster-name=acid-minimal-cluster'
failover_targets = self.get_failover_targets(master_node, replica_nodes)
cluster_labels = 'application=spilo,cluster-name=acid-minimal-cluster'
# get nodes of master and replica(s)
master_nodes, replica_nodes = k8s.get_cluster_nodes()
self.assertNotEqual(master_nodes, [])
self.assertNotEqual(replica_nodes, [])
# if nodes are different we can quit here
if master_nodes[0] not in replica_nodes:
return True
# enable pod anti affintiy in config map which should trigger movement of replica
patch_enable_antiaffinity = {
@ -1608,18 +1602,40 @@ class EndToEndTestCase(unittest.TestCase):
"enable_pod_antiaffinity": "true"
}
}
k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
self.assert_failover(master_node, len(replica_nodes), failover_targets, cluster_label)
# now disable pod anti affintiy again which will cause yet another failover
patch_disable_antiaffinity = {
"data": {
"enable_pod_antiaffinity": "false"
try:
k8s.update_config(patch_enable_antiaffinity, "enable antiaffinity")
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
k8s.wait_for_running_pods(cluster_labels, 2)
# now disable pod anti affintiy again which will cause yet another failover
patch_disable_antiaffinity = {
"data": {
"enable_pod_antiaffinity": "false"
}
}
}
k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
k8s.wait_for_running_pods(cluster_labels, 2)
master_nodes, replica_nodes = k8s.get_cluster_nodes()
self.assertNotEqual(master_nodes, [])
self.assertNotEqual(replica_nodes, [])
# if nodes are different we can quit here
for target_node in target_nodes:
if (target_node not in master_nodes or target_node not in replica_nodes) and master_nodes[0] in replica_nodes:
print('Pods run on the same node')
return False
except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log()))
raise
return True
def list_databases(self, pod_name):

View File

@ -86,7 +86,8 @@ data:
# min_cpu_limit: 250m
# min_memory_limit: 250Mi
# minimal_major_version: "9.6"
# node_readiness_label: ""
# node_readiness_label: "status:ready"
# node_readiness_label_merge: "OR"
# oauth_token_secret_name: postgresql-operator
# pam_configuration: |
# https://info.example.com/oauth2/tokeninfo?access_token= uid realm=/employees

View File

@ -228,6 +228,11 @@ spec:
type: object
additionalProperties:
type: string
node_readiness_label_merge:
type: string
enum:
- "AND"
- "OR"
oauth_token_secret_name:
type: string
default: "postgresql-operator"

View File

@ -70,6 +70,7 @@ configuration:
master_pod_move_timeout: 20m
# node_readiness_label:
# status: ready
# node_readiness_label_merge: "OR"
oauth_token_secret_name: postgresql-operator
pdb_name_format: "postgres-{cluster}-pdb"
pod_antiaffinity_topology_key: "kubernetes.io/hostname"

View File

@ -1167,6 +1167,17 @@ var OperatorConfigCRDResourceValidation = apiextv1.CustomResourceValidation{
},
},
},
"node_readiness_label_merge": {
Type: "string",
Enum: []apiextv1.JSON{
{
Raw: []byte(`"AND"`),
},
{
Raw: []byte(`"OR"`),
},
},
},
"oauth_token_secret_name": {
Type: "string",
},

View File

@ -82,6 +82,7 @@ type KubernetesMetaConfiguration struct {
DeleteAnnotationDateKey string `json:"delete_annotation_date_key,omitempty"`
DeleteAnnotationNameKey string `json:"delete_annotation_name_key,omitempty"`
NodeReadinessLabel map[string]string `json:"node_readiness_label,omitempty"`
NodeReadinessLabelMerge string `json:"node_readiness_label_merge,omitempty"`
CustomPodAnnotations map[string]string `json:"custom_pod_annotations,omitempty"`
// TODO: use a proper toleration structure?
PodToleration map[string]string `json:"toleration,omitempty"`

View File

@ -375,7 +375,6 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
reasons = append(reasons, "new statefulset's number of replicas does not match the current one")
}
if !reflect.DeepEqual(c.Statefulset.Annotations, statefulSet.Annotations) {
match = false
needsReplace = true
reasons = append(reasons, "new statefulset's annotations do not match the current one")
}
@ -406,6 +405,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
needsRollUpdate = true
reasons = append(reasons, "new statefulset's pod affinity does not match the current one")
}
if len(c.Statefulset.Spec.Template.Spec.Tolerations) != len(statefulSet.Spec.Template.Spec.Tolerations) {
needsReplace = true
needsRollUpdate = true
reasons = append(reasons, "new statefulset's pod tolerations does not match the current one")
}
// Some generated fields like creationTimestamp make it not possible to use DeepCompare on Spec.Template.ObjectMeta
if !reflect.DeepEqual(c.Statefulset.Spec.Template.Labels, statefulSet.Spec.Template.Labels) {
@ -427,13 +431,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
}
if !reflect.DeepEqual(c.Statefulset.Spec.Template.Annotations, statefulSet.Spec.Template.Annotations) {
match = false
needsReplace = true
needsRollUpdate = true
reasons = append(reasons, "new statefulset's pod template metadata annotations does not match the current one")
}
if !reflect.DeepEqual(c.Statefulset.Spec.Template.Spec.SecurityContext, statefulSet.Spec.Template.Spec.SecurityContext) {
match = false
needsReplace = true
needsRollUpdate = true
reasons = append(reasons, "new statefulset's pod template security context in spec does not match the current one")
@ -469,7 +471,6 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
// we assume any change in priority happens by rolling out a new priority class
// changing the priority value in an existing class is not supproted
if c.Statefulset.Spec.Template.Spec.PriorityClassName != statefulSet.Spec.Template.Spec.PriorityClassName {
match = false
needsReplace = true
needsRollUpdate = true
reasons = append(reasons, "new statefulset's pod priority class in spec does not match the current one")

View File

@ -309,7 +309,7 @@ func (c *Cluster) generateConnectionPoolerPodTemplate(role PostgresRole) (
},
}
nodeAffinity := nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity)
nodeAffinity := c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity)
if c.OpConfig.EnablePodAntiAffinity {
labelsSet := labels.Set(c.connectionPoolerLabels(role, false).MatchLabels)
podTemplate.Spec.Affinity = generatePodAffinity(labelsSet, c.OpConfig.PodAntiAffinityTopologyKey, nodeAffinity)

View File

@ -327,7 +327,7 @@ func generateCapabilities(capabilities []string) *v1.Capabilities {
return nil
}
func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity {
func (c *Cluster) nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity {
if len(nodeReadinessLabel) == 0 && nodeAffinity == nil {
return nil
}
@ -352,8 +352,18 @@ func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAff
},
}
} else {
nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
NodeSelectorTerms: append(nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, nodeReadinessSelectorTerm),
if c.OpConfig.NodeReadinessLabelMerge == "OR" {
manifestTerms := nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
manifestTerms = append(manifestTerms, nodeReadinessSelectorTerm)
nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
NodeSelectorTerms: manifestTerms,
}
} else if c.OpConfig.NodeReadinessLabelMerge == "AND" {
for i, nodeSelectorTerm := range nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms {
manifestExpressions := nodeSelectorTerm.MatchExpressions
manifestExpressions = append(manifestExpressions, matchExpressions...)
nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[i] = v1.NodeSelectorTerm{MatchExpressions: manifestExpressions}
}
}
}
}
@ -1260,7 +1270,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef
effectiveRunAsUser,
effectiveRunAsGroup,
effectiveFSGroup,
nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity),
c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity),
spec.SchedulerName,
int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
c.OpConfig.PodServiceAccountName,
@ -2010,7 +2020,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1beta1.CronJob, error) {
nil,
nil,
nil,
nodeAffinity(c.OpConfig.NodeReadinessLabel, nil),
c.nodeAffinity(c.OpConfig.NodeReadinessLabel, nil),
nil,
int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
c.OpConfig.PodServiceAccountName,

View File

@ -110,6 +110,7 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur
result.DeleteAnnotationDateKey = fromCRD.Kubernetes.DeleteAnnotationDateKey
result.DeleteAnnotationNameKey = fromCRD.Kubernetes.DeleteAnnotationNameKey
result.NodeReadinessLabel = fromCRD.Kubernetes.NodeReadinessLabel
result.NodeReadinessLabelMerge = fromCRD.Kubernetes.NodeReadinessLabelMerge
result.PodPriorityClassName = fromCRD.Kubernetes.PodPriorityClassName
result.PodManagementPolicy = util.Coalesce(fromCRD.Kubernetes.PodManagementPolicy, "ordered_ready")
result.MasterPodMoveTimeout = util.CoalesceDuration(time.Duration(fromCRD.Kubernetes.MasterPodMoveTimeout), "10m")

View File

@ -55,6 +55,7 @@ type Resources struct {
PodEnvironmentConfigMap spec.NamespacedName `name:"pod_environment_configmap"`
PodEnvironmentSecret string `name:"pod_environment_secret"`
NodeReadinessLabel map[string]string `name:"node_readiness_label" default:""`
NodeReadinessLabelMerge string `name:"node_readiness_label_merge" default:"OR"`
MaxInstances int32 `name:"max_instances" default:"-1"`
MinInstances int32 `name:"min_instances" default:"-1"`
ShmVolume *bool `name:"enable_shm_volume" default:"true"`