From 77252e316c97ca4c572485d2b0803503ddd675ea Mon Sep 17 00:00:00 2001 From: Pavel Tumik Date: Wed, 16 Dec 2020 05:56:28 -0800 Subject: [PATCH] Add node affinity support (#1166) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adding nodeaffinity support alongside node_readiness_label * add documentation for node affinity * add node affinity e2e test * add unit test for node affinity Co-authored-by: Steffen Pøhner Henriksen Co-authored-by: Adrian Astley --- .../postgres-operator/crds/postgresqls.yaml | 91 +++++++++++++++ docs/user.md | 24 +++- e2e/tests/test_e2e.py | 106 ++++++++++++++++++ manifests/complete-postgres-manifest.yaml | 11 ++ manifests/postgresql.crd.yaml | 91 +++++++++++++++ pkg/apis/acid.zalan.do/v1/crds.go | 85 ++++++++++++++ pkg/apis/acid.zalan.do/v1/postgresql_type.go | 1 + .../acid.zalan.do/v1/zz_generated.deepcopy.go | 1 + pkg/cluster/k8sres.go | 46 +++++--- pkg/cluster/k8sres_test.go | 66 +++++++++++ 10 files changed, 505 insertions(+), 17 deletions(-) diff --git a/charts/postgres-operator/crds/postgresqls.yaml b/charts/postgres-operator/crds/postgresqls.yaml index 784bb2a76..13811936d 100644 --- a/charts/postgres-operator/crds/postgresqls.yaml +++ b/charts/postgres-operator/crds/postgresqls.yaml @@ -396,6 +396,97 @@ spec: type: string caSecretName: type: string + nodeAffinity: + type: object + properties: + preferredDuringSchedulingIgnoredDuringExecution: + type: array + items: + type: object + required: + - weight + - preference + properties: + preference: + type: object + properties: + matchExpressions: + type: array + items: + type: object + required: + - key + - operator + properties: + key: + type: string + operator: + type: string + values: + type: array + items: + type: string + matchFields: + type: array + items: + type: object + required: + - key + - operator + properties: + key: + type: string + operator: + type: string + values: + type: array + items: + type: string + weight: + format: int32 + type: integer + requiredDuringSchedulingIgnoredDuringExecution: + type: object + required: + - nodeSelectorTerms + properties: + nodeSelectorTerms: + type: array + items: + type: object + properties: + matchExpressions: + type: array + items: + type: object + required: + - key + - operator + properties: + key: + type: string + operator: + type: string + values: + type: array + items: + type: string + matchFields: + type: array + items: + type: object + required: + - key + - operator + properties: + key: + type: string + operator: + type: string + values: + type: array + items: + type: string tolerations: type: array items: diff --git a/docs/user.md b/docs/user.md index 8723a01e4..7552dca9d 100644 --- a/docs/user.md +++ b/docs/user.md @@ -517,7 +517,7 @@ manifest the operator will raise the limits to the configured minimum values. If no resources are defined in the manifest they will be obtained from the configured [default requests](reference/operator_parameters.md#kubernetes-resource-requests). -## Use taints and tolerations for dedicated PostgreSQL nodes +## Use taints, tolerations and node affinity for dedicated PostgreSQL nodes To ensure Postgres pods are running on nodes without any other application pods, you can use [taints and tolerations](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/) @@ -531,6 +531,28 @@ spec: effect: NoSchedule ``` +If you need the pods to be scheduled on specific nodes you may use [node affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/) +to specify a set of label(s), of which a prospective host node must have at least one. This could be used to +place nodes with certain hardware capabilities (e.g. SSD drives) in certain environments or network segments, +e.g. for PCI compliance. + +```yaml +apiVersion: "acid.zalan.do/v1" +kind: postgresql +metadata: + name: acid-minimal-cluster +spec: + teamId: "ACID" + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: environment + operator: In + values: + - pci +``` + ## How to clone an existing PostgreSQL cluster You can spin up a new cluster as a clone of the existing one, using a `clone` diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py index d396da01b..fa889047c 100644 --- a/e2e/tests/test_e2e.py +++ b/e2e/tests/test_e2e.py @@ -929,6 +929,112 @@ class EndToEndTestCase(unittest.TestCase): new_master_node = nm[0] self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label) + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) + def test_node_affinity(self): + ''' + Add label to a node and update postgres cluster spec to deploy only on a node with that label + ''' + k8s = self.k8s + cluster_label = 'application=spilo,cluster-name=acid-minimal-cluster' + + # verify we are in good state from potential previous tests + self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running") + self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running") + self.eventuallyEqual(lambda: self.k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") + + # get nodes of master and replica(s) + master_node, replica_nodes = k8s.get_pg_nodes(cluster_label) + + self.assertNotEqual(master_node, []) + self.assertNotEqual(replica_nodes, []) + + # label node with environment=postgres + node_label_body = { + "metadata": { + "labels": { + "node-affinity-test": "postgres" + } + } + } + + try: + # patch current master node with the label + print('patching master node: {}'.format(master_node)) + k8s.api.core_v1.patch_node(master_node, node_label_body) + + # add node affinity to cluster + patch_node_affinity_config = { + "spec": { + "nodeAffinity" : { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [ + { + "matchExpressions": [ + { + "key": "node-affinity-test", + "operator": "In", + "values": [ + "postgres" + ] + } + ] + } + ] + } + } + } + } + + k8s.api.custom_objects_api.patch_namespaced_custom_object( + group="acid.zalan.do", + version="v1", + namespace="default", + plural="postgresqls", + name="acid-minimal-cluster", + body=patch_node_affinity_config) + self.eventuallyEqual(lambda: self.k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") + + # node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement + k8s.wait_for_pod_failover(master_node, 'spilo-role=replica,' + cluster_label) + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) + + podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label) + for pod in podsList.items: + if pod.metadata.labels.get('spilo-role') == 'replica': + self.assertEqual(master_node, pod.spec.node_name, + "Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_node, pod.spec.node_name)) + + # check that pod has correct node affinity + key = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].key + value = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].values[0] + self.assertEqual("node-affinity-test", key, + "Sanity check: expect node selector key to be equal to 'node-affinity-test' but got {}".format(key)) + self.assertEqual("postgres", value, + "Sanity check: expect node selector value to be equal to 'postgres' but got {}".format(value)) + + patch_node_remove_affinity_config = { + "spec": { + "nodeAffinity" : None + } + } + k8s.api.custom_objects_api.patch_namespaced_custom_object( + group="acid.zalan.do", + version="v1", + namespace="default", + plural="postgresqls", + name="acid-minimal-cluster", + body=patch_node_remove_affinity_config) + self.eventuallyEqual(lambda: self.k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") + + # remove node affinity to move replica away from master node + nm, new_replica_nodes = k8s.get_cluster_nodes() + new_master_node = nm[0] + self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label) + + except timeout_decorator.TimeoutError: + print('Operator log: {}'.format(k8s.get_operator_log())) + raise + @timeout_decorator.timeout(TEST_TIMEOUT_SEC) def test_zzzz_cluster_deletion(self): ''' diff --git a/manifests/complete-postgres-manifest.yaml b/manifests/complete-postgres-manifest.yaml index e6fb9a43c..2555ed450 100644 --- a/manifests/complete-postgres-manifest.yaml +++ b/manifests/complete-postgres-manifest.yaml @@ -172,3 +172,14 @@ spec: # When TLS is enabled, also set spiloFSGroup parameter above to the relevant value. # if unknown, set it to 103 which is the usual value in the default spilo images. # In Openshift, there is no need to set spiloFSGroup/spilo_fsgroup. + +# Add node affinity support by allowing postgres pods to schedule only on nodes that +# have label: "postgres-operator:enabled" set. +# nodeAffinity: +# requiredDuringSchedulingIgnoredDuringExecution: +# nodeSelectorTerms: +# - matchExpressions: +# - key: postgres-operator +# operator: In +# values: +# - enabled diff --git a/manifests/postgresql.crd.yaml b/manifests/postgresql.crd.yaml index 7836d07e7..d5170e9d4 100644 --- a/manifests/postgresql.crd.yaml +++ b/manifests/postgresql.crd.yaml @@ -392,6 +392,97 @@ spec: type: string caSecretName: type: string + nodeAffinity: + type: object + properties: + preferredDuringSchedulingIgnoredDuringExecution: + type: array + items: + type: object + required: + - weight + - preference + properties: + preference: + type: object + properties: + matchExpressions: + type: array + items: + type: object + required: + - key + - operator + properties: + key: + type: string + operator: + type: string + values: + type: array + items: + type: string + matchFields: + type: array + items: + type: object + required: + - key + - operator + properties: + key: + type: string + operator: + type: string + values: + type: array + items: + type: string + weight: + format: int32 + type: integer + requiredDuringSchedulingIgnoredDuringExecution: + type: object + required: + - nodeSelectorTerms + properties: + nodeSelectorTerms: + type: array + items: + type: object + properties: + matchExpressions: + type: array + items: + type: object + required: + - key + - operator + properties: + key: + type: string + operator: + type: string + values: + type: array + items: + type: string + matchFields: + type: array + items: + type: object + required: + - key + - operator + properties: + key: + type: string + operator: + type: string + values: + type: array + items: + type: string tolerations: type: array items: diff --git a/pkg/apis/acid.zalan.do/v1/crds.go b/pkg/apis/acid.zalan.do/v1/crds.go index 737346f5e..852a2f961 100644 --- a/pkg/apis/acid.zalan.do/v1/crds.go +++ b/pkg/apis/acid.zalan.do/v1/crds.go @@ -597,6 +597,91 @@ var PostgresCRDResourceValidation = apiextv1.CustomResourceValidation{ }, }, }, + "nodeAffinity": { + Type: "object", + Properties: map[string]apiextv1.JSONSchemaProps{ + "preferredDuringSchedulingIgnoredDuringExecution": { + Type: "array", + Items: &apiextv1.JSONSchemaPropsOrArray{ + Schema: &apiextv1.JSONSchemaProps{ + Type: "object", + Required: []string{"preference, weight"}, + Properties: map[string]apiextv1.JSONSchemaProps{ + "preference": { + Type: "object", + Properties: map[string]apiextv1.JSONSchemaProps{ + "matchExpressions": { + Type: "array", + Items: &apiextv1.JSONSchemaPropsOrArray{ + Schema: &apiextv1.JSONSchemaProps{ + Type: "object", + AdditionalProperties: &apiextv1.JSONSchemaPropsOrBool{ + Allows: true, + }, + }, + }, + }, + "matchFields": { + Type: "array", + Items: &apiextv1.JSONSchemaPropsOrArray{ + Schema: &apiextv1.JSONSchemaProps{ + Type: "object", + AdditionalProperties: &apiextv1.JSONSchemaPropsOrBool{ + Allows: true, + }, + }, + }, + }, + }, + }, + "weight": { + Type: "integer", + Format: "int32", + }, + }, + }, + }, + }, + "requiredDuringSchedulingIgnoredDuringExecution": { + Type: "object", + Required: []string{"nodeSelectorTerms"}, + Properties: map[string]apiextv1.JSONSchemaProps{ + "nodeSelectorTerms": { + Type: "array", + Items: &apiextv1.JSONSchemaPropsOrArray{ + Schema: &apiextv1.JSONSchemaProps{ + Type: "object", + Properties: map[string]apiextv1.JSONSchemaProps{ + "matchExpressions": { + Type: "array", + Items: &apiextv1.JSONSchemaPropsOrArray{ + Schema: &apiextv1.JSONSchemaProps{ + Type: "object", + AdditionalProperties: &apiextv1.JSONSchemaPropsOrBool{ + Allows: true, + }, + }, + }, + }, + "matchFields": { + Type: "array", + Items: &apiextv1.JSONSchemaPropsOrArray{ + Schema: &apiextv1.JSONSchemaProps{ + Type: "object", + AdditionalProperties: &apiextv1.JSONSchemaPropsOrBool{ + Allows: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, "tolerations": { Type: "array", Items: &apiextv1.JSONSchemaPropsOrArray{ diff --git a/pkg/apis/acid.zalan.do/v1/postgresql_type.go b/pkg/apis/acid.zalan.do/v1/postgresql_type.go index 74a9057a5..0c87f96f8 100644 --- a/pkg/apis/acid.zalan.do/v1/postgresql_type.go +++ b/pkg/apis/acid.zalan.do/v1/postgresql_type.go @@ -61,6 +61,7 @@ type PostgresSpec struct { Databases map[string]string `json:"databases,omitempty"` PreparedDatabases map[string]PreparedDatabase `json:"preparedDatabases,omitempty"` SchedulerName *string `json:"schedulerName,omitempty"` + NodeAffinity v1.NodeAffinity `json:"nodeAffinity,omitempty"` Tolerations []v1.Toleration `json:"tolerations,omitempty"` Sidecars []Sidecar `json:"sidecars,omitempty"` InitContainers []v1.Container `json:"initContainers,omitempty"` diff --git a/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go b/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go index f04a29490..83cd18c40 100644 --- a/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go +++ b/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go @@ -633,6 +633,7 @@ func (in *PostgresSpec) DeepCopyInto(out *PostgresSpec) { *out = new(string) **out = **in } + in.NodeAffinity.DeepCopyInto(&out.NodeAffinity) if in.Tolerations != nil { in, out := &in.Tolerations, &out.Tolerations *out = make([]corev1.Toleration, len(*in)) diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index 9e5ed7050..1650d38d3 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -320,25 +320,39 @@ func getLocalAndBoostrapPostgreSQLParameters(parameters map[string]string) (loca return } -func nodeAffinity(nodeReadinessLabel map[string]string) *v1.Affinity { - matchExpressions := make([]v1.NodeSelectorRequirement, 0) - if len(nodeReadinessLabel) == 0 { +func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity { + if len(nodeReadinessLabel) == 0 && nodeAffinity == nil { return nil } - for k, v := range nodeReadinessLabel { - matchExpressions = append(matchExpressions, v1.NodeSelectorRequirement{ - Key: k, - Operator: v1.NodeSelectorOpIn, - Values: []string{v}, - }) + nodeAffinityCopy := *&v1.NodeAffinity{} + if nodeAffinity != nil { + nodeAffinityCopy = *nodeAffinity.DeepCopy() + } + if len(nodeReadinessLabel) > 0 { + matchExpressions := make([]v1.NodeSelectorRequirement, 0) + for k, v := range nodeReadinessLabel { + matchExpressions = append(matchExpressions, v1.NodeSelectorRequirement{ + Key: k, + Operator: v1.NodeSelectorOpIn, + Values: []string{v}, + }) + } + nodeReadinessSelectorTerm := v1.NodeSelectorTerm{MatchExpressions: matchExpressions} + if nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution == nil { + nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + nodeReadinessSelectorTerm, + }, + } + } else { + nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{ + NodeSelectorTerms: append(nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, nodeReadinessSelectorTerm), + } + } } return &v1.Affinity{ - NodeAffinity: &v1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ - NodeSelectorTerms: []v1.NodeSelectorTerm{{MatchExpressions: matchExpressions}}, - }, - }, + NodeAffinity: &nodeAffinityCopy, } } @@ -1209,7 +1223,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef effectiveRunAsUser, effectiveRunAsGroup, effectiveFSGroup, - nodeAffinity(c.OpConfig.NodeReadinessLabel), + nodeAffinity(c.OpConfig.NodeReadinessLabel, &spec.NodeAffinity), spec.SchedulerName, int64(c.OpConfig.PodTerminateGracePeriod.Seconds()), c.OpConfig.PodServiceAccountName, @@ -1914,7 +1928,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1beta1.CronJob, error) { nil, nil, nil, - nodeAffinity(c.OpConfig.NodeReadinessLabel), + nodeAffinity(c.OpConfig.NodeReadinessLabel, nil), nil, int64(c.OpConfig.PodTerminateGracePeriod.Seconds()), c.OpConfig.PodServiceAccountName, diff --git a/pkg/cluster/k8sres_test.go b/pkg/cluster/k8sres_test.go index 2f2b353ab..8a5103cbe 100644 --- a/pkg/cluster/k8sres_test.go +++ b/pkg/cluster/k8sres_test.go @@ -864,6 +864,72 @@ func testEnvs(cluster *Cluster, podSpec *v1.PodTemplateSpec, role PostgresRole) return nil } +func TestNodeAffinity(t *testing.T) { + var err error + var spec acidv1.PostgresSpec + var cluster *Cluster + var spiloRunAsUser = int64(101) + var spiloRunAsGroup = int64(103) + var spiloFSGroup = int64(103) + + makeSpec := func(nodeAffinity *v1.NodeAffinity) acidv1.PostgresSpec { + return acidv1.PostgresSpec{ + TeamID: "myapp", NumberOfInstances: 1, + Resources: acidv1.Resources{ + ResourceRequests: acidv1.ResourceDescription{CPU: "1", Memory: "10"}, + ResourceLimits: acidv1.ResourceDescription{CPU: "1", Memory: "10"}, + }, + Volume: acidv1.Volume{ + Size: "1G", + }, + NodeAffinity: *nodeAffinity, + } + } + + cluster = New( + Config{ + OpConfig: config.Config{ + PodManagementPolicy: "ordered_ready", + ProtectedRoles: []string{"admin"}, + Auth: config.Auth{ + SuperUsername: superUserName, + ReplicationUsername: replicationUserName, + }, + Resources: config.Resources{ + SpiloRunAsUser: &spiloRunAsUser, + SpiloRunAsGroup: &spiloRunAsGroup, + SpiloFSGroup: &spiloFSGroup, + }, + }, + }, k8sutil.KubernetesClient{}, acidv1.Postgresql{}, logger, eventRecorder) + + nodeAff := &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + v1.NodeSelectorTerm{ + MatchExpressions: []v1.NodeSelectorRequirement{ + v1.NodeSelectorRequirement{ + Key: "test-label", + Operator: v1.NodeSelectorOpIn, + Values: []string{ + "test-value", + }, + }, + }, + }, + }, + }, + } + spec = makeSpec(nodeAff) + s, err := cluster.generateStatefulSet(&spec) + if err != nil { + assert.NoError(t, err) + } + + assert.NotNil(t, s.Spec.Template.Spec.Affinity.NodeAffinity, "node affinity in statefulset shouldn't be nil") + assert.Equal(t, s.Spec.Template.Spec.Affinity.NodeAffinity, nodeAff, "cluster template has correct node affinity") +} + func testCustomPodTemplate(cluster *Cluster, podSpec *v1.PodTemplateSpec) error { if podSpec.ObjectMeta.Name != "test-pod-template" { return fmt.Errorf("Custom pod template is not used, current spec %+v",