Add node affinity support (#1166)

* Adding nodeaffinity support alongside node_readiness_label
* add documentation for node affinity
* add node affinity e2e test
* add unit test for node affinity

Co-authored-by: Steffen Pøhner Henriksen <str3sses@gmail.com>
Co-authored-by: Adrian Astley <adrian.astley@activision.com>
This commit is contained in:
Pavel Tumik 2020-12-16 05:56:28 -08:00 committed by GitHub
parent f28706e940
commit 77252e316c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 505 additions and 17 deletions

View File

@ -396,6 +396,97 @@ spec:
type: string type: string
caSecretName: caSecretName:
type: string type: string
nodeAffinity:
type: object
properties:
preferredDuringSchedulingIgnoredDuringExecution:
type: array
items:
type: object
required:
- weight
- preference
properties:
preference:
type: object
properties:
matchExpressions:
type: array
items:
type: object
required:
- key
- operator
properties:
key:
type: string
operator:
type: string
values:
type: array
items:
type: string
matchFields:
type: array
items:
type: object
required:
- key
- operator
properties:
key:
type: string
operator:
type: string
values:
type: array
items:
type: string
weight:
format: int32
type: integer
requiredDuringSchedulingIgnoredDuringExecution:
type: object
required:
- nodeSelectorTerms
properties:
nodeSelectorTerms:
type: array
items:
type: object
properties:
matchExpressions:
type: array
items:
type: object
required:
- key
- operator
properties:
key:
type: string
operator:
type: string
values:
type: array
items:
type: string
matchFields:
type: array
items:
type: object
required:
- key
- operator
properties:
key:
type: string
operator:
type: string
values:
type: array
items:
type: string
tolerations: tolerations:
type: array type: array
items: items:

View File

@ -517,7 +517,7 @@ manifest the operator will raise the limits to the configured minimum values.
If no resources are defined in the manifest they will be obtained from the If no resources are defined in the manifest they will be obtained from the
configured [default requests](reference/operator_parameters.md#kubernetes-resource-requests). configured [default requests](reference/operator_parameters.md#kubernetes-resource-requests).
## Use taints and tolerations for dedicated PostgreSQL nodes ## Use taints, tolerations and node affinity for dedicated PostgreSQL nodes
To ensure Postgres pods are running on nodes without any other application pods, To ensure Postgres pods are running on nodes without any other application pods,
you can use [taints and tolerations](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/) you can use [taints and tolerations](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/)
@ -531,6 +531,28 @@ spec:
effect: NoSchedule effect: NoSchedule
``` ```
If you need the pods to be scheduled on specific nodes you may use [node affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/)
to specify a set of label(s), of which a prospective host node must have at least one. This could be used to
place nodes with certain hardware capabilities (e.g. SSD drives) in certain environments or network segments,
e.g. for PCI compliance.
```yaml
apiVersion: "acid.zalan.do/v1"
kind: postgresql
metadata:
name: acid-minimal-cluster
spec:
teamId: "ACID"
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: environment
operator: In
values:
- pci
```
## How to clone an existing PostgreSQL cluster ## How to clone an existing PostgreSQL cluster
You can spin up a new cluster as a clone of the existing one, using a `clone` You can spin up a new cluster as a clone of the existing one, using a `clone`

View File

@ -929,6 +929,112 @@ class EndToEndTestCase(unittest.TestCase):
new_master_node = nm[0] new_master_node = nm[0]
self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label) self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label)
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_node_affinity(self):
'''
Add label to a node and update postgres cluster spec to deploy only on a node with that label
'''
k8s = self.k8s
cluster_label = 'application=spilo,cluster-name=acid-minimal-cluster'
# verify we are in good state from potential previous tests
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
self.eventuallyEqual(lambda: self.k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# get nodes of master and replica(s)
master_node, replica_nodes = k8s.get_pg_nodes(cluster_label)
self.assertNotEqual(master_node, [])
self.assertNotEqual(replica_nodes, [])
# label node with environment=postgres
node_label_body = {
"metadata": {
"labels": {
"node-affinity-test": "postgres"
}
}
}
try:
# patch current master node with the label
print('patching master node: {}'.format(master_node))
k8s.api.core_v1.patch_node(master_node, node_label_body)
# add node affinity to cluster
patch_node_affinity_config = {
"spec": {
"nodeAffinity" : {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "node-affinity-test",
"operator": "In",
"values": [
"postgres"
]
}
]
}
]
}
}
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
group="acid.zalan.do",
version="v1",
namespace="default",
plural="postgresqls",
name="acid-minimal-cluster",
body=patch_node_affinity_config)
self.eventuallyEqual(lambda: self.k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# node affinity change should cause replica to relocate from replica node to master node due to node affinity requirement
k8s.wait_for_pod_failover(master_node, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
podsList = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_label)
for pod in podsList.items:
if pod.metadata.labels.get('spilo-role') == 'replica':
self.assertEqual(master_node, pod.spec.node_name,
"Sanity check: expected replica to relocate to master node {}, but found on {}".format(master_node, pod.spec.node_name))
# check that pod has correct node affinity
key = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].key
value = pod.spec.affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms[0].match_expressions[0].values[0]
self.assertEqual("node-affinity-test", key,
"Sanity check: expect node selector key to be equal to 'node-affinity-test' but got {}".format(key))
self.assertEqual("postgres", value,
"Sanity check: expect node selector value to be equal to 'postgres' but got {}".format(value))
patch_node_remove_affinity_config = {
"spec": {
"nodeAffinity" : None
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
group="acid.zalan.do",
version="v1",
namespace="default",
plural="postgresqls",
name="acid-minimal-cluster",
body=patch_node_remove_affinity_config)
self.eventuallyEqual(lambda: self.k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# remove node affinity to move replica away from master node
nm, new_replica_nodes = k8s.get_cluster_nodes()
new_master_node = nm[0]
self.assert_distributed_pods(new_master_node, new_replica_nodes, cluster_label)
except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log()))
raise
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_zzzz_cluster_deletion(self): def test_zzzz_cluster_deletion(self):
''' '''

View File

@ -172,3 +172,14 @@ spec:
# When TLS is enabled, also set spiloFSGroup parameter above to the relevant value. # When TLS is enabled, also set spiloFSGroup parameter above to the relevant value.
# if unknown, set it to 103 which is the usual value in the default spilo images. # if unknown, set it to 103 which is the usual value in the default spilo images.
# In Openshift, there is no need to set spiloFSGroup/spilo_fsgroup. # In Openshift, there is no need to set spiloFSGroup/spilo_fsgroup.
# Add node affinity support by allowing postgres pods to schedule only on nodes that
# have label: "postgres-operator:enabled" set.
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: postgres-operator
# operator: In
# values:
# - enabled

View File

@ -392,6 +392,97 @@ spec:
type: string type: string
caSecretName: caSecretName:
type: string type: string
nodeAffinity:
type: object
properties:
preferredDuringSchedulingIgnoredDuringExecution:
type: array
items:
type: object
required:
- weight
- preference
properties:
preference:
type: object
properties:
matchExpressions:
type: array
items:
type: object
required:
- key
- operator
properties:
key:
type: string
operator:
type: string
values:
type: array
items:
type: string
matchFields:
type: array
items:
type: object
required:
- key
- operator
properties:
key:
type: string
operator:
type: string
values:
type: array
items:
type: string
weight:
format: int32
type: integer
requiredDuringSchedulingIgnoredDuringExecution:
type: object
required:
- nodeSelectorTerms
properties:
nodeSelectorTerms:
type: array
items:
type: object
properties:
matchExpressions:
type: array
items:
type: object
required:
- key
- operator
properties:
key:
type: string
operator:
type: string
values:
type: array
items:
type: string
matchFields:
type: array
items:
type: object
required:
- key
- operator
properties:
key:
type: string
operator:
type: string
values:
type: array
items:
type: string
tolerations: tolerations:
type: array type: array
items: items:

View File

@ -597,6 +597,91 @@ var PostgresCRDResourceValidation = apiextv1.CustomResourceValidation{
}, },
}, },
}, },
"nodeAffinity": {
Type: "object",
Properties: map[string]apiextv1.JSONSchemaProps{
"preferredDuringSchedulingIgnoredDuringExecution": {
Type: "array",
Items: &apiextv1.JSONSchemaPropsOrArray{
Schema: &apiextv1.JSONSchemaProps{
Type: "object",
Required: []string{"preference, weight"},
Properties: map[string]apiextv1.JSONSchemaProps{
"preference": {
Type: "object",
Properties: map[string]apiextv1.JSONSchemaProps{
"matchExpressions": {
Type: "array",
Items: &apiextv1.JSONSchemaPropsOrArray{
Schema: &apiextv1.JSONSchemaProps{
Type: "object",
AdditionalProperties: &apiextv1.JSONSchemaPropsOrBool{
Allows: true,
},
},
},
},
"matchFields": {
Type: "array",
Items: &apiextv1.JSONSchemaPropsOrArray{
Schema: &apiextv1.JSONSchemaProps{
Type: "object",
AdditionalProperties: &apiextv1.JSONSchemaPropsOrBool{
Allows: true,
},
},
},
},
},
},
"weight": {
Type: "integer",
Format: "int32",
},
},
},
},
},
"requiredDuringSchedulingIgnoredDuringExecution": {
Type: "object",
Required: []string{"nodeSelectorTerms"},
Properties: map[string]apiextv1.JSONSchemaProps{
"nodeSelectorTerms": {
Type: "array",
Items: &apiextv1.JSONSchemaPropsOrArray{
Schema: &apiextv1.JSONSchemaProps{
Type: "object",
Properties: map[string]apiextv1.JSONSchemaProps{
"matchExpressions": {
Type: "array",
Items: &apiextv1.JSONSchemaPropsOrArray{
Schema: &apiextv1.JSONSchemaProps{
Type: "object",
AdditionalProperties: &apiextv1.JSONSchemaPropsOrBool{
Allows: true,
},
},
},
},
"matchFields": {
Type: "array",
Items: &apiextv1.JSONSchemaPropsOrArray{
Schema: &apiextv1.JSONSchemaProps{
Type: "object",
AdditionalProperties: &apiextv1.JSONSchemaPropsOrBool{
Allows: true,
},
},
},
},
},
},
},
},
},
},
},
},
"tolerations": { "tolerations": {
Type: "array", Type: "array",
Items: &apiextv1.JSONSchemaPropsOrArray{ Items: &apiextv1.JSONSchemaPropsOrArray{

View File

@ -61,6 +61,7 @@ type PostgresSpec struct {
Databases map[string]string `json:"databases,omitempty"` Databases map[string]string `json:"databases,omitempty"`
PreparedDatabases map[string]PreparedDatabase `json:"preparedDatabases,omitempty"` PreparedDatabases map[string]PreparedDatabase `json:"preparedDatabases,omitempty"`
SchedulerName *string `json:"schedulerName,omitempty"` SchedulerName *string `json:"schedulerName,omitempty"`
NodeAffinity v1.NodeAffinity `json:"nodeAffinity,omitempty"`
Tolerations []v1.Toleration `json:"tolerations,omitempty"` Tolerations []v1.Toleration `json:"tolerations,omitempty"`
Sidecars []Sidecar `json:"sidecars,omitempty"` Sidecars []Sidecar `json:"sidecars,omitempty"`
InitContainers []v1.Container `json:"initContainers,omitempty"` InitContainers []v1.Container `json:"initContainers,omitempty"`

View File

@ -633,6 +633,7 @@ func (in *PostgresSpec) DeepCopyInto(out *PostgresSpec) {
*out = new(string) *out = new(string)
**out = **in **out = **in
} }
in.NodeAffinity.DeepCopyInto(&out.NodeAffinity)
if in.Tolerations != nil { if in.Tolerations != nil {
in, out := &in.Tolerations, &out.Tolerations in, out := &in.Tolerations, &out.Tolerations
*out = make([]corev1.Toleration, len(*in)) *out = make([]corev1.Toleration, len(*in))

View File

@ -320,25 +320,39 @@ func getLocalAndBoostrapPostgreSQLParameters(parameters map[string]string) (loca
return return
} }
func nodeAffinity(nodeReadinessLabel map[string]string) *v1.Affinity { func nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinity *v1.NodeAffinity) *v1.Affinity {
matchExpressions := make([]v1.NodeSelectorRequirement, 0) if len(nodeReadinessLabel) == 0 && nodeAffinity == nil {
if len(nodeReadinessLabel) == 0 {
return nil return nil
} }
for k, v := range nodeReadinessLabel { nodeAffinityCopy := *&v1.NodeAffinity{}
matchExpressions = append(matchExpressions, v1.NodeSelectorRequirement{ if nodeAffinity != nil {
Key: k, nodeAffinityCopy = *nodeAffinity.DeepCopy()
Operator: v1.NodeSelectorOpIn, }
Values: []string{v}, if len(nodeReadinessLabel) > 0 {
}) matchExpressions := make([]v1.NodeSelectorRequirement, 0)
for k, v := range nodeReadinessLabel {
matchExpressions = append(matchExpressions, v1.NodeSelectorRequirement{
Key: k,
Operator: v1.NodeSelectorOpIn,
Values: []string{v},
})
}
nodeReadinessSelectorTerm := v1.NodeSelectorTerm{MatchExpressions: matchExpressions}
if nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution == nil {
nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
nodeReadinessSelectorTerm,
},
}
} else {
nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{
NodeSelectorTerms: append(nodeAffinityCopy.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, nodeReadinessSelectorTerm),
}
}
} }
return &v1.Affinity{ return &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{ NodeAffinity: &nodeAffinityCopy,
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{{MatchExpressions: matchExpressions}},
},
},
} }
} }
@ -1209,7 +1223,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef
effectiveRunAsUser, effectiveRunAsUser,
effectiveRunAsGroup, effectiveRunAsGroup,
effectiveFSGroup, effectiveFSGroup,
nodeAffinity(c.OpConfig.NodeReadinessLabel), nodeAffinity(c.OpConfig.NodeReadinessLabel, &spec.NodeAffinity),
spec.SchedulerName, spec.SchedulerName,
int64(c.OpConfig.PodTerminateGracePeriod.Seconds()), int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
c.OpConfig.PodServiceAccountName, c.OpConfig.PodServiceAccountName,
@ -1914,7 +1928,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1beta1.CronJob, error) {
nil, nil,
nil, nil,
nil, nil,
nodeAffinity(c.OpConfig.NodeReadinessLabel), nodeAffinity(c.OpConfig.NodeReadinessLabel, nil),
nil, nil,
int64(c.OpConfig.PodTerminateGracePeriod.Seconds()), int64(c.OpConfig.PodTerminateGracePeriod.Seconds()),
c.OpConfig.PodServiceAccountName, c.OpConfig.PodServiceAccountName,

View File

@ -864,6 +864,72 @@ func testEnvs(cluster *Cluster, podSpec *v1.PodTemplateSpec, role PostgresRole)
return nil return nil
} }
func TestNodeAffinity(t *testing.T) {
var err error
var spec acidv1.PostgresSpec
var cluster *Cluster
var spiloRunAsUser = int64(101)
var spiloRunAsGroup = int64(103)
var spiloFSGroup = int64(103)
makeSpec := func(nodeAffinity *v1.NodeAffinity) acidv1.PostgresSpec {
return acidv1.PostgresSpec{
TeamID: "myapp", NumberOfInstances: 1,
Resources: acidv1.Resources{
ResourceRequests: acidv1.ResourceDescription{CPU: "1", Memory: "10"},
ResourceLimits: acidv1.ResourceDescription{CPU: "1", Memory: "10"},
},
Volume: acidv1.Volume{
Size: "1G",
},
NodeAffinity: *nodeAffinity,
}
}
cluster = New(
Config{
OpConfig: config.Config{
PodManagementPolicy: "ordered_ready",
ProtectedRoles: []string{"admin"},
Auth: config.Auth{
SuperUsername: superUserName,
ReplicationUsername: replicationUserName,
},
Resources: config.Resources{
SpiloRunAsUser: &spiloRunAsUser,
SpiloRunAsGroup: &spiloRunAsGroup,
SpiloFSGroup: &spiloFSGroup,
},
},
}, k8sutil.KubernetesClient{}, acidv1.Postgresql{}, logger, eventRecorder)
nodeAff := &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
v1.NodeSelectorTerm{
MatchExpressions: []v1.NodeSelectorRequirement{
v1.NodeSelectorRequirement{
Key: "test-label",
Operator: v1.NodeSelectorOpIn,
Values: []string{
"test-value",
},
},
},
},
},
},
}
spec = makeSpec(nodeAff)
s, err := cluster.generateStatefulSet(&spec)
if err != nil {
assert.NoError(t, err)
}
assert.NotNil(t, s.Spec.Template.Spec.Affinity.NodeAffinity, "node affinity in statefulset shouldn't be nil")
assert.Equal(t, s.Spec.Template.Spec.Affinity.NodeAffinity, nodeAff, "cluster template has correct node affinity")
}
func testCustomPodTemplate(cluster *Cluster, podSpec *v1.PodTemplateSpec) error { func testCustomPodTemplate(cluster *Cluster, podSpec *v1.PodTemplateSpec) error {
if podSpec.ObjectMeta.Name != "test-pod-template" { if podSpec.ObjectMeta.Name != "test-pod-template" {
return fmt.Errorf("Custom pod template is not used, current spec %+v", return fmt.Errorf("Custom pod template is not used, current spec %+v",