From 26a7fdfa9f4b6ff1acbba59108e3c1e159cf4d44 Mon Sep 17 00:00:00 2001 From: "teuto.net Netzdienste GmbH" Date: Thu, 21 Feb 2019 16:37:03 +0100 Subject: [PATCH] Add Pod Anti Affinity (#489) * Add Pod Anti Affinity --- docs/administrator.md | 30 +++++++++++++++++++ docs/reference/operator_parameters.md | 8 +++++ .../v1/operator_configuration_type.go | 2 ++ pkg/cluster/k8sres.go | 30 +++++++++++++++++-- pkg/controller/operator_config.go | 3 ++ pkg/util/config/config.go | 2 ++ 6 files changed, 73 insertions(+), 2 deletions(-) diff --git a/docs/administrator.md b/docs/administrator.md index be53eaf2d..208e9ddb9 100644 --- a/docs/administrator.md +++ b/docs/administrator.md @@ -151,6 +151,36 @@ Postgres pods by default receive tolerations for `unreachable` and `noExecute` t Depending on your setup, you may want to adjust these parameters to prevent master pods from being evicted by the Kubernetes runtime. To prevent eviction completely, specify the toleration by leaving out the `tolerationSeconds` value (similar to how Kubernetes' own DaemonSets are configured) +### Enable pod anti affinity + +To ensure Postgres pods are running on different topologies, you can use [pod anti affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/) +and configure the required topology in the operator ConfigMap. + +Enable pod anti affinity by adding following line to the operator ConfigMap: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: postgres-operator +data: + enable_pod_antiaffinity: "true" +``` + +By default the topology key for the pod anti affinity is set to `kubernetes.io/hostname`, +you can set another topology key e.g. `failure-domain.beta.kubernetes.io/zone` by adding following line +to the operator ConfigMap, see [built-in node labels](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#interlude-built-in-node-labels) for available topology keys: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: postgres-operator +data: + enable_pod_antiaffinity: "true" + pod_antiaffinity_topology_key: "failure-domain.beta.kubernetes.io/zone" +``` + ### Add cluster-specific labels In some cases, you might want to add `labels` that are specific to a given diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md index 06a779c1e..69d903427 100644 --- a/docs/reference/operator_parameters.md +++ b/docs/reference/operator_parameters.md @@ -213,6 +213,14 @@ configuration they are grouped under the `kubernetes` key. that should be assigned to the Postgres pods. The priority class itself must be defined in advance. Default is empty (use the default priority class). +* **enable_pod_antiaffinity** + toggles [pod anti affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/) on the Postgres pods, to avoid multiple pods + of the same Postgres cluster in the same topology , e.g. node. The default is `false`. + +* **pod_antiaffinity_topology_key** + override + [topology key](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#interlude-built-in-node-labels) + for pod anti affinity. The default is `kubernetes.io/hostname`. ## Kubernetes resource requests diff --git a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go index 1b6939dfa..99d79b64b 100644 --- a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go +++ b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go @@ -60,6 +60,8 @@ type KubernetesMetaConfiguration struct { // TODO: use namespacedname PodEnvironmentConfigMap string `json:"pod_environment_configmap,omitempty"` PodPriorityClassName string `json:"pod_priority_class_name,omitempty"` + EnablePodAntiAffinity bool `json:"enable_pod_antiaffinity" default:"false"` + PodAntiAffinityTopologyKey string `name:"pod_antiaffinity_topology_key" default:"kubernetes.io/hostname"` } // PostgresPodResourcesDefaults defines the spec of default resources diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index c0ae1648c..9a58f0516 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -290,6 +290,26 @@ func nodeAffinity(nodeReadinessLabel map[string]string) *v1.Affinity { } } +func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1.Affinity) *v1.Affinity { + // generate pod anti-affinity to avoid multiple pods of the same Postgres cluster in the same topology , e.g. node + podAffinity := v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{{ + LabelSelector: &metav1.LabelSelector{ + MatchLabels: labels, + }, + TopologyKey: topologyKey, + }}, + }, + } + + if nodeAffinity != nil && nodeAffinity.NodeAffinity != nil { + podAffinity.NodeAffinity = nodeAffinity.NodeAffinity + } + + return &podAffinity +} + func tolerations(tolerationsSpec *[]v1.Toleration, podToleration map[string]string) []v1.Toleration { // allow to override tolerations by postgresql manifest if len(*tolerationsSpec) > 0 { @@ -419,6 +439,8 @@ func generatePodTemplate( kubeIAMRole string, priorityClassName string, shmVolume bool, + podAntiAffinity bool, + podAntiAffinityTopologyKey string, ) (*v1.PodTemplateSpec, error) { terminateGracePeriodSeconds := terminateGracePeriod @@ -437,7 +459,9 @@ func generatePodTemplate( addShmVolume(&podSpec) } - if nodeAffinity != nil { + if podAntiAffinity { + podSpec.Affinity = generatePodAffinity(labels, podAntiAffinityTopologyKey, nodeAffinity) + } else if nodeAffinity != nil { podSpec.Affinity = nodeAffinity } @@ -813,7 +837,9 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*v1beta1.State c.OpConfig.PodServiceAccountName, c.OpConfig.KubeIAMRole, effectivePodPriorityClassName, - mountShmVolumeNeeded(c.OpConfig, spec)); err != nil { + mountShmVolumeNeeded(c.OpConfig, spec), + c.OpConfig.EnablePodAntiAffinity, + c.OpConfig.PodAntiAffinityTopologyKey); err != nil { return nil, fmt.Errorf("could not generate pod template: %v", err) } diff --git a/pkg/controller/operator_config.go b/pkg/controller/operator_config.go index 74549cbb8..08df7e97c 100644 --- a/pkg/controller/operator_config.go +++ b/pkg/controller/operator_config.go @@ -53,6 +53,9 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur result.NodeReadinessLabel = fromCRD.Kubernetes.NodeReadinessLabel result.PodPriorityClassName = fromCRD.Kubernetes.PodPriorityClassName + result.EnablePodAntiAffinity = fromCRD.Kubernetes.EnablePodAntiAffinity; + result.PodAntiAffinityTopologyKey = fromCRD.Kubernetes.PodAntiAffinityTopologyKey; + result.DefaultCPURequest = fromCRD.PostgresPodResources.DefaultCPURequest result.DefaultMemoryRequest = fromCRD.PostgresPodResources.DefaultMemoryRequest result.DefaultCPULimit = fromCRD.PostgresPodResources.DefaultCPULimit diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index 371b7cb65..a82f4c17d 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -95,6 +95,8 @@ type Config struct { EnableMasterLoadBalancer bool `name:"enable_master_load_balancer" default:"true"` EnableReplicaLoadBalancer bool `name:"enable_replica_load_balancer" default:"false"` CustomServiceAnnotations map[string]string `name:"custom_service_annotations"` + EnablePodAntiAffinity bool `name:"enable_pod_antiaffinity" default:"false"` + PodAntiAffinityTopologyKey string `name:"pod_antiaffinity_topology_key" default:"kubernetes.io/hostname"` // deprecated and kept for backward compatibility EnableLoadBalancer *bool `name:"enable_load_balancer"` MasterDNSNameFormat StringTemplate `name:"master_dns_name_format" default:"{cluster}.{team}.{hostedzone}"`