diff --git a/README.md b/README.md index 17ca5f8cc..dc7b60db5 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,27 @@ spec: Please be aware that the taint and toleration only ensures that no other pod gets scheduled to a PostgreSQL node but not that PostgreSQL pods are placed on such a node. This can be achieved by setting a node affinity rule in the ConfigMap. +### Using the operator to minimize the amount of failovers during the cluster upgrade + +Postgres operator moves master pods out of to be decommissioned Kubernetes nodes. The decommission status of the node is derived +from the presence of the set of labels defined by the `node_readiness_label` parameter. The operator makes sure that the Postgres +master pods are moved elsewhere from the node that is pending to be decommissioned , but not on another node that is also +about to be shut down. It achieves that via a combination of several properties set on the postgres pods: + +* [nodeAffinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity-beta-feature) is configured to avoid scheduling the pod on nodes without all labels from the `node_readiness_label` set. +* [PodDisruptionBudget](https://kubernetes.io/docs/concepts/workloads/pods/disruptions/#how-disruption-budgets-work) is defined to keep the master pods running until they are moved out by the operator. + +The operator starts moving master pods when the node is drained and doesn't have all labels from the `node_readiness_label` set. +By default this parameter is set to an empty string, disabling this feature altogether. It can be set to a string containing one +or more key:value parameters, i.e: +``` +node_readiness_label: "lifecycle-status:ready,disagnostic-checks:ok" + +``` + +when multiple labels are set the operator will require all of them to be present on a node (and set to the specified value) in order to consider +it ready. + #### Custom Pod Environment Variables It is possible to configure a config map which is used by the Postgres pods as an additional provider for environment variables. diff --git a/manifests/configmap.yaml b/manifests/configmap.yaml index d996a473c..a5f9c33ff 100644 --- a/manifests/configmap.yaml +++ b/manifests/configmap.yaml @@ -39,5 +39,5 @@ data: pod_terminate_grace_period: 5m pdb_name_format: "postgres-{cluster}-pdb" node_eol_label: "lifecycle-status:pending-decommission" - node_readiness_label: "lifecycle-status:ready" + node_readiness_label: "" team_api_role_configuration: "log_statement:all" diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index 203786875..1bc18422e 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -238,6 +238,9 @@ PatroniInitDBParams: func (c *Cluster) nodeAffinity() *v1.Affinity { matchExpressions := make([]v1.NodeSelectorRequirement, 0) + if len(c.OpConfig.NodeReadinessLabel) == 0 { + return nil + } for k, v := range c.OpConfig.NodeReadinessLabel { matchExpressions = append(matchExpressions, v1.NodeSelectorRequirement{ Key: k, @@ -431,10 +434,13 @@ func (c *Cluster) generatePodTemplate( ServiceAccountName: c.OpConfig.ServiceAccountName, TerminationGracePeriodSeconds: &terminateGracePeriodSeconds, Containers: []v1.Container{container}, - Affinity: c.nodeAffinity(), Tolerations: c.tolerations(tolerationsSpec), } + if affinity := c.nodeAffinity(); affinity != nil { + podSpec.Affinity = affinity + } + if c.OpConfig.ScalyrAPIKey != "" && c.OpConfig.ScalyrImage != "" { podSpec.Containers = append( podSpec.Containers, diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index 8cec41417..e0a520b08 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -32,7 +32,7 @@ type Resources struct { DefaultCPULimit string `name:"default_cpu_limit" default:"3"` DefaultMemoryLimit string `name:"default_memory_limit" default:"1Gi"` PodEnvironmentConfigMap string `name:"pod_environment_configmap" default:""` - NodeReadinessLabel map[string]string `name:"node_readiness_label" default:"lifecycle-status:ready"` + NodeReadinessLabel map[string]string `name:"node_readiness_label" default:""` MaxInstances int32 `name:"max_instances" default:"-1"` MinInstances int32 `name:"min_instances" default:"-1"` }