Merge fa6dadd691 into 1af4c50ed0

2025-10-21 14:43:16 +03:00 · 2025-10-21 14:43:16 +03:00 · fe0fbcee22
parent 1af4c50ed0 fa6dadd691
commit fe0fbcee22
11 changed files with 268 additions and 32 deletions
--- a/charts/postgres-operator/crds/postgresqls.yaml
+++ b/charts/postgres-operator/crds/postgresqls.yaml
@ -584,6 +584,23 @@ spec:
                        - PreferNoSchedule
                    tolerationSeconds:
                      type: integer
+              topologySpreadConstraints:
+                type: array
+                nullable: true
+                items:
+                  type: object
+                  properties:
+                    maxskew:
+                      type: integer
+                      format: int32
+                      minimum: 1
+                    topologyKey:
+                      type: string
+                    whenUnsatisfiable:
+                      type: string
+                      enum:
+                        - DoNotSchedule
+                        - ScheduleAnyway
              useLoadBalancer:
                type: boolean
                description: deprecated
--- a/docs/user.md
+++ b/docs/user.md
@ -714,7 +714,7 @@ but Kubernetes will not spin up the pod if the requested HugePages cannot be all
 For more information on HugePages in Kubernetes, see also
 [https://kubernetes.io/docs/tasks/manage-hugepages/scheduling-hugepages/](https://kubernetes.io/docs/tasks/manage-hugepages/scheduling-hugepages/)

-## Use taints, tolerations and node affinity for dedicated PostgreSQL nodes
+## Use taints, tolerations, node affinity and topology spread constraint for dedicated PostgreSQL nodes

 To ensure Postgres pods are running on nodes without any other application pods,
 you can use [taints and tolerations](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/)
@ -755,6 +755,23 @@ spec:
 If you need to define a `nodeAffinity` for all your Postgres clusters use the
 `node_readiness_label` [configuration](administrator.md#node-readiness-labels).

+If you need PostgreSQL Pods to run on separate nodes, you can use the
+[topologySpreadConstraints](https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/) to control how they are distributed across your cluster.
+This ensures they are spread among failure domains such as
+regions, zones, nodes, or other user-defined topology domains.
+
+```yaml
+apiVersion: "acid.zalan.do/v1"
+kind: postgresql
+metadata:
+  name: acid-minimal-cluster
+spec:
+  topologySpreadConstraints:
+    - maxskew: 1
+      topologyKey: topology.kubernetes.io/zone
+      whenUnsatisfiable: DoNotSchedule
+```
+
 ## In-place major version upgrade

 Starting with Spilo 13, operator supports in-place major version upgrade to a
--- a/e2e/tests/test_e2e.py
+++ b/e2e/tests/test_e2e.py
@ -1646,7 +1646,6 @@ class EndToEndTestCase(unittest.TestCase):
        # toggle pod anti affinity to move replica away from master node
        self.assert_distributed_pods(master_nodes)

-
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_overwrite_pooler_deployment(self):
        pooler_name = 'acid-minimal-cluster-pooler'
@ -2385,6 +2384,78 @@ class EndToEndTestCase(unittest.TestCase):
        # toggle pod anti affinity to move replica away from master node
        self.assert_distributed_pods(master_nodes)

+    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
+    def test_topology_spread_constraints(self):
+        '''
+            Enable topologySpreadConstraints for pods
+        '''
+        k8s = self.k8s
+        cluster_labels = "application=spilo,cluster-name=acid-minimal-cluster"
+
+        # Verify we are in good state from potential previous tests
+        self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")
+
+        master_nodes, replica_nodes = k8s.get_cluster_nodes()
+        self.assertNotEqual(master_nodes, [])
+        self.assertNotEqual(replica_nodes, [])
+
+        # Patch label to nodes for topologySpreadConstraints
+        patch_node_label = {
+            "metadata": {
+                "labels": {
+                    "topology.kubernetes.io/zone": "zalando"
+                }
+            }
+        }
+        k8s.api.core_v1.patch_node(master_nodes[0], patch_node_label)
+        k8s.api.core_v1.patch_node(replica_nodes[0], patch_node_label)
+
+        # Patch topologySpreadConstraint and scale-out postgresql pods to postgresqls manifest.
+        patch_topologySpreadConstraint_config = {
+            "spec": {
+                "numberOfInstances": 6,
+                "topologySpreadConstraint": [
+                    {
+                        "maxskew": 1,
+                        "topologyKey": "topology.kubernetes.io/zone",
+                        "whenUnsatisfiable": "DoNotSchedule"
+                    }
+                ]
+            }
+        }
+        k8s.api.custom_objects_api.patch_namespaced_custom_object(
+            "acid.zalan.do", "v1", "default",
+            "postgresqls", "acid-minimal-cluster",
+            patch_topologySpreadConstraint_config)
+        self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
+        self.eventuallyEqual(lambda: k8s.count_pods_with_label(cluster_labels), 6, "Postgresql StatefulSet are scale to 6")
+        self.eventuallyEqual(lambda: k8s.count_running_pods(), 6, "All pods are running")
+
+        worker_node_1 = 0
+        worker_node_2 = 0
+        pods = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_labels)
+        for pod in pods.items:
+            if pod.spec.node_name == 'postgres-operator-e2e-tests-worker':
+                worker_node_1 += 1
+            elif pod.spec.node_name == 'postgres-operator-e2e-tests-worker2':
+                worker_node_2 += 1
+
+        self.assertEqual(worker_node_1, worker_node_2)
+        self.assertEqual(worker_node_1, 3)
+        self.assertEqual(worker_node_2, 3)
+
+        # Reset configurations
+        patch_topologySpreadConstraint_config = {
+            "spec": {
+                "numberOfInstances": 2,
+                "topologySpreadConstraint": []
+            }
+        }
+        k8s.api.custom_objects_api.patch_namespaced_custom_object(
+            "acid.zalan.do", "v1", "default",
+            "postgresqls", "acid-minimal-cluster",
+            patch_topologySpreadConstraint_config)
+
    @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
    def test_zz_cluster_deletion(self):
        '''
@ -2460,7 +2531,7 @@ class EndToEndTestCase(unittest.TestCase):
            self.eventuallyEqual(lambda: k8s.count_deployments_with_label(cluster_label), 0, "Deployments not deleted")
            self.eventuallyEqual(lambda: k8s.count_pdbs_with_label(cluster_label), 0, "Pod disruption budget not deleted")
            self.eventuallyEqual(lambda: k8s.count_secrets_with_label(cluster_label), 8, "Secrets were deleted although disabled in config")
-            self.eventuallyEqual(lambda: k8s.count_pvcs_with_label(cluster_label), 3, "PVCs were deleted although disabled in config")
+            self.eventuallyEqual(lambda: k8s.count_pvcs_with_label(cluster_label), 6, "PVCs were deleted although disabled in config")

        except timeout_decorator.TimeoutError:
            print('Operator log: {}'.format(k8s.get_operator_log()))
--- a/manifests/complete-postgres-manifest.yaml
+++ b/manifests/complete-postgres-manifest.yaml
@ -232,6 +232,12 @@ spec:
 #              values:
 #                - enabled

+# Add topology spread constraint to distribute PostgreSQL pods across all nodes labeled with "topology.kubernetes.io/zone".
+#   topologySpreadConstraint:
+#     - maxSkew: 1
+#       topologyKey: topology.kubernetes.io/zone
+#       whenUnsatisfiable: DoNotSchedule
+
 # Enables change data capture streams for defined database tables
 #  streams:
 #  - applicationId: test-app
--- a/manifests/postgresql.crd.yaml
+++ b/manifests/postgresql.crd.yaml
@ -582,6 +582,22 @@ spec:
                        - PreferNoSchedule
                    tolerationSeconds:
                      type: integer
+              topologySpreadConstraints:
+                type: array
+                nullable: true
+                items:
+                  type: object
+                  properties:
+                    maxSkew:
+                      type: integer
+                      format: int32
+                    topologyKey:
+                      type: string
+                    whenUnsatisfiable:
+                      type: string
+                      enum:
+                        - DoNotSchedule
+                        - ScheduleAnyway
              useLoadBalancer:
                type: boolean
                description: deprecated
--- a/pkg/apis/acid.zalan.do/v1/crds.go
+++ b/pkg/apis/acid.zalan.do/v1/crds.go
@ -111,6 +111,7 @@ var OperatorConfigCRDResourceColumns = []apiextv1.CustomResourceColumnDefinition

 var min0 = 0.0
 var min1 = 1.0
+var minLength1 int64 = 1
 var minDisable = -1.0

 // PostgresCRDResourceValidation to check applied manifest parameters
@ -895,6 +896,34 @@ var PostgresCRDResourceValidation = apiextv1.CustomResourceValidation{
 							},
 						},
 					},
+					"topologySpreadConstraints": {
+						Type:     "array",
+						Nullable: true,
+						Items: &apiextv1.JSONSchemaPropsOrArray{
+							Schema: &apiextv1.JSONSchemaProps{
+								Type: "object",
+								Properties: map[string]apiextv1.JSONSchemaProps{
+									"maxSkew": {
+										Type:    "integer",
+										Format:  "int32",
+										Minimum: &min1,
+									},
+									"topologyKey": {
+										Type:      "string",
+										MinLength: &minLength1,
+									},
+									"whenUnsatisfiable": {
+										Type: "string",
+										Enum: []apiextv1.JSON{
+											{Raw: []byte(`"DoNotSchedule"`)},
+											{Raw: []byte(`"ScheduleAnyway"`)},
+										},
+									},
+								},
+								Required: []string{"maxSkew", "topologyKey", "whenUnsatisfiable"},
+							},
+						},
+					},
 					"useLoadBalancer": {
 						Type:        "boolean",
 						Description: "deprecated",
--- a/pkg/apis/acid.zalan.do/v1/postgresql_type.go
+++ b/pkg/apis/acid.zalan.do/v1/postgresql_type.go
@ -70,6 +70,7 @@ type PostgresSpec struct {
 	PreparedDatabases         map[string]PreparedDatabase   `json:"preparedDatabases,omitempty"`
 	SchedulerName             *string                       `json:"schedulerName,omitempty"`
 	NodeAffinity              *v1.NodeAffinity              `json:"nodeAffinity,omitempty"`
+	TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
 	Tolerations               []v1.Toleration               `json:"tolerations,omitempty"`
 	Sidecars                  []Sidecar                     `json:"sidecars,omitempty"`
 	InitContainers            []v1.Container                `json:"initContainers,omitempty"`
--- a/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go
+++ b/pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go
@ -789,6 +789,13 @@ func (in *PostgresSpec) DeepCopyInto(out *PostgresSpec) {
 		*out = new(corev1.NodeAffinity)
 		(*in).DeepCopyInto(*out)
 	}
+	if in.TopologySpreadConstraints != nil {
+		in, out := &in.TopologySpreadConstraints, &out.TopologySpreadConstraints
+		*out = make([]corev1.TopologySpreadConstraint, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
 	if in.Tolerations != nil {
 		in, out := &in.Tolerations, &out.Tolerations
 		*out = make([]corev1.Toleration, len(*in))
--- a/pkg/cluster/cluster.go
+++ b/pkg/cluster/cluster.go
@ -499,6 +499,11 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa
 		needsRollUpdate = true
 		reasons = append(reasons, "new statefulset's pod affinity does not match the current one")
 	}
+	if !reflect.DeepEqual(c.Statefulset.Spec.Template.Spec.TopologySpreadConstraints, statefulSet.Spec.Template.Spec.TopologySpreadConstraints) {
+		needsReplace = true
+		needsRollUpdate = true
+		reasons = append(reasons, "new statefulset's pod topologySpreadConstraints does not match the current one")
+	}
 	if len(c.Statefulset.Spec.Template.Spec.Tolerations) != len(statefulSet.Spec.Template.Spec.Tolerations) {
 		needsReplace = true
 		needsRollUpdate = true
--- a/pkg/cluster/k8sres.go
+++ b/pkg/cluster/k8sres.go
@ -604,6 +604,13 @@ func generatePodAntiAffinity(podAffinityTerm v1.PodAffinityTerm, preferredDuring
 	return podAntiAffinity
 }

+func generateTopologySpreadConstraints(labels labels.Set, topologySpreadConstraints []v1.TopologySpreadConstraint) []v1.TopologySpreadConstraint {
+	for _, topologySpreadConstraint := range topologySpreadConstraints {
+		topologySpreadConstraint.LabelSelector = &metav1.LabelSelector{MatchLabels: labels}
+	}
+	return topologySpreadConstraints
+}
+
 func tolerations(tolerationsSpec *[]v1.Toleration, podToleration map[string]string) []v1.Toleration {
 	// allow to override tolerations by postgresql manifest
 	if len(*tolerationsSpec) > 0 {
@ -809,6 +816,7 @@ func (c *Cluster) generatePodTemplate(
 	initContainers []v1.Container,
 	sidecarContainers []v1.Container,
 	sharePgSocketWithSidecars *bool,
+	topologySpreadConstraintsSpec []v1.TopologySpreadConstraint,
 	tolerationsSpec *[]v1.Toleration,
 	spiloRunAsUser *int64,
 	spiloRunAsGroup *int64,
@ -878,6 +886,8 @@ func (c *Cluster) generatePodTemplate(
 		podSpec.PriorityClassName = priorityClassName
 	}

+	podSpec.TopologySpreadConstraints = generateTopologySpreadConstraints(labels, topologySpreadConstraintsSpec)
+
 	if sharePgSocketWithSidecars != nil && *sharePgSocketWithSidecars {
 		addVarRunVolume(&podSpec)
 	}
@ -1469,6 +1479,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef
 		initContainers,
 		sidecarContainers,
 		c.OpConfig.SharePgSocketWithSidecars,
+		spec.TopologySpreadConstraints,
 		&tolerationSpec,
 		effectiveRunAsUser,
 		effectiveRunAsGroup,
@ -2348,6 +2359,8 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1.CronJob, error) {

 	tolerationsSpec := tolerations(&spec.Tolerations, c.OpConfig.PodToleration)

+	topologySpreadConstraintsSpec := generateTopologySpreadConstraints(labels, spec.TopologySpreadConstraints)
+
 	// re-use the method that generates DB pod templates
 	if podTemplate, err = c.generatePodTemplate(
 		c.Namespace,
@ -2357,6 +2370,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1.CronJob, error) {
 		[]v1.Container{},
 		[]v1.Container{},
 		util.False(),
+		topologySpreadConstraintsSpec,
 		&tolerationsSpec,
 		nil,
 		nil,
--- a/pkg/cluster/k8sres_test.go
+++ b/pkg/cluster/k8sres_test.go
@ -3984,3 +3984,56 @@ func TestGenerateCapabilities(t *testing.T) {
 		}
 	}
 }
+
+func TestTopologySpreadConstraints(t *testing.T) {
+	clusterName := "acid-test-cluster"
+	namespace := "default"
+	labelSelector := &metav1.LabelSelector{
+		MatchLabels: cluster.labelsSet(true),
+	}
+
+	pg := acidv1.Postgresql{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      clusterName,
+			Namespace: namespace,
+		},
+		Spec: acidv1.PostgresSpec{
+			NumberOfInstances: 1,
+			Resources: &acidv1.Resources{
+				ResourceRequests: acidv1.ResourceDescription{CPU: k8sutil.StringToPointer("1"), Memory: k8sutil.StringToPointer("10")},
+				ResourceLimits:   acidv1.ResourceDescription{CPU: k8sutil.StringToPointer("1"), Memory: k8sutil.StringToPointer("10")},
+			},
+			Volume: acidv1.Volume{
+				Size: "1G",
+			},
+			TopologySpreadConstraints: []v1.TopologySpreadConstraint{
+				{
+					MaxSkew:           1,
+					TopologyKey:       "topology.kubernetes.io/zone",
+					WhenUnsatisfiable: v1.DoNotSchedule,
+					LabelSelector:     labelSelector,
+				},
+			},
+		},
+	}
+
+	cluster := New(
+		Config{
+			OpConfig: config.Config{
+				PodManagementPolicy: "ordered_ready",
+			},
+		}, k8sutil.KubernetesClient{}, acidv1.Postgresql{}, logger, eventRecorder)
+	cluster.Name = clusterName
+	cluster.Namespace = namespace
+	cluster.labelsSet(true)
+
+	s, err := cluster.generateStatefulSet(&pg.Spec)
+	assert.NoError(t, err)
+	assert.Contains(t, s.Spec.Template.Spec.TopologySpreadConstraints, v1.TopologySpreadConstraint{
+		MaxSkew:           int32(1),
+		TopologyKey:       "topology.kubernetes.io/zone",
+		WhenUnsatisfiable: v1.DoNotSchedule,
+		LabelSelector:     labelSelector,
+	},
+	)
+}