From 819e4109597c879e0535ae6449814af85b17fe00 Mon Sep 17 00:00:00 2001 From: Felix Kunde Date: Tue, 3 Jan 2023 11:34:02 +0100 Subject: [PATCH] refactor podAffinity generation (#2156) --- .../crds/operatorconfigurations.yaml | 6 +- charts/postgres-operator/values.yaml | 4 +- manifests/configmap.yaml | 1 + manifests/operatorconfiguration.crd.yaml | 3 + ...gresql-operator-default-configuration.yaml | 2 + pkg/apis/acid.zalan.do/v1/crds.go | 6 +- .../v1/operator_configuration_type.go | 4 +- pkg/cluster/connection_pooler.go | 3 +- pkg/cluster/k8sres.go | 81 +++++---- pkg/cluster/k8sres_test.go | 157 +++++++++--------- pkg/util/config/config.go | 2 +- 11 files changed, 151 insertions(+), 118 deletions(-) diff --git a/charts/postgres-operator/crds/operatorconfigurations.yaml b/charts/postgres-operator/crds/operatorconfigurations.yaml index 5c5da0474..f1589b92d 100644 --- a/charts/postgres-operator/crds/operatorconfigurations.yaml +++ b/charts/postgres-operator/crds/operatorconfigurations.yaml @@ -278,12 +278,12 @@ spec: pdb_name_format: type: string default: "postgres-{cluster}-pdb" - pod_antiaffinity_topology_key: - type: string - default: "kubernetes.io/hostname" pod_antiaffinity_preferred_during_scheduling: type: boolean default: false + pod_antiaffinity_topology_key: + type: string + default: "kubernetes.io/hostname" pod_environment_configmap: type: string pod_environment_secret: diff --git a/charts/postgres-operator/values.yaml b/charts/postgres-operator/values.yaml index 613ee50d5..20f961ca1 100644 --- a/charts/postgres-operator/values.yaml +++ b/charts/postgres-operator/values.yaml @@ -165,10 +165,10 @@ configKubernetes: # defines the template for PDB (Pod Disruption Budget) names pdb_name_format: "postgres-{cluster}-pdb" + # switches pod anti affinity type to `preferredDuringSchedulingIgnoredDuringExecution` + pod_antiaffinity_preferred_during_scheduling: false # override topology key for pod anti affinity pod_antiaffinity_topology_key: "kubernetes.io/hostname" - # switches pod anti affinity type to `preferredDuringSchedulingIgnoredDuringExecution` - # pod_antiaffinity_preferred_during_scheduling: true # namespaced name of the ConfigMap with environment variables to populate on every pod # pod_environment_configmap: "default/my-custom-config" # name of the Secret (in cluster namespace) with environment variables to populate on every pod diff --git a/manifests/configmap.yaml b/manifests/configmap.yaml index 5aebddb93..831a00341 100644 --- a/manifests/configmap.yaml +++ b/manifests/configmap.yaml @@ -109,6 +109,7 @@ data: # password_rotation_interval: "90" # password_rotation_user_retention: "180" pdb_name_format: "postgres-{cluster}-pdb" + # pod_antiaffinity_preferred_during_scheduling: "false" # pod_antiaffinity_topology_key: "kubernetes.io/hostname" pod_deletion_wait_timeout: 10m # pod_environment_configmap: "default/my-custom-config" diff --git a/manifests/operatorconfiguration.crd.yaml b/manifests/operatorconfiguration.crd.yaml index 58a27bc54..4e5ab803f 100644 --- a/manifests/operatorconfiguration.crd.yaml +++ b/manifests/operatorconfiguration.crd.yaml @@ -276,6 +276,9 @@ spec: pdb_name_format: type: string default: "postgres-{cluster}-pdb" + pod_antiaffinity_preferred_during_scheduling: + type: boolean + default: false pod_antiaffinity_topology_key: type: string default: "kubernetes.io/hostname" diff --git a/manifests/postgresql-operator-default-configuration.yaml b/manifests/postgresql-operator-default-configuration.yaml index aba77e848..4c7737d13 100644 --- a/manifests/postgresql-operator-default-configuration.yaml +++ b/manifests/postgresql-operator-default-configuration.yaml @@ -84,6 +84,7 @@ configuration: # node_readiness_label_merge: "OR" oauth_token_secret_name: postgresql-operator pdb_name_format: "postgres-{cluster}-pdb" + pod_antiaffinity_preferred_during_scheduling: false pod_antiaffinity_topology_key: "kubernetes.io/hostname" # pod_environment_configmap: "default/my-custom-config" # pod_environment_secret: "my-custom-secret" @@ -95,6 +96,7 @@ configuration: # pod_service_account_role_binding_definition: "" pod_terminate_grace_period: 5m secret_name_template: "{username}.{cluster}.credentials.{tprkind}.{tprgroup}" + share_pgsocket_with_sidecars: false spilo_allow_privilege_escalation: true # spilo_runasuser: 101 # spilo_runasgroup: 103 diff --git a/pkg/apis/acid.zalan.do/v1/crds.go b/pkg/apis/acid.zalan.do/v1/crds.go index 5fdb87f21..40a8b380e 100644 --- a/pkg/apis/acid.zalan.do/v1/crds.go +++ b/pkg/apis/acid.zalan.do/v1/crds.go @@ -1372,12 +1372,12 @@ var OperatorConfigCRDResourceValidation = apiextv1.CustomResourceValidation{ "pdb_name_format": { Type: "string", }, - "pod_antiaffinity_topology_key": { - Type: "string", - }, "pod_antiaffinity_preferred_during_scheduling": { Type: "boolean", }, + "pod_antiaffinity_topology_key": { + Type: "string", + }, "pod_environment_configmap": { Type: "string", }, diff --git a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go index 1279ba43b..29cddffa5 100644 --- a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go +++ b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go @@ -97,10 +97,10 @@ type KubernetesMetaConfiguration struct { PodPriorityClassName string `json:"pod_priority_class_name,omitempty"` MasterPodMoveTimeout Duration `json:"master_pod_move_timeout,omitempty"` EnablePodAntiAffinity bool `json:"enable_pod_antiaffinity,omitempty"` - PodAntiAffinityTopologyKey string `json:"pod_antiaffinity_topology_key,omitempty"` PodAntiAffinityPreferredDuringScheduling bool `json:"pod_antiaffinity_preferred_during_scheduling,omitempty"` + PodAntiAffinityTopologyKey string `json:"pod_antiaffinity_topology_key,omitempty"` PodManagementPolicy string `json:"pod_management_policy,omitempty"` - EnableReadinessProbe bool `json:"enable_readiness_probe,omitempty"` + EnableReadinessProbe bool `json:"enable_readiness_probe,omitempty"` EnableCrossNamespaceSecret bool `json:"enable_cross_namespace_secret,omitempty"` } diff --git a/pkg/cluster/connection_pooler.go b/pkg/cluster/connection_pooler.go index fd8b9251c..93bfb8854 100644 --- a/pkg/cluster/connection_pooler.go +++ b/pkg/cluster/connection_pooler.go @@ -354,11 +354,12 @@ func (c *Cluster) generateConnectionPoolerPodTemplate(role PostgresRole) ( nodeAffinity := c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity) if c.OpConfig.EnablePodAntiAffinity { labelsSet := labels.Set(c.connectionPoolerLabels(role, false).MatchLabels) - podTemplate.Spec.Affinity = generatePodAffinity( + podTemplate.Spec.Affinity = podAffinity( labelsSet, c.OpConfig.PodAntiAffinityTopologyKey, nodeAffinity, c.OpConfig.PodAntiAffinityPreferredDuringScheduling, + true, ) } else if nodeAffinity != nil { podTemplate.Spec.Affinity = nodeAffinity diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index 250d3156b..902e6f873 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -495,8 +495,14 @@ func (c *Cluster) nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinit } } -func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1.Affinity, preferredDuringScheduling bool) *v1.Affinity { - // generate pod anti-affinity to avoid multiple pods of the same Postgres cluster in the same topology , e.g. node +func podAffinity( + labels labels.Set, + topologyKey string, + nodeAffinity *v1.Affinity, + preferredDuringScheduling bool, + anti bool) *v1.Affinity { + + var podAffinity v1.Affinity podAffinityTerm := v1.PodAffinityTerm{ LabelSelector: &metav1.LabelSelector{ @@ -505,17 +511,10 @@ func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1 TopologyKey: topologyKey, } - podAffinity := v1.Affinity{ - PodAntiAffinity: &v1.PodAntiAffinity{}, - } - - if preferredDuringScheduling { - podAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution = []v1.WeightedPodAffinityTerm{{ - Weight: 1, - PodAffinityTerm: podAffinityTerm, - }} + if anti { + podAffinity.PodAntiAffinity = generatePodAntiAffinity(podAffinityTerm, preferredDuringScheduling) } else { - podAffinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = []v1.PodAffinityTerm{podAffinityTerm} + podAffinity.PodAffinity = generatePodAffinity(podAffinityTerm, preferredDuringScheduling) } if nodeAffinity != nil && nodeAffinity.NodeAffinity != nil { @@ -525,6 +524,36 @@ func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1 return &podAffinity } +func generatePodAffinity(podAffinityTerm v1.PodAffinityTerm, preferredDuringScheduling bool) *v1.PodAffinity { + podAffinity := &v1.PodAffinity{} + + if preferredDuringScheduling { + podAffinity.PreferredDuringSchedulingIgnoredDuringExecution = []v1.WeightedPodAffinityTerm{{ + Weight: 1, + PodAffinityTerm: podAffinityTerm, + }} + } else { + podAffinity.RequiredDuringSchedulingIgnoredDuringExecution = []v1.PodAffinityTerm{podAffinityTerm} + } + + return podAffinity +} + +func generatePodAntiAffinity(podAffinityTerm v1.PodAffinityTerm, preferredDuringScheduling bool) *v1.PodAntiAffinity { + podAntiAffinity := &v1.PodAntiAffinity{} + + if preferredDuringScheduling { + podAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution = []v1.WeightedPodAffinityTerm{{ + Weight: 1, + PodAffinityTerm: podAffinityTerm, + }} + } else { + podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = []v1.PodAffinityTerm{podAffinityTerm} + } + + return podAntiAffinity +} + func tolerations(tolerationsSpec *[]v1.Toleration, podToleration map[string]string) []v1.Toleration { // allow to override tolerations by postgresql manifest if len(*tolerationsSpec) > 0 { @@ -778,11 +807,12 @@ func (c *Cluster) generatePodTemplate( } if podAntiAffinity { - podSpec.Affinity = generatePodAffinity( + podSpec.Affinity = podAffinity( labels, podAntiAffinityTopologyKey, nodeAffinity, podAntiAffinityPreferredDuringScheduling, + true, ) } else if nodeAffinity != nil { podSpec.Affinity = nodeAffinity @@ -2100,20 +2130,15 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1.CronJob, error) { c.OpConfig.ClusterNameLabel: c.Name, "application": "spilo-logical-backup", } - podAffinityTerm := v1.PodAffinityTerm{ - LabelSelector: &metav1.LabelSelector{ - MatchLabels: labels, - }, - TopologyKey: "kubernetes.io/hostname", - } - podAffinity := v1.Affinity{ - PodAffinity: &v1.PodAffinity{ - PreferredDuringSchedulingIgnoredDuringExecution: []v1.WeightedPodAffinityTerm{{ - Weight: 1, - PodAffinityTerm: podAffinityTerm, - }, - }, - }} + + nodeAffinity := c.nodeAffinity(c.OpConfig.NodeReadinessLabel, nil) + podAffinity := podAffinity( + labels, + "kubernetes.io/hostname", + nodeAffinity, + true, + false, + ) annotations := c.generatePodAnnotations(&c.Spec) @@ -2147,7 +2172,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1.CronJob, error) { } // overwrite specific params of logical backups pods - podTemplate.Spec.Affinity = &podAffinity + podTemplate.Spec.Affinity = podAffinity podTemplate.Spec.RestartPolicy = "Never" // affects containers within a pod // configure a batch job diff --git a/pkg/cluster/k8sres_test.go b/pkg/cluster/k8sres_test.go index 4f62d59be..bfdca7e59 100644 --- a/pkg/cluster/k8sres_test.go +++ b/pkg/cluster/k8sres_test.go @@ -1351,93 +1351,94 @@ func TestNodeAffinity(t *testing.T) { assert.Equal(t, s.Spec.Template.Spec.Affinity.NodeAffinity, nodeAff, "cluster template has correct node affinity") } -func TestPodAntiAffinityrRequiredDuringScheduling(t *testing.T) { - var err error - var spiloRunAsUser = int64(101) - var spiloRunAsGroup = int64(103) - var spiloFSGroup = int64(103) +func TestPodAffinity(t *testing.T) { + clusterName := "acid-test-cluster" + namespace := "default" - spec := acidv1.PostgresSpec{ - TeamID: "myapp", NumberOfInstances: 1, - Resources: &acidv1.Resources{ - ResourceRequests: acidv1.ResourceDescription{CPU: "1", Memory: "10"}, - ResourceLimits: acidv1.ResourceDescription{CPU: "1", Memory: "10"}, + tests := []struct { + subTest string + preferred bool + anti bool + }{ + { + subTest: "generate affinity RequiredDuringSchedulingIgnoredDuringExecution", + preferred: false, + anti: false, }, - Volume: acidv1.Volume{ - Size: "1G", + { + subTest: "generate affinity PreferredDuringSchedulingIgnoredDuringExecution", + preferred: true, + anti: false, + }, + { + subTest: "generate anitAffinity RequiredDuringSchedulingIgnoredDuringExecution", + preferred: false, + anti: true, + }, + { + subTest: "generate anitAffinity PreferredDuringSchedulingIgnoredDuringExecution", + preferred: true, + anti: true, }, } - cluster := New( - Config{ - OpConfig: config.Config{ - PodManagementPolicy: "ordered_ready", - ProtectedRoles: []string{"admin"}, - Auth: config.Auth{ - SuperUsername: superUserName, - ReplicationUsername: replicationUserName, - }, - Resources: config.Resources{ - SpiloRunAsUser: &spiloRunAsUser, - SpiloRunAsGroup: &spiloRunAsGroup, - SpiloFSGroup: &spiloFSGroup, - }, - EnablePodAntiAffinity: true, + pg := acidv1.Postgresql{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName, + Namespace: namespace, + }, + Spec: acidv1.PostgresSpec{ + NumberOfInstances: 1, + Resources: &acidv1.Resources{ + ResourceRequests: acidv1.ResourceDescription{CPU: "1", Memory: "10"}, + ResourceLimits: acidv1.ResourceDescription{CPU: "1", Memory: "10"}, }, - }, k8sutil.KubernetesClient{}, acidv1.Postgresql{}, logger, eventRecorder) - - s, err := cluster.generateStatefulSet(&spec) - if err != nil { - assert.NoError(t, err) - } - - assert.Nil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should not use preferredDuringScheduling") - assert.NotNil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should use requiredDuringScheduling") -} - -func TestPodAntiAffinityPreferredDuringScheduling(t *testing.T) { - var err error - var spiloRunAsUser = int64(101) - var spiloRunAsGroup = int64(103) - var spiloFSGroup = int64(103) - - spec := acidv1.PostgresSpec{ - TeamID: "myapp", NumberOfInstances: 1, - Resources: &acidv1.Resources{ - ResourceRequests: acidv1.ResourceDescription{CPU: "1", Memory: "10"}, - ResourceLimits: acidv1.ResourceDescription{CPU: "1", Memory: "10"}, - }, - Volume: acidv1.Volume{ - Size: "1G", - }, - } - - cluster := New( - Config{ - OpConfig: config.Config{ - PodManagementPolicy: "ordered_ready", - ProtectedRoles: []string{"admin"}, - Auth: config.Auth{ - SuperUsername: superUserName, - ReplicationUsername: replicationUserName, - }, - Resources: config.Resources{ - SpiloRunAsUser: &spiloRunAsUser, - SpiloRunAsGroup: &spiloRunAsGroup, - SpiloFSGroup: &spiloFSGroup, - }, - EnablePodAntiAffinity: true, - PodAntiAffinityPreferredDuringScheduling: true, + Volume: acidv1.Volume{ + Size: "1G", }, - }, k8sutil.KubernetesClient{}, acidv1.Postgresql{}, logger, eventRecorder) - - s, err := cluster.generateStatefulSet(&spec) - if err != nil { - assert.NoError(t, err) + }, } - assert.NotNil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should use preferredDuringScheduling") - assert.Nil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should not use requiredDuringScheduling") + for _, tt := range tests { + cluster := New( + Config{ + OpConfig: config.Config{ + EnablePodAntiAffinity: tt.anti, + PodManagementPolicy: "ordered_ready", + ProtectedRoles: []string{"admin"}, + PodAntiAffinityPreferredDuringScheduling: tt.preferred, + Resources: config.Resources{ + ClusterLabels: map[string]string{"application": "spilo"}, + ClusterNameLabel: "cluster-name", + DefaultCPURequest: "300m", + DefaultCPULimit: "300m", + DefaultMemoryRequest: "300Mi", + DefaultMemoryLimit: "300Mi", + PodRoleLabel: "spilo-role", + }, + }, + }, k8sutil.KubernetesClient{}, pg, logger, eventRecorder) + + cluster.Name = clusterName + cluster.Namespace = namespace + + s, err := cluster.generateStatefulSet(&pg.Spec) + if err != nil { + assert.NoError(t, err) + } + + if !tt.anti { + assert.Nil(t, s.Spec.Template.Spec.Affinity, "pod affinity should not be set") + } else { + if tt.preferred { + assert.NotNil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should use preferredDuringScheduling") + assert.Nil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should not use requiredDuringScheduling") + } else { + assert.Nil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should not use preferredDuringScheduling") + assert.NotNil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should use requiredDuringScheduling") + } + } + } } func testDeploymentOwnerReference(cluster *Cluster, deployment *appsv1.Deployment) error { diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index 7abb33056..80aa509e3 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -202,8 +202,8 @@ type Config struct { CustomServiceAnnotations map[string]string `name:"custom_service_annotations"` CustomPodAnnotations map[string]string `name:"custom_pod_annotations"` EnablePodAntiAffinity bool `name:"enable_pod_antiaffinity" default:"false"` - PodAntiAffinityTopologyKey string `name:"pod_antiaffinity_topology_key" default:"kubernetes.io/hostname"` PodAntiAffinityPreferredDuringScheduling bool `name:"pod_antiaffinity_preferred_during_scheduling" default:"false"` + PodAntiAffinityTopologyKey string `name:"pod_antiaffinity_topology_key" default:"kubernetes.io/hostname"` StorageResizeMode string `name:"storage_resize_mode" default:"pvc"` EnableLoadBalancer *bool `name:"enable_load_balancer"` // deprecated and kept for backward compatibility ExternalTrafficPolicy string `name:"external_traffic_policy" default:"Cluster"`