add preferred during scheduling pod anti affinity (#2048)

* add preferred during scheduling pod anti affinity

Co-authored-by: Felix Kunde <felix-kunde@gmx.de>
This commit is contained in:
Francois Parquet 2023-01-02 18:22:47 +01:00 committed by GitHub
parent 93a253bde1
commit be7b52db92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 215 additions and 89 deletions

View File

@ -281,6 +281,9 @@ spec:
pod_antiaffinity_topology_key: pod_antiaffinity_topology_key:
type: string type: string
default: "kubernetes.io/hostname" default: "kubernetes.io/hostname"
pod_antiaffinity_preferred_during_scheduling:
type: boolean
default: false
pod_environment_configmap: pod_environment_configmap:
type: string type: string
pod_environment_secret: pod_environment_secret:

View File

@ -167,6 +167,8 @@ configKubernetes:
pdb_name_format: "postgres-{cluster}-pdb" pdb_name_format: "postgres-{cluster}-pdb"
# override topology key for pod anti affinity # override topology key for pod anti affinity
pod_antiaffinity_topology_key: "kubernetes.io/hostname" pod_antiaffinity_topology_key: "kubernetes.io/hostname"
# switches pod anti affinity type to `preferredDuringSchedulingIgnoredDuringExecution`
# pod_antiaffinity_preferred_during_scheduling: true
# namespaced name of the ConfigMap with environment variables to populate on every pod # namespaced name of the ConfigMap with environment variables to populate on every pod
# pod_environment_configmap: "default/my-custom-config" # pod_environment_configmap: "default/my-custom-config"
# name of the Secret (in cluster namespace) with environment variables to populate on every pod # name of the Secret (in cluster namespace) with environment variables to populate on every pod

View File

@ -516,6 +516,9 @@ configuration:
enable_pod_antiaffinity: true enable_pod_antiaffinity: true
``` ```
By default the type of pod anti affinity is `requiredDuringSchedulingIgnoredDuringExecution`,
you can switch to `preferredDuringSchedulingIgnoredDuringExecution` by setting `pod_antiaffinity_preferred_during_scheduling: true`.
By default the topology key for the pod anti affinity is set to By default the topology key for the pod anti affinity is set to
`kubernetes.io/hostname`, you can set another topology key e.g. `kubernetes.io/hostname`, you can set another topology key e.g.
`failure-domain.beta.kubernetes.io/zone`. See [built-in node labels](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#interlude-built-in-node-labels) for available topology keys. `failure-domain.beta.kubernetes.io/zone`. See [built-in node labels](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#interlude-built-in-node-labels) for available topology keys.

View File

@ -1378,6 +1378,9 @@ var OperatorConfigCRDResourceValidation = apiextv1.CustomResourceValidation{
"pod_antiaffinity_topology_key": { "pod_antiaffinity_topology_key": {
Type: "string", Type: "string",
}, },
"pod_antiaffinity_preferred_during_scheduling": {
Type: "boolean",
},
"pod_environment_configmap": { "pod_environment_configmap": {
Type: "string", Type: "string",
}, },

View File

@ -98,6 +98,7 @@ type KubernetesMetaConfiguration struct {
MasterPodMoveTimeout Duration `json:"master_pod_move_timeout,omitempty"` MasterPodMoveTimeout Duration `json:"master_pod_move_timeout,omitempty"`
EnablePodAntiAffinity bool `json:"enable_pod_antiaffinity,omitempty"` EnablePodAntiAffinity bool `json:"enable_pod_antiaffinity,omitempty"`
PodAntiAffinityTopologyKey string `json:"pod_antiaffinity_topology_key,omitempty"` PodAntiAffinityTopologyKey string `json:"pod_antiaffinity_topology_key,omitempty"`
PodAntiAffinityPreferredDuringScheduling bool `json:"pod_antiaffinity_preferred_during_scheduling,omitempty"`
PodManagementPolicy string `json:"pod_management_policy,omitempty"` PodManagementPolicy string `json:"pod_management_policy,omitempty"`
EnableReadinessProbe bool `json:"enable_readiness_probe,omitempty"` EnableReadinessProbe bool `json:"enable_readiness_probe,omitempty"`
EnableCrossNamespaceSecret bool `json:"enable_cross_namespace_secret,omitempty"` EnableCrossNamespaceSecret bool `json:"enable_cross_namespace_secret,omitempty"`

View File

@ -354,7 +354,12 @@ func (c *Cluster) generateConnectionPoolerPodTemplate(role PostgresRole) (
nodeAffinity := c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity) nodeAffinity := c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity)
if c.OpConfig.EnablePodAntiAffinity { if c.OpConfig.EnablePodAntiAffinity {
labelsSet := labels.Set(c.connectionPoolerLabels(role, false).MatchLabels) labelsSet := labels.Set(c.connectionPoolerLabels(role, false).MatchLabels)
podTemplate.Spec.Affinity = generatePodAffinity(labelsSet, c.OpConfig.PodAntiAffinityTopologyKey, nodeAffinity) podTemplate.Spec.Affinity = generatePodAffinity(
labelsSet,
c.OpConfig.PodAntiAffinityTopologyKey,
nodeAffinity,
c.OpConfig.PodAntiAffinityPreferredDuringScheduling,
)
} else if nodeAffinity != nil { } else if nodeAffinity != nil {
podTemplate.Spec.Affinity = nodeAffinity podTemplate.Spec.Affinity = nodeAffinity
} }

View File

@ -495,17 +495,27 @@ func (c *Cluster) nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinit
} }
} }
func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1.Affinity) *v1.Affinity { func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1.Affinity, preferredDuringScheduling bool) *v1.Affinity {
// generate pod anti-affinity to avoid multiple pods of the same Postgres cluster in the same topology , e.g. node // generate pod anti-affinity to avoid multiple pods of the same Postgres cluster in the same topology , e.g. node
podAffinity := v1.Affinity{
PodAntiAffinity: &v1.PodAntiAffinity{ podAffinityTerm := v1.PodAffinityTerm{
RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{{
LabelSelector: &metav1.LabelSelector{ LabelSelector: &metav1.LabelSelector{
MatchLabels: labels, MatchLabels: labels,
}, },
TopologyKey: topologyKey, TopologyKey: topologyKey,
}}, }
},
podAffinity := v1.Affinity{
PodAntiAffinity: &v1.PodAntiAffinity{},
}
if preferredDuringScheduling {
podAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution = []v1.WeightedPodAffinityTerm{{
Weight: 1,
PodAffinityTerm: podAffinityTerm,
}}
} else {
podAffinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = []v1.PodAffinityTerm{podAffinityTerm}
} }
if nodeAffinity != nil && nodeAffinity.NodeAffinity != nil { if nodeAffinity != nil && nodeAffinity.NodeAffinity != nil {
@ -727,6 +737,7 @@ func (c *Cluster) generatePodTemplate(
shmVolume *bool, shmVolume *bool,
podAntiAffinity bool, podAntiAffinity bool,
podAntiAffinityTopologyKey string, podAntiAffinityTopologyKey string,
podAntiAffinityPreferredDuringScheduling bool,
additionalSecretMount string, additionalSecretMount string,
additionalSecretMountPath string, additionalSecretMountPath string,
additionalVolumes []acidv1.AdditionalVolume, additionalVolumes []acidv1.AdditionalVolume,
@ -767,7 +778,12 @@ func (c *Cluster) generatePodTemplate(
} }
if podAntiAffinity { if podAntiAffinity {
podSpec.Affinity = generatePodAffinity(labels, podAntiAffinityTopologyKey, nodeAffinity) podSpec.Affinity = generatePodAffinity(
labels,
podAntiAffinityTopologyKey,
nodeAffinity,
podAntiAffinityPreferredDuringScheduling,
)
} else if nodeAffinity != nil { } else if nodeAffinity != nil {
podSpec.Affinity = nodeAffinity podSpec.Affinity = nodeAffinity
} }
@ -1376,6 +1392,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef
mountShmVolumeNeeded(c.OpConfig, spec), mountShmVolumeNeeded(c.OpConfig, spec),
c.OpConfig.EnablePodAntiAffinity, c.OpConfig.EnablePodAntiAffinity,
c.OpConfig.PodAntiAffinityTopologyKey, c.OpConfig.PodAntiAffinityTopologyKey,
c.OpConfig.PodAntiAffinityPreferredDuringScheduling,
c.OpConfig.AdditionalSecretMount, c.OpConfig.AdditionalSecretMount,
c.OpConfig.AdditionalSecretMountPath, c.OpConfig.AdditionalSecretMountPath,
additionalVolumes) additionalVolumes)
@ -2122,6 +2139,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1.CronJob, error) {
util.False(), util.False(),
false, false,
"", "",
false,
c.OpConfig.AdditionalSecretMount, c.OpConfig.AdditionalSecretMount,
c.OpConfig.AdditionalSecretMountPath, c.OpConfig.AdditionalSecretMountPath,
[]acidv1.AdditionalVolume{}); err != nil { []acidv1.AdditionalVolume{}); err != nil {

View File

@ -1360,6 +1360,95 @@ func TestNodeAffinity(t *testing.T) {
assert.Equal(t, s.Spec.Template.Spec.Affinity.NodeAffinity, nodeAff, "cluster template has correct node affinity") assert.Equal(t, s.Spec.Template.Spec.Affinity.NodeAffinity, nodeAff, "cluster template has correct node affinity")
} }
func TestPodAntiAffinityrRequiredDuringScheduling(t *testing.T) {
var err error
var spiloRunAsUser = int64(101)
var spiloRunAsGroup = int64(103)
var spiloFSGroup = int64(103)
spec := acidv1.PostgresSpec{
TeamID: "myapp", NumberOfInstances: 1,
Resources: &acidv1.Resources{
ResourceRequests: acidv1.ResourceDescription{CPU: "1", Memory: "10"},
ResourceLimits: acidv1.ResourceDescription{CPU: "1", Memory: "10"},
},
Volume: acidv1.Volume{
Size: "1G",
},
}
cluster := New(
Config{
OpConfig: config.Config{
PodManagementPolicy: "ordered_ready",
ProtectedRoles: []string{"admin"},
Auth: config.Auth{
SuperUsername: superUserName,
ReplicationUsername: replicationUserName,
},
Resources: config.Resources{
SpiloRunAsUser: &spiloRunAsUser,
SpiloRunAsGroup: &spiloRunAsGroup,
SpiloFSGroup: &spiloFSGroup,
},
EnablePodAntiAffinity: true,
},
}, k8sutil.KubernetesClient{}, acidv1.Postgresql{}, logger, eventRecorder)
s, err := cluster.generateStatefulSet(&spec)
if err != nil {
assert.NoError(t, err)
}
assert.Nil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should not use preferredDuringScheduling")
assert.NotNil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should use requiredDuringScheduling")
}
func TestPodAntiAffinityPreferredDuringScheduling(t *testing.T) {
var err error
var spiloRunAsUser = int64(101)
var spiloRunAsGroup = int64(103)
var spiloFSGroup = int64(103)
spec := acidv1.PostgresSpec{
TeamID: "myapp", NumberOfInstances: 1,
Resources: &acidv1.Resources{
ResourceRequests: acidv1.ResourceDescription{CPU: "1", Memory: "10"},
ResourceLimits: acidv1.ResourceDescription{CPU: "1", Memory: "10"},
},
Volume: acidv1.Volume{
Size: "1G",
},
}
cluster := New(
Config{
OpConfig: config.Config{
PodManagementPolicy: "ordered_ready",
ProtectedRoles: []string{"admin"},
Auth: config.Auth{
SuperUsername: superUserName,
ReplicationUsername: replicationUserName,
},
Resources: config.Resources{
SpiloRunAsUser: &spiloRunAsUser,
SpiloRunAsGroup: &spiloRunAsGroup,
SpiloFSGroup: &spiloFSGroup,
},
EnablePodAntiAffinity: true,
PodAntiAffinityPreferredDuringScheduling: true,
},
}, k8sutil.KubernetesClient{}, acidv1.Postgresql{}, logger, eventRecorder)
s, err := cluster.generateStatefulSet(&spec)
if err != nil {
assert.NoError(t, err)
}
assert.NotNil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should use preferredDuringScheduling")
assert.Nil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should not use requiredDuringScheduling")
}
func testDeploymentOwnerReference(cluster *Cluster, deployment *appsv1.Deployment) error { func testDeploymentOwnerReference(cluster *Cluster, deployment *appsv1.Deployment) error {
owner := deployment.ObjectMeta.OwnerReferences[0] owner := deployment.ObjectMeta.OwnerReferences[0]

View File

@ -123,6 +123,7 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur
result.MasterPodMoveTimeout = util.CoalesceDuration(time.Duration(fromCRD.Kubernetes.MasterPodMoveTimeout), "10m") result.MasterPodMoveTimeout = util.CoalesceDuration(time.Duration(fromCRD.Kubernetes.MasterPodMoveTimeout), "10m")
result.EnablePodAntiAffinity = fromCRD.Kubernetes.EnablePodAntiAffinity result.EnablePodAntiAffinity = fromCRD.Kubernetes.EnablePodAntiAffinity
result.PodAntiAffinityTopologyKey = util.Coalesce(fromCRD.Kubernetes.PodAntiAffinityTopologyKey, "kubernetes.io/hostname") result.PodAntiAffinityTopologyKey = util.Coalesce(fromCRD.Kubernetes.PodAntiAffinityTopologyKey, "kubernetes.io/hostname")
result.PodAntiAffinityPreferredDuringScheduling = fromCRD.Kubernetes.PodAntiAffinityPreferredDuringScheduling
result.PodToleration = fromCRD.Kubernetes.PodToleration result.PodToleration = fromCRD.Kubernetes.PodToleration
// Postgres Pod resources // Postgres Pod resources

View File

@ -203,6 +203,7 @@ type Config struct {
CustomPodAnnotations map[string]string `name:"custom_pod_annotations"` CustomPodAnnotations map[string]string `name:"custom_pod_annotations"`
EnablePodAntiAffinity bool `name:"enable_pod_antiaffinity" default:"false"` EnablePodAntiAffinity bool `name:"enable_pod_antiaffinity" default:"false"`
PodAntiAffinityTopologyKey string `name:"pod_antiaffinity_topology_key" default:"kubernetes.io/hostname"` PodAntiAffinityTopologyKey string `name:"pod_antiaffinity_topology_key" default:"kubernetes.io/hostname"`
PodAntiAffinityPreferredDuringScheduling bool `name:"pod_antiaffinity_preferred_during_scheduling" default:"false"`
StorageResizeMode string `name:"storage_resize_mode" default:"pvc"` StorageResizeMode string `name:"storage_resize_mode" default:"pvc"`
EnableLoadBalancer *bool `name:"enable_load_balancer"` // deprecated and kept for backward compatibility EnableLoadBalancer *bool `name:"enable_load_balancer"` // deprecated and kept for backward compatibility
ExternalTrafficPolicy string `name:"external_traffic_policy" default:"Cluster"` ExternalTrafficPolicy string `name:"external_traffic_policy" default:"Cluster"`
@ -231,7 +232,7 @@ type Config struct {
EnableTeamIdClusternamePrefix bool `name:"enable_team_id_clustername_prefix" default:"false"` EnableTeamIdClusternamePrefix bool `name:"enable_team_id_clustername_prefix" default:"false"`
MajorVersionUpgradeMode string `name:"major_version_upgrade_mode" default:"off"` MajorVersionUpgradeMode string `name:"major_version_upgrade_mode" default:"off"`
MajorVersionUpgradeTeamAllowList []string `name:"major_version_upgrade_team_allow_list" default:""` MajorVersionUpgradeTeamAllowList []string `name:"major_version_upgrade_team_allow_list" default:""`
MinimalMajorVersion string `name:"minimal_major_version" default:"11"` MinimalMajorVersion string `name:"minimal_major_version" default:"9.6"`
TargetMajorVersion string `name:"target_major_version" default:"14"` TargetMajorVersion string `name:"target_major_version" default:"14"`
PatroniAPICheckInterval time.Duration `name:"patroni_api_check_interval" default:"1s"` PatroniAPICheckInterval time.Duration `name:"patroni_api_check_interval" default:"1s"`
PatroniAPICheckTimeout time.Duration `name:"patroni_api_check_timeout" default:"5s"` PatroniAPICheckTimeout time.Duration `name:"patroni_api_check_timeout" default:"5s"`