add preferred during scheduling pod anti affinity (#2048)

* add preferred during scheduling pod anti affinity

Co-authored-by: Felix Kunde <felix-kunde@gmx.de>
This commit is contained in:
Francois Parquet 2023-01-02 18:22:47 +01:00 committed by GitHub
parent 93a253bde1
commit be7b52db92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 215 additions and 89 deletions

View File

@ -281,6 +281,9 @@ spec:
pod_antiaffinity_topology_key:
type: string
default: "kubernetes.io/hostname"
pod_antiaffinity_preferred_during_scheduling:
type: boolean
default: false
pod_environment_configmap:
type: string
pod_environment_secret:

View File

@ -167,6 +167,8 @@ configKubernetes:
pdb_name_format: "postgres-{cluster}-pdb"
# override topology key for pod anti affinity
pod_antiaffinity_topology_key: "kubernetes.io/hostname"
# switches pod anti affinity type to `preferredDuringSchedulingIgnoredDuringExecution`
# pod_antiaffinity_preferred_during_scheduling: true
# namespaced name of the ConfigMap with environment variables to populate on every pod
# pod_environment_configmap: "default/my-custom-config"
# name of the Secret (in cluster namespace) with environment variables to populate on every pod

View File

@ -516,6 +516,9 @@ configuration:
enable_pod_antiaffinity: true
```
By default the type of pod anti affinity is `requiredDuringSchedulingIgnoredDuringExecution`,
you can switch to `preferredDuringSchedulingIgnoredDuringExecution` by setting `pod_antiaffinity_preferred_during_scheduling: true`.
By default the topology key for the pod anti affinity is set to
`kubernetes.io/hostname`, you can set another topology key e.g.
`failure-domain.beta.kubernetes.io/zone`. See [built-in node labels](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#interlude-built-in-node-labels) for available topology keys.

View File

@ -1378,6 +1378,9 @@ var OperatorConfigCRDResourceValidation = apiextv1.CustomResourceValidation{
"pod_antiaffinity_topology_key": {
Type: "string",
},
"pod_antiaffinity_preferred_during_scheduling": {
Type: "boolean",
},
"pod_environment_configmap": {
Type: "string",
},

View File

@ -98,6 +98,7 @@ type KubernetesMetaConfiguration struct {
MasterPodMoveTimeout Duration `json:"master_pod_move_timeout,omitempty"`
EnablePodAntiAffinity bool `json:"enable_pod_antiaffinity,omitempty"`
PodAntiAffinityTopologyKey string `json:"pod_antiaffinity_topology_key,omitempty"`
PodAntiAffinityPreferredDuringScheduling bool `json:"pod_antiaffinity_preferred_during_scheduling,omitempty"`
PodManagementPolicy string `json:"pod_management_policy,omitempty"`
EnableReadinessProbe bool `json:"enable_readiness_probe,omitempty"`
EnableCrossNamespaceSecret bool `json:"enable_cross_namespace_secret,omitempty"`

View File

@ -354,7 +354,12 @@ func (c *Cluster) generateConnectionPoolerPodTemplate(role PostgresRole) (
nodeAffinity := c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity)
if c.OpConfig.EnablePodAntiAffinity {
labelsSet := labels.Set(c.connectionPoolerLabels(role, false).MatchLabels)
podTemplate.Spec.Affinity = generatePodAffinity(labelsSet, c.OpConfig.PodAntiAffinityTopologyKey, nodeAffinity)
podTemplate.Spec.Affinity = generatePodAffinity(
labelsSet,
c.OpConfig.PodAntiAffinityTopologyKey,
nodeAffinity,
c.OpConfig.PodAntiAffinityPreferredDuringScheduling,
)
} else if nodeAffinity != nil {
podTemplate.Spec.Affinity = nodeAffinity
}

View File

@ -495,17 +495,27 @@ func (c *Cluster) nodeAffinity(nodeReadinessLabel map[string]string, nodeAffinit
}
}
func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1.Affinity) *v1.Affinity {
func generatePodAffinity(labels labels.Set, topologyKey string, nodeAffinity *v1.Affinity, preferredDuringScheduling bool) *v1.Affinity {
// generate pod anti-affinity to avoid multiple pods of the same Postgres cluster in the same topology , e.g. node
podAffinity := v1.Affinity{
PodAntiAffinity: &v1.PodAntiAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{{
podAffinityTerm := v1.PodAffinityTerm{
LabelSelector: &metav1.LabelSelector{
MatchLabels: labels,
},
TopologyKey: topologyKey,
}},
},
}
podAffinity := v1.Affinity{
PodAntiAffinity: &v1.PodAntiAffinity{},
}
if preferredDuringScheduling {
podAffinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution = []v1.WeightedPodAffinityTerm{{
Weight: 1,
PodAffinityTerm: podAffinityTerm,
}}
} else {
podAffinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = []v1.PodAffinityTerm{podAffinityTerm}
}
if nodeAffinity != nil && nodeAffinity.NodeAffinity != nil {
@ -727,6 +737,7 @@ func (c *Cluster) generatePodTemplate(
shmVolume *bool,
podAntiAffinity bool,
podAntiAffinityTopologyKey string,
podAntiAffinityPreferredDuringScheduling bool,
additionalSecretMount string,
additionalSecretMountPath string,
additionalVolumes []acidv1.AdditionalVolume,
@ -767,7 +778,12 @@ func (c *Cluster) generatePodTemplate(
}
if podAntiAffinity {
podSpec.Affinity = generatePodAffinity(labels, podAntiAffinityTopologyKey, nodeAffinity)
podSpec.Affinity = generatePodAffinity(
labels,
podAntiAffinityTopologyKey,
nodeAffinity,
podAntiAffinityPreferredDuringScheduling,
)
} else if nodeAffinity != nil {
podSpec.Affinity = nodeAffinity
}
@ -1376,6 +1392,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef
mountShmVolumeNeeded(c.OpConfig, spec),
c.OpConfig.EnablePodAntiAffinity,
c.OpConfig.PodAntiAffinityTopologyKey,
c.OpConfig.PodAntiAffinityPreferredDuringScheduling,
c.OpConfig.AdditionalSecretMount,
c.OpConfig.AdditionalSecretMountPath,
additionalVolumes)
@ -2122,6 +2139,7 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1.CronJob, error) {
util.False(),
false,
"",
false,
c.OpConfig.AdditionalSecretMount,
c.OpConfig.AdditionalSecretMountPath,
[]acidv1.AdditionalVolume{}); err != nil {

View File

@ -1360,6 +1360,95 @@ func TestNodeAffinity(t *testing.T) {
assert.Equal(t, s.Spec.Template.Spec.Affinity.NodeAffinity, nodeAff, "cluster template has correct node affinity")
}
func TestPodAntiAffinityrRequiredDuringScheduling(t *testing.T) {
var err error
var spiloRunAsUser = int64(101)
var spiloRunAsGroup = int64(103)
var spiloFSGroup = int64(103)
spec := acidv1.PostgresSpec{
TeamID: "myapp", NumberOfInstances: 1,
Resources: &acidv1.Resources{
ResourceRequests: acidv1.ResourceDescription{CPU: "1", Memory: "10"},
ResourceLimits: acidv1.ResourceDescription{CPU: "1", Memory: "10"},
},
Volume: acidv1.Volume{
Size: "1G",
},
}
cluster := New(
Config{
OpConfig: config.Config{
PodManagementPolicy: "ordered_ready",
ProtectedRoles: []string{"admin"},
Auth: config.Auth{
SuperUsername: superUserName,
ReplicationUsername: replicationUserName,
},
Resources: config.Resources{
SpiloRunAsUser: &spiloRunAsUser,
SpiloRunAsGroup: &spiloRunAsGroup,
SpiloFSGroup: &spiloFSGroup,
},
EnablePodAntiAffinity: true,
},
}, k8sutil.KubernetesClient{}, acidv1.Postgresql{}, logger, eventRecorder)
s, err := cluster.generateStatefulSet(&spec)
if err != nil {
assert.NoError(t, err)
}
assert.Nil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should not use preferredDuringScheduling")
assert.NotNil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should use requiredDuringScheduling")
}
func TestPodAntiAffinityPreferredDuringScheduling(t *testing.T) {
var err error
var spiloRunAsUser = int64(101)
var spiloRunAsGroup = int64(103)
var spiloFSGroup = int64(103)
spec := acidv1.PostgresSpec{
TeamID: "myapp", NumberOfInstances: 1,
Resources: &acidv1.Resources{
ResourceRequests: acidv1.ResourceDescription{CPU: "1", Memory: "10"},
ResourceLimits: acidv1.ResourceDescription{CPU: "1", Memory: "10"},
},
Volume: acidv1.Volume{
Size: "1G",
},
}
cluster := New(
Config{
OpConfig: config.Config{
PodManagementPolicy: "ordered_ready",
ProtectedRoles: []string{"admin"},
Auth: config.Auth{
SuperUsername: superUserName,
ReplicationUsername: replicationUserName,
},
Resources: config.Resources{
SpiloRunAsUser: &spiloRunAsUser,
SpiloRunAsGroup: &spiloRunAsGroup,
SpiloFSGroup: &spiloFSGroup,
},
EnablePodAntiAffinity: true,
PodAntiAffinityPreferredDuringScheduling: true,
},
}, k8sutil.KubernetesClient{}, acidv1.Postgresql{}, logger, eventRecorder)
s, err := cluster.generateStatefulSet(&spec)
if err != nil {
assert.NoError(t, err)
}
assert.NotNil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should use preferredDuringScheduling")
assert.Nil(t, s.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution, "pod anti-affinity should not use requiredDuringScheduling")
}
func testDeploymentOwnerReference(cluster *Cluster, deployment *appsv1.Deployment) error {
owner := deployment.ObjectMeta.OwnerReferences[0]

View File

@ -123,6 +123,7 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur
result.MasterPodMoveTimeout = util.CoalesceDuration(time.Duration(fromCRD.Kubernetes.MasterPodMoveTimeout), "10m")
result.EnablePodAntiAffinity = fromCRD.Kubernetes.EnablePodAntiAffinity
result.PodAntiAffinityTopologyKey = util.Coalesce(fromCRD.Kubernetes.PodAntiAffinityTopologyKey, "kubernetes.io/hostname")
result.PodAntiAffinityPreferredDuringScheduling = fromCRD.Kubernetes.PodAntiAffinityPreferredDuringScheduling
result.PodToleration = fromCRD.Kubernetes.PodToleration
// Postgres Pod resources

View File

@ -203,6 +203,7 @@ type Config struct {
CustomPodAnnotations map[string]string `name:"custom_pod_annotations"`
EnablePodAntiAffinity bool `name:"enable_pod_antiaffinity" default:"false"`
PodAntiAffinityTopologyKey string `name:"pod_antiaffinity_topology_key" default:"kubernetes.io/hostname"`
PodAntiAffinityPreferredDuringScheduling bool `name:"pod_antiaffinity_preferred_during_scheduling" default:"false"`
StorageResizeMode string `name:"storage_resize_mode" default:"pvc"`
EnableLoadBalancer *bool `name:"enable_load_balancer"` // deprecated and kept for backward compatibility
ExternalTrafficPolicy string `name:"external_traffic_policy" default:"Cluster"`
@ -231,7 +232,7 @@ type Config struct {
EnableTeamIdClusternamePrefix bool `name:"enable_team_id_clustername_prefix" default:"false"`
MajorVersionUpgradeMode string `name:"major_version_upgrade_mode" default:"off"`
MajorVersionUpgradeTeamAllowList []string `name:"major_version_upgrade_team_allow_list" default:""`
MinimalMajorVersion string `name:"minimal_major_version" default:"11"`
MinimalMajorVersion string `name:"minimal_major_version" default:"9.6"`
TargetMajorVersion string `name:"target_major_version" default:"14"`
PatroniAPICheckInterval time.Duration `name:"patroni_api_check_interval" default:"1s"`
PatroniAPICheckTimeout time.Duration `name:"patroni_api_check_timeout" default:"5s"`