diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md index 39320fa76..8e2793023 100644 --- a/docs/reference/operator_parameters.md +++ b/docs/reference/operator_parameters.md @@ -205,6 +205,11 @@ configuration they are grouped under the `kubernetes` key. that should be assigned to the Postgres pods. The priority class itself must be defined in advance. Default is empty (use the default priority class). + * **master_pod_move_timeout** + The period of time to wait for the success of Patroni switchovers from master pods on an unschedulable node + to their respective replicas on healthy nodes. The situation where master pods still exist on the old node + after this timeout expires has to be fixed manually. The default is 10 minutes. + ## Kubernetes resource requests diff --git a/manifests/configmap.yaml b/manifests/configmap.yaml index 37b174755..f49cbe71d 100644 --- a/manifests/configmap.yaml +++ b/manifests/configmap.yaml @@ -46,6 +46,7 @@ data: pod_label_wait_timeout: 10m ready_wait_interval: 3s ready_wait_timeout: 30s + # master_pod_move_timeout: 10m replication_username: standby resource_check_interval: 3s resource_check_timeout: 10m diff --git a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go index f5aac03b6..d4654d9c8 100644 --- a/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go +++ b/pkg/apis/acid.zalan.do/v1/operator_configuration_type.go @@ -59,6 +59,7 @@ type KubernetesMetaConfiguration struct { // TODO: use namespacedname PodEnvironmentConfigMap string `json:"pod_environment_configmap,omitempty"` PodPriorityClassName string `json:"pod_priority_class_name,omitempty"` + MasterPodMoveTimeout time.Duration `json:"master_pod_move_timeout,omitempty"` } // PostgresPodResourcesDefaults defines the spec of default resources diff --git a/pkg/controller/node.go b/pkg/controller/node.go index 96698f329..1700ca176 100644 --- a/pkg/controller/node.go +++ b/pkg/controller/node.go @@ -41,18 +41,21 @@ func (c *Controller) nodeAdd(obj interface{}) { } c.logger.Debugf("new node has been added: %q (%s)", util.NameFromMeta(node.ObjectMeta), node.Spec.ProviderID) + // check if the node became not ready while the operator was down (otherwise we would have caught it in nodeUpdate) if !c.nodeIsReady(node) { - err := retryutil.Retry(2 * time.Minute, 10 * time.Minute, + + err := retryutil.Retry(1 * time.Minute, c.opConfig.MasterPodMoveTimeout, func() (bool, error) { err := c.moveMasterPodsOffNode(node) if err != nil { - return false, fmt.Errorf(("Unable to move master pods off the unschedulable node. Will retry after delay")) + return false, fmt.Errorf("unable to move master pods off the unschedulable node; will retry after delay") } return true, nil } ) + if err != nil { - c.logger.Warning("Unable to move maser pods") + c.logger.Warning("failed to move master pods from the node %q: timeout expired", node.Name) } } } @@ -174,7 +177,7 @@ func (c *Controller) moveMasterPodsOffNode(node *v1.Node) error { return fmt.Errorf("could not move master %d/%d pods from the %q node", leftPods, totalPods, nodeName) } - + return nil } diff --git a/pkg/controller/operator_config.go b/pkg/controller/operator_config.go index dae651b1d..e89a782c4 100644 --- a/pkg/controller/operator_config.go +++ b/pkg/controller/operator_config.go @@ -50,6 +50,7 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur result.ClusterNameLabel = fromCRD.Kubernetes.ClusterNameLabel result.NodeReadinessLabel = fromCRD.Kubernetes.NodeReadinessLabel result.PodPriorityClassName = fromCRD.Kubernetes.PodPriorityClassName + result.MasterPodMoveTimeout = fromCRD.Kubernetes.MasterPodMoveTimeout result.DefaultCPURequest = fromCRD.PostgresPodResources.DefaultCPURequest result.DefaultMemoryRequest = fromCRD.PostgresPodResources.DefaultMemoryRequest diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index 31cda4b98..6ff217b6d 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -80,6 +80,7 @@ type Config struct { // value of this string must be valid JSON or YAML; see initPodServiceAccount PodServiceAccountDefinition string `name:"pod_service_account_definition" default:""` PodServiceAccountRoleBindingDefinition string `name:"pod_service_account_role_binding_definition" default:""` + MasterPodMoveTimeout time.Duration `name:"master_pod_move_timeout" default:"10m"` DbHostedZone string `name:"db_hosted_zone" default:"db.example.com"` AWSRegion string `name:"aws_region" default:"eu-central-1"` WALES3Bucket string `name:"wal_s3_bucket"`