diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py index 194e05daa..04dea18d3 100644 --- a/e2e/tests/test_e2e.py +++ b/e2e/tests/test_e2e.py @@ -1066,8 +1066,8 @@ class EndToEndTestCase(unittest.TestCase): k8s = self.k8s leader = k8s.get_cluster_leader_pod() replica = k8s.get_cluster_replica_pod() - labels = 'application=spilo,cluster-name=acid-minimal-cluster,spilo-role=master' - creationTimestamp = leader.metadata.creation_timestamp + masterCreationTimestamp = leader.metadata.creation_timestamp + replicaCreationTimestamp = replica.metadata.creation_timestamp new_max_connections_value = "50" # adjust Postgres config @@ -1116,8 +1116,18 @@ class EndToEndTestCase(unittest.TestCase): "synchronous_mode not updated") return True + # check if Patroni config has been updated self.eventuallyTrue(compare_config, "Postgres config not applied") + # make sure that pods were not recreated + leader = k8s.get_cluster_leader_pod() + replica = k8s.get_cluster_replica_pod() + self.assertEqual(masterCreationTimestamp, leader.metadata.creation_timestamp, + "Master pod creation timestamp is updated") + self.assertEqual(replicaCreationTimestamp, replica.metadata.creation_timestamp, + "Master pod creation timestamp is updated") + + # query max_connections setting setting_query = """ SELECT setting FROM pg_settings @@ -1128,11 +1138,16 @@ class EndToEndTestCase(unittest.TestCase): self.eventuallyNotEqual(lambda: self.query_database(replica.metadata.name, "postgres", setting_query)[0], new_max_connections_value, "Expected max_connections not to be updated on replica since Postgres was restarted there first", 10, 5) - # make sure that pod wasn't recreated - self.assertEqual(creationTimestamp, leader.metadata.creation_timestamp, - "Master pod creation timestamp is updated") + # the next sync should restart the replica because it has pending_restart flag set + # force next sync by deleting the operator pod + k8s.delete_operator_pod() + self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") + + self.eventuallyEqual(lambda: self.query_database(replica.metadata.name, "postgres", setting_query)[0], new_max_connections_value, + "New max_connections setting not applied on replica", 10, 5) # decrease max_connections again + # this time restart will be correct and new value should appear on both instances lower_max_connections_value = "30" pg_patch_max_connections = { "spec": { @@ -1149,12 +1164,14 @@ class EndToEndTestCase(unittest.TestCase): self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") + # check Patroni config again pg_patch_config["spec"]["postgresql"]["parameters"]["max_connections"] = lower_max_connections_value self.eventuallyTrue(compare_config, "Postgres config not applied") - self.eventuallyEqual(lambda: self.query_database(leader.metadata.name, "postgres", setting_query)[0], new_max_connections_value, + # and query max_connections setting again + self.eventuallyEqual(lambda: self.query_database(leader.metadata.name, "postgres", setting_query)[0], lower_max_connections_value, "Previous max_connections setting not applied on master", 10, 5) - self.eventuallyEqual(lambda: self.query_database(replica.metadata.name, "postgres", setting_query)[0], new_max_connections_value, + self.eventuallyEqual(lambda: self.query_database(replica.metadata.name, "postgres", setting_query)[0], lower_max_connections_value, "Previous max_connections setting not applied on replica", 10, 5) except timeout_decorator.TimeoutError: diff --git a/pkg/cluster/sync.go b/pkg/cluster/sync.go index a1f197b28..95ce6b091 100644 --- a/pkg/cluster/sync.go +++ b/pkg/cluster/sync.go @@ -272,9 +272,8 @@ func (c *Cluster) syncPodDisruptionBudget(isUpdate bool) error { func (c *Cluster) syncStatefulSet() error { var ( - restartTTL uint32 - instanceRestartRequired bool - restartMasterFirst bool + restartTTL uint32 + restartMasterFirst bool ) podsToRecreate := make([]v1.Pod, 0) @@ -414,43 +413,42 @@ func (c *Cluster) syncStatefulSet() error { // empty config probably means cluster is not fully initialized yet, e.g. restoring from backup // do not attempt a restart if !reflect.DeepEqual(patroniConfig, emptyPatroniConfig) || len(pgParameters) > 0 { - instanceRestartRequired, restartMasterFirst, err = c.checkAndSetGlobalPostgreSQLConfiguration(&pod, patroniConfig, pgParameters) + restartMasterFirst, err = c.checkAndSetGlobalPostgreSQLConfiguration(&pod, patroniConfig, pgParameters) if err != nil { c.logger.Warningf("could not set PostgreSQL configuration options for pod %s: %v", podName, err) continue } + // it could take up to LoopWait to apply the config + time.Sleep(time.Duration(patroniConfig.LoopWait)*time.Second + time.Second*2) restartTTL = patroniConfig.TTL break } } - // if the config update requires a restart, call Patroni restart - if instanceRestartRequired { - remainingPods := make([]*v1.Pod, 0) - skipRole := Master - if restartMasterFirst { - skipRole = Replica + // restart instances if requiredy + remainingPods := make([]*v1.Pod, 0) + skipRole := Master + if restartMasterFirst { + skipRole = Replica + } + for i, pod := range pods { + role := PostgresRole(pod.Labels[c.OpConfig.PodRoleLabel]) + if role == skipRole { + remainingPods = append(remainingPods, &pods[i]) + continue } - c.logger.Debug("restarting Postgres server within pods") - for i, pod := range pods { - role := PostgresRole(pod.Labels[c.OpConfig.PodRoleLabel]) - if role == skipRole { - remainingPods = append(remainingPods, &pods[i]) - continue - } - c.restartInstance(&pod) - if len(pods) > 1 { - time.Sleep(time.Duration(restartTTL) * time.Second) - } + c.restartInstance(&pod) + if len(pods) > 1 { + time.Sleep(time.Duration(restartTTL) * time.Second) } + } - // in most cases only the master should be left to restart - if len(remainingPods) > 0 { - for _, remainingPod := range remainingPods { - c.restartInstance(remainingPod) - if len(remainingPods) > 1 { - time.Sleep(time.Duration(restartTTL) * time.Second) - } + // in most cases only the master should be left to restart + if len(remainingPods) > 0 { + for _, remainingPod := range remainingPods { + c.restartInstance(remainingPod) + if len(remainingPods) > 1 { + time.Sleep(time.Duration(restartTTL) * time.Second) } } } @@ -472,15 +470,22 @@ func (c *Cluster) restartInstance(pod *v1.Pod) { podName := util.NameFromMeta(pod.ObjectMeta) role := PostgresRole(pod.Labels[c.OpConfig.PodRoleLabel]) - c.eventRecorder.Event(c.GetReference(), v1.EventTypeNormal, "Update", fmt.Sprintf("restarting Postgres server within %s pod %s", role, pod.Name)) - - if err := c.patroni.Restart(pod); err != nil { - c.logger.Warningf("could not restart Postgres server within %s pod %s: %v", role, podName, err) + // if the config update requires a restart, call Patroni restart + memberData, err := c.patroni.GetMemberData(pod) + if err != nil { + c.logger.Debugf("could not get member data of %s pod %s - skipping possible restart attempt: %v", role, podName, err) return } - c.logger.Debugf("Postgres server successfuly restarted in %s pod %s", role, podName) - c.eventRecorder.Event(c.GetReference(), v1.EventTypeNormal, "Update", fmt.Sprintf("Postgres server restart done for %s pod %s", role, pod.Name)) + // do restart only when it is pending + if memberData.PendingRestart { + c.eventRecorder.Event(c.GetReference(), v1.EventTypeNormal, "Update", fmt.Sprintf("restarting Postgres server within %s pod %s", role, pod.Name)) + if err := c.patroni.Restart(pod); err != nil { + c.logger.Warningf("could not restart Postgres server within %s pod %s: %v", role, podName, err) + return + } + c.eventRecorder.Event(c.GetReference(), v1.EventTypeNormal, "Update", fmt.Sprintf("Postgres server restart done for %s pod %s", role, pod.Name)) + } } // AnnotationsToPropagate get the annotations to update if required @@ -517,7 +522,7 @@ func (c *Cluster) AnnotationsToPropagate(annotations map[string]string) map[stri // checkAndSetGlobalPostgreSQLConfiguration checks whether cluster-wide API parameters // (like max_connections) have changed and if necessary sets it via the Patroni API -func (c *Cluster) checkAndSetGlobalPostgreSQLConfiguration(pod *v1.Pod, patroniConfig acidv1.Patroni, effectivePgParameters map[string]string) (bool, bool, error) { +func (c *Cluster) checkAndSetGlobalPostgreSQLConfiguration(pod *v1.Pod, patroniConfig acidv1.Patroni, effectivePgParameters map[string]string) (bool, error) { configToSet := make(map[string]interface{}) parametersToSet := make(map[string]string) restartMaster := make([]bool, 0) @@ -584,7 +589,7 @@ func (c *Cluster) checkAndSetGlobalPostgreSQLConfiguration(pod *v1.Pod, patroniC } if len(configToSet) == 0 { - return false, false, nil + return false, nil } configToSetJson, err := json.Marshal(configToSet) @@ -598,10 +603,10 @@ func (c *Cluster) checkAndSetGlobalPostgreSQLConfiguration(pod *v1.Pod, patroniC c.logger.Debugf("patching Postgres config via Patroni API on pod %s with following options: %s", podName, configToSetJson) if err = c.patroni.SetConfig(pod, configToSet); err != nil { - return true, requiresMasterRestart, fmt.Errorf("could not patch postgres parameters with a pod %s: %v", podName, err) + return requiresMasterRestart, fmt.Errorf("could not patch postgres parameters with a pod %s: %v", podName, err) } - return true, requiresMasterRestart, nil + return requiresMasterRestart, nil } func (c *Cluster) syncSecrets() error { diff --git a/pkg/util/patroni/patroni.go b/pkg/util/patroni/patroni.go index 6adc0bfbc..4de3b8201 100644 --- a/pkg/util/patroni/patroni.go +++ b/pkg/util/patroni/patroni.go @@ -238,16 +238,12 @@ func (p *Patroni) Restart(server *v1.Pod) error { if err != nil { return err } - memberData, err := p.GetMemberData(server) - if err != nil { + if err := p.httpPostOrPatch(http.MethodPost, apiURLString+restartPath, buf); err != nil { return err } + p.logger.Infof("Postgres server successfuly restarted in pod %s", server.Name) - // do restart only when it is pending - if !memberData.PendingRestart { - return nil - } - return p.httpPostOrPatch(http.MethodPost, apiURLString+restartPath, buf) + return nil } // GetMemberData read member data from patroni API