Partially revert #2810 (#2849)

Only schedule switchover for pod migration, consider mainWindow for PGVERSION env change
This commit is contained in:
Polina Bungina 2025-01-23 18:35:33 +03:00 committed by GitHub
parent e04b91d8af
commit b0cfeb30ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 48 additions and 37 deletions

View File

@ -208,9 +208,6 @@ Note that, changes in `SPILO_CONFIGURATION` env variable under `bootstrap.dcs`
path are ignored for the diff. They will be applied through Patroni's rest api
interface, following a restart of all instances.
Rolling update is postponed until the next maintenance window if any is defined
under the `maintenanceWindows` cluster manifest parameter.
The operator also support lazy updates of the Spilo image. In this case the
StatefulSet is only updated, but no rolling update follows. This feature saves
you a switchover - and hence downtime - when you know pods are re-started later

View File

@ -116,7 +116,7 @@ These parameters are grouped directly under the `spec` key in the manifest.
* **maintenanceWindows**
a list which defines specific time frames when certain maintenance operations
such as automatic major upgrades or rolling updates are allowed. Accepted formats
such as automatic major upgrades or master pod migration. Accepted formats
are "01:00-06:00" for daily maintenance windows or "Sat:00:00-04:00" for specific
days, with all times in UTC.

View File

@ -1187,7 +1187,7 @@ class EndToEndTestCase(unittest.TestCase):
Test major version upgrade: with full upgrade, maintenance window, and annotation
"""
def check_version():
p = k8s.patroni_rest("acid-upgrade-test-0", "")
p = k8s.patroni_rest("acid-upgrade-test-0", "") or {}
version = p.get("server_version", 0) // 10000
return version
@ -1237,7 +1237,7 @@ class EndToEndTestCase(unittest.TestCase):
# should not upgrade because current time is not in maintenanceWindow
current_time = datetime.now()
maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}"
pg_patch_version_15 = {
pg_patch_version_15_outside_mw = {
"spec": {
"postgresql": {
"version": "15"
@ -1248,7 +1248,7 @@ class EndToEndTestCase(unittest.TestCase):
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_outside_mw)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# no pod replacement outside of the maintenance window
@ -1259,12 +1259,12 @@ class EndToEndTestCase(unittest.TestCase):
second_annotations = get_annotations()
self.assertIsNone(second_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure should not be set")
# change the version again to trigger operator sync
# change maintenanceWindows to current
maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}"
pg_patch_version_16 = {
pg_patch_version_15_in_mw = {
"spec": {
"postgresql": {
"version": "16"
"version": "15"
},
"maintenanceWindows": [
maintenance_window_current
@ -1273,13 +1273,13 @@ class EndToEndTestCase(unittest.TestCase):
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16)
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_in_mw)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16")
self.eventuallyEqual(check_version, 15, "Version should be upgraded from 14 to 15")
# check if annotation for last upgrade's success is updated after second upgrade
third_annotations = get_annotations()
@ -1306,16 +1306,17 @@ class EndToEndTestCase(unittest.TestCase):
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 16, "Version should not be upgraded because annotation for last upgrade's failure is set")
self.eventuallyEqual(check_version, 15, "Version should not be upgraded because annotation for last upgrade's failure is set")
# change the version back to 15 and should remove failure annotation
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_in_mw)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 15, "Version should not be upgraded from 15")
fourth_annotations = get_annotations()
self.assertIsNone(fourth_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure is not removed")

View File

@ -957,6 +957,11 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error {
defer c.mu.Unlock()
c.KubeClient.SetPostgresCRDStatus(c.clusterName(), acidv1.ClusterStatusUpdating)
if !isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) {
// do not apply any major version related changes yet
newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion
}
c.setSpec(newSpec)
defer func() {
@ -1761,28 +1766,28 @@ func (c *Cluster) GetSwitchoverSchedule() string {
}
// Switchover does a switchover (via Patroni) to a candidate pod
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) error {
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName, scheduled bool) error {
var err error
c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate)
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
c.logger.Infof("postponing switchover, not in maintenance window")
schedule := c.GetSwitchoverSchedule()
if err := c.patroni.Switchover(curMaster, candidate.Name, schedule); err != nil {
return fmt.Errorf("could not schedule switchover: %v", err)
}
c.logger.Infof("switchover is scheduled at %s", schedule)
return nil
}
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate)
stopCh := make(chan struct{})
ch := c.registerPodSubscriber(candidate)
defer c.unregisterPodSubscriber(candidate)
defer close(stopCh)
if err = c.patroni.Switchover(curMaster, candidate.Name, ""); err == nil {
var scheduled_at string
if scheduled {
scheduled_at = c.GetSwitchoverSchedule()
} else {
c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate)
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate)
scheduled_at = ""
}
if err = c.patroni.Switchover(curMaster, candidate.Name, scheduled_at); err == nil {
if scheduled {
c.logger.Infof("switchover from %q to %q is scheduled at %s", curMaster.Name, candidate, scheduled_at)
return nil
}
c.logger.Debugf("successfully switched over from %q to %q", curMaster.Name, candidate)
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Successfully switched over from %q to %q", curMaster.Name, candidate)
_, err = c.waitForPodLabel(ch, stopCh, nil)
@ -1790,6 +1795,9 @@ func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) e
err = fmt.Errorf("could not get master pod label: %v", err)
}
} else {
if scheduled {
return fmt.Errorf("could not schedule switchover: %v", err)
}
err = fmt.Errorf("could not switch over from %q to %q: %v", curMaster.Name, candidate, err)
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switchover from %q to %q FAILED: %v", curMaster.Name, candidate, err)
}

View File

@ -280,11 +280,16 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
return fmt.Errorf("could not move pod: %v", err)
}
scheduleSwitchover := false
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
c.logger.Infof("postponing switchover, not in maintenance window")
scheduleSwitchover = true
}
err = retryutil.Retry(1*time.Minute, 5*time.Minute,
func() (bool, error) {
err := c.Switchover(oldMaster, masterCandidateName)
err := c.Switchover(oldMaster, masterCandidateName, scheduleSwitchover)
if err != nil {
c.logger.Errorf("could not failover to pod %q: %v", masterCandidateName, err)
c.logger.Errorf("could not switchover to pod %q: %v", masterCandidateName, err)
return false, nil
}
return true, nil
@ -445,7 +450,7 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
// do not recreate master now so it will keep the update flag and switchover will be retried on next sync
return fmt.Errorf("skipping switchover: %v", err)
}
if err := c.Switchover(masterPod, masterCandidate); err != nil {
if err := c.Switchover(masterPod, masterCandidate, false); err != nil {
return fmt.Errorf("could not perform switch over: %v", err)
}
} else if newMasterPod == nil && len(replicas) == 0 {

View File

@ -97,6 +97,11 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error {
}
}
if !isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) {
// do not apply any major version related changes yet
newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion
}
if err = c.syncStatefulSet(); err != nil {
if !k8sutil.ResourceAlreadyExists(err) {
err = fmt.Errorf("could not sync statefulsets: %v", err)
@ -658,11 +663,6 @@ func (c *Cluster) syncStatefulSet() error {
isSafeToRecreatePods = false
}
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
postponeReasons = append(postponeReasons, "not in maintenance window")
isSafeToRecreatePods = false
}
// if we get here we also need to re-create the pods (either leftovers from the old
// statefulset or those that got their configuration from the outdated statefulset)
if len(podsToRecreate) > 0 {