Only schedule switchover for pod migration, consider mainWindow for PGVERSION env change
This commit is contained in:
parent
e04b91d8af
commit
b0cfeb30ea
|
|
@ -208,9 +208,6 @@ Note that, changes in `SPILO_CONFIGURATION` env variable under `bootstrap.dcs`
|
||||||
path are ignored for the diff. They will be applied through Patroni's rest api
|
path are ignored for the diff. They will be applied through Patroni's rest api
|
||||||
interface, following a restart of all instances.
|
interface, following a restart of all instances.
|
||||||
|
|
||||||
Rolling update is postponed until the next maintenance window if any is defined
|
|
||||||
under the `maintenanceWindows` cluster manifest parameter.
|
|
||||||
|
|
||||||
The operator also support lazy updates of the Spilo image. In this case the
|
The operator also support lazy updates of the Spilo image. In this case the
|
||||||
StatefulSet is only updated, but no rolling update follows. This feature saves
|
StatefulSet is only updated, but no rolling update follows. This feature saves
|
||||||
you a switchover - and hence downtime - when you know pods are re-started later
|
you a switchover - and hence downtime - when you know pods are re-started later
|
||||||
|
|
|
||||||
|
|
@ -116,7 +116,7 @@ These parameters are grouped directly under the `spec` key in the manifest.
|
||||||
|
|
||||||
* **maintenanceWindows**
|
* **maintenanceWindows**
|
||||||
a list which defines specific time frames when certain maintenance operations
|
a list which defines specific time frames when certain maintenance operations
|
||||||
such as automatic major upgrades or rolling updates are allowed. Accepted formats
|
such as automatic major upgrades or master pod migration. Accepted formats
|
||||||
are "01:00-06:00" for daily maintenance windows or "Sat:00:00-04:00" for specific
|
are "01:00-06:00" for daily maintenance windows or "Sat:00:00-04:00" for specific
|
||||||
days, with all times in UTC.
|
days, with all times in UTC.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1187,7 +1187,7 @@ class EndToEndTestCase(unittest.TestCase):
|
||||||
Test major version upgrade: with full upgrade, maintenance window, and annotation
|
Test major version upgrade: with full upgrade, maintenance window, and annotation
|
||||||
"""
|
"""
|
||||||
def check_version():
|
def check_version():
|
||||||
p = k8s.patroni_rest("acid-upgrade-test-0", "")
|
p = k8s.patroni_rest("acid-upgrade-test-0", "") or {}
|
||||||
version = p.get("server_version", 0) // 10000
|
version = p.get("server_version", 0) // 10000
|
||||||
return version
|
return version
|
||||||
|
|
||||||
|
|
@ -1237,7 +1237,7 @@ class EndToEndTestCase(unittest.TestCase):
|
||||||
# should not upgrade because current time is not in maintenanceWindow
|
# should not upgrade because current time is not in maintenanceWindow
|
||||||
current_time = datetime.now()
|
current_time = datetime.now()
|
||||||
maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}"
|
maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}"
|
||||||
pg_patch_version_15 = {
|
pg_patch_version_15_outside_mw = {
|
||||||
"spec": {
|
"spec": {
|
||||||
"postgresql": {
|
"postgresql": {
|
||||||
"version": "15"
|
"version": "15"
|
||||||
|
|
@ -1248,7 +1248,7 @@ class EndToEndTestCase(unittest.TestCase):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
k8s.api.custom_objects_api.patch_namespaced_custom_object(
|
k8s.api.custom_objects_api.patch_namespaced_custom_object(
|
||||||
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
|
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_outside_mw)
|
||||||
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
||||||
|
|
||||||
# no pod replacement outside of the maintenance window
|
# no pod replacement outside of the maintenance window
|
||||||
|
|
@ -1259,12 +1259,12 @@ class EndToEndTestCase(unittest.TestCase):
|
||||||
second_annotations = get_annotations()
|
second_annotations = get_annotations()
|
||||||
self.assertIsNone(second_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure should not be set")
|
self.assertIsNone(second_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure should not be set")
|
||||||
|
|
||||||
# change the version again to trigger operator sync
|
# change maintenanceWindows to current
|
||||||
maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}"
|
maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}"
|
||||||
pg_patch_version_16 = {
|
pg_patch_version_15_in_mw = {
|
||||||
"spec": {
|
"spec": {
|
||||||
"postgresql": {
|
"postgresql": {
|
||||||
"version": "16"
|
"version": "15"
|
||||||
},
|
},
|
||||||
"maintenanceWindows": [
|
"maintenanceWindows": [
|
||||||
maintenance_window_current
|
maintenance_window_current
|
||||||
|
|
@ -1273,13 +1273,13 @@ class EndToEndTestCase(unittest.TestCase):
|
||||||
}
|
}
|
||||||
|
|
||||||
k8s.api.custom_objects_api.patch_namespaced_custom_object(
|
k8s.api.custom_objects_api.patch_namespaced_custom_object(
|
||||||
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16)
|
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_in_mw)
|
||||||
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
||||||
|
|
||||||
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
|
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
|
||||||
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
|
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
|
||||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
||||||
self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16")
|
self.eventuallyEqual(check_version, 15, "Version should be upgraded from 14 to 15")
|
||||||
|
|
||||||
# check if annotation for last upgrade's success is updated after second upgrade
|
# check if annotation for last upgrade's success is updated after second upgrade
|
||||||
third_annotations = get_annotations()
|
third_annotations = get_annotations()
|
||||||
|
|
@ -1306,16 +1306,17 @@ class EndToEndTestCase(unittest.TestCase):
|
||||||
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
|
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
|
||||||
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
|
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
|
||||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
||||||
self.eventuallyEqual(check_version, 16, "Version should not be upgraded because annotation for last upgrade's failure is set")
|
self.eventuallyEqual(check_version, 15, "Version should not be upgraded because annotation for last upgrade's failure is set")
|
||||||
|
|
||||||
# change the version back to 15 and should remove failure annotation
|
# change the version back to 15 and should remove failure annotation
|
||||||
k8s.api.custom_objects_api.patch_namespaced_custom_object(
|
k8s.api.custom_objects_api.patch_namespaced_custom_object(
|
||||||
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
|
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15_in_mw)
|
||||||
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
|
||||||
|
|
||||||
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
|
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
|
||||||
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
|
||||||
|
|
||||||
|
self.eventuallyEqual(check_version, 15, "Version should not be upgraded from 15")
|
||||||
fourth_annotations = get_annotations()
|
fourth_annotations = get_annotations()
|
||||||
self.assertIsNone(fourth_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure is not removed")
|
self.assertIsNone(fourth_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure is not removed")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -957,6 +957,11 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error {
|
||||||
defer c.mu.Unlock()
|
defer c.mu.Unlock()
|
||||||
|
|
||||||
c.KubeClient.SetPostgresCRDStatus(c.clusterName(), acidv1.ClusterStatusUpdating)
|
c.KubeClient.SetPostgresCRDStatus(c.clusterName(), acidv1.ClusterStatusUpdating)
|
||||||
|
|
||||||
|
if !isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) {
|
||||||
|
// do not apply any major version related changes yet
|
||||||
|
newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion
|
||||||
|
}
|
||||||
c.setSpec(newSpec)
|
c.setSpec(newSpec)
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
|
|
@ -1761,28 +1766,28 @@ func (c *Cluster) GetSwitchoverSchedule() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Switchover does a switchover (via Patroni) to a candidate pod
|
// Switchover does a switchover (via Patroni) to a candidate pod
|
||||||
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) error {
|
func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName, scheduled bool) error {
|
||||||
var err error
|
var err error
|
||||||
c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate)
|
|
||||||
|
|
||||||
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
|
|
||||||
c.logger.Infof("postponing switchover, not in maintenance window")
|
|
||||||
schedule := c.GetSwitchoverSchedule()
|
|
||||||
|
|
||||||
if err := c.patroni.Switchover(curMaster, candidate.Name, schedule); err != nil {
|
|
||||||
return fmt.Errorf("could not schedule switchover: %v", err)
|
|
||||||
}
|
|
||||||
c.logger.Infof("switchover is scheduled at %s", schedule)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate)
|
|
||||||
stopCh := make(chan struct{})
|
stopCh := make(chan struct{})
|
||||||
ch := c.registerPodSubscriber(candidate)
|
ch := c.registerPodSubscriber(candidate)
|
||||||
defer c.unregisterPodSubscriber(candidate)
|
defer c.unregisterPodSubscriber(candidate)
|
||||||
defer close(stopCh)
|
defer close(stopCh)
|
||||||
|
|
||||||
if err = c.patroni.Switchover(curMaster, candidate.Name, ""); err == nil {
|
var scheduled_at string
|
||||||
|
if scheduled {
|
||||||
|
scheduled_at = c.GetSwitchoverSchedule()
|
||||||
|
} else {
|
||||||
|
c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate)
|
||||||
|
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate)
|
||||||
|
scheduled_at = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = c.patroni.Switchover(curMaster, candidate.Name, scheduled_at); err == nil {
|
||||||
|
if scheduled {
|
||||||
|
c.logger.Infof("switchover from %q to %q is scheduled at %s", curMaster.Name, candidate, scheduled_at)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
c.logger.Debugf("successfully switched over from %q to %q", curMaster.Name, candidate)
|
c.logger.Debugf("successfully switched over from %q to %q", curMaster.Name, candidate)
|
||||||
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Successfully switched over from %q to %q", curMaster.Name, candidate)
|
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Successfully switched over from %q to %q", curMaster.Name, candidate)
|
||||||
_, err = c.waitForPodLabel(ch, stopCh, nil)
|
_, err = c.waitForPodLabel(ch, stopCh, nil)
|
||||||
|
|
@ -1790,6 +1795,9 @@ func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) e
|
||||||
err = fmt.Errorf("could not get master pod label: %v", err)
|
err = fmt.Errorf("could not get master pod label: %v", err)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if scheduled {
|
||||||
|
return fmt.Errorf("could not schedule switchover: %v", err)
|
||||||
|
}
|
||||||
err = fmt.Errorf("could not switch over from %q to %q: %v", curMaster.Name, candidate, err)
|
err = fmt.Errorf("could not switch over from %q to %q: %v", curMaster.Name, candidate, err)
|
||||||
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switchover from %q to %q FAILED: %v", curMaster.Name, candidate, err)
|
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switchover from %q to %q FAILED: %v", curMaster.Name, candidate, err)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -280,11 +280,16 @@ func (c *Cluster) MigrateMasterPod(podName spec.NamespacedName) error {
|
||||||
return fmt.Errorf("could not move pod: %v", err)
|
return fmt.Errorf("could not move pod: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
scheduleSwitchover := false
|
||||||
|
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
|
||||||
|
c.logger.Infof("postponing switchover, not in maintenance window")
|
||||||
|
scheduleSwitchover = true
|
||||||
|
}
|
||||||
err = retryutil.Retry(1*time.Minute, 5*time.Minute,
|
err = retryutil.Retry(1*time.Minute, 5*time.Minute,
|
||||||
func() (bool, error) {
|
func() (bool, error) {
|
||||||
err := c.Switchover(oldMaster, masterCandidateName)
|
err := c.Switchover(oldMaster, masterCandidateName, scheduleSwitchover)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.logger.Errorf("could not failover to pod %q: %v", masterCandidateName, err)
|
c.logger.Errorf("could not switchover to pod %q: %v", masterCandidateName, err)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
return true, nil
|
return true, nil
|
||||||
|
|
@ -445,7 +450,7 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
|
||||||
// do not recreate master now so it will keep the update flag and switchover will be retried on next sync
|
// do not recreate master now so it will keep the update flag and switchover will be retried on next sync
|
||||||
return fmt.Errorf("skipping switchover: %v", err)
|
return fmt.Errorf("skipping switchover: %v", err)
|
||||||
}
|
}
|
||||||
if err := c.Switchover(masterPod, masterCandidate); err != nil {
|
if err := c.Switchover(masterPod, masterCandidate, false); err != nil {
|
||||||
return fmt.Errorf("could not perform switch over: %v", err)
|
return fmt.Errorf("could not perform switch over: %v", err)
|
||||||
}
|
}
|
||||||
} else if newMasterPod == nil && len(replicas) == 0 {
|
} else if newMasterPod == nil && len(replicas) == 0 {
|
||||||
|
|
|
||||||
|
|
@ -97,6 +97,11 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !isInMaintenanceWindow(newSpec.Spec.MaintenanceWindows) {
|
||||||
|
// do not apply any major version related changes yet
|
||||||
|
newSpec.Spec.PostgresqlParam.PgVersion = oldSpec.Spec.PostgresqlParam.PgVersion
|
||||||
|
}
|
||||||
|
|
||||||
if err = c.syncStatefulSet(); err != nil {
|
if err = c.syncStatefulSet(); err != nil {
|
||||||
if !k8sutil.ResourceAlreadyExists(err) {
|
if !k8sutil.ResourceAlreadyExists(err) {
|
||||||
err = fmt.Errorf("could not sync statefulsets: %v", err)
|
err = fmt.Errorf("could not sync statefulsets: %v", err)
|
||||||
|
|
@ -658,11 +663,6 @@ func (c *Cluster) syncStatefulSet() error {
|
||||||
isSafeToRecreatePods = false
|
isSafeToRecreatePods = false
|
||||||
}
|
}
|
||||||
|
|
||||||
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
|
|
||||||
postponeReasons = append(postponeReasons, "not in maintenance window")
|
|
||||||
isSafeToRecreatePods = false
|
|
||||||
}
|
|
||||||
|
|
||||||
// if we get here we also need to re-create the pods (either leftovers from the old
|
// if we get here we also need to re-create the pods (either leftovers from the old
|
||||||
// statefulset or those that got their configuration from the outdated statefulset)
|
// statefulset or those that got their configuration from the outdated statefulset)
|
||||||
if len(podsToRecreate) > 0 {
|
if len(podsToRecreate) > 0 {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue