Implement major upgrade result annotations (#2727)

Co-authored-by: Felix Kunde <felix-kunde@gmx.de>
Co-authored-by: Polina Bungina <27892524+hughcapet@users.noreply.github.com>
This commit is contained in:
Ida Novindasari 2024-08-28 15:26:12 +02:00 committed by GitHub
parent a09b7655c9
commit 2e398120d2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 135 additions and 17 deletions

View File

@ -85,6 +85,12 @@ It is also possible to define `maintenanceWindows` in the Postgres manifest to
better control when such automated upgrades should take place after increasing better control when such automated upgrades should take place after increasing
the version. the version.
### Upgrade annotations
When an upgrade is executed, the operator sets an annotation in the PostgreSQL resource, either `last-major-upgrade-success` if the upgrade succeeds, or `last-major-upgrade-failure` if it fails. The value of the annotation is a timestamp indicating when the upgrade occurred.
If a PostgreSQL resource contains a failure annotation, the operator will not attempt to retry the upgrade during a sync event. To remove the failure annotation, you can revert the PostgreSQL version back to the current version. This action will trigger the removal of the failure annotation.
## Non-default cluster domain ## Non-default cluster domain
If your cluster uses a DNS domain other than the default `cluster.local`, this If your cluster uses a DNS domain other than the default `cluster.local`, this

View File

@ -1185,13 +1185,19 @@ class EndToEndTestCase(unittest.TestCase):
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_major_version_upgrade(self): def test_major_version_upgrade(self):
""" """
Test major version upgrade Test major version upgrade: with full upgrade, maintenance window, and annotation
""" """
def check_version(): def check_version():
p = k8s.patroni_rest("acid-upgrade-test-0", "") p = k8s.patroni_rest("acid-upgrade-test-0", "")
version = p.get("server_version", 0) // 10000 version = p.get("server_version", 0) // 10000
return version return version
def get_annotations():
pg_manifest = k8s.api.custom_objects_api.get_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test")
annotations = pg_manifest["metadata"]["annotations"]
return annotations
k8s = self.k8s k8s = self.k8s
cluster_label = 'application=spilo,cluster-name=acid-upgrade-test' cluster_label = 'application=spilo,cluster-name=acid-upgrade-test'
@ -1209,30 +1215,33 @@ class EndToEndTestCase(unittest.TestCase):
master_nodes, _ = k8s.get_cluster_nodes(cluster_labels=cluster_label) master_nodes, _ = k8s.get_cluster_nodes(cluster_labels=cluster_label)
# should upgrade immediately # should upgrade immediately
pg_patch_version_14 = { pg_patch_version_13 = {
"spec": { "spec": {
"postgresql": { "postgresql": {
"version": "14" "version": "13"
} }
} }
} }
k8s.api.custom_objects_api.patch_namespaced_custom_object( k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14) "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_13)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label) k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 14, "Version should be upgraded from 12 to 14") self.eventuallyEqual(check_version, 13, "Version should be upgraded from 12 to 13")
# check if annotation for last upgrade's success is set
annotations = get_annotations()
self.assertIsNotNone(annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not set")
# should not upgrade because current time is not in maintenanceWindow # should not upgrade because current time is not in maintenanceWindow
current_time = datetime.now() current_time = datetime.now()
maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}" maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}"
pg_patch_version_15 = { pg_patch_version_14 = {
"spec": { "spec": {
"postgresql": { "postgresql": {
"version": "15" "version": "14"
}, },
"maintenanceWindows": [ "maintenanceWindows": [
maintenance_window_future maintenance_window_future
@ -1240,21 +1249,23 @@ class EndToEndTestCase(unittest.TestCase):
} }
} }
k8s.api.custom_objects_api.patch_namespaced_custom_object( k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15) "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label) k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 14, "Version should not be upgraded") self.eventuallyEqual(check_version, 13, "Version should not be upgraded")
second_annotations = get_annotations()
self.assertIsNone(second_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure should not be set")
# change the version again to trigger operator sync # change the version again to trigger operator sync
maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}" maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}"
pg_patch_version_16 = { pg_patch_version_15 = {
"spec": { "spec": {
"postgresql": { "postgresql": {
"version": "16" "version": "15"
}, },
"maintenanceWindows": [ "maintenanceWindows": [
maintenance_window_current maintenance_window_current
@ -1263,14 +1274,52 @@ class EndToEndTestCase(unittest.TestCase):
} }
k8s.api.custom_objects_api.patch_namespaced_custom_object( k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16) "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label) k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16") self.eventuallyEqual(check_version, 15, "Version should be upgraded from 13 to 15")
# check if annotation for last upgrade's success is updated after second upgrade
third_annotations = get_annotations()
self.assertIsNotNone(third_annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not set")
self.assertNotEqual(annotations.get("last-major-upgrade-success"), third_annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not updated")
# test upgrade with failed upgrade annotation
pg_patch_version_16 = {
"metadata": {
"annotations": {
"last-major-upgrade-failure": "2024-01-02T15:04:05Z"
},
},
"spec": {
"postgresql": {
"version": "16"
},
},
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 15, "Version should not be upgraded because annotation for last upgrade's failure is set")
# change the version back to 15 and should remove failure annotation
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
fourth_annotations = get_annotations()
self.assertIsNone(fourth_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure is not removed")
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_persistent_volume_claim_retention_policy(self): def test_persistent_volume_claim_retention_policy(self):

View File

@ -1,12 +1,16 @@
package cluster package cluster
import ( import (
"context"
"encoding/json"
"fmt" "fmt"
"strings" "strings"
"github.com/zalando/postgres-operator/pkg/spec" "github.com/zalando/postgres-operator/pkg/spec"
"github.com/zalando/postgres-operator/pkg/util" "github.com/zalando/postgres-operator/pkg/util"
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
) )
// VersionMap Map of version numbers // VersionMap Map of version numbers
@ -18,6 +22,11 @@ var VersionMap = map[string]int{
"16": 160000, "16": 160000,
} }
const (
majorVersionUpgradeSuccessAnnotation = "last-major-upgrade-success"
majorVersionUpgradeFailureAnnotation = "last-major-upgrade-failure"
)
// IsBiggerPostgresVersion Compare two Postgres version numbers // IsBiggerPostgresVersion Compare two Postgres version numbers
func IsBiggerPostgresVersion(old string, new string) bool { func IsBiggerPostgresVersion(old string, new string) bool {
oldN := VersionMap[old] oldN := VersionMap[old]
@ -54,6 +63,47 @@ func (c *Cluster) isUpgradeAllowedForTeam(owningTeam string) bool {
return util.SliceContains(allowedTeams, owningTeam) return util.SliceContains(allowedTeams, owningTeam)
} }
func (c *Cluster) annotatePostgresResource(isSuccess bool) error {
annotations := make(map[string]string)
currentTime := metav1.Now().Format("2006-01-02T15:04:05Z")
if isSuccess {
annotations[majorVersionUpgradeSuccessAnnotation] = currentTime
} else {
annotations[majorVersionUpgradeFailureAnnotation] = currentTime
}
patchData, err := metaAnnotationsPatch(annotations)
if err != nil {
c.logger.Errorf("could not form patch for %s postgresql resource: %v", c.Name, err)
return err
}
_, err = c.KubeClient.Postgresqls(c.Namespace).Patch(context.Background(), c.Name, types.MergePatchType, patchData, metav1.PatchOptions{})
if err != nil {
c.logger.Errorf("failed to patch annotations to postgresql resource: %v", err)
return err
}
return nil
}
func (c *Cluster) removeFailuresAnnotation() error {
annotationToRemove := []map[string]string{
{
"op": "remove",
"path": fmt.Sprintf("/metadata/annotations/%s", majorVersionUpgradeFailureAnnotation),
},
}
removePatch, err := json.Marshal(annotationToRemove)
if err != nil {
c.logger.Errorf("could not form removal patch for %s postgresql resource: %v", c.Name, err)
return err
}
_, err = c.KubeClient.Postgresqls(c.Namespace).Patch(context.Background(), c.Name, types.JSONPatchType, removePatch, metav1.PatchOptions{})
if err != nil {
c.logger.Errorf("failed to remove annotations from postgresql resource: %v", err)
return err
}
return nil
}
/* /*
Execute upgrade when mode is set to manual or full or when the owning team is allowed for upgrade (and mode is "off"). Execute upgrade when mode is set to manual or full or when the owning team is allowed for upgrade (and mode is "off").
@ -69,10 +119,19 @@ func (c *Cluster) majorVersionUpgrade() error {
desiredVersion := c.GetDesiredMajorVersionAsInt() desiredVersion := c.GetDesiredMajorVersionAsInt()
if c.currentMajorVersion >= desiredVersion { if c.currentMajorVersion >= desiredVersion {
if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists { // if failure annotation exists, remove it
c.removeFailuresAnnotation()
c.logger.Infof("removing failure annotation as the cluster is already up to date")
}
c.logger.Infof("cluster version up to date. current: %d, min desired: %d", c.currentMajorVersion, desiredVersion) c.logger.Infof("cluster version up to date. current: %d, min desired: %d", c.currentMajorVersion, desiredVersion)
return nil return nil
} }
if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists {
c.logger.Infof("last major upgrade failed, skipping upgrade")
return nil
}
if !isInMainternanceWindow(c.Spec.MaintenanceWindows) { if !isInMainternanceWindow(c.Spec.MaintenanceWindows) {
c.logger.Infof("skipping major version upgrade, not in maintenance window") c.logger.Infof("skipping major version upgrade, not in maintenance window")
return nil return nil
@ -107,6 +166,7 @@ func (c *Cluster) majorVersionUpgrade() error {
return nil return nil
} }
isUpgradeSuccess := true
numberOfPods := len(pods) numberOfPods := len(pods)
if allRunning && masterPod != nil { if allRunning && masterPod != nil {
c.logger.Infof("healthy cluster ready to upgrade, current: %d desired: %d", c.currentMajorVersion, desiredVersion) c.logger.Infof("healthy cluster ready to upgrade, current: %d desired: %d", c.currentMajorVersion, desiredVersion)
@ -132,11 +192,14 @@ func (c *Cluster) majorVersionUpgrade() error {
result, err = c.ExecCommand(podName, "/bin/su", "postgres", "-c", upgradeCommand) result, err = c.ExecCommand(podName, "/bin/su", "postgres", "-c", upgradeCommand)
} }
if err != nil { if err != nil {
isUpgradeSuccess = false
c.annotatePostgresResource(isUpgradeSuccess)
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, err) c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, err)
return err return err
} }
c.logger.Infof("upgrade action triggered and command completed: %s", result[:100])
c.annotatePostgresResource(isUpgradeSuccess)
c.logger.Infof("upgrade action triggered and command completed: %s", result[:100])
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "upgrade from %d to %d finished", c.currentMajorVersion, desiredVersion) c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "upgrade from %d to %d finished", c.currentMajorVersion, desiredVersion)
} }
} }