291 lines
10 KiB
Go
291 lines
10 KiB
Go
package cluster
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/Masterminds/semver"
|
|
"github.com/zalando/postgres-operator/pkg/spec"
|
|
"github.com/zalando/postgres-operator/pkg/util"
|
|
v1 "k8s.io/api/core/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
)
|
|
|
|
// VersionMap Map of version numbers
|
|
var VersionMap = map[string]int{
|
|
"12": 120000,
|
|
"13": 130000,
|
|
"14": 140000,
|
|
"15": 150000,
|
|
"16": 160000,
|
|
"17": 170000,
|
|
}
|
|
|
|
const (
|
|
majorVersionUpgradeSuccessAnnotation = "last-major-upgrade-success"
|
|
majorVersionUpgradeFailureAnnotation = "last-major-upgrade-failure"
|
|
)
|
|
|
|
// IsBiggerPostgresVersion Compare two Postgres version numbers
|
|
func IsBiggerPostgresVersion(old string, new string) bool {
|
|
oldN := VersionMap[old]
|
|
newN := VersionMap[new]
|
|
return newN > oldN
|
|
}
|
|
|
|
// GetDesiredMajorVersionAsInt Convert string to comparable integer of PG version
|
|
func (c *Cluster) GetDesiredMajorVersionAsInt() int {
|
|
return VersionMap[c.GetDesiredMajorVersion()]
|
|
}
|
|
|
|
// GetDesiredMajorVersion returns major version to use, incl. potential auto upgrade
|
|
func (c *Cluster) GetDesiredMajorVersion() string {
|
|
|
|
if c.Config.OpConfig.MajorVersionUpgradeMode == "full" {
|
|
// e.g. current is 13, minimal is 13 allowing 13 to 17 clusters, everything below is upgraded
|
|
if IsBiggerPostgresVersion(c.Spec.PgVersion, c.Config.OpConfig.MinimalMajorVersion) {
|
|
c.logger.Infof("overwriting configured major version %s to %s", c.Spec.PgVersion, c.Config.OpConfig.TargetMajorVersion)
|
|
return c.Config.OpConfig.TargetMajorVersion
|
|
}
|
|
}
|
|
|
|
return c.Spec.PgVersion
|
|
}
|
|
|
|
func (c *Cluster) isUpgradeAllowedForTeam(owningTeam string) bool {
|
|
allowedTeams := c.OpConfig.MajorVersionUpgradeTeamAllowList
|
|
|
|
if len(allowedTeams) == 0 {
|
|
return false
|
|
}
|
|
|
|
return util.SliceContains(allowedTeams, owningTeam)
|
|
}
|
|
|
|
func (c *Cluster) annotatePostgresResource(isSuccess bool) error {
|
|
annotations := make(map[string]string)
|
|
currentTime := metav1.Now().Format("2006-01-02T15:04:05Z")
|
|
if isSuccess {
|
|
annotations[majorVersionUpgradeSuccessAnnotation] = currentTime
|
|
} else {
|
|
annotations[majorVersionUpgradeFailureAnnotation] = currentTime
|
|
}
|
|
patchData, err := metaAnnotationsPatch(annotations)
|
|
if err != nil {
|
|
c.logger.Errorf("could not form patch for %s postgresql resource: %v", c.Name, err)
|
|
return err
|
|
}
|
|
_, err = c.KubeClient.Postgresqls(c.Namespace).Patch(context.Background(), c.Name, types.MergePatchType, patchData, metav1.PatchOptions{})
|
|
if err != nil {
|
|
c.logger.Errorf("failed to patch annotations to postgresql resource: %v", err)
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *Cluster) removeFailuresAnnotation() error {
|
|
annotationToRemove := []map[string]string{
|
|
{
|
|
"op": "remove",
|
|
"path": fmt.Sprintf("/metadata/annotations/%s", majorVersionUpgradeFailureAnnotation),
|
|
},
|
|
}
|
|
removePatch, err := json.Marshal(annotationToRemove)
|
|
if err != nil {
|
|
c.logger.Errorf("could not form removal patch for %s postgresql resource: %v", c.Name, err)
|
|
return err
|
|
}
|
|
_, err = c.KubeClient.Postgresqls(c.Namespace).Patch(context.Background(), c.Name, types.JSONPatchType, removePatch, metav1.PatchOptions{})
|
|
if err != nil {
|
|
c.logger.Errorf("failed to remove annotations from postgresql resource: %v", err)
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *Cluster) criticalOperationLabel(pods []v1.Pod, value *string) error {
|
|
metadataReq := map[string]map[string]map[string]*string{"metadata": {"labels": {"critical-operation": value}}}
|
|
|
|
patchReq, err := json.Marshal(metadataReq)
|
|
if err != nil {
|
|
return fmt.Errorf("could not marshal ObjectMeta: %v", err)
|
|
}
|
|
for _, pod := range pods {
|
|
_, err = c.KubeClient.Pods(c.Namespace).Patch(context.TODO(), pod.Name, types.StrategicMergePatchType, patchReq, metav1.PatchOptions{})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
/*
|
|
Execute upgrade when mode is set to manual or full or when the owning team is allowed for upgrade (and mode is "off").
|
|
|
|
Manual upgrade means, it is triggered by the user via manifest version change
|
|
Full upgrade means, operator also determines the minimal version used accross all clusters and upgrades violators.
|
|
*/
|
|
func (c *Cluster) majorVersionUpgrade() error {
|
|
|
|
if c.OpConfig.MajorVersionUpgradeMode == "off" && !c.isUpgradeAllowedForTeam(c.Spec.TeamID) {
|
|
return nil
|
|
}
|
|
|
|
desiredVersion := c.GetDesiredMajorVersionAsInt()
|
|
|
|
if c.currentMajorVersion >= desiredVersion {
|
|
if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists { // if failure annotation exists, remove it
|
|
c.removeFailuresAnnotation()
|
|
c.logger.Infof("removing failure annotation as the cluster is already up to date")
|
|
}
|
|
c.logger.Infof("cluster version up to date. current: %d, min desired: %d", c.currentMajorVersion, desiredVersion)
|
|
return nil
|
|
}
|
|
|
|
pods, err := c.listPods()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
allRunning := true
|
|
isStandbyCluster := false
|
|
|
|
var masterPod *v1.Pod
|
|
|
|
for i, pod := range pods {
|
|
ps, _ := c.patroni.GetMemberData(&pod)
|
|
|
|
if ps.Role == "standby_leader" {
|
|
isStandbyCluster = true
|
|
c.currentMajorVersion = ps.ServerVersion
|
|
break
|
|
}
|
|
|
|
if ps.State != "running" {
|
|
allRunning = false
|
|
c.logger.Infof("identified non running pod, potentially skipping major version upgrade")
|
|
}
|
|
|
|
if ps.Role == "master" || ps.Role == "primary" {
|
|
masterPod = &pods[i]
|
|
c.currentMajorVersion = ps.ServerVersion
|
|
}
|
|
}
|
|
|
|
if masterPod == nil {
|
|
c.logger.Infof("no master in the cluster, skipping major version upgrade")
|
|
return nil
|
|
}
|
|
|
|
// Recheck version with newest data from Patroni
|
|
if c.currentMajorVersion >= desiredVersion {
|
|
if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists { // if failure annotation exists, remove it
|
|
c.removeFailuresAnnotation()
|
|
c.logger.Infof("removing failure annotation as the cluster is already up to date")
|
|
}
|
|
c.logger.Infof("recheck cluster version is already up to date. current: %d, min desired: %d", c.currentMajorVersion, desiredVersion)
|
|
return nil
|
|
} else if isStandbyCluster {
|
|
c.logger.Warnf("skipping major version upgrade for %s/%s standby cluster. Re-deploy standby cluster with the required Postgres version specified", c.Namespace, c.Name)
|
|
return nil
|
|
}
|
|
|
|
if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists {
|
|
c.logger.Infof("last major upgrade failed, skipping upgrade")
|
|
return nil
|
|
}
|
|
|
|
if !isInMaintenanceWindow(c.Spec.MaintenanceWindows) {
|
|
c.logger.Infof("skipping major version upgrade, not in maintenance window")
|
|
return nil
|
|
}
|
|
|
|
members, err := c.patroni.GetClusterMembers(masterPod)
|
|
if err != nil {
|
|
c.logger.Error("could not get cluster members data from Patroni API, skipping major version upgrade")
|
|
return err
|
|
}
|
|
patroniData, err := c.patroni.GetMemberData(masterPod)
|
|
if err != nil {
|
|
c.logger.Error("could not get members data from Patroni API, skipping major version upgrade")
|
|
return err
|
|
}
|
|
patroniVer, err := semver.NewVersion(patroniData.Patroni.Version)
|
|
if err != nil {
|
|
c.logger.Error("error parsing Patroni version")
|
|
patroniVer, _ = semver.NewVersion("3.0.4")
|
|
}
|
|
verConstraint, _ := semver.NewConstraint(">= 3.0.4")
|
|
checkStreaming, _ := verConstraint.Validate(patroniVer)
|
|
|
|
for _, member := range members {
|
|
if PostgresRole(member.Role) == Leader {
|
|
continue
|
|
}
|
|
if checkStreaming && member.State != "streaming" {
|
|
c.logger.Infof("skipping major version upgrade, replica %s is not streaming from primary", member.Name)
|
|
return nil
|
|
}
|
|
if member.Lag > 16*1024*1024 {
|
|
c.logger.Infof("skipping major version upgrade, replication lag on member %s is too high", member.Name)
|
|
return nil
|
|
}
|
|
}
|
|
|
|
isUpgradeSuccess := true
|
|
numberOfPods := len(pods)
|
|
if allRunning && masterPod != nil {
|
|
c.logger.Infof("healthy cluster ready to upgrade, current: %d desired: %d", c.currentMajorVersion, desiredVersion)
|
|
if c.currentMajorVersion < desiredVersion {
|
|
defer func() error {
|
|
if err = c.criticalOperationLabel(pods, nil); err != nil {
|
|
return fmt.Errorf("failed to remove critical-operation label: %s", err)
|
|
}
|
|
return nil
|
|
}()
|
|
val := "true"
|
|
if err = c.criticalOperationLabel(pods, &val); err != nil {
|
|
return fmt.Errorf("failed to assign critical-operation label: %s", err)
|
|
}
|
|
|
|
podName := &spec.NamespacedName{Namespace: masterPod.Namespace, Name: masterPod.Name}
|
|
c.logger.Infof("triggering major version upgrade on pod %s of %d pods", masterPod.Name, numberOfPods)
|
|
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "starting major version upgrade on pod %s of %d pods", masterPod.Name, numberOfPods)
|
|
upgradeCommand := fmt.Sprintf("set -o pipefail && /usr/bin/python3 /scripts/inplace_upgrade.py %d 2>&1 | tee last_upgrade.log", numberOfPods)
|
|
|
|
c.logger.Debug("checking if the spilo image runs with root or non-root (check for user id=0)")
|
|
resultIdCheck, errIdCheck := c.ExecCommand(podName, "/bin/bash", "-c", "/usr/bin/id -u")
|
|
if errIdCheck != nil {
|
|
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "checking user id to run upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, errIdCheck)
|
|
}
|
|
|
|
resultIdCheck = strings.TrimSuffix(resultIdCheck, "\n")
|
|
var result, scriptErrMsg string
|
|
if resultIdCheck != "0" {
|
|
c.logger.Infof("user id was identified as: %s, hence default user is non-root already", resultIdCheck)
|
|
result, err = c.ExecCommand(podName, "/bin/bash", "-c", upgradeCommand)
|
|
scriptErrMsg, _ = c.ExecCommand(podName, "/bin/bash", "-c", "tail -n 1 last_upgrade.log")
|
|
} else {
|
|
c.logger.Infof("user id was identified as: %s, using su to reach the postgres user", resultIdCheck)
|
|
result, err = c.ExecCommand(podName, "/bin/su", "postgres", "-c", upgradeCommand)
|
|
scriptErrMsg, _ = c.ExecCommand(podName, "/bin/bash", "-c", "tail -n 1 last_upgrade.log")
|
|
}
|
|
if err != nil {
|
|
isUpgradeSuccess = false
|
|
c.annotatePostgresResource(isUpgradeSuccess)
|
|
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, scriptErrMsg)
|
|
return fmt.Errorf("%s", scriptErrMsg)
|
|
}
|
|
|
|
c.annotatePostgresResource(isUpgradeSuccess)
|
|
c.logger.Infof("upgrade action triggered and command completed: %s", result[:100])
|
|
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "upgrade from %d to %d finished", c.currentMajorVersion, desiredVersion)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|