Implement per-cluster maintenance window for Postgres automatic upgrade (#2710)

* implement maintenance window for major version upgrade 
* e2e test: fix major version upgrade test and extend with the time window
* unit test: add iteration to test isInMaintenanceWindow
* UI: show the window and enable edit via UI
This commit is contained in:
Ida Novindasari 2024-08-09 14:07:35 +02:00 committed by GitHub
parent ce15d10aa3
commit e6ae9e3772
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 219 additions and 24 deletions

View File

@ -114,6 +114,12 @@ These parameters are grouped directly under the `spec` key in the manifest.
this parameter. Optional, when empty the load balancer service becomes
inaccessible from outside of the Kubernetes cluster.
* **maintenanceWindows**
a list defines specific time frames when major version upgrades are permitted
to occur, restricting major version upgrades to these designated periods only.
Accepted formats include "01:00-06:00" for daily maintenance windows or
"Sat:00:00-04:00" for specific days, with all times in UTC.
* **users**
a map of usernames to user flags for the users that should be created in the
cluster by the operator. User flags are a list, allowed elements are

View File

@ -218,7 +218,6 @@ class K8s:
pod_phase = 'Failing over'
new_pod_node = ''
pods_with_update_flag = self.count_pods_with_rolling_update_flag(labels, namespace)
while (pod_phase != 'Running') or (new_pod_node not in failover_targets):
pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items
if pods:
@ -525,7 +524,6 @@ class K8sBase:
pod_phase = 'Failing over'
new_pod_node = ''
pods_with_update_flag = self.count_pods_with_rolling_update_flag(labels, namespace)
while (pod_phase != 'Running') or (new_pod_node not in failover_targets):
pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items
if pods:

View File

@ -14,6 +14,7 @@ from kubernetes.client.rest import ApiException
SPILO_CURRENT = "registry.opensource.zalan.do/acid/spilo-16-e2e:0.1"
SPILO_LAZY = "registry.opensource.zalan.do/acid/spilo-16-e2e:0.2"
SPILO_FULL_IMAGE = "ghcr.io/zalando/spilo-16:3.2-p3"
def to_selector(labels):
@ -115,6 +116,7 @@ class EndToEndTestCase(unittest.TestCase):
configmap = yaml.safe_load(f)
configmap["data"]["workers"] = "1"
configmap["data"]["docker_image"] = SPILO_CURRENT
configmap["data"]["major_version_upgrade_mode"] = "full"
with open("manifests/configmap.yaml", 'w') as f:
yaml.dump(configmap, f, Dumper=yaml.Dumper)
@ -1181,31 +1183,94 @@ class EndToEndTestCase(unittest.TestCase):
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
@unittest.skip("Skipping this test until fixed")
def test_major_version_upgrade(self):
k8s = self.k8s
result = k8s.create_with_kubectl("manifests/minimal-postgres-manifest-12.yaml")
self.eventuallyEqual(lambda: k8s.count_running_pods(labels="application=spilo,cluster-name=acid-upgrade-test"), 2, "No 2 pods running")
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
"""
Test major version upgrade
"""
def check_version():
p = k8s.patroni_rest("acid-upgrade-test-0", "")
version = p.get("server_version", 0) // 10000
return version
pg_patch_version = {
k8s = self.k8s
cluster_label = 'application=spilo,cluster-name=acid-upgrade-test'
with open("manifests/minimal-postgres-manifest-12.yaml", 'r+') as f:
upgrade_manifest = yaml.safe_load(f)
upgrade_manifest["spec"]["dockerImage"] = SPILO_FULL_IMAGE
with open("manifests/minimal-postgres-manifest-12.yaml", 'w') as f:
yaml.dump(upgrade_manifest, f, Dumper=yaml.Dumper)
k8s.create_with_kubectl("manifests/minimal-postgres-manifest-12.yaml")
self.eventuallyEqual(lambda: k8s.count_running_pods(labels=cluster_label), 2, "No 2 pods running")
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
self.eventuallyEqual(check_version, 12, "Version is not correct")
master_nodes, _ = k8s.get_cluster_nodes(cluster_labels=cluster_label)
# should upgrade immediately
pg_patch_version_14 = {
"spec": {
"postgres": {
"postgresql": {
"version": "14"
}
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version)
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
def check_version_14():
p = k8s.get_patroni_state("acid-upgrade-test-0")
version = p["server_version"][0:2]
return version
# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 14, "Version should be upgraded from 12 to 14")
self.eventuallyEqual(check_version_14, "14", "Version was not upgrade to 14")
# should not upgrade because current time is not in maintenanceWindow
current_time = datetime.now()
maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}"
pg_patch_version_15 = {
"spec": {
"postgresql": {
"version": "15"
},
"maintenanceWindows": [
maintenance_window_future
]
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 14, "Version should not be upgraded")
# change the version again to trigger operator sync
maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}"
pg_patch_version_16 = {
"spec": {
"postgresql": {
"version": "16"
},
"maintenanceWindows": [
maintenance_window_current
]
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16")
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_persistent_volume_claim_retention_policy(self):

View File

@ -74,6 +74,11 @@ func (c *Cluster) majorVersionUpgrade() error {
return nil
}
if !c.isInMainternanceWindow() {
c.logger.Infof("skipping major version upgrade, not in maintenance window")
return nil
}
pods, err := c.listPods()
if err != nil {
return err

View File

@ -662,3 +662,24 @@ func parseResourceRequirements(resourcesRequirement v1.ResourceRequirements) (ac
}
return resources, nil
}
func (c *Cluster) isInMainternanceWindow() bool {
if c.Spec.MaintenanceWindows == nil {
return true
}
now := time.Now()
currentDay := now.Weekday()
currentTime := now.Format("15:04")
for _, window := range c.Spec.MaintenanceWindows {
startTime := window.StartTime.Format("15:04")
endTime := window.EndTime.Format("15:04")
if window.Everyday || window.Weekday == currentDay {
if currentTime >= startTime && currentTime <= endTime {
return true
}
}
}
return false
}

View File

@ -27,6 +27,15 @@ import (
var externalAnnotations = map[string]string{"existing": "annotation"}
func mustParseTime(s string) metav1.Time {
v, err := time.Parse("15:04", s)
if err != nil {
panic(err)
}
return metav1.Time{Time: v.UTC()}
}
func newFakeK8sAnnotationsClient() (k8sutil.KubernetesClient, *k8sFake.Clientset) {
clientSet := k8sFake.NewSimpleClientset()
acidClientSet := fakeacidv1.NewSimpleClientset()
@ -521,3 +530,83 @@ func Test_trimCronjobName(t *testing.T) {
})
}
}
func TestIsInMaintenanceWindow(t *testing.T) {
client, _ := newFakeK8sStreamClient()
var cluster = New(
Config{
OpConfig: config.Config{
PodManagementPolicy: "ordered_ready",
Resources: config.Resources{
ClusterLabels: map[string]string{"application": "spilo"},
ClusterNameLabel: "cluster-name",
DefaultCPURequest: "300m",
DefaultCPULimit: "300m",
DefaultMemoryRequest: "300Mi",
DefaultMemoryLimit: "300Mi",
PodRoleLabel: "spilo-role",
},
},
}, client, pg, logger, eventRecorder)
now := time.Now()
futureTimeStart := now.Add(1 * time.Hour)
futureTimeStartFormatted := futureTimeStart.Format("15:04")
futureTimeEnd := now.Add(2 * time.Hour)
futureTimeEndFormatted := futureTimeEnd.Format("15:04")
tests := []struct {
name string
windows []acidv1.MaintenanceWindow
expected bool
}{
{
name: "no maintenance windows",
windows: nil,
expected: true,
},
{
name: "maintenance windows with everyday",
windows: []acidv1.MaintenanceWindow{
{
Everyday: true,
StartTime: mustParseTime("00:00"),
EndTime: mustParseTime("23:59"),
},
},
expected: true,
},
{
name: "maintenance windows with weekday",
windows: []acidv1.MaintenanceWindow{
{
Weekday: now.Weekday(),
StartTime: mustParseTime("00:00"),
EndTime: mustParseTime("23:59"),
},
},
expected: true,
},
{
name: "maintenance windows with future interval time",
windows: []acidv1.MaintenanceWindow{
{
Weekday: now.Weekday(),
StartTime: mustParseTime(futureTimeStartFormatted),
EndTime: mustParseTime(futureTimeEndFormatted),
},
},
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
cluster.Spec.MaintenanceWindows = tt.windows
if cluster.isInMainternanceWindow() != tt.expected {
t.Errorf("Expected isInMainternanceWindow to return %t", tt.expected)
}
})
}
}

View File

@ -384,10 +384,6 @@ func (c *Controller) warnOnDeprecatedPostgreSQLSpecParameters(spec *acidv1.Postg
c.logger.Warningf("parameter %q is deprecated. Consider setting %q instead", deprecated, replacement)
}
noeffect := func(param string, explanation string) {
c.logger.Warningf("parameter %q takes no effect. %s", param, explanation)
}
if spec.UseLoadBalancer != nil {
deprecate("useLoadBalancer", "enableMasterLoadBalancer")
}
@ -395,10 +391,6 @@ func (c *Controller) warnOnDeprecatedPostgreSQLSpecParameters(spec *acidv1.Postg
deprecate("replicaLoadBalancer", "enableReplicaLoadBalancer")
}
if len(spec.MaintenanceWindows) > 0 {
noeffect("maintenanceWindows", "Not implemented.")
}
if (spec.UseLoadBalancer != nil || spec.ReplicaLoadBalancer != nil) &&
(spec.EnableReplicaLoadBalancer != nil || spec.EnableMasterLoadBalancer != nil) {
c.logger.Warnf("both old and new load balancer parameters are present in the manifest, ignoring old ones")

View File

@ -142,6 +142,7 @@ edit
o.spec.enableReplicaConnectionPooler = i.spec.enableReplicaConnectionPooler || false
o.spec.enableMasterPoolerLoadBalancer = i.spec.enableMasterPoolerLoadBalancer || false
o.spec.enableReplicaPoolerLoadBalancer = i.spec.enableReplicaPoolerLoadBalancer || false
o.spec.maintenanceWindows = i.spec.maintenanceWindows || []
o.spec.volume = {
size: i.spec.volume.size,

View File

@ -594,6 +594,12 @@ new
{{#if enableReplicaPoolerLoadBalancer}}
enableReplicaPoolerLoadBalancer: true
{{/if}}
{{#if maintenanceWindows}}
maintenanceWindows:
{{#each maintenanceWindows}}
- "{{ this }}"
{{/each}}
{{/if}}
volume:
size: "{{ volumeSize }}Gi"{{#if volumeStorageClass}}
storageClass: "{{ volumeStorageClass }}"{{/if}}{{#if iops}}
@ -651,6 +657,7 @@ new
enableReplicaConnectionPooler: this.enableReplicaConnectionPooler,
enableMasterPoolerLoadBalancer: this.enableMasterPoolerLoadBalancer,
enableReplicaPoolerLoadBalancer: this.enableReplicaPoolerLoadBalancer,
maintenanceWindows: this.maintenanceWindows,
volumeSize: this.volumeSize,
volumeStorageClass: this.volumeStorageClass,
iops: this.iops,
@ -727,6 +734,10 @@ new
this.enableReplicaPoolerLoadBalancer = !this.enableReplicaPoolerLoadBalancer
}
this.maintenanceWindows = e => {
this.maintenanceWindows = e.target.value
}
this.volumeChange = e => {
this.volumeSize = +e.target.value
}
@ -1042,6 +1053,7 @@ new
this.enableReplicaConnectionPooler = false
this.enableMasterPoolerLoadBalancer = false
this.enableReplicaPoolerLoadBalancer = false
this.maintenanceWindows = {}
this.postgresqlVersion = this.postgresqlVersion = (
this.config.postgresql_versions[0]

View File

@ -465,6 +465,7 @@ def get_postgresqls():
'status': status,
'num_elb': spec.get('enableMasterLoadBalancer', 0) + spec.get('enableReplicaLoadBalancer', 0) + \
spec.get('enableMasterPoolerLoadBalancer', 0) + spec.get('enableReplicaPoolerLoadBalancer', 0),
'maintenance_windows': spec.get('maintenanceWindows', []),
}
for cluster in these(
read_postgresqls(
@ -566,6 +567,11 @@ def update_postgresql(namespace: str, cluster: str):
return fail('allowedSourceRanges invalid')
spec['allowedSourceRanges'] = postgresql['spec']['allowedSourceRanges']
if 'maintenanceWindows' in postgresql['spec']:
if not isinstance(postgresql['spec']['maintenanceWindows'], list):
return fail('maintenanceWindows invalid')
spec['maintenanceWindows'] = postgresql['spec']['maintenanceWindows']
if 'numberOfInstances' in postgresql['spec']:
if not isinstance(postgresql['spec']['numberOfInstances'], int):
return fail('numberOfInstances invalid')