Implement per-cluster maintenance window for Postgres automatic upgrade (#2710)

* implement maintenance window for major version upgrade 
* e2e test: fix major version upgrade test and extend with the time window
* unit test: add iteration to test isInMaintenanceWindow
* UI: show the window and enable edit via UI
This commit is contained in:
Ida Novindasari 2024-08-09 14:07:35 +02:00 committed by GitHub
parent ce15d10aa3
commit e6ae9e3772
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 219 additions and 24 deletions

View File

@ -114,6 +114,12 @@ These parameters are grouped directly under the `spec` key in the manifest.
this parameter. Optional, when empty the load balancer service becomes this parameter. Optional, when empty the load balancer service becomes
inaccessible from outside of the Kubernetes cluster. inaccessible from outside of the Kubernetes cluster.
* **maintenanceWindows**
a list defines specific time frames when major version upgrades are permitted
to occur, restricting major version upgrades to these designated periods only.
Accepted formats include "01:00-06:00" for daily maintenance windows or
"Sat:00:00-04:00" for specific days, with all times in UTC.
* **users** * **users**
a map of usernames to user flags for the users that should be created in the a map of usernames to user flags for the users that should be created in the
cluster by the operator. User flags are a list, allowed elements are cluster by the operator. User flags are a list, allowed elements are

View File

@ -218,7 +218,6 @@ class K8s:
pod_phase = 'Failing over' pod_phase = 'Failing over'
new_pod_node = '' new_pod_node = ''
pods_with_update_flag = self.count_pods_with_rolling_update_flag(labels, namespace) pods_with_update_flag = self.count_pods_with_rolling_update_flag(labels, namespace)
while (pod_phase != 'Running') or (new_pod_node not in failover_targets): while (pod_phase != 'Running') or (new_pod_node not in failover_targets):
pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items
if pods: if pods:
@ -525,7 +524,6 @@ class K8sBase:
pod_phase = 'Failing over' pod_phase = 'Failing over'
new_pod_node = '' new_pod_node = ''
pods_with_update_flag = self.count_pods_with_rolling_update_flag(labels, namespace) pods_with_update_flag = self.count_pods_with_rolling_update_flag(labels, namespace)
while (pod_phase != 'Running') or (new_pod_node not in failover_targets): while (pod_phase != 'Running') or (new_pod_node not in failover_targets):
pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items
if pods: if pods:

View File

@ -14,6 +14,7 @@ from kubernetes.client.rest import ApiException
SPILO_CURRENT = "registry.opensource.zalan.do/acid/spilo-16-e2e:0.1" SPILO_CURRENT = "registry.opensource.zalan.do/acid/spilo-16-e2e:0.1"
SPILO_LAZY = "registry.opensource.zalan.do/acid/spilo-16-e2e:0.2" SPILO_LAZY = "registry.opensource.zalan.do/acid/spilo-16-e2e:0.2"
SPILO_FULL_IMAGE = "ghcr.io/zalando/spilo-16:3.2-p3"
def to_selector(labels): def to_selector(labels):
@ -115,6 +116,7 @@ class EndToEndTestCase(unittest.TestCase):
configmap = yaml.safe_load(f) configmap = yaml.safe_load(f)
configmap["data"]["workers"] = "1" configmap["data"]["workers"] = "1"
configmap["data"]["docker_image"] = SPILO_CURRENT configmap["data"]["docker_image"] = SPILO_CURRENT
configmap["data"]["major_version_upgrade_mode"] = "full"
with open("manifests/configmap.yaml", 'w') as f: with open("manifests/configmap.yaml", 'w') as f:
yaml.dump(configmap, f, Dumper=yaml.Dumper) yaml.dump(configmap, f, Dumper=yaml.Dumper)
@ -1181,31 +1183,94 @@ class EndToEndTestCase(unittest.TestCase):
self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running") self.eventuallyEqual(lambda: len(k8s.get_patroni_running_members("acid-minimal-cluster-0")), 2, "Postgres status did not enter running")
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
@unittest.skip("Skipping this test until fixed")
def test_major_version_upgrade(self): def test_major_version_upgrade(self):
k8s = self.k8s """
result = k8s.create_with_kubectl("manifests/minimal-postgres-manifest-12.yaml") Test major version upgrade
self.eventuallyEqual(lambda: k8s.count_running_pods(labels="application=spilo,cluster-name=acid-upgrade-test"), 2, "No 2 pods running") """
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") def check_version():
p = k8s.patroni_rest("acid-upgrade-test-0", "")
version = p.get("server_version", 0) // 10000
return version
pg_patch_version = { k8s = self.k8s
cluster_label = 'application=spilo,cluster-name=acid-upgrade-test'
with open("manifests/minimal-postgres-manifest-12.yaml", 'r+') as f:
upgrade_manifest = yaml.safe_load(f)
upgrade_manifest["spec"]["dockerImage"] = SPILO_FULL_IMAGE
with open("manifests/minimal-postgres-manifest-12.yaml", 'w') as f:
yaml.dump(upgrade_manifest, f, Dumper=yaml.Dumper)
k8s.create_with_kubectl("manifests/minimal-postgres-manifest-12.yaml")
self.eventuallyEqual(lambda: k8s.count_running_pods(labels=cluster_label), 2, "No 2 pods running")
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
self.eventuallyEqual(check_version, 12, "Version is not correct")
master_nodes, _ = k8s.get_cluster_nodes(cluster_labels=cluster_label)
# should upgrade immediately
pg_patch_version_14 = {
"spec": { "spec": {
"postgres": { "postgresql": {
"version": "14" "version": "14"
} }
} }
} }
k8s.api.custom_objects_api.patch_namespaced_custom_object( k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version) "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
def check_version_14(): # should have finish failover
p = k8s.get_patroni_state("acid-upgrade-test-0") k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
version = p["server_version"][0:2] k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
return version k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 14, "Version should be upgraded from 12 to 14")
self.eventuallyEqual(check_version_14, "14", "Version was not upgrade to 14") # should not upgrade because current time is not in maintenanceWindow
current_time = datetime.now()
maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}"
pg_patch_version_15 = {
"spec": {
"postgresql": {
"version": "15"
},
"maintenanceWindows": [
maintenance_window_future
]
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 14, "Version should not be upgraded")
# change the version again to trigger operator sync
maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}"
pg_patch_version_16 = {
"spec": {
"postgresql": {
"version": "16"
},
"maintenanceWindows": [
maintenance_window_current
]
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16")
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_persistent_volume_claim_retention_policy(self): def test_persistent_volume_claim_retention_policy(self):

View File

@ -74,6 +74,11 @@ func (c *Cluster) majorVersionUpgrade() error {
return nil return nil
} }
if !c.isInMainternanceWindow() {
c.logger.Infof("skipping major version upgrade, not in maintenance window")
return nil
}
pods, err := c.listPods() pods, err := c.listPods()
if err != nil { if err != nil {
return err return err

View File

@ -662,3 +662,24 @@ func parseResourceRequirements(resourcesRequirement v1.ResourceRequirements) (ac
} }
return resources, nil return resources, nil
} }
func (c *Cluster) isInMainternanceWindow() bool {
if c.Spec.MaintenanceWindows == nil {
return true
}
now := time.Now()
currentDay := now.Weekday()
currentTime := now.Format("15:04")
for _, window := range c.Spec.MaintenanceWindows {
startTime := window.StartTime.Format("15:04")
endTime := window.EndTime.Format("15:04")
if window.Everyday || window.Weekday == currentDay {
if currentTime >= startTime && currentTime <= endTime {
return true
}
}
}
return false
}

View File

@ -27,6 +27,15 @@ import (
var externalAnnotations = map[string]string{"existing": "annotation"} var externalAnnotations = map[string]string{"existing": "annotation"}
func mustParseTime(s string) metav1.Time {
v, err := time.Parse("15:04", s)
if err != nil {
panic(err)
}
return metav1.Time{Time: v.UTC()}
}
func newFakeK8sAnnotationsClient() (k8sutil.KubernetesClient, *k8sFake.Clientset) { func newFakeK8sAnnotationsClient() (k8sutil.KubernetesClient, *k8sFake.Clientset) {
clientSet := k8sFake.NewSimpleClientset() clientSet := k8sFake.NewSimpleClientset()
acidClientSet := fakeacidv1.NewSimpleClientset() acidClientSet := fakeacidv1.NewSimpleClientset()
@ -521,3 +530,83 @@ func Test_trimCronjobName(t *testing.T) {
}) })
} }
} }
func TestIsInMaintenanceWindow(t *testing.T) {
client, _ := newFakeK8sStreamClient()
var cluster = New(
Config{
OpConfig: config.Config{
PodManagementPolicy: "ordered_ready",
Resources: config.Resources{
ClusterLabels: map[string]string{"application": "spilo"},
ClusterNameLabel: "cluster-name",
DefaultCPURequest: "300m",
DefaultCPULimit: "300m",
DefaultMemoryRequest: "300Mi",
DefaultMemoryLimit: "300Mi",
PodRoleLabel: "spilo-role",
},
},
}, client, pg, logger, eventRecorder)
now := time.Now()
futureTimeStart := now.Add(1 * time.Hour)
futureTimeStartFormatted := futureTimeStart.Format("15:04")
futureTimeEnd := now.Add(2 * time.Hour)
futureTimeEndFormatted := futureTimeEnd.Format("15:04")
tests := []struct {
name string
windows []acidv1.MaintenanceWindow
expected bool
}{
{
name: "no maintenance windows",
windows: nil,
expected: true,
},
{
name: "maintenance windows with everyday",
windows: []acidv1.MaintenanceWindow{
{
Everyday: true,
StartTime: mustParseTime("00:00"),
EndTime: mustParseTime("23:59"),
},
},
expected: true,
},
{
name: "maintenance windows with weekday",
windows: []acidv1.MaintenanceWindow{
{
Weekday: now.Weekday(),
StartTime: mustParseTime("00:00"),
EndTime: mustParseTime("23:59"),
},
},
expected: true,
},
{
name: "maintenance windows with future interval time",
windows: []acidv1.MaintenanceWindow{
{
Weekday: now.Weekday(),
StartTime: mustParseTime(futureTimeStartFormatted),
EndTime: mustParseTime(futureTimeEndFormatted),
},
},
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
cluster.Spec.MaintenanceWindows = tt.windows
if cluster.isInMainternanceWindow() != tt.expected {
t.Errorf("Expected isInMainternanceWindow to return %t", tt.expected)
}
})
}
}

View File

@ -384,10 +384,6 @@ func (c *Controller) warnOnDeprecatedPostgreSQLSpecParameters(spec *acidv1.Postg
c.logger.Warningf("parameter %q is deprecated. Consider setting %q instead", deprecated, replacement) c.logger.Warningf("parameter %q is deprecated. Consider setting %q instead", deprecated, replacement)
} }
noeffect := func(param string, explanation string) {
c.logger.Warningf("parameter %q takes no effect. %s", param, explanation)
}
if spec.UseLoadBalancer != nil { if spec.UseLoadBalancer != nil {
deprecate("useLoadBalancer", "enableMasterLoadBalancer") deprecate("useLoadBalancer", "enableMasterLoadBalancer")
} }
@ -395,10 +391,6 @@ func (c *Controller) warnOnDeprecatedPostgreSQLSpecParameters(spec *acidv1.Postg
deprecate("replicaLoadBalancer", "enableReplicaLoadBalancer") deprecate("replicaLoadBalancer", "enableReplicaLoadBalancer")
} }
if len(spec.MaintenanceWindows) > 0 {
noeffect("maintenanceWindows", "Not implemented.")
}
if (spec.UseLoadBalancer != nil || spec.ReplicaLoadBalancer != nil) && if (spec.UseLoadBalancer != nil || spec.ReplicaLoadBalancer != nil) &&
(spec.EnableReplicaLoadBalancer != nil || spec.EnableMasterLoadBalancer != nil) { (spec.EnableReplicaLoadBalancer != nil || spec.EnableMasterLoadBalancer != nil) {
c.logger.Warnf("both old and new load balancer parameters are present in the manifest, ignoring old ones") c.logger.Warnf("both old and new load balancer parameters are present in the manifest, ignoring old ones")

View File

@ -142,6 +142,7 @@ edit
o.spec.enableReplicaConnectionPooler = i.spec.enableReplicaConnectionPooler || false o.spec.enableReplicaConnectionPooler = i.spec.enableReplicaConnectionPooler || false
o.spec.enableMasterPoolerLoadBalancer = i.spec.enableMasterPoolerLoadBalancer || false o.spec.enableMasterPoolerLoadBalancer = i.spec.enableMasterPoolerLoadBalancer || false
o.spec.enableReplicaPoolerLoadBalancer = i.spec.enableReplicaPoolerLoadBalancer || false o.spec.enableReplicaPoolerLoadBalancer = i.spec.enableReplicaPoolerLoadBalancer || false
o.spec.maintenanceWindows = i.spec.maintenanceWindows || []
o.spec.volume = { o.spec.volume = {
size: i.spec.volume.size, size: i.spec.volume.size,

View File

@ -594,6 +594,12 @@ new
{{#if enableReplicaPoolerLoadBalancer}} {{#if enableReplicaPoolerLoadBalancer}}
enableReplicaPoolerLoadBalancer: true enableReplicaPoolerLoadBalancer: true
{{/if}} {{/if}}
{{#if maintenanceWindows}}
maintenanceWindows:
{{#each maintenanceWindows}}
- "{{ this }}"
{{/each}}
{{/if}}
volume: volume:
size: "{{ volumeSize }}Gi"{{#if volumeStorageClass}} size: "{{ volumeSize }}Gi"{{#if volumeStorageClass}}
storageClass: "{{ volumeStorageClass }}"{{/if}}{{#if iops}} storageClass: "{{ volumeStorageClass }}"{{/if}}{{#if iops}}
@ -651,6 +657,7 @@ new
enableReplicaConnectionPooler: this.enableReplicaConnectionPooler, enableReplicaConnectionPooler: this.enableReplicaConnectionPooler,
enableMasterPoolerLoadBalancer: this.enableMasterPoolerLoadBalancer, enableMasterPoolerLoadBalancer: this.enableMasterPoolerLoadBalancer,
enableReplicaPoolerLoadBalancer: this.enableReplicaPoolerLoadBalancer, enableReplicaPoolerLoadBalancer: this.enableReplicaPoolerLoadBalancer,
maintenanceWindows: this.maintenanceWindows,
volumeSize: this.volumeSize, volumeSize: this.volumeSize,
volumeStorageClass: this.volumeStorageClass, volumeStorageClass: this.volumeStorageClass,
iops: this.iops, iops: this.iops,
@ -727,6 +734,10 @@ new
this.enableReplicaPoolerLoadBalancer = !this.enableReplicaPoolerLoadBalancer this.enableReplicaPoolerLoadBalancer = !this.enableReplicaPoolerLoadBalancer
} }
this.maintenanceWindows = e => {
this.maintenanceWindows = e.target.value
}
this.volumeChange = e => { this.volumeChange = e => {
this.volumeSize = +e.target.value this.volumeSize = +e.target.value
} }
@ -1042,6 +1053,7 @@ new
this.enableReplicaConnectionPooler = false this.enableReplicaConnectionPooler = false
this.enableMasterPoolerLoadBalancer = false this.enableMasterPoolerLoadBalancer = false
this.enableReplicaPoolerLoadBalancer = false this.enableReplicaPoolerLoadBalancer = false
this.maintenanceWindows = {}
this.postgresqlVersion = this.postgresqlVersion = ( this.postgresqlVersion = this.postgresqlVersion = (
this.config.postgresql_versions[0] this.config.postgresql_versions[0]

View File

@ -465,6 +465,7 @@ def get_postgresqls():
'status': status, 'status': status,
'num_elb': spec.get('enableMasterLoadBalancer', 0) + spec.get('enableReplicaLoadBalancer', 0) + \ 'num_elb': spec.get('enableMasterLoadBalancer', 0) + spec.get('enableReplicaLoadBalancer', 0) + \
spec.get('enableMasterPoolerLoadBalancer', 0) + spec.get('enableReplicaPoolerLoadBalancer', 0), spec.get('enableMasterPoolerLoadBalancer', 0) + spec.get('enableReplicaPoolerLoadBalancer', 0),
'maintenance_windows': spec.get('maintenanceWindows', []),
} }
for cluster in these( for cluster in these(
read_postgresqls( read_postgresqls(
@ -566,6 +567,11 @@ def update_postgresql(namespace: str, cluster: str):
return fail('allowedSourceRanges invalid') return fail('allowedSourceRanges invalid')
spec['allowedSourceRanges'] = postgresql['spec']['allowedSourceRanges'] spec['allowedSourceRanges'] = postgresql['spec']['allowedSourceRanges']
if 'maintenanceWindows' in postgresql['spec']:
if not isinstance(postgresql['spec']['maintenanceWindows'], list):
return fail('maintenanceWindows invalid')
spec['maintenanceWindows'] = postgresql['spec']['maintenanceWindows']
if 'numberOfInstances' in postgresql['spec']: if 'numberOfInstances' in postgresql['spec']:
if not isinstance(postgresql['spec']['numberOfInstances'], int): if not isinstance(postgresql['spec']['numberOfInstances'], int):
return fail('numberOfInstances invalid') return fail('numberOfInstances invalid')