initial implementation
This commit is contained in:
		
							parent
							
								
									51909204fd
								
							
						
					
					
						commit
						e9486f8325
					
				|  | @ -62,6 +62,8 @@ spec: | |||
|               type: string | ||||
|             enable_crd_validation: | ||||
|               type: boolean | ||||
|             enable_lazy_image_upgrade: | ||||
|               type: boolean | ||||
|             enable_shm_volume: | ||||
|               type: boolean | ||||
|             etcd_host: | ||||
|  |  | |||
|  | @ -429,6 +429,10 @@ from numerous escape characters in the latter log entry, view it in CLI with | |||
| `PodTemplate` used by the operator is yet to be updated with the default values | ||||
| used internally in K8s. | ||||
| 
 | ||||
| The operator also support lazy updates of the Spilo image. That means the pod template of a  | ||||
| PG cluster's stateful set is updated immediately with the new image, but no rolling upgrade follows. This feature saves you  | ||||
| some downtime when you know pods are re-started after the update anyway, for instance due to the node rotation. | ||||
| 
 | ||||
| ## Logical backups | ||||
| 
 | ||||
| The operator can manage K8s cron jobs to run logical backups of Postgres | ||||
|  |  | |||
|  | @ -75,6 +75,10 @@ Those are top-level keys, containing both leaf keys and groups. | |||
|   [OpenAPI v3 schema validation](https://kubernetes.io/docs/tasks/access-kubernetes-api/custom-resources/custom-resource-definitions/#validation) | ||||
|   The default is `true`. | ||||
| 
 | ||||
| * **enable_lazy_image_upgrade** | ||||
|   Instruct operator to update only the statefulsets with the new image without immediately doing the rolling update. The assumption is pods will be re-started later with the new image, for example due to the node rotation. | ||||
|   The default is `false`. | ||||
| 
 | ||||
| * **etcd_host** | ||||
|   Etcd connection string for Patroni defined as `host:port`. Not required when | ||||
|   Patroni native Kubernetes support is used. The default is empty (use | ||||
|  |  | |||
|  | @ -346,6 +346,61 @@ class EndToEndTestCase(unittest.TestCase): | |||
|         } | ||||
|         k8s.update_config(unpatch_custom_service_annotations) | ||||
| 
 | ||||
|     @timeout_decorator.timeout(TEST_TIMEOUT_SEC) | ||||
|     def test_lazy_image_update(self): | ||||
|         ''' | ||||
|         Test lazy update for the Spilo image: operator changes a stateful set but lets pods run with the old image  | ||||
|         until they are recreated for reasons other than operator's activity. That works because the operator uses  | ||||
|         "onDelete" pod update policy for stateful sets. | ||||
| 
 | ||||
|         The test covers: | ||||
|         1) enabling lazy upgrade in existing operator deployment | ||||
|         2) forcing the normal rolling upgrade by changing the operator configmap and restarting its pod | ||||
|         ''' | ||||
| 
 | ||||
|         k8s = self.k8s | ||||
|         pod0 = "acid-minimal-cluster-0" | ||||
|         pod1 = "acid-minimal-cluster-1" | ||||
| 
 | ||||
|         # enable lazy update | ||||
|         patch_lazy_image_upgrade = { | ||||
|             "data": { | ||||
|                 "enable_lazy_image_upgrade": "true", | ||||
|                 "docker_image": "registry.opensource.zalan.do/acid/spilo-cdp-12:1.6-p16" | ||||
|             } | ||||
|         } | ||||
|         k8s.update_config(patch_lazy_image_upgrade) | ||||
| 
 | ||||
|         # wait for sts update | ||||
|         time.sleep(60) | ||||
| 
 | ||||
|         # restart the pod to get a container with the new image  | ||||
|         k8s.api.core_v1.delete_namespaced_pod(pod0, "default") | ||||
|         time.sleep(60) | ||||
| 
 | ||||
|         # lazy update works if the restarted pod and older pods have different Spilo versions | ||||
|         # i.e. the update did not immediately affect all pods | ||||
|         new_image = k8s.get_effective_pod_image(pod0) | ||||
|         old_image = k8s.get_effective_pod_image(pod1) | ||||
|         self.assertNotEqual(old_image, new_image, "Lazy updated failed: pods have the same image {}".format(new_image)) | ||||
| 
 | ||||
|         # clean up | ||||
|         unpatch_lazy_image_upgrade = { | ||||
|             "data": { | ||||
|                 "enable_lazy_image_upgrade": "false", | ||||
|             } | ||||
|         } | ||||
|         k8s.update_config(unpatch_lazy_image_upgrade) | ||||
| 
 | ||||
|         # at this point operator will complete the normal rolling update | ||||
|         # so we additonally test if disabling the lazy update (forcing the normal rolling update) works | ||||
|         time.sleep(60) | ||||
| 
 | ||||
|         image0 = k8s.get_effective_pod_image(pod0) | ||||
|         image1 = k8s.get_effective_pod_image(pod1) | ||||
| 
 | ||||
|         self.assertEqual(image0, image1, "Disabling lazy updated failed: pods still have different images {} and {}".format(image0, image1)) | ||||
| 
 | ||||
|     def assert_master_is_unique(self, namespace='default', clusterName="acid-minimal-cluster"): | ||||
|         ''' | ||||
|            Check that there is a single pod in the k8s cluster with the label "spilo-role=master" | ||||
|  | @ -481,6 +536,14 @@ class K8s: | |||
|     def create_with_kubectl(self, path): | ||||
|         subprocess.run(["kubectl", "create", "-f", path]) | ||||
| 
 | ||||
|     def get_effective_pod_image(self, pod_name, namespace = 'default'): | ||||
|         ''' | ||||
|         Get the Spilo image pod currently uses. In case of lazy rolling updates  | ||||
|         it may differ from the one specified in the stateful set. | ||||
|         ''' | ||||
|         pod = self.api.core_v1.list_namespaced_pod( | ||||
|             namespace, label_selector="statefulset.kubernetes.io/pod-name=" + pod_name) | ||||
|         return pod.items[0].spec.containers[0].image | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     unittest.main() | ||||
|  |  | |||
|  | @ -24,6 +24,7 @@ data: | |||
|   # enable_crd_validation: "true" | ||||
|   # enable_database_access: "true" | ||||
|   # enable_init_containers: "true" | ||||
|   # enable_lazy_image_upgrade: "true" | ||||
|   enable_master_load_balancer: "false" | ||||
|   # enable_pod_antiaffinity: "false" | ||||
|   # enable_pod_disruption_budget: "true" | ||||
|  |  | |||
|  | @ -38,6 +38,8 @@ spec: | |||
|               type: string | ||||
|             enable_crd_validation: | ||||
|               type: boolean | ||||
|             enable_lazy_image_upgrade: | ||||
|               type: boolean | ||||
|             enable_shm_volume: | ||||
|               type: boolean | ||||
|             etcd_host: | ||||
|  |  | |||
|  | @ -4,6 +4,7 @@ metadata: | |||
|   name: postgresql-operator-default-configuration | ||||
| configuration: | ||||
|   # enable_crd_validation: true | ||||
|   # enable_lazy_image_upgrade: true | ||||
|   etcd_host: "" | ||||
|   docker_image: registry.opensource.zalan.do/acid/spilo-12:1.6-p2 | ||||
|   # enable_shm_volume: true | ||||
|  |  | |||
|  | @ -409,6 +409,14 @@ func (c *Cluster) compareStatefulSetWith(statefulSet *appsv1.StatefulSet) *compa | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	// lazy Spilo update: modify the image in the statefulset itself but let its pods run with the old image
 | ||||
| 	// until they are re-created for other reasons, for example node rotation
 | ||||
| 	if c.OpConfig.EnableLazyImageUpgrade && !reflect.DeepEqual(c.Statefulset.Spec.Template.Spec.Containers[0].Image, statefulSet.Spec.Template.Spec.Containers[0].Image) { | ||||
| 		needsReplace = true | ||||
| 		needsRollUpdate = false | ||||
| 		reasons = append(reasons, "lazy Spilo update: new statefulset's pod image doesn't match the current one") | ||||
| 	} | ||||
| 
 | ||||
| 	if needsRollUpdate || needsReplace { | ||||
| 		match = false | ||||
| 	} | ||||
|  | @ -440,8 +448,6 @@ func (c *Cluster) compareContainers(description string, setA, setB []v1.Containe | |||
| 	checks := []containerCheck{ | ||||
| 		newCheck("new statefulset %s's %s (index %d) name doesn't match the current one", | ||||
| 			func(a, b v1.Container) bool { return a.Name != b.Name }), | ||||
| 		newCheck("new statefulset %s's %s (index %d) image doesn't match the current one", | ||||
| 			func(a, b v1.Container) bool { return a.Image != b.Image }), | ||||
| 		newCheck("new statefulset %s's %s (index %d) ports don't match the current one", | ||||
| 			func(a, b v1.Container) bool { return !reflect.DeepEqual(a.Ports, b.Ports) }), | ||||
| 		newCheck("new statefulset %s's %s (index %d) resources don't match the current ones", | ||||
|  | @ -452,6 +458,11 @@ func (c *Cluster) compareContainers(description string, setA, setB []v1.Containe | |||
| 			func(a, b v1.Container) bool { return !reflect.DeepEqual(a.EnvFrom, b.EnvFrom) }), | ||||
| 	} | ||||
| 
 | ||||
| 	if !c.OpConfig.EnableLazyImageUpgrade { | ||||
| 		checks = append(checks, newCheck("new statefulset %s's %s (index %d) image doesn't match the current one", | ||||
| 			func(a, b v1.Container) bool { return a.Image != b.Image })) | ||||
| 	} | ||||
| 
 | ||||
| 	for index, containerA := range setA { | ||||
| 		containerB := setB[index] | ||||
| 		for _, check := range checks { | ||||
|  |  | |||
|  | @ -3,17 +3,17 @@ package cluster | |||
| import ( | ||||
| 	"fmt" | ||||
| 
 | ||||
| 	batchv1beta1 "k8s.io/api/batch/v1beta1" | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	policybeta1 "k8s.io/api/policy/v1beta1" | ||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||
| 
 | ||||
| 	acidv1 "github.com/zalando/postgres-operator/pkg/apis/acid.zalan.do/v1" | ||||
| 	"github.com/zalando/postgres-operator/pkg/spec" | ||||
| 	"github.com/zalando/postgres-operator/pkg/util" | ||||
| 	"github.com/zalando/postgres-operator/pkg/util/constants" | ||||
| 	"github.com/zalando/postgres-operator/pkg/util/k8sutil" | ||||
| 	"github.com/zalando/postgres-operator/pkg/util/volumes" | ||||
| 	appsv1 "k8s.io/api/apps/v1" | ||||
| 	batchv1beta1 "k8s.io/api/batch/v1beta1" | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	policybeta1 "k8s.io/api/policy/v1beta1" | ||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||
| ) | ||||
| 
 | ||||
| // Sync syncs the cluster, making sure the actual Kubernetes objects correspond to what is defined in the manifest.
 | ||||
|  | @ -244,6 +244,32 @@ func (c *Cluster) syncPodDisruptionBudget(isUpdate bool) error { | |||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func (c *Cluster) mustUpdatePodsAfterLazyUpdate(desiredSset *appsv1.StatefulSet) (bool, error) { | ||||
| 
 | ||||
| 	if c.OpConfig.EnableLazyImageUpgrade { | ||||
| 		return false, nil | ||||
| 	} | ||||
| 
 | ||||
| 	pods, err := c.listPods() | ||||
| 	if err != nil { | ||||
| 		return false, fmt.Errorf("could not list pods of the statefulset: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	for _, pod := range pods { | ||||
| 
 | ||||
| 		effectivePodImage := pod.Spec.Containers[0].Image | ||||
| 		ssImage := desiredSset.Spec.Template.Spec.Containers[0].Image | ||||
| 
 | ||||
| 		if ssImage != effectivePodImage { | ||||
| 			c.logger.Infof("not all pods were re-started when the lazy upgrade was enabled; forcing the rolling upgrade now") | ||||
| 			return true, nil | ||||
| 		} | ||||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 	return false, nil | ||||
| } | ||||
| 
 | ||||
| func (c *Cluster) syncStatefulSet() error { | ||||
| 	var ( | ||||
| 		podsRollingUpdateRequired bool | ||||
|  | @ -310,6 +336,19 @@ func (c *Cluster) syncStatefulSet() error { | |||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		if !podsRollingUpdateRequired { | ||||
| 			// even if desired and actual statefulsets match
 | ||||
| 			// there still may be not up-to-date pods on condition
 | ||||
| 			//  (a) the lazy update was just disabled
 | ||||
| 			// and
 | ||||
| 			//  (b) some of the pods were not restarted when the lazy update was still in place
 | ||||
| 			podsRollingUpdateRequired, err = c.mustUpdatePodsAfterLazyUpdate(desiredSS) | ||||
| 			if err != nil { | ||||
| 				return fmt.Errorf("could not list pods of the statefulset: %v", err) | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 	// Apply special PostgreSQL parameters that can only be set via the Patroni API.
 | ||||
|  |  | |||
|  | @ -137,6 +137,7 @@ type Config struct { | |||
| 	ProtectedRoles            []string          `name:"protected_role_names" default:"admin"` | ||||
| 	PostgresSuperuserTeams    []string          `name:"postgres_superuser_teams" default:""` | ||||
| 	SetMemoryRequestToLimit   bool              `name:"set_memory_request_to_limit" default:"false"` | ||||
| 	EnableLazyImageUpgrade    bool              `name:"enable_lazy_image_upgrade" default:"false"` | ||||
| } | ||||
| 
 | ||||
| // MustMarshal marshals the config or panics
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue