package cluster import ( "context" "fmt" "strconv" "strings" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "github.com/aws/aws-sdk-go/aws" "github.com/zalando/postgres-operator/pkg/spec" "github.com/zalando/postgres-operator/pkg/util/constants" "github.com/zalando/postgres-operator/pkg/util/filesystems" "github.com/zalando/postgres-operator/pkg/util/k8sutil" "github.com/zalando/postgres-operator/pkg/util/volumes" ) func (c *Cluster) syncVolumes() error { c.logger.Debugf("syncing volumes using %q storage resize mode", c.OpConfig.StorageResizeMode) var err error // check quantity string once, and do not bother with it anymore anywhere else _, err = resource.ParseQuantity(c.Spec.Volume.Size) if err != nil { return fmt.Errorf("could not parse volume size from the manifest: %v", err) } if c.OpConfig.StorageResizeMode == "mixed" { // mixed op uses AWS API to adjust size, throughput, iops, and calls pvc change for file system resize // in case of errors we proceed to let K8s do its work, favoring disk space increase of other adjustments err = c.populateVolumeMetaData() if err != nil { c.logger.Errorf("populating EBS meta data failed, skipping potential adjustments: %v", err) } else { err = c.syncUnderlyingEBSVolume() if err != nil { c.logger.Errorf("errors occured during EBS volume adjustments: %v", err) } } } if err = c.syncVolumeClaims(); err != nil { err = fmt.Errorf("could not sync persistent volume claims: %v", err) return err } if c.OpConfig.StorageResizeMode == "ebs" { // potentially enlarge volumes before changing the statefulset. By doing that // in this order we make sure the operator is not stuck waiting for a pod that // cannot start because it ran out of disk space. // TODO: handle the case of the cluster that is downsized and enlarged again // (there will be a volume from the old pod for which we can't act before the // the statefulset modification is concluded) if err = c.syncEbsVolumes(); err != nil { err = fmt.Errorf("could not sync persistent volumes: %v", err) return err } } return nil } func (c *Cluster) syncUnderlyingEBSVolume() error { c.logger.Debug("starting to sync EBS volumes: type, iops, throughput, and size") var ( err error newSize resource.Quantity ) targetValue := c.Spec.Volume if newSize, err = resource.ParseQuantity(targetValue.Size); err != nil { return fmt.Errorf("could not parse volume size: %v", err) } targetSize := quantityToGigabyte(newSize) awsGp3 := aws.String("gp3") awsIo2 := aws.String("io2") errors := make([]string, 0) for _, volume := range c.EBSVolumes { var modifyIops *int64 var modifyThroughput *int64 var modifySize *int64 var modifyType *string if targetValue.Iops != nil && *targetValue.Iops >= int64(3000) { if volume.Iops != *targetValue.Iops { modifyIops = targetValue.Iops } } if targetValue.Throughput != nil && *targetValue.Throughput >= int64(125) { if volume.Throughput != *targetValue.Throughput { modifyThroughput = targetValue.Throughput } } if targetSize > volume.Size { modifySize = &targetSize } if modifyIops != nil || modifyThroughput != nil || modifySize != nil { if modifyIops != nil || modifyThroughput != nil { // we default to gp3 if iops and throughput are configured modifyType = awsGp3 if targetValue.VolumeType == "io2" { modifyType = awsIo2 } } else if targetValue.VolumeType == "gp3" && volume.VolumeType != "gp3" { modifyType = awsGp3 } else { // do not touch type modifyType = nil } err = c.VolumeResizer.ModifyVolume(volume.VolumeID, modifyType, modifySize, modifyIops, modifyThroughput) if err != nil { errors = append(errors, fmt.Sprintf("modify failed: %v, showing current EBS values: volume-id=%s size=%d iops=%d throughput=%d", err, volume.VolumeID, volume.Size, volume.Iops, volume.Throughput)) } } } if len(errors) > 0 { for _, s := range errors { c.logger.Warningf("%s", s) } } return nil } func (c *Cluster) populateVolumeMetaData() error { c.logger.Debug("starting reading ebs meta data") pvs, err := c.listPersistentVolumes() if err != nil { return fmt.Errorf("could not list persistent volumes: %v", err) } if len(pvs) == 0 { c.EBSVolumes = make(map[string]volumes.VolumeProperties) return fmt.Errorf("no persistent volumes found") } c.logger.Debugf("found %d persistent volumes, size of known volumes %d", len(pvs), len(c.EBSVolumes)) volumeIds := []string{} var volumeID string for _, pv := range pvs { volumeID, err = c.VolumeResizer.GetProviderVolumeID(pv) if err != nil { continue } volumeIds = append(volumeIds, volumeID) } currentVolumes, err := c.VolumeResizer.DescribeVolumes(volumeIds) if nil != err { return err } if len(currentVolumes) != len(c.EBSVolumes) && len(c.EBSVolumes) > 0 { c.logger.Infof("number of ebs volumes (%d) discovered differs from already known volumes (%d)", len(currentVolumes), len(c.EBSVolumes)) } // reset map, operator is not responsible for dangling ebs volumes c.EBSVolumes = make(map[string]volumes.VolumeProperties) for _, volume := range currentVolumes { c.EBSVolumes[volume.VolumeID] = volume } return nil } // syncVolumeClaims reads all persistent volume claims and checks that their size matches the one declared in the statefulset. func (c *Cluster) syncVolumeClaims() error { c.setProcessName("syncing volume claims") ignoreResize := false if c.OpConfig.StorageResizeMode == "off" || c.OpConfig.StorageResizeMode == "ebs" { ignoreResize = true c.logger.Debugf("Storage resize mode is set to %q. Skipping volume size sync of persistent volume claims.", c.OpConfig.StorageResizeMode) } newSize, err := resource.ParseQuantity(c.Spec.Volume.Size) if err != nil { return fmt.Errorf("could not parse volume size from the manifest: %v", err) } manifestSize := quantityToGigabyte(newSize) pvcs, err := c.listPersistentVolumeClaims() if err != nil { return fmt.Errorf("could not list persistent volume claims: %v", err) } for _, pvc := range pvcs { c.VolumeClaims[pvc.UID] = &pvc needsUpdate := false currentSize := quantityToGigabyte(pvc.Spec.Resources.Requests[v1.ResourceStorage]) if !ignoreResize && currentSize != manifestSize { if currentSize < manifestSize { pvc.Spec.Resources.Requests[v1.ResourceStorage] = newSize needsUpdate = true c.logger.Infof("persistent volume claim for volume %q needs to be resized", pvc.Name) } else { c.logger.Warningf("cannot shrink persistent volume") } } if needsUpdate { c.logger.Infof("updating persistent volume claim definition for volume %q", pvc.Name) updatedPvc, err := c.KubeClient.PersistentVolumeClaims(pvc.Namespace).Update(context.TODO(), &pvc, metav1.UpdateOptions{}) if err != nil { return fmt.Errorf("could not update persistent volume claim: %q", err) } c.VolumeClaims[pvc.UID] = updatedPvc c.logger.Infof("successfully updated persistent volume claim %q", pvc.Name) } else { c.logger.Debugf("volume claim for volume %q do not require updates", pvc.Name) } newAnnotations := c.annotationsSet(nil) if changed, _ := c.compareAnnotations(pvc.Annotations, newAnnotations, nil); changed { patchData, err := metaAnnotationsPatch(newAnnotations) if err != nil { return fmt.Errorf("could not form patch for the persistent volume claim for volume %q: %v", pvc.Name, err) } patchedPvc, err := c.KubeClient.PersistentVolumeClaims(pvc.Namespace).Patch(context.TODO(), pvc.Name, types.MergePatchType, []byte(patchData), metav1.PatchOptions{}) if err != nil { return fmt.Errorf("could not patch annotations of the persistent volume claim for volume %q: %v", pvc.Name, err) } c.VolumeClaims[pvc.UID] = patchedPvc } } c.logger.Debug("volume claims have been synced successfully") return nil } // syncVolumes reads all persistent volumes and checks that their size matches the one declared in the statefulset. func (c *Cluster) syncEbsVolumes() error { c.setProcessName("syncing EBS volumes") act, err := c.volumesNeedResizing() if err != nil { return fmt.Errorf("could not compare size of the volumes: %v", err) } if !act { return nil } if err := c.resizeVolumes(); err != nil { return fmt.Errorf("could not sync volumes: %v", err) } c.logger.Debug("volumes have been synced successfully") return nil } func (c *Cluster) listPersistentVolumeClaims() ([]v1.PersistentVolumeClaim, error) { ns := c.Namespace listOptions := metav1.ListOptions{ LabelSelector: c.labelsSet(false).String(), } pvcs, err := c.KubeClient.PersistentVolumeClaims(ns).List(context.TODO(), listOptions) if err != nil { return nil, fmt.Errorf("could not list of persistent volume claims: %v", err) } return pvcs.Items, nil } func (c *Cluster) deletePersistentVolumeClaims() error { c.setProcessName("deleting persistent volume claims") errors := make([]string, 0) for uid := range c.VolumeClaims { err := c.deletePersistentVolumeClaim(uid) if err != nil { errors = append(errors, fmt.Sprintf("%v", err)) } } if len(errors) > 0 { c.logger.Warningf("could not delete all persistent volume claims: %v", strings.Join(errors, `', '`)) } return nil } func (c *Cluster) deletePersistentVolumeClaim(uid types.UID) error { c.setProcessName("deleting persistent volume claim") pvc := c.VolumeClaims[uid] c.logger.Debugf("deleting persistent volume claim %q", pvc.Name) err := c.KubeClient.PersistentVolumeClaims(pvc.Namespace).Delete(context.TODO(), pvc.Name, c.deleteOptions) if k8sutil.ResourceNotFound(err) { c.logger.Debugf("persistent volume claim %q has already been deleted", pvc.Name) } else if err != nil { return fmt.Errorf("could not delete persistent volume claim %q: %v", pvc.Name, err) } c.logger.Infof("persistent volume claim %q has been deleted", pvc.Name) delete(c.VolumeClaims, uid) return nil } func (c *Cluster) listPersistentVolumes() ([]*v1.PersistentVolume, error) { result := make([]*v1.PersistentVolume, 0) pvcs, err := c.listPersistentVolumeClaims() if err != nil { return nil, fmt.Errorf("could not list cluster's persistent volume claims: %v", err) } pods, err := c.listPods() if err != nil { return nil, fmt.Errorf("could not get list of running pods for resizing persistent volumes: %v", err) } lastPodIndex := len(pods) - 1 for _, pvc := range pvcs { lastDash := strings.LastIndex(pvc.Name, "-") if lastDash > 0 && lastDash < len(pvc.Name)-1 { pvcNumber, err := strconv.Atoi(pvc.Name[lastDash+1:]) if err != nil { return nil, fmt.Errorf("could not convert last part of the persistent volume claim name %q to a number", pvc.Name) } if pvcNumber > lastPodIndex { c.logger.Debugf("skipping persistent volume %q corresponding to a non-running pods", pvc.Name) continue } } pv, err := c.KubeClient.PersistentVolumes().Get(context.TODO(), pvc.Spec.VolumeName, metav1.GetOptions{}) if err != nil { return nil, fmt.Errorf("could not get PersistentVolume: %v", err) } result = append(result, pv) } return result, nil } // resizeVolumes resize persistent volumes compatible with the given resizer interface func (c *Cluster) resizeVolumes() error { if c.VolumeResizer == nil { return fmt.Errorf("no volume resizer set for EBS volume handling") } c.setProcessName("resizing EBS volumes") newQuantity, err := resource.ParseQuantity(c.Spec.Volume.Size) if err != nil { return fmt.Errorf("could not parse volume size: %v", err) } newSize := quantityToGigabyte(newQuantity) resizer := c.VolumeResizer var totalIncompatible int pvs, err := c.listPersistentVolumes() if err != nil { return fmt.Errorf("could not list persistent volumes: %v", err) } for _, pv := range pvs { volumeSize := quantityToGigabyte(pv.Spec.Capacity[v1.ResourceStorage]) if volumeSize >= newSize { if volumeSize > newSize { c.logger.Warningf("cannot shrink persistent volume") } continue } compatible := false if !resizer.VolumeBelongsToProvider(pv) { continue } compatible = true if !resizer.IsConnectedToProvider() { err := resizer.ConnectToProvider() if err != nil { return fmt.Errorf("could not connect to the volume provider: %v", err) } defer func() { if err := resizer.DisconnectFromProvider(); err != nil { c.logger.Errorf("%v", err) } }() } awsVolumeID, err := resizer.GetProviderVolumeID(pv) if err != nil { return err } c.logger.Infof("updating persistent volume %q to %d", pv.Name, newSize) if err := resizer.ResizeVolume(awsVolumeID, newSize); err != nil { return fmt.Errorf("could not resize EBS volume %q: %v", awsVolumeID, err) } c.logger.Infof("resizing the filesystem on the volume %q", pv.Name) podName := getPodNameFromPersistentVolume(pv) if err := c.resizePostgresFilesystem(podName, []filesystems.FilesystemResizer{&filesystems.Ext234Resize{}}); err != nil { return fmt.Errorf("could not resize the filesystem on pod %q: %v", podName, err) } c.logger.Infof("filesystem resize successful on volume %q", pv.Name) pv.Spec.Capacity[v1.ResourceStorage] = newQuantity c.logger.Infof("updating persistent volume definition for volume %q", pv.Name) if _, err := c.KubeClient.PersistentVolumes().Update(context.TODO(), pv, metav1.UpdateOptions{}); err != nil { return fmt.Errorf("could not update persistent volume: %q", err) } c.logger.Infof("successfully updated persistent volume %q", pv.Name) if !compatible { c.logger.Warningf("volume %q is incompatible with all available resizing providers, consider switching storage_resize_mode to pvc or off", pv.Name) totalIncompatible++ } } if totalIncompatible > 0 { return fmt.Errorf("could not resize EBS volumes: some persistent volumes are not compatible with existing resizing providers") } return nil } func (c *Cluster) volumesNeedResizing() (bool, error) { newQuantity, _ := resource.ParseQuantity(c.Spec.Volume.Size) newSize := quantityToGigabyte(newQuantity) vols, err := c.listPersistentVolumes() if err != nil { return false, err } for _, pv := range vols { currentSize := quantityToGigabyte(pv.Spec.Capacity[v1.ResourceStorage]) if currentSize != newSize { return true, nil } } return false, nil } // getPodNameFromPersistentVolume returns a pod name that it extracts from the volume claim ref. func getPodNameFromPersistentVolume(pv *v1.PersistentVolume) *spec.NamespacedName { namespace := pv.Spec.ClaimRef.Namespace name := pv.Spec.ClaimRef.Name[len(constants.DataVolumeName)+1:] return &spec.NamespacedName{Namespace: namespace, Name: name} } func quantityToGigabyte(q resource.Quantity) int64 { return q.ScaledValue(0) / (1 * constants.Gigabyte) } func (c *Cluster) executeEBSMigration() error { pvs, err := c.listPersistentVolumes() if err != nil { return fmt.Errorf("could not list persistent volumes: %v", err) } if len(pvs) == 0 { c.logger.Warningf("no persistent volumes found - skipping EBS migration") return nil } c.logger.Debugf("found %d volumes, size of known volumes %d", len(pvs), len(c.EBSVolumes)) if len(pvs) == len(c.EBSVolumes) { hasGp2 := false for _, v := range c.EBSVolumes { if v.VolumeType == "gp2" { hasGp2 = true } } if !hasGp2 { c.logger.Debugf("no EBS gp2 volumes left to migrate") return nil } } var i3000 int64 = 3000 var i125 int64 = 125 for _, volume := range c.EBSVolumes { if volume.VolumeType == "gp2" && volume.Size < c.OpConfig.EnableEBSGp3MigrationMaxSize { c.logger.Infof("modifying EBS volume %s to type gp3 migration (%d)", volume.VolumeID, volume.Size) err = c.VolumeResizer.ModifyVolume(volume.VolumeID, aws.String("gp3"), &volume.Size, &i3000, &i125) if nil != err { c.logger.Warningf("modifying volume %s failed: %v", volume.VolumeID, err) } } else { c.logger.Debugf("skipping EBS volume %s to type gp3 migration (%d)", volume.VolumeID, volume.Size) } c.EBSVolumes[volume.VolumeID] = volume } return nil }