From bbf94324138580ec5d500614ef19907388bd9a15 Mon Sep 17 00:00:00 2001 From: DDD <58938832+dandeandean@users.noreply.github.com> Date: Fri, 17 Apr 2026 02:59:08 -0400 Subject: [PATCH 1/9] Wasm target updates (#3068) * Updates Needed for WASM Target * switch to regular (instead of local) build flags * update codegen to match other scripts --------- Co-authored-by: Felix Kunde --- Makefile | 3 +++ go.mod | 4 ++-- go.sum | 4 ++++ hack/update-codegen.sh | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 02c9c73f5..b96d71939 100644 --- a/Makefile +++ b/Makefile @@ -78,6 +78,9 @@ $(GENERATED_CRDS): $(GENERATED) local: ${SOURCES} $(GENERATED_CRDS) CGO_ENABLED=${CGO_ENABLED} go build -o build/${BINARY} $(LOCAL_BUILD_FLAGS) -ldflags "$(LDFLAGS)" $(SOURCES) +wasm: ${SOURCES} $(GENERATED_CRDS) + GOOS=wasip1 GOARCH=wasm CGO_ENABLED=${CGO_ENABLED} go build -o build/${BINARY}.wasm ${BUILD_FLAGS} -ldflags "$(LDFLAGS)" $(SOURCES) + linux: ${SOURCES} $(GENERATED_CRDS) GOOS=linux GOARCH=amd64 CGO_ENABLED=${CGO_ENABLED} go build -o build/linux/${BINARY} ${BUILD_FLAGS} -ldflags "$(LDFLAGS)" $(SOURCES) diff --git a/go.mod b/go.mod index a25723a44..e0e0b1956 100644 --- a/go.mod +++ b/go.mod @@ -6,11 +6,11 @@ require ( github.com/Masterminds/semver v1.5.0 github.com/aws/aws-sdk-go v1.55.8 github.com/golang/mock v1.6.0 - github.com/lib/pq v1.10.9 + github.com/lib/pq v1.11.2 github.com/motomux/pretty v0.0.0-20161209205251-b2aad2c9a95d github.com/pkg/errors v0.9.1 github.com/r3labs/diff v1.1.0 - github.com/sirupsen/logrus v1.9.3 + github.com/sirupsen/logrus v1.9.4 github.com/stretchr/testify v1.11.1 golang.org/x/crypto v0.45.0 gopkg.in/yaml.v2 v2.4.0 diff --git a/go.sum b/go.sum index 463b37211..5b70c6899 100644 --- a/go.sum +++ b/go.sum @@ -73,6 +73,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= +github.com/lib/pq v1.11.2 h1:x6gxUeu39V0BHZiugWe8LXZYZ+Utk7hSJGThs8sdzfs= +github.com/lib/pq v1.11.2/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= @@ -113,6 +115,8 @@ github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= +github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index 9d43bc512..1363c2786 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Copyright 2017 The Kubernetes Authors. # From 085a1a91e6c7f062b848078888d3f93b63914926 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 23 Apr 2026 13:56:32 +0200 Subject: [PATCH 2/9] Bump werkzeug from 3.1.5 to 3.1.6 in /ui (#3076) Bumps [werkzeug](https://github.com/pallets/werkzeug) from 3.1.5 to 3.1.6. - [Release notes](https://github.com/pallets/werkzeug/releases) - [Changelog](https://github.com/pallets/werkzeug/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/werkzeug/compare/3.1.5...3.1.6) --- updated-dependencies: - dependency-name: werkzeug dependency-version: 3.1.6 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- ui/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/requirements.txt b/ui/requirements.txt index 2e43ccb0e..ace18641d 100644 --- a/ui/requirements.txt +++ b/ui/requirements.txt @@ -11,4 +11,4 @@ kubernetes==11.0.0 python-json-logger==2.0.7 requests==2.32.4 stups-tokens>=1.1.19 -werkzeug==3.1.5 +werkzeug==3.1.6 From 0ba2147d733ed4c6cf4bc5d85232d1bddf0720d6 Mon Sep 17 00:00:00 2001 From: Zadkiel AHARONIAN Date: Thu, 23 Apr 2026 17:47:12 +0200 Subject: [PATCH 3/9] fix(logical-backup): wait for PG connectivity before running backup (#3069) * fix(logical-backup): wait for PG connectivity before running backup The backup script connects to the target PostgreSQL pod immediately after resolving its IP via the Kubernetes API. When NetworkPolicy is enforced via iptables, a newly-created pod's IP may not yet be present in the destination node's ingress allow lists, causing cross-node connections to be rejected until the next policy sync. This adds a pg_isready retry loop before the dump starts, with configurable retries and delay via LOGICAL_BACKUP_CONNECT_RETRIES (default: 10) and LOGICAL_BACKUP_CONNECT_RETRY_DELAY (default: 2s). Signed-off-by: Zadkiel AHARONIAN * docs: document LOGICAL_BACKUP_CONNECT_RETRIES and RETRY_DELAY env vars Document the new environment variables that control the pg_isready retry loop added in the previous commit. These are passed via the existing logical_backup_cronjob_environment_secret mechanism. Signed-off-by: Zadkiel AHARONIAN --------- Signed-off-by: Zadkiel AHARONIAN Co-authored-by: Ida Novindasari --- docs/reference/operator_parameters.md | 13 +++++++++++++ logical-backup/dump.sh | 21 +++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md index 6dd775069..83f693acc 100644 --- a/docs/reference/operator_parameters.md +++ b/docs/reference/operator_parameters.md @@ -900,6 +900,19 @@ grouped under the `logical_backup` key. * **logical_backup_cronjob_environment_secret** Reference to a Kubernetes secret, which keys will be added as environment variables to the cronjob. Default: "" +The following environment variables can be passed to the logical backup +cronjob via `logical_backup_cronjob_environment_secret` to control +connectivity checks before the backup starts: + +* **LOGICAL_BACKUP_CONNECT_RETRIES** + Number of times to retry connecting to the target PostgreSQL pod before + giving up. This is useful when NetworkPolicy enforcement introduces a + short delay before a newly-created pod's IP is allowed through ingress + rules on the destination node. Default: "10" + +* **LOGICAL_BACKUP_CONNECT_RETRY_DELAY** + Delay in seconds between connectivity retries. Default: "2" + ## Debugging the operator Options to aid debugging of the operator itself. Grouped under the `debug` key. diff --git a/logical-backup/dump.sh b/logical-backup/dump.sh index a250670a6..7833de399 100755 --- a/logical-backup/dump.sh +++ b/logical-backup/dump.sh @@ -183,6 +183,25 @@ function get_master_pod { get_pods "labelSelector=${CLUSTER_NAME_LABEL}%3D${SCOPE},spilo-role%3Dmaster" | tee | head -n 1 } +# Wait for TCP connectivity to the target PostgreSQL pod. +# When NetworkPolicy is enforced via iptables, a newly-created pod's IP may not +# yet be present in the destination node's ingress allow lists, causing +# cross-node connections to be rejected until the next policy sync. +function wait_for_pg { + local retries=${LOGICAL_BACKUP_CONNECT_RETRIES:-10} + local delay=${LOGICAL_BACKUP_CONNECT_RETRY_DELAY:-2} + local i + for (( i=1; i<=retries; i++ )); do + if "$PG_BIN"/pg_isready -h "$PGHOST" -p "${PGPORT:-5432}" -q 2>/dev/null; then + return 0 + fi + echo "waiting for $PGHOST:${PGPORT:-5432} to become reachable (attempt $i/$retries)..." + sleep "$delay" + done + echo "ERROR: $PGHOST:${PGPORT:-5432} not reachable after $((retries * delay))s" + return 1 +} + CURRENT_NODENAME=$(get_current_pod | jq .items[].spec.nodeName --raw-output) export CURRENT_NODENAME @@ -197,6 +216,8 @@ for search in "${search_strategy[@]}"; do done +wait_for_pg + set -x if [ "$LOGICAL_BACKUP_PROVIDER" == "az" ]; then dump | compress > /tmp/azure-backup.sql.gz From 030c24f64e0504f6050245879fec7ad13d5282d8 Mon Sep 17 00:00:00 2001 From: Sai Asish Y Date: Thu, 23 Apr 2026 08:47:51 -0700 Subject: [PATCH 4/9] ui: honor AWS_ENDPOINT in read_basebackups S3 list/get (#3079) read_stored_clusters and read_versions build their S3 clients with endpoint_url=AWS_ENDPOINT, but read_basebackups used a bare client('s3') for both the list_objects_v2 paginator and the per-key get_object call. On MinIO / S3-compatible backends the list+get requests go to the default AWS endpoint, so the Backups tab renders cluster/version prefixes (picked up by the correctly-configured read_stored_clusters) but then returns empty base backup details (silently no hits against the real backend) (#3078). Build s3_client once per call with endpoint_url=AWS_ENDPOINT and reuse it for both the paginator and get_object. No behaviour change when AWS_ENDPOINT is unset; boto3 defaults to the AWS endpoint either way. Fixes #3078 Signed-off-by: SAY-5 Co-authored-by: SAY-5 Co-authored-by: Ida Novindasari --- ui/operator_ui/spiloutils.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ui/operator_ui/spiloutils.py b/ui/operator_ui/spiloutils.py index 6a2f03bb2..8d2b73967 100644 --- a/ui/operator_ui/spiloutils.py +++ b/ui/operator_ui/spiloutils.py @@ -321,11 +321,18 @@ def read_basebackups( suffix = '' if uid == 'base' else '/' + uid backups = [] + # Reuse a single S3 client configured with AWS_ENDPOINT so MinIO / + # other S3-compatible backends are hit for list+get calls too. The + # previous plain client('s3') fell back to the default AWS endpoint + # and returned empty data against a custom endpoint; read_stored_clusters + # and read_versions already pass endpoint_url=AWS_ENDPOINT (#3078). + s3_client = client('s3', endpoint_url=AWS_ENDPOINT) + for vp in postgresql_versions: backup_prefix = f'{prefix}{pg_cluster}{suffix}/wal/{vp}/basebackups_005/' logger.info(f"{bucket}/{backup_prefix}") - paginator = client('s3').get_paginator('list_objects_v2') + paginator = s3_client.get_paginator('list_objects_v2') pages = paginator.paginate(Bucket=bucket, Prefix=backup_prefix) for page in pages: @@ -334,7 +341,7 @@ def read_basebackups( if not key.endswith("backup_stop_sentinel.json"): continue - response = client('s3').get_object(Bucket=bucket, Key=key) + response = s3_client.get_object(Bucket=bucket, Key=key) backup_info = loads(response["Body"].read().decode("utf-8")) last_modified = response["LastModified"].astimezone(timezone.utc).isoformat() From 27c969d14bda464c7a5e4000fc3a0cd35e6ab9bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20M=C3=A5rtensson?= Date: Fri, 24 Apr 2026 11:06:30 +0200 Subject: [PATCH 5/9] Set securityContext for backup container (#2117) Co-authored-by: Felix Kunde --- pkg/cluster/k8sres.go | 45 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index 724986dbc..7d51951ff 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -818,9 +818,6 @@ func (c *Cluster) generatePodTemplate( sidecarContainers []v1.Container, sharePgSocketWithSidecars *bool, tolerationsSpec *[]v1.Toleration, - spiloRunAsUser *int64, - spiloRunAsGroup *int64, - spiloFSGroup *int64, nodeAffinity *v1.Affinity, schedulerName *string, terminateGracePeriod int64, @@ -839,18 +836,22 @@ func (c *Cluster) generatePodTemplate( terminateGracePeriodSeconds := terminateGracePeriod containers := []v1.Container{*spiloContainer} containers = append(containers, sidecarContainers...) - securityContext := v1.PodSecurityContext{} - - if spiloRunAsUser != nil { - securityContext.RunAsUser = spiloRunAsUser + securityContext := v1.PodSecurityContext{ + RunAsUser: c.OpConfig.Resources.SpiloRunAsUser, + RunAsGroup: c.OpConfig.Resources.SpiloRunAsGroup, + FSGroup: c.OpConfig.Resources.SpiloFSGroup, } - if spiloRunAsGroup != nil { - securityContext.RunAsGroup = spiloRunAsGroup + if c.Spec.SpiloRunAsUser != nil { + securityContext.RunAsUser = c.Spec.SpiloRunAsUser } - if spiloFSGroup != nil { - securityContext.FSGroup = spiloFSGroup + if c.Spec.SpiloRunAsGroup != nil { + securityContext.RunAsGroup = c.Spec.SpiloRunAsGroup + } + + if c.Spec.SpiloFSGroup != nil { + securityContext.FSGroup = c.Spec.SpiloFSGroup } podSpec := v1.PodSpec{ @@ -1352,22 +1353,6 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef // pickup the docker image for the spilo container effectiveDockerImage := util.Coalesce(spec.DockerImage, c.OpConfig.DockerImage) - // determine the User, Group and FSGroup for the spilo pod - effectiveRunAsUser := c.OpConfig.Resources.SpiloRunAsUser - if spec.SpiloRunAsUser != nil { - effectiveRunAsUser = spec.SpiloRunAsUser - } - - effectiveRunAsGroup := c.OpConfig.Resources.SpiloRunAsGroup - if spec.SpiloRunAsGroup != nil { - effectiveRunAsGroup = spec.SpiloRunAsGroup - } - - effectiveFSGroup := c.OpConfig.Resources.SpiloFSGroup - if spec.SpiloFSGroup != nil { - effectiveFSGroup = spec.SpiloFSGroup - } - volumeMounts := generateVolumeMounts(spec.Volume) // configure TLS with a custom secret volume @@ -1485,9 +1470,6 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef sidecarContainers, c.OpConfig.SharePgSocketWithSidecars, &tolerationSpec, - effectiveRunAsUser, - effectiveRunAsGroup, - effectiveFSGroup, c.nodeAffinity(c.OpConfig.NodeReadinessLabel, spec.NodeAffinity), spec.SchedulerName, int64(c.OpConfig.PodTerminateGracePeriod.Seconds()), @@ -2379,9 +2361,6 @@ func (c *Cluster) generateLogicalBackupJob() (*batchv1.CronJob, error) { []v1.Container{}, util.False(), &tolerationsSpec, - nil, - nil, - nil, c.nodeAffinity(c.OpConfig.NodeReadinessLabel, nil), nil, int64(c.OpConfig.PodTerminateGracePeriod.Seconds()), From 0ac28e3aad33bf0ad0c0ed26e799e712d623f3ce Mon Sep 17 00:00:00 2001 From: Polina Bungina <27892524+hughcapet@users.noreply.github.com> Date: Fri, 24 Apr 2026 14:23:54 +0200 Subject: [PATCH 6/9] Do not set aws-load-balancer-connection-idle-timeout by default (#3054) Co-authored-by: Felix Kunde --- docs/administrator.md | 6 ++-- e2e/tests/test_e2e.py | 2 -- pkg/cluster/cluster_test.go | 46 ++++++++----------------------- pkg/cluster/connection_pooler.go | 4 --- pkg/cluster/k8sres.go | 5 ---- pkg/util/constants/annotations.go | 2 -- 6 files changed, 14 insertions(+), 51 deletions(-) diff --git a/docs/administrator.md b/docs/administrator.md index b7880b183..e854775ce 100644 --- a/docs/administrator.md +++ b/docs/administrator.md @@ -891,15 +891,13 @@ cluster manifest. In the case any of these variables are omitted from the manifest, the operator configuration settings `enable_master_load_balancer` and `enable_replica_load_balancer` apply. Note that the operator settings affect all Postgresql services running in all namespaces watched by the operator. -If load balancing is enabled two default annotations will be applied to its -services: +If load balancing is enabled the following default annotation will be applied to +its services: - `external-dns.alpha.kubernetes.io/hostname` with the value defined by the operator configs `master_dns_name_format` and `replica_dns_name_format`. This value can't be overwritten. If any changing in its value is needed, it MUST be done changing the DNS format operator config parameters; and -- `service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout` with - a default value of "3600". There are multiple options to specify service annotations that will be merged with each other and override in the following order (where latter take diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py index 70145f3e4..8cadb98a7 100644 --- a/e2e/tests/test_e2e.py +++ b/e2e/tests/test_e2e.py @@ -724,14 +724,12 @@ class EndToEndTestCase(unittest.TestCase): master_annotations = { "external-dns.alpha.kubernetes.io/hostname": "acid-minimal-cluster-pooler.default.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", } self.eventuallyTrue(lambda: k8s.check_service_annotations( master_pooler_label+","+pooler_label, master_annotations), "Wrong annotations") replica_annotations = { "external-dns.alpha.kubernetes.io/hostname": "acid-minimal-cluster-pooler-repl.default.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", } self.eventuallyTrue(lambda: k8s.check_service_annotations( replica_pooler_label+","+pooler_label, replica_annotations), "Wrong annotations") diff --git a/pkg/cluster/cluster_test.go b/pkg/cluster/cluster_test.go index c7181dbbc..8046943d4 100644 --- a/pkg/cluster/cluster_test.go +++ b/pkg/cluster/cluster_test.go @@ -680,8 +680,7 @@ func TestServiceAnnotations(t *testing.T) { operatorAnnotations: make(map[string]string), serviceAnnotations: make(map[string]string), expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", }, }, { @@ -702,8 +701,7 @@ func TestServiceAnnotations(t *testing.T) { operatorAnnotations: make(map[string]string), serviceAnnotations: make(map[string]string), expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", }, }, { @@ -714,8 +712,7 @@ func TestServiceAnnotations(t *testing.T) { operatorAnnotations: make(map[string]string), serviceAnnotations: map[string]string{"foo": "bar"}, expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", "foo": "bar", }, }, @@ -737,8 +734,7 @@ func TestServiceAnnotations(t *testing.T) { operatorAnnotations: map[string]string{"foo": "bar"}, serviceAnnotations: make(map[string]string), expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", "foo": "bar", }, }, @@ -780,8 +776,7 @@ func TestServiceAnnotations(t *testing.T) { "external-dns.alpha.kubernetes.io/hostname": "wrong.external-dns-name.example.com", }, expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", }, }, { @@ -792,8 +787,7 @@ func TestServiceAnnotations(t *testing.T) { serviceAnnotations: make(map[string]string), operatorAnnotations: make(map[string]string), expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg.test.db.example.com,test-stg.acid.db.example.com", }, }, { @@ -835,8 +829,7 @@ func TestServiceAnnotations(t *testing.T) { operatorAnnotations: make(map[string]string), serviceAnnotations: make(map[string]string), expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", }, }, { @@ -857,8 +850,7 @@ func TestServiceAnnotations(t *testing.T) { operatorAnnotations: make(map[string]string), serviceAnnotations: make(map[string]string), expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", }, }, { @@ -869,8 +861,7 @@ func TestServiceAnnotations(t *testing.T) { operatorAnnotations: make(map[string]string), serviceAnnotations: map[string]string{"foo": "bar"}, expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", "foo": "bar", }, }, @@ -892,8 +883,7 @@ func TestServiceAnnotations(t *testing.T) { operatorAnnotations: map[string]string{"foo": "bar"}, serviceAnnotations: make(map[string]string), expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", "foo": "bar", }, }, @@ -935,8 +925,7 @@ func TestServiceAnnotations(t *testing.T) { "external-dns.alpha.kubernetes.io/hostname": "wrong.external-dns-name.example.com", }, expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", }, }, { @@ -947,8 +936,7 @@ func TestServiceAnnotations(t *testing.T) { serviceAnnotations: make(map[string]string), operatorAnnotations: make(map[string]string), expect: map[string]string{ - "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", - "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600", + "external-dns.alpha.kubernetes.io/hostname": "acid-test-stg-repl.test.db.example.com,test-stg-repl.acid.db.example.com", }, }, { @@ -1377,7 +1365,6 @@ func TestCompareServices(t *testing.T) { serviceWithOwnerReference := newService( map[string]string{ constants.ZalandoDNSNameAnnotation: "clstr.acid.zalan.do", - constants.ElbTimeoutAnnotationName: constants.ElbTimeoutAnnotationValue, }, v1.ServiceTypeClusterIP, []string{"128.141.0.0/16", "137.138.0.0/16"}, @@ -1406,7 +1393,6 @@ func TestCompareServices(t *testing.T) { current: newService( map[string]string{ constants.ZalandoDNSNameAnnotation: "clstr.acid.zalan.do", - constants.ElbTimeoutAnnotationName: constants.ElbTimeoutAnnotationValue, }, v1.ServiceTypeClusterIP, []string{"128.141.0.0/16", "137.138.0.0/16"}, @@ -1414,7 +1400,6 @@ func TestCompareServices(t *testing.T) { new: newService( map[string]string{ constants.ZalandoDNSNameAnnotation: "clstr.acid.zalan.do", - constants.ElbTimeoutAnnotationName: constants.ElbTimeoutAnnotationValue, }, v1.ServiceTypeClusterIP, []string{"128.141.0.0/16", "137.138.0.0/16"}, @@ -1426,7 +1411,6 @@ func TestCompareServices(t *testing.T) { current: newService( map[string]string{ constants.ZalandoDNSNameAnnotation: "clstr.acid.zalan.do", - constants.ElbTimeoutAnnotationName: constants.ElbTimeoutAnnotationValue, }, v1.ServiceTypeClusterIP, []string{"128.141.0.0/16", "137.138.0.0/16"}, @@ -1434,7 +1418,6 @@ func TestCompareServices(t *testing.T) { new: newService( map[string]string{ constants.ZalandoDNSNameAnnotation: "clstr.acid.zalan.do", - constants.ElbTimeoutAnnotationName: constants.ElbTimeoutAnnotationValue, }, v1.ServiceTypeLoadBalancer, []string{"128.141.0.0/16", "137.138.0.0/16"}, @@ -1447,7 +1430,6 @@ func TestCompareServices(t *testing.T) { current: newService( map[string]string{ constants.ZalandoDNSNameAnnotation: "clstr.acid.zalan.do", - constants.ElbTimeoutAnnotationName: constants.ElbTimeoutAnnotationValue, }, v1.ServiceTypeLoadBalancer, []string{"128.141.0.0/16", "137.138.0.0/16"}, @@ -1455,7 +1437,6 @@ func TestCompareServices(t *testing.T) { new: newService( map[string]string{ constants.ZalandoDNSNameAnnotation: "clstr.acid.zalan.do", - constants.ElbTimeoutAnnotationName: constants.ElbTimeoutAnnotationValue, }, v1.ServiceTypeLoadBalancer, []string{"185.249.56.0/22"}, @@ -1468,7 +1449,6 @@ func TestCompareServices(t *testing.T) { current: newService( map[string]string{ constants.ZalandoDNSNameAnnotation: "clstr.acid.zalan.do", - constants.ElbTimeoutAnnotationName: constants.ElbTimeoutAnnotationValue, }, v1.ServiceTypeLoadBalancer, []string{"128.141.0.0/16", "137.138.0.0/16"}, @@ -1476,7 +1456,6 @@ func TestCompareServices(t *testing.T) { new: newService( map[string]string{ constants.ZalandoDNSNameAnnotation: "clstr.acid.zalan.do", - constants.ElbTimeoutAnnotationName: constants.ElbTimeoutAnnotationValue, }, v1.ServiceTypeLoadBalancer, []string{}, @@ -1489,7 +1468,6 @@ func TestCompareServices(t *testing.T) { current: newService( map[string]string{ constants.ZalandoDNSNameAnnotation: "clstr.acid.zalan.do", - constants.ElbTimeoutAnnotationName: constants.ElbTimeoutAnnotationValue, }, v1.ServiceTypeClusterIP, []string{"128.141.0.0/16", "137.138.0.0/16"}, diff --git a/pkg/cluster/connection_pooler.go b/pkg/cluster/connection_pooler.go index ac4ce67d8..e70eac56e 100644 --- a/pkg/cluster/connection_pooler.go +++ b/pkg/cluster/connection_pooler.go @@ -533,10 +533,6 @@ func (c *Cluster) generatePoolerServiceAnnotations(role PostgresRole, spec *acid annotations := c.getCustomServiceAnnotations(role, spec) if c.shouldCreateLoadBalancerForPoolerService(role, spec) { - // set ELB Timeout annotation with default value - if _, ok := annotations[constants.ElbTimeoutAnnotationName]; !ok { - annotations[constants.ElbTimeoutAnnotationName] = constants.ElbTimeoutAnnotationValue - } // -repl suffix will be added by replicaDNSName clusterNameWithPoolerSuffix := c.connectionPoolerName(Master) if role == Master { diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index 7d51951ff..2eb867f06 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -2029,11 +2029,6 @@ func (c *Cluster) generateServiceAnnotations(role PostgresRole, spec *acidv1.Pos if c.shouldCreateLoadBalancerForService(role, spec) { dnsName := c.dnsName(role) - // Just set ELB Timeout annotation with default value, if it does not - // have a custom value - if _, ok := annotations[constants.ElbTimeoutAnnotationName]; !ok { - annotations[constants.ElbTimeoutAnnotationName] = constants.ElbTimeoutAnnotationValue - } // External DNS name annotation is not customizable annotations[constants.ZalandoDNSNameAnnotation] = dnsName } diff --git a/pkg/util/constants/annotations.go b/pkg/util/constants/annotations.go index fc5a84fa5..0330ddcb8 100644 --- a/pkg/util/constants/annotations.go +++ b/pkg/util/constants/annotations.go @@ -3,8 +3,6 @@ package constants // Names and values in Kubernetes annotation for services, statefulsets and volumes const ( ZalandoDNSNameAnnotation = "external-dns.alpha.kubernetes.io/hostname" - ElbTimeoutAnnotationName = "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout" - ElbTimeoutAnnotationValue = "3600" KubeIAmAnnotation = "iam.amazonaws.com/role" VolumeStorateProvisionerAnnotation = "pv.kubernetes.io/provisioned-by" PostgresqlControllerAnnotationKey = "acid.zalan.do/controller" From 688bbf1b9e4fd99edc637d2a0de9f7a507b3b1e6 Mon Sep 17 00:00:00 2001 From: Felix Kunde Date: Tue, 28 Apr 2026 10:17:28 +0200 Subject: [PATCH 7/9] update standby check in pooler code (#3088) --- pkg/cluster/connection_pooler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cluster/connection_pooler.go b/pkg/cluster/connection_pooler.go index e70eac56e..336ffd4d9 100644 --- a/pkg/cluster/connection_pooler.go +++ b/pkg/cluster/connection_pooler.go @@ -908,7 +908,7 @@ func (c *Cluster) syncConnectionPooler(oldSpec, newSpec *acidv1.Postgresql, Look // in this case also do not forget to install lookup function // skip installation in standby clusters, since they are read-only - if !c.ConnectionPooler[role].LookupFunction && c.Spec.StandbyCluster == nil { + if !c.ConnectionPooler[role].LookupFunction && !isStandbyCluster(&newSpec.Spec) { connectionPooler := c.Spec.ConnectionPooler specSchema := "" specUser := "" From 97f4de7cc04289284308ad94b3c1fbca721b5f99 Mon Sep 17 00:00:00 2001 From: annielzy <148128409+annielzy@users.noreply.github.com> Date: Tue, 28 Apr 2026 03:08:34 -0700 Subject: [PATCH 8/9] Fix rolling update deadlock when pods are stuck in non-running state (#3051) * add fix to recreate non running pods in syncStatefulsets * remove TestSyncStatefulSetNonRunningPodsDoNotBlockRecreatio * revert pod_test * pod without status --------- Co-authored-by: Felix Kunde Co-authored-by: Ida Novindasari --- pkg/cluster/pod.go | 36 ++++- pkg/cluster/pod_test.go | 300 ++++++++++++++++++++++++++++++++++++++++ pkg/cluster/sync.go | 16 ++- 3 files changed, 349 insertions(+), 3 deletions(-) diff --git a/pkg/cluster/pod.go b/pkg/cluster/pod.go index 959bacb54..6658ba414 100644 --- a/pkg/cluster/pod.go +++ b/pkg/cluster/pod.go @@ -376,6 +376,36 @@ func (c *Cluster) getPatroniMemberData(pod *v1.Pod) (patroni.MemberData, error) return memberData, nil } +// podIsNotRunning returns true if a pod is known to be in a non-running state, +// e.g. stuck in CreateContainerConfigError, CrashLoopBackOff, ImagePullBackOff, etc. +// Pods with no status information are not considered non-running, as they may +// simply not have reported status yet. +func podIsNotRunning(pod *v1.Pod) bool { + if pod.Status.Phase == "" { + // No status reported yet — don't treat as non-running + return false + } + if pod.Status.Phase != v1.PodRunning { + return true + } + for _, cs := range pod.Status.ContainerStatuses { + if cs.State.Waiting != nil || cs.State.Terminated != nil { + return true + } + } + return false +} + +// allPodsRunning returns true only if every pod in the list is in a healthy running state. +func (c *Cluster) allPodsRunning(pods []v1.Pod) bool { + for i := range pods { + if podIsNotRunning(&pods[i]) { + return false + } + } + return true +} + func (c *Cluster) recreatePod(podName spec.NamespacedName) (*v1.Pod, error) { stopCh := make(chan struct{}) ch := c.registerPodSubscriber(podName) @@ -444,7 +474,8 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp // switchover if // 1. we have not observed a new master pod when re-creating former replicas // 2. we know possible switchover targets even when no replicas were recreated - if newMasterPod == nil && len(replicas) > 0 { + // 3. the master pod is actually running (can't switchover a dead master) + if newMasterPod == nil && len(replicas) > 0 && !podIsNotRunning(masterPod) { masterCandidate, err := c.getSwitchoverCandidate(masterPod) if err != nil { // do not recreate master now so it will keep the update flag and switchover will be retried on next sync @@ -455,6 +486,9 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp } } else if newMasterPod == nil && len(replicas) == 0 { c.logger.Warningf("cannot perform switch over before re-creating the pod: no replicas") + } else if podIsNotRunning(masterPod) { + c.logger.Warningf("master pod %q is not running, skipping switchover and recreating directly", + util.NameFromMeta(masterPod.ObjectMeta)) } c.logger.Infof("recreating old master pod %q", util.NameFromMeta(masterPod.ObjectMeta)) diff --git a/pkg/cluster/pod_test.go b/pkg/cluster/pod_test.go index 6816b4d7a..6ab3f9207 100644 --- a/pkg/cluster/pod_test.go +++ b/pkg/cluster/pod_test.go @@ -15,6 +15,7 @@ import ( "github.com/zalando/postgres-operator/pkg/util/config" "github.com/zalando/postgres-operator/pkg/util/k8sutil" "github.com/zalando/postgres-operator/pkg/util/patroni" + v1 "k8s.io/api/core/v1" ) func TestGetSwitchoverCandidate(t *testing.T) { @@ -112,3 +113,302 @@ func TestGetSwitchoverCandidate(t *testing.T) { } } } + +func TestPodIsNotRunning(t *testing.T) { + tests := []struct { + subtest string + pod v1.Pod + expected bool + }{ + { + subtest: "pod with no status reported yet", + pod: v1.Pod{ + Status: v1.PodStatus{}, + }, + expected: false, + }, + { + subtest: "pod running with all containers ready", + pod: v1.Pod{ + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Running: &v1.ContainerStateRunning{}, + }, + }, + }, + }, + }, + expected: false, + }, + { + subtest: "pod in pending phase", + pod: v1.Pod{ + Status: v1.PodStatus{ + Phase: v1.PodPending, + }, + }, + expected: true, + }, + { + subtest: "pod running but container in CreateContainerConfigError", + pod: v1.Pod{ + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Waiting: &v1.ContainerStateWaiting{ + Reason: "CreateContainerConfigError", + Message: `secret "some-secret" not found`, + }, + }, + }, + }, + }, + }, + expected: true, + }, + { + subtest: "pod running but container in CrashLoopBackOff", + pod: v1.Pod{ + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Waiting: &v1.ContainerStateWaiting{ + Reason: "CrashLoopBackOff", + }, + }, + }, + }, + }, + }, + expected: true, + }, + { + subtest: "pod running but container terminated", + pod: v1.Pod{ + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + ExitCode: 137, + }, + }, + }, + }, + }, + }, + expected: true, + }, + { + subtest: "pod running with mixed container states - one healthy one broken", + pod: v1.Pod{ + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Running: &v1.ContainerStateRunning{}, + }, + }, + { + State: v1.ContainerState{ + Waiting: &v1.ContainerStateWaiting{ + Reason: "CreateContainerConfigError", + }, + }, + }, + }, + }, + }, + expected: true, + }, + { + subtest: "pod in failed phase", + pod: v1.Pod{ + Status: v1.PodStatus{ + Phase: v1.PodFailed, + }, + }, + expected: true, + }, + { + subtest: "pod running with multiple healthy containers", + pod: v1.Pod{ + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Running: &v1.ContainerStateRunning{}, + }, + }, + { + State: v1.ContainerState{ + Running: &v1.ContainerStateRunning{}, + }, + }, + }, + }, + }, + expected: false, + }, + { + subtest: "pod running with ImagePullBackOff", + pod: v1.Pod{ + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Waiting: &v1.ContainerStateWaiting{ + Reason: "ImagePullBackOff", + }, + }, + }, + }, + }, + }, + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.subtest, func(t *testing.T) { + result := podIsNotRunning(&tt.pod) + if result != tt.expected { + t.Errorf("podIsNotRunning() = %v, expected %v", result, tt.expected) + } + }) + } +} + +func TestAllPodsRunning(t *testing.T) { + client, _ := newFakeK8sSyncClient() + + var cluster = New( + Config{ + OpConfig: config.Config{ + Resources: config.Resources{ + ClusterLabels: map[string]string{"application": "spilo"}, + ClusterNameLabel: "cluster-name", + PodRoleLabel: "spilo-role", + }, + }, + }, client, acidv1.Postgresql{}, logger, eventRecorder) + + tests := []struct { + subtest string + pods []v1.Pod + expected bool + }{ + { + subtest: "all pods running", + pods: []v1.Pod{ + { + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}}, + }, + }, + }, + { + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}}, + }, + }, + }, + }, + expected: true, + }, + { + subtest: "one pod not running", + pods: []v1.Pod{ + { + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + {State: v1.ContainerState{Running: &v1.ContainerStateRunning{}}}, + }, + }, + }, + { + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Waiting: &v1.ContainerStateWaiting{ + Reason: "CreateContainerConfigError", + }, + }, + }, + }, + }, + }, + }, + expected: false, + }, + { + subtest: "all pods not running", + pods: []v1.Pod{ + { + Status: v1.PodStatus{ + Phase: v1.PodPending, + }, + }, + { + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + { + State: v1.ContainerState{ + Waiting: &v1.ContainerStateWaiting{ + Reason: "CrashLoopBackOff", + }, + }, + }, + }, + }, + }, + }, + expected: false, + }, + { + subtest: "empty pod list", + pods: []v1.Pod{}, + expected: true, + }, + { + subtest: "pods with no status reported yet", + pods: []v1.Pod{ + { + Status: v1.PodStatus{}, + }, + { + Status: v1.PodStatus{}, + }, + }, + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.subtest, func(t *testing.T) { + result := cluster.allPodsRunning(tt.pods) + if result != tt.expected { + t.Errorf("allPodsRunning() = %v, expected %v", result, tt.expected) + } + }) + } +} diff --git a/pkg/cluster/sync.go b/pkg/cluster/sync.go index 3fa9e9783..7c478477a 100644 --- a/pkg/cluster/sync.go +++ b/pkg/cluster/sync.go @@ -719,14 +719,26 @@ func (c *Cluster) syncStatefulSet() error { if configPatched, restartPrimaryFirst, restartWait, err = c.syncPatroniConfig(pods, c.Spec.Patroni, requiredPgParameters); err != nil { c.logger.Warningf("Patroni config updated? %v - errors during config sync: %v", configPatched, err) postponeReasons = append(postponeReasons, "errors during Patroni config sync") - isSafeToRecreatePods = false + // Only mark unsafe if all pods are running. If some pods are not running, + // Patroni API errors are expected and should not block pod recreation, + // which is the only way to fix non-running pods. + if c.allPodsRunning(pods) { + isSafeToRecreatePods = false + } else { + c.logger.Warningf("ignoring Patroni config sync errors because some pods are not running") + } } // restart Postgres where it is still pending if err = c.restartInstances(pods, restartWait, restartPrimaryFirst); err != nil { c.logger.Errorf("errors while restarting Postgres in pods via Patroni API: %v", err) postponeReasons = append(postponeReasons, "errors while restarting Postgres via Patroni API") - isSafeToRecreatePods = false + // Same logic: don't let unreachable non-running pods block recreation. + if c.allPodsRunning(pods) { + isSafeToRecreatePods = false + } else { + c.logger.Warningf("ignoring Patroni restart errors because some pods are not running") + } } // if we get here we also need to re-create the pods (either leftovers from the old From e1713705f4d9b50a6914c9e8841013c25f8de6bf Mon Sep 17 00:00:00 2001 From: Felix Kunde Date: Tue, 28 Apr 2026 13:34:36 +0200 Subject: [PATCH 9/9] build multi-arch pooler image (#3077) * build multi-arch pooler image * add pooler build step in delivery.yaml and bump pooler version * pull from docker hub not zalando registry * add pooler step to ghcr workflow * pass infra roles to auth file via pooler entrypoint * introduce extra pooler secret for mounting auth_file * use pbgouncer as image name and push to ghcr on next merge * build with latest pgbouncer * integrate new image in e2e process and update pooler image default * update pooler build dependencies * build pooler image for e2e test * more Makefile and e2e run script tweaking --------- Co-authored-by: Ida Novindasari --- .github/workflows/publish_ghcr_image.yaml | 15 +++ Makefile | 8 +- .../crds/operatorconfigurations.yaml | 2 +- charts/postgres-operator/values.yaml | 2 +- delivery.yaml | 27 ++++ docs/reference/operator_parameters.md | 2 +- e2e/exec_into_env.sh | 4 +- e2e/run.sh | 46 ++++--- e2e/tests/test_e2e.py | 11 +- go.sum | 6 - manifests/configmap.yaml | 2 +- manifests/minimal-fake-pooler-deployment.yaml | 2 +- manifests/operatorconfiguration.crd.yaml | 2 +- ...gresql-operator-default-configuration.yaml | 2 +- pkg/cluster/connection_pooler.go | 119 +++++++++++++++++- pkg/cluster/connection_pooler_test.go | 5 + pkg/cluster/k8sres_test.go | 1 + pkg/controller/operator_config.go | 2 +- pkg/util/config/config.go | 2 +- pooler/Dockerfile | 54 ++++++++ pooler/entrypoint.sh | 19 +++ pooler/pgbouncer.ini.tmpl | 70 +++++++++++ 22 files changed, 363 insertions(+), 40 deletions(-) create mode 100644 pooler/Dockerfile create mode 100755 pooler/entrypoint.sh create mode 100644 pooler/pgbouncer.ini.tmpl diff --git a/.github/workflows/publish_ghcr_image.yaml b/.github/workflows/publish_ghcr_image.yaml index 3cead3503..2425e39b3 100644 --- a/.github/workflows/publish_ghcr_image.yaml +++ b/.github/workflows/publish_ghcr_image.yaml @@ -34,6 +34,12 @@ jobs: OPERATOR_IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${GITHUB_REF/refs\/tags\//}" echo "OPERATOR_IMAGE=$OPERATOR_IMAGE" >> $GITHUB_OUTPUT + - name: Define pooler image name + id: image_pooler + run: | + POOLER_IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/pgbouncer:${GITHUB_REF/refs\/tags\//}" + echo "POOLER_IMAGE=$POOLER_IMAGE" >> $GITHUB_OUTPUT + - name: Define UI image name id: image_ui run: | @@ -69,6 +75,15 @@ jobs: tags: "${{ steps.image.outputs.OPERATOR_IMAGE }}" platforms: linux/amd64,linux/arm64 + - name: Build and push multiarch pooler image to ghcr + uses: docker/build-push-action@v3 + with: + context: pooler + push: true + build-args: BASE_IMAGE=alpine:3.22 + tags: "${{ steps.image_pooler.outputs.POOLER_IMAGE }}" + platforms: linux/amd64,linux/arm64 + - name: Build and push multiarch ui image to ghcr uses: docker/build-push-action@v3 with: diff --git a/Makefile b/Makefile index b96d71939..c1becbc99 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: clean local test linux macos mocks docker push e2e +.PHONY: clean local test linux macos mocks docker pooler push e2e BINARY ?= postgres-operator BUILD_FLAGS ?= -v @@ -49,6 +49,7 @@ endif PATH := $(GOPATH)/bin:$(PATH) SHELL := env PATH="$(PATH)" $(SHELL) IMAGE_TAG := $(IMAGE):$(TAG)$(CDP_TAG)$(DEBUG_FRESH)$(DEBUG_POSTFIX) +POOLER_TAG := $(IMAGE)/pgbouncer:$(TAG)$(CDP_TAG)$(DEBUG_FRESH)$(DEBUG_POSTFIX) default: local @@ -95,6 +96,9 @@ docker: $(GENERATED_CRDS) ${DOCKERDIR}/${DOCKERFILE} echo "git describe $(shell git describe --tags --always --dirty)" docker build --rm -t "$(IMAGE_TAG)" -f "${DOCKERDIR}/${DOCKERFILE}" --build-arg VERSION="${VERSION}" --build-arg BASE_IMAGE="${BASE_IMAGE}" . +pooler: + cd pooler; docker build --rm -t "$(POOLER_TAG)" --build-arg VERSION="${VERSION}" --build-arg BASE_IMAGE="${BASE_IMAGE}" . + indocker-race: docker run --rm -v "${GOPATH}":"${GOPATH}" -e GOPATH="${GOPATH}" -e RACE=1 -w ${PWD} golang:1.25.3 bash -c "make linux" @@ -113,5 +117,5 @@ test: mocks $(GENERATED) $(GENERATED_CRDS) codegen: $(GENERATED) -e2e: docker # build operator image to be tested +e2e: docker pooler # build operator and pooler images to be tested cd e2e; make e2etest diff --git a/charts/postgres-operator/crds/operatorconfigurations.yaml b/charts/postgres-operator/crds/operatorconfigurations.yaml index cb4b7a335..80ef38d25 100644 --- a/charts/postgres-operator/crds/operatorconfigurations.yaml +++ b/charts/postgres-operator/crds/operatorconfigurations.yaml @@ -672,7 +672,7 @@ spec: default: "pooler" connection_pooler_image: type: string - default: "registry.opensource.zalan.do/acid/pgbouncer:master-32" + default: "ghcr.io/zalando/postgres-operator/pgbouncer:latest" connection_pooler_max_db_connections: type: integer default: 60 diff --git a/charts/postgres-operator/values.yaml b/charts/postgres-operator/values.yaml index dfec76b6b..a1f4fa94c 100644 --- a/charts/postgres-operator/values.yaml +++ b/charts/postgres-operator/values.yaml @@ -443,7 +443,7 @@ configConnectionPooler: # db user for pooler to use connection_pooler_user: "pooler" # docker image - connection_pooler_image: "registry.opensource.zalan.do/acid/pgbouncer:master-32" + connection_pooler_image: "ghcr.io/zalando/postgres-operator/pgbouncer:latest" # max db connections the pooler should hold connection_pooler_max_db_connections: 60 # default pooling mode diff --git a/delivery.yaml b/delivery.yaml index 933e72733..ac1ed90c6 100644 --- a/delivery.yaml +++ b/delivery.yaml @@ -42,6 +42,33 @@ pipeline: -f docker/Dockerfile \ --push . + - id: build-pooler + env: + <<: *BUILD_ENV + type: script + vm_config: + type: linux + + commands: + - desc: Build image + cmd: | + cd pooler + if [ -z ${CDP_SOURCE_BRANCH} ]; then + IMAGE=${MULTI_ARCH_REGISTRY}/pgbouncer + else + IMAGE=${MULTI_ARCH_REGISTRY}/pgbouncer-test + fi + + docker buildx create --config /etc/cdp-buildkitd.toml --driver-opt network=host --bootstrap --use + docker buildx build --platform "linux/amd64,linux/arm64" \ + --build-arg BASE_IMAGE="${ALPINE_BASE_IMAGE}" \ + -t "${IMAGE}:${CDP_BUILD_VERSION}" \ + --push . + + if [ -z ${CDP_SOURCE_BRANCH} ]; then + cdp-promote-image ${IMAGE}:${CDP_BUILD_VERSION} + fi + - id: build-operator-ui env: <<: *BUILD_ENV diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md index 83f693acc..bd2ca7cf6 100644 --- a/docs/reference/operator_parameters.md +++ b/docs/reference/operator_parameters.md @@ -1075,7 +1075,7 @@ operator being able to provide some reasonable defaults. * **connection_pooler_image** Docker image to use for connection pooler deployment. - Default: "registry.opensource.zalan.do/acid/pgbouncer" + Default: "ghcr.io/zalando/postgres-operator/pgbouncer:latest" * **connection_pooler_max_db_connections** How many connections the pooler can max hold. This value is divided among the diff --git a/e2e/exec_into_env.sh b/e2e/exec_into_env.sh index a46efecbd..4d017bc35 100755 --- a/e2e/exec_into_env.sh +++ b/e2e/exec_into_env.sh @@ -3,6 +3,7 @@ export cluster_name="postgres-operator-e2e-tests" export kubeconfig_path="/tmp/kind-config-${cluster_name}" export operator_image="ghcr.io/zalando/postgres-operator:latest" +export pooler_image="ghcr.io/zalando/postgres-operator/pgbouncer:latest" export e2e_test_runner_image="ghcr.io/zalando/postgres-operator-e2e-tests-runner:latest" docker run -it --entrypoint /bin/bash --network=host -e "TERM=xterm-256color" \ @@ -11,4 +12,5 @@ docker run -it --entrypoint /bin/bash --network=host -e "TERM=xterm-256color" \ --mount type=bind,source="$(readlink -f tests)",target=/tests \ --mount type=bind,source="$(readlink -f exec.sh)",target=/exec.sh \ --mount type=bind,source="$(readlink -f scripts)",target=/scripts \ - -e OPERATOR_IMAGE="${operator_image}" "${e2e_test_runner_image}" + -e OPERATOR_IMAGE="${operator_image}" -e POOLER_IMAGE="${pooler_image}" \ + "${e2e_test_runner_image}" diff --git a/e2e/run.sh b/e2e/run.sh index f74158240..c5daf81f6 100755 --- a/e2e/run.sh +++ b/e2e/run.sh @@ -26,17 +26,33 @@ echo "Kubeconfig path: ${kubeconfig_path}" function pull_images(){ operator_tag=$(git describe --tags --always --dirty) - image_name="ghcr.io/zalando/postgres-operator:${operator_tag}" - if [[ -z $(docker images -q "${image_name}") ]] - then - if ! docker pull "${image_name}" - then - echo "Failed to pull operator image: ${image_name}" - exit 1 + components=("postgres-operator" "pooler") + image_urls=("ghcr.io/zalando/postgres-operator:${operator_tag}" "ghcr.io/zalando/postgres-operator/pgbouncer:${operator_tag}") + + for i in "${!components[@]}"; do + component="${components[$i]}" + image="${image_urls[$i]}" + + if [[ -z $(docker images -q "$image") ]]; then + echo "Pulling $component image: $image" + if ! docker pull "$image"; then + echo "Failed to pull $component image: $image" + exit 1 + fi + else + echo "$component image already exists: $image" fi - fi - operator_image="${image_name}" - echo "Using operator image: ${operator_image}" + + # Set variables for later use + if [[ "$component" == "postgres-operator" ]]; then + operator_image="$image" + elif [[ "$component" == "pooler" ]]; then + pooler_image="$image" + fi + done + + echo "Using operator image: $operator_image" + echo "Using pooler image: $pooler_image" } function start_kind(){ @@ -55,10 +71,11 @@ function start_kind(){ kind load docker-image "${spilo_image}" --name ${cluster_name} } -function load_operator_image() { - echo "Loading operator image" +function load_operator_images() { + echo "Loading operator images" export KUBECONFIG="${kubeconfig_path}" kind load docker-image "${operator_image}" --name ${cluster_name} + kind load docker-image "${pooler_image}" --name ${cluster_name} } function set_kind_api_server_ip(){ @@ -85,7 +102,8 @@ function run_tests(){ --mount type=bind,source="$(readlink -f tests)",target=/tests \ --mount type=bind,source="$(readlink -f exec.sh)",target=/exec.sh \ --mount type=bind,source="$(readlink -f scripts)",target=/scripts \ - -e OPERATOR_IMAGE="${operator_image}" "${e2e_test_runner_image}" ${E2E_TEST_CASE-} $@ + -e OPERATOR_IMAGE="${operator_image}" -e POOLER_IMAGE="${pooler_image}" \ + "${e2e_test_runner_image}" ${E2E_TEST_CASE-} $@ } function cleanup(){ @@ -100,7 +118,7 @@ function main(){ [[ -z ${NOCLEANUP-} ]] && trap "cleanup" QUIT TERM EXIT pull_images [[ ! -f ${kubeconfig_path} ]] && start_kind - load_operator_image + load_operator_images set_kind_api_server_ip generate_certificate diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py index 8cadb98a7..159c3cc79 100644 --- a/e2e/tests/test_e2e.py +++ b/e2e/tests/test_e2e.py @@ -116,6 +116,7 @@ class EndToEndTestCase(unittest.TestCase): configmap["data"]["workers"] = "1" configmap["data"]["docker_image"] = SPILO_CURRENT configmap["data"]["major_version_upgrade_mode"] = "full" + configmap["data"]["connection_pooler_image"] = os.environ['POOLER_IMAGE'] with open("manifests/configmap.yaml", 'w') as f: yaml.dump(configmap, f, Dumper=yaml.Dumper) @@ -698,7 +699,7 @@ class EndToEndTestCase(unittest.TestCase): self.eventuallyEqual(lambda: k8s.count_running_pods(master_pooler_label), 2, "No pooler pods found") self.eventuallyEqual(lambda: k8s.count_running_pods(replica_pooler_label), 2, "No pooler replica pods found") self.eventuallyEqual(lambda: k8s.count_services_with_label(pooler_label), 2, "No pooler service found") - self.eventuallyEqual(lambda: k8s.count_secrets_with_label(pooler_label), 1, "Pooler secret not created") + self.eventuallyEqual(lambda: k8s.count_secrets_with_label(pooler_label), 3, "Not all pooler secrets found") # TLS still enabled so check existing env variables and volume mounts self.eventuallyEqual(lambda: k8s.count_pods_with_env_variable("CONNECTION_POOLER_CLIENT_TLS_CRT", pooler_label), 4, "TLS env variable CONNECTION_POOLER_CLIENT_TLS_CRT missing in pooler pods") @@ -756,7 +757,7 @@ class EndToEndTestCase(unittest.TestCase): self.eventuallyEqual(lambda: k8s.count_services_with_label(pooler_label), 1, "No pooler service found") self.eventuallyEqual(lambda: k8s.count_secrets_with_label(pooler_label), - 1, "Secret not created") + 2, "Not all pooler secrets created") # Turn off only replica connection pooler k8s.api.custom_objects_api.patch_namespaced_custom_object( @@ -784,7 +785,7 @@ class EndToEndTestCase(unittest.TestCase): 'ClusterIP', "Expected LoadBalancer service type for master, found {}") self.eventuallyEqual(lambda: k8s.count_secrets_with_label(pooler_label), - 1, "Secret not created") + 2, "Not all pooler secrets created") # scale up connection pooler deployment k8s.api.custom_objects_api.patch_namespaced_custom_object( @@ -819,8 +820,8 @@ class EndToEndTestCase(unittest.TestCase): 0, "Pooler pods not scaled down") self.eventuallyEqual(lambda: k8s.count_services_with_label(pooler_label), 0, "Pooler service not removed") - self.eventuallyEqual(lambda: k8s.count_secrets_with_label('application=spilo,cluster-name=acid-minimal-cluster'), - 4, "Secrets not deleted") + self.eventuallyEqual(lambda: k8s.count_secrets_with_label(pooler_label), + 0, "Not all pooler secrets deleted") # Verify that all the databases have pooler schema installed. # Do this via psql, since otherwise we need to deal with diff --git a/go.sum b/go.sum index 5b70c6899..a1fa39389 100644 --- a/go.sum +++ b/go.sum @@ -71,8 +71,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= -github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/lib/pq v1.11.2 h1:x6gxUeu39V0BHZiugWe8LXZYZ+Utk7hSJGThs8sdzfs= github.com/lib/pq v1.11.2/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= @@ -113,8 +111,6 @@ github.com/r3labs/diff v1.1.0/go.mod h1:7WjXasNzi0vJetRcB/RqNl5dlIsmXcTTLmF5IoH6 github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= @@ -126,7 +122,6 @@ github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSS github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= @@ -170,7 +165,6 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= diff --git a/manifests/configmap.yaml b/manifests/configmap.yaml index 571a4171b..1096e0265 100644 --- a/manifests/configmap.yaml +++ b/manifests/configmap.yaml @@ -17,7 +17,7 @@ data: connection_pooler_default_cpu_request: "500m" connection_pooler_default_memory_limit: 100Mi connection_pooler_default_memory_request: 100Mi - connection_pooler_image: "registry.opensource.zalan.do/acid/pgbouncer:master-32" + connection_pooler_image: "ghcr.io/zalando/postgres-operator/pgbouncer:latest" connection_pooler_max_db_connections: "60" connection_pooler_mode: "transaction" connection_pooler_number_of_instances: "2" diff --git a/manifests/minimal-fake-pooler-deployment.yaml b/manifests/minimal-fake-pooler-deployment.yaml index 59a32ad0b..13f2cc68f 100644 --- a/manifests/minimal-fake-pooler-deployment.yaml +++ b/manifests/minimal-fake-pooler-deployment.yaml @@ -23,7 +23,7 @@ spec: serviceAccountName: postgres-operator containers: - name: postgres-operator - image: registry.opensource.zalan.do/acid/pgbouncer:master-32 + image: ghcr.io/zalando/postgres-operator/pgbouncer:latest imagePullPolicy: IfNotPresent resources: requests: diff --git a/manifests/operatorconfiguration.crd.yaml b/manifests/operatorconfiguration.crd.yaml index 3be545b65..b5044b467 100644 --- a/manifests/operatorconfiguration.crd.yaml +++ b/manifests/operatorconfiguration.crd.yaml @@ -670,7 +670,7 @@ spec: default: "pooler" connection_pooler_image: type: string - default: "registry.opensource.zalan.do/acid/pgbouncer:master-32" + default: "ghcr.io/zalando/postgres-operator/pgbouncer:latest" connection_pooler_max_db_connections: type: integer default: 60 diff --git a/manifests/postgresql-operator-default-configuration.yaml b/manifests/postgresql-operator-default-configuration.yaml index 1c6a0e34a..13dfd6977 100644 --- a/manifests/postgresql-operator-default-configuration.yaml +++ b/manifests/postgresql-operator-default-configuration.yaml @@ -218,7 +218,7 @@ configuration: connection_pooler_default_cpu_request: "500m" connection_pooler_default_memory_limit: 100Mi connection_pooler_default_memory_request: 100Mi - connection_pooler_image: "registry.opensource.zalan.do/acid/pgbouncer:master-32" + connection_pooler_image: "ghcr.io/zalando/postgres-operator/pgbouncer:latest" # connection_pooler_max_db_connections: 60 connection_pooler_mode: "transaction" connection_pooler_number_of_instances: 2 diff --git a/pkg/cluster/connection_pooler.go b/pkg/cluster/connection_pooler.go index 336ffd4d9..97f7a3076 100644 --- a/pkg/cluster/connection_pooler.go +++ b/pkg/cluster/connection_pooler.go @@ -31,6 +31,7 @@ var poolerRunAsGroup = int64(101) // ConnectionPoolerObjects K8s objects that are belong to connection pooler type ConnectionPoolerObjects struct { + AuthSecret *v1.Secret Deployment *appsv1.Deployment Service *v1.Service Name string @@ -167,6 +168,38 @@ func (c *Cluster) createConnectionPooler(LookupFunction InstallFunction) (SyncRe return reason, nil } +func (c *Cluster) generateUserlist() string { + var sb strings.Builder + + poolerAdminUser := c.systemUsers[constants.ConnectionPoolerUserKeyName] + fmt.Fprintf(&sb, "\"%s\" \"%s\"\n", poolerAdminUser.Name, poolerAdminUser.Password) + + for roleName, infraRole := range c.InfrastructureRoles { + if infraRole.Password != "" { + fmt.Fprintf(&sb, "\"%s\" \"%s\"\n", roleName, infraRole.Password) + } + } + + return sb.String() +} + +func (c *Cluster) generateConnectionPoolerAuthSecret(connectionPooler *ConnectionPoolerObjects) *v1.Secret { + return &v1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Labels: c.connectionPoolerLabels(connectionPooler.Role, true).MatchLabels, + Name: fmt.Sprintf("%s-userlist", connectionPooler.Name), + Namespace: connectionPooler.Namespace, + Annotations: c.annotationsSet(nil), + OwnerReferences: c.ownerReferences(), + }, + Type: v1.SecretTypeOpaque, + // Secret data must be bytes. Kubernetes handles the encoding. + StringData: map[string]string{ + "userlist.txt": c.generateUserlist(), + }, + } +} + // Generate pool size related environment variables. // // MAX_DB_CONN would specify the global maximum for connections to a target @@ -320,6 +353,18 @@ func (c *Cluster) generateConnectionPoolerPodTemplate(role PostgresRole) ( } envVars = append(envVars, c.getConnectionPoolerEnvVars()...) + infraRolesList := make([]string, 0) + for infraRoleName := range c.InfrastructureRoles { + infraRolesList = append(infraRolesList, infraRoleName) + } + + if len(infraRolesList) > 0 { + envVars = append(envVars, v1.EnvVar{ + Name: "INFRASTRUCTURE_ROLES", + Value: strings.Join(infraRolesList, ","), + }) + } + poolerContainer := v1.Container{ Name: connectionPoolerContainer, Image: effectiveDockerImage, @@ -343,12 +388,29 @@ func (c *Cluster) generateConnectionPoolerPodTemplate(role PostgresRole) ( }, } + var poolerVolumes []v1.Volume + var volumeMounts []v1.VolumeMount + + // mount secret volume with userlist.txt for pgBouncer to authenticate users + poolerVolumes = append(poolerVolumes, v1.Volume{ + Name: fmt.Sprintf("%s-userlist-volume", c.connectionPoolerName(role)), + VolumeSource: v1.VolumeSource{ + Secret: &v1.SecretVolumeSource{ + SecretName: fmt.Sprintf("%s-userlist", c.connectionPoolerName(role)), + }, + }, + }) + volumeMounts = append(volumeMounts, v1.VolumeMount{ + Name: fmt.Sprintf("%s-userlist-volume", c.connectionPoolerName(role)), + MountPath: "/etc/pgbouncer/userlist.txt", + SubPath: "userlist.txt", + ReadOnly: true, + }) + // If the cluster has custom TLS certificates configured, we do the following: // 1. Add environment variables to tell pgBouncer where to find the TLS certificates // 2. Reference the secret in a volume // 3. Mount the volume to the container at /tls - var poolerVolumes []v1.Volume - var volumeMounts []v1.VolumeMount if spec.TLS != nil && spec.TLS.SecretName != "" { getPoolerTLSEnv := func(k string) string { keyName := "" @@ -635,12 +697,31 @@ func (c *Cluster) deleteConnectionPooler(role PostgresRole) (err error) { c.logger.Infof("connection pooler service %s has been deleted for role %s", service.Name, role) } + // Repeat the same for the auth secret + authSecret := c.ConnectionPooler[role].AuthSecret + if authSecret == nil { + c.logger.Debug("no connection pooler auth secret to delete") + } else { + err := c.KubeClient. + Secrets(c.Namespace). + Delete(context.TODO(), authSecret.Name, metav1.DeleteOptions{}) + + if k8sutil.ResourceNotFound(err) { + c.logger.Debugf("connection pooler auth secret %s for role %s has already been deleted", authSecret.Name, role) + } else if err != nil { + return fmt.Errorf("could not delete connection pooler auth secret: %v", err) + } + + c.logger.Infof("connection pooler auth secret %s has been deleted for role %s", authSecret.Name, role) + } + + c.ConnectionPooler[role].AuthSecret = nil c.ConnectionPooler[role].Deployment = nil c.ConnectionPooler[role].Service = nil return nil } -// delete connection pooler +// delete connection pooler secret func (c *Cluster) deleteConnectionPoolerSecret() (err error) { // Repeat the same for the secret object secretName := c.credentialSecretName(c.OpConfig.ConnectionPooler.User) @@ -656,6 +737,7 @@ func (c *Cluster) deleteConnectionPoolerSecret() (err error) { return fmt.Errorf("could not delete pooler secret: %v", err) } } + return nil } @@ -971,11 +1053,42 @@ func (c *Cluster) syncConnectionPoolerWorker(oldSpec, newSpec *acidv1.Postgresql pods []v1.Pod service *v1.Service newService *v1.Service + authSecret *v1.Secret + newAuthSecret *v1.Secret err error ) updatedPodAnnotations := map[string]*string{} syncReason := make([]string, 0) + + // create extra secret for connection pooler authentication + newAuthSecret = c.generateConnectionPoolerAuthSecret(c.ConnectionPooler[role]) + if authSecret, err = c.KubeClient.Secrets(c.Namespace).Get(context.TODO(), fmt.Sprintf("%s-userlist", c.connectionPoolerName(role)), metav1.GetOptions{}); err == nil { + c.ConnectionPooler[role].AuthSecret = authSecret + // make sure existing annotations are preserved + newAuthSecret.Annotations = c.annotationsSet(authSecret.Annotations) + authSecret, err = c.KubeClient.Secrets(authSecret.Namespace).Update(context.TODO(), newAuthSecret, metav1.UpdateOptions{}) + if err != nil { + return NoSync, fmt.Errorf("could not update connection pooler auth secret: %v", err) + } + c.ConnectionPooler[role].AuthSecret = authSecret + } else if !k8sutil.ResourceNotFound(err) { + return NoSync, fmt.Errorf("could not get auth secret for connection pooler to sync: %v", err) + } + + if k8sutil.ResourceNotFound(err) { + c.logger.Warningf("auth secret %s for connection pooler is not found, create it", fmt.Sprintf("%s-userlist", c.connectionPoolerName(role))) + authSecret, err = c.KubeClient. + Secrets(newAuthSecret.Namespace). + Create(context.TODO(), newAuthSecret, metav1.CreateOptions{}) + + if err != nil { + return NoSync, err + } + c.ConnectionPooler[role].AuthSecret = authSecret + } + + // next the pooler deployment deployment, err = c.KubeClient. Deployments(c.Namespace). Get(context.TODO(), c.connectionPoolerName(role), metav1.GetOptions{}) diff --git a/pkg/cluster/connection_pooler_test.go b/pkg/cluster/connection_pooler_test.go index 78d1c2527..23213520f 100644 --- a/pkg/cluster/connection_pooler_test.go +++ b/pkg/cluster/connection_pooler_test.go @@ -30,6 +30,7 @@ func newFakeK8sPoolerTestClient() (k8sutil.KubernetesClient, *fake.Clientset) { StatefulSetsGetter: clientSet.AppsV1(), DeploymentsGetter: clientSet.AppsV1(), ServicesGetter: clientSet.CoreV1(), + SecretsGetter: clientSet.CoreV1(), }, clientSet } @@ -803,6 +804,7 @@ func TestConnectionPoolerDeploymentSpec(t *testing.T) { } cluster.ConnectionPooler = map[PostgresRole]*ConnectionPoolerObjects{ Master: { + AuthSecret: nil, Deployment: nil, Service: nil, LookupFunction: true, @@ -1019,6 +1021,7 @@ func TestPoolerTLS(t *testing.T) { // create pooler resources cluster.ConnectionPooler = map[PostgresRole]*ConnectionPoolerObjects{} cluster.ConnectionPooler[Master] = &ConnectionPoolerObjects{ + AuthSecret: nil, Deployment: nil, Service: nil, Name: cluster.connectionPoolerName(Master), @@ -1089,12 +1092,14 @@ func TestConnectionPoolerServiceSpec(t *testing.T) { } cluster.ConnectionPooler = map[PostgresRole]*ConnectionPoolerObjects{ Master: { + AuthSecret: nil, Deployment: nil, Service: nil, LookupFunction: false, Role: Master, }, Replica: { + AuthSecret: nil, Deployment: nil, Service: nil, LookupFunction: false, diff --git a/pkg/cluster/k8sres_test.go b/pkg/cluster/k8sres_test.go index 04f6476a6..62481c7e3 100644 --- a/pkg/cluster/k8sres_test.go +++ b/pkg/cluster/k8sres_test.go @@ -2967,6 +2967,7 @@ func newLBFakeClient() (k8sutil.KubernetesClient, *fake.Clientset) { DeploymentsGetter: clientSet.AppsV1(), PodsGetter: clientSet.CoreV1(), ServicesGetter: clientSet.CoreV1(), + SecretsGetter: clientSet.CoreV1(), }, clientSet } diff --git a/pkg/controller/operator_config.go b/pkg/controller/operator_config.go index 0a458618b..4df8a8bd2 100644 --- a/pkg/controller/operator_config.go +++ b/pkg/controller/operator_config.go @@ -275,7 +275,7 @@ func (c *Controller) importConfigurationFromCRD(fromCRD *acidv1.OperatorConfigur result.ConnectionPooler.Image = util.Coalesce( fromCRD.ConnectionPooler.Image, - "registry.opensource.zalan.do/acid/pgbouncer") + "ghcr.io/zalando/postgres-operator/pgbouncer:latest") result.ConnectionPooler.Mode = util.Coalesce( fromCRD.ConnectionPooler.Mode, diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index 914d7a180..796594a89 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -155,7 +155,7 @@ type ConnectionPooler struct { NumberOfInstances *int32 `name:"connection_pooler_number_of_instances" default:"2"` Schema string `name:"connection_pooler_schema" default:"pooler"` User string `name:"connection_pooler_user" default:"pooler"` - Image string `name:"connection_pooler_image" default:"registry.opensource.zalan.do/acid/pgbouncer"` + Image string `name:"connection_pooler_image" default:"ghcr.io/zalando/postgres-operator/pgbouncer:latest"` Mode string `name:"connection_pooler_mode" default:"transaction"` MaxDBConnections *int32 `name:"connection_pooler_max_db_connections" default:"60"` ConnectionPoolerDefaultCPURequest string `name:"connection_pooler_default_cpu_request"` diff --git a/pooler/Dockerfile b/pooler/Dockerfile new file mode 100644 index 000000000..e7836e0f2 --- /dev/null +++ b/pooler/Dockerfile @@ -0,0 +1,54 @@ +ARG BASE_IMAGE=alpine:3.22 +FROM ${BASE_IMAGE} AS build_stage + +RUN apk add -U --no-cache \ + autoconf \ + automake \ + curl \ + gcc \ + libc-dev \ + libevent \ + libevent-dev \ + libtool \ + make \ + openssl-dev \ + pkgconfig \ + git + +WORKDIR /src + +RUN git clone --single-branch --depth 1 https://github.com/pgbouncer/pgbouncer.git . && \ + git checkout $(git describe --tags $(git rev-list --tags --max-count=1)) + +RUN git submodule init && git submodule update + +RUN ./autogen.sh && \ + ./configure --prefix=/pgbouncer --with-libevent=/usr/lib && \ + sed -i '/dist_man_MANS/d' Makefile && \ + make && \ + make install + +FROM ${BASE_IMAGE} + +RUN apk -U upgrade --no-cache \ + && apk --no-cache add bash c-ares ca-certificates gettext libevent openssl postgresql-client + +RUN addgroup -g 101 -S pgbouncer && \ + adduser -u 100 -S pgbouncer -G pgbouncer && \ + mkdir -p /etc/pgbouncer /var/log/pgbouncer /var/run/pgbouncer /etc/ssl/certs + +COPY --from=build_stage /pgbouncer/bin/pgbouncer /bin/pgbouncer +COPY pgbouncer.ini.tmpl /etc/pgbouncer/ +COPY entrypoint.sh /entrypoint.sh + +RUN chown -R pgbouncer:pgbouncer \ + /var/log/pgbouncer \ + /var/run/pgbouncer \ + /etc/pgbouncer \ + /etc/ssl/certs \ + && chmod +x /entrypoint.sh + +USER pgbouncer:pgbouncer +WORKDIR /etc/pgbouncer + +ENTRYPOINT ["/bin/sh", "/entrypoint.sh"] diff --git a/pooler/entrypoint.sh b/pooler/entrypoint.sh new file mode 100755 index 000000000..326d63fbe --- /dev/null +++ b/pooler/entrypoint.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +set -ex + +if [ -z "${CONNECTION_POOLER_CLIENT_TLS_CRT}" ]; then + openssl req -nodes -new -x509 -subj /CN=spilo.dummy.org \ + -keyout /etc/ssl/certs/pgbouncer.key \ + -out /etc/ssl/certs/pgbouncer.crt +else + ln -s ${CONNECTION_POOLER_CLIENT_TLS_CRT} /etc/ssl/certs/pgbouncer.crt + ln -s ${CONNECTION_POOLER_CLIENT_TLS_KEY} /etc/ssl/certs/pgbouncer.key + if [ ! -z "${CONNECTION_POOLER_CLIENT_CA_FILE}" ]; then + ln -s ${CONNECTION_POOLER_CLIENT_CA_FILE} /etc/ssl/certs/ca.crt + fi +fi + +envsubst < /etc/pgbouncer/pgbouncer.ini.tmpl > /etc/pgbouncer/pgbouncer.ini + +exec /bin/pgbouncer /etc/pgbouncer/pgbouncer.ini diff --git a/pooler/pgbouncer.ini.tmpl b/pooler/pgbouncer.ini.tmpl new file mode 100644 index 000000000..c26cf1453 --- /dev/null +++ b/pooler/pgbouncer.ini.tmpl @@ -0,0 +1,70 @@ +# vim: set ft=dosini: + +[databases] +* = host=$PGHOST port=$PGPORT auth_user=$PGUSER +postgres = host=$PGHOST port=$PGPORT auth_user=$PGUSER + +[pgbouncer] +pool_mode = $CONNECTION_POOLER_MODE +listen_port = $CONNECTION_POOLER_PORT +listen_addr = * +admin_users = $PGUSER +stats_users = $INFRASTRUCTURE_ROLES +auth_dbname = postgres +auth_file = /etc/pgbouncer/userlist.txt +auth_query = SELECT * FROM $PGSCHEMA.user_lookup($1) +auth_type = md5 +logfile = /var/log/pgbouncer/pgbouncer.log +pidfile = /var/run/pgbouncer/pgbouncer.pid + +server_tls_sslmode = require +server_tls_ca_file = /etc/ssl/certs/pgbouncer.crt +server_tls_protocols = secure +client_tls_sslmode = require +client_tls_key_file = /etc/ssl/certs/pgbouncer.key +client_tls_cert_file = /etc/ssl/certs/pgbouncer.crt + +log_connections = 0 +log_disconnections = 0 + +# Number of prepared statements to cache on a server connection (zero value +# disables support of prepared statements). +max_prepared_statements = 200 + +# How many server connections to allow per user/database pair. +default_pool_size = $CONNECTION_POOLER_DEFAULT_SIZE + +# Add more server connections to pool if below this number. Improves behavior +# when usual load comes suddenly back after period of total inactivity. +# +# NOTE: This value is per pool, i.e. a pair of (db, user), not a global one. +# Which means on the higher level it has to be calculated from the max allowed +# database connections and number of databases and users. If not taken into +# account, then for too many users or databases PgBouncer will go crazy +# opening/evicting connections. For now disable it. +# +# min_pool_size = $CONNECTION_POOLER_MIN_SIZE + +# How many additional connections to allow to a pool +reserve_pool_size = $CONNECTION_POOLER_RESERVE_SIZE + +# Maximum number of client connections allowed. +max_client_conn = $CONNECTION_POOLER_MAX_CLIENT_CONN + +# Do not allow more than this many connections per database (regardless of +# pool, i.e. user) +max_db_connections = $CONNECTION_POOLER_MAX_DB_CONN + +# If a client has been in "idle in transaction" state longer, it will be +# disconnected. [seconds] +idle_transaction_timeout = 600 + +# If login failed, because of failure from connect() or authentication that +# pooler waits this much before retrying to connect. Default is 15. [seconds] +server_login_retry = 5 + +# To ignore extra parameter in startup packet. By default only 'database' and +# 'user' are allowed, all others raise error. This is needed to tolerate +# overenthusiastic JDBC wanting to unconditionally set 'extra_float_digits=2' +# in startup packet. +ignore_startup_parameters = extra_float_digits,options