From 0ba2147d733ed4c6cf4bc5d85232d1bddf0720d6 Mon Sep 17 00:00:00 2001 From: Zadkiel AHARONIAN Date: Thu, 23 Apr 2026 17:47:12 +0200 Subject: [PATCH] fix(logical-backup): wait for PG connectivity before running backup (#3069) * fix(logical-backup): wait for PG connectivity before running backup The backup script connects to the target PostgreSQL pod immediately after resolving its IP via the Kubernetes API. When NetworkPolicy is enforced via iptables, a newly-created pod's IP may not yet be present in the destination node's ingress allow lists, causing cross-node connections to be rejected until the next policy sync. This adds a pg_isready retry loop before the dump starts, with configurable retries and delay via LOGICAL_BACKUP_CONNECT_RETRIES (default: 10) and LOGICAL_BACKUP_CONNECT_RETRY_DELAY (default: 2s). Signed-off-by: Zadkiel AHARONIAN * docs: document LOGICAL_BACKUP_CONNECT_RETRIES and RETRY_DELAY env vars Document the new environment variables that control the pg_isready retry loop added in the previous commit. These are passed via the existing logical_backup_cronjob_environment_secret mechanism. Signed-off-by: Zadkiel AHARONIAN --------- Signed-off-by: Zadkiel AHARONIAN Co-authored-by: Ida Novindasari --- docs/reference/operator_parameters.md | 13 +++++++++++++ logical-backup/dump.sh | 21 +++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md index 6dd775069..83f693acc 100644 --- a/docs/reference/operator_parameters.md +++ b/docs/reference/operator_parameters.md @@ -900,6 +900,19 @@ grouped under the `logical_backup` key. * **logical_backup_cronjob_environment_secret** Reference to a Kubernetes secret, which keys will be added as environment variables to the cronjob. Default: "" +The following environment variables can be passed to the logical backup +cronjob via `logical_backup_cronjob_environment_secret` to control +connectivity checks before the backup starts: + +* **LOGICAL_BACKUP_CONNECT_RETRIES** + Number of times to retry connecting to the target PostgreSQL pod before + giving up. This is useful when NetworkPolicy enforcement introduces a + short delay before a newly-created pod's IP is allowed through ingress + rules on the destination node. Default: "10" + +* **LOGICAL_BACKUP_CONNECT_RETRY_DELAY** + Delay in seconds between connectivity retries. Default: "2" + ## Debugging the operator Options to aid debugging of the operator itself. Grouped under the `debug` key. diff --git a/logical-backup/dump.sh b/logical-backup/dump.sh index a250670a6..7833de399 100755 --- a/logical-backup/dump.sh +++ b/logical-backup/dump.sh @@ -183,6 +183,25 @@ function get_master_pod { get_pods "labelSelector=${CLUSTER_NAME_LABEL}%3D${SCOPE},spilo-role%3Dmaster" | tee | head -n 1 } +# Wait for TCP connectivity to the target PostgreSQL pod. +# When NetworkPolicy is enforced via iptables, a newly-created pod's IP may not +# yet be present in the destination node's ingress allow lists, causing +# cross-node connections to be rejected until the next policy sync. +function wait_for_pg { + local retries=${LOGICAL_BACKUP_CONNECT_RETRIES:-10} + local delay=${LOGICAL_BACKUP_CONNECT_RETRY_DELAY:-2} + local i + for (( i=1; i<=retries; i++ )); do + if "$PG_BIN"/pg_isready -h "$PGHOST" -p "${PGPORT:-5432}" -q 2>/dev/null; then + return 0 + fi + echo "waiting for $PGHOST:${PGPORT:-5432} to become reachable (attempt $i/$retries)..." + sleep "$delay" + done + echo "ERROR: $PGHOST:${PGPORT:-5432} not reachable after $((retries * delay))s" + return 1 +} + CURRENT_NODENAME=$(get_current_pod | jq .items[].spec.nodeName --raw-output) export CURRENT_NODENAME @@ -197,6 +216,8 @@ for search in "${search_strategy[@]}"; do done +wait_for_pg + set -x if [ "$LOGICAL_BACKUP_PROVIDER" == "az" ]; then dump | compress > /tmp/azure-backup.sql.gz