diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md index 6dd775069..83f693acc 100644 --- a/docs/reference/operator_parameters.md +++ b/docs/reference/operator_parameters.md @@ -900,6 +900,19 @@ grouped under the `logical_backup` key. * **logical_backup_cronjob_environment_secret** Reference to a Kubernetes secret, which keys will be added as environment variables to the cronjob. Default: "" +The following environment variables can be passed to the logical backup +cronjob via `logical_backup_cronjob_environment_secret` to control +connectivity checks before the backup starts: + +* **LOGICAL_BACKUP_CONNECT_RETRIES** + Number of times to retry connecting to the target PostgreSQL pod before + giving up. This is useful when NetworkPolicy enforcement introduces a + short delay before a newly-created pod's IP is allowed through ingress + rules on the destination node. Default: "10" + +* **LOGICAL_BACKUP_CONNECT_RETRY_DELAY** + Delay in seconds between connectivity retries. Default: "2" + ## Debugging the operator Options to aid debugging of the operator itself. Grouped under the `debug` key. diff --git a/logical-backup/dump.sh b/logical-backup/dump.sh index a250670a6..7833de399 100755 --- a/logical-backup/dump.sh +++ b/logical-backup/dump.sh @@ -183,6 +183,25 @@ function get_master_pod { get_pods "labelSelector=${CLUSTER_NAME_LABEL}%3D${SCOPE},spilo-role%3Dmaster" | tee | head -n 1 } +# Wait for TCP connectivity to the target PostgreSQL pod. +# When NetworkPolicy is enforced via iptables, a newly-created pod's IP may not +# yet be present in the destination node's ingress allow lists, causing +# cross-node connections to be rejected until the next policy sync. +function wait_for_pg { + local retries=${LOGICAL_BACKUP_CONNECT_RETRIES:-10} + local delay=${LOGICAL_BACKUP_CONNECT_RETRY_DELAY:-2} + local i + for (( i=1; i<=retries; i++ )); do + if "$PG_BIN"/pg_isready -h "$PGHOST" -p "${PGPORT:-5432}" -q 2>/dev/null; then + return 0 + fi + echo "waiting for $PGHOST:${PGPORT:-5432} to become reachable (attempt $i/$retries)..." + sleep "$delay" + done + echo "ERROR: $PGHOST:${PGPORT:-5432} not reachable after $((retries * delay))s" + return 1 +} + CURRENT_NODENAME=$(get_current_pod | jq .items[].spec.nodeName --raw-output) export CURRENT_NODENAME @@ -197,6 +216,8 @@ for search in "${search_strategy[@]}"; do done +wait_for_pg + set -x if [ "$LOGICAL_BACKUP_PROVIDER" == "az" ]; then dump | compress > /tmp/azure-backup.sql.gz diff --git a/ui/operator_ui/spiloutils.py b/ui/operator_ui/spiloutils.py index 6a2f03bb2..8d2b73967 100644 --- a/ui/operator_ui/spiloutils.py +++ b/ui/operator_ui/spiloutils.py @@ -321,11 +321,18 @@ def read_basebackups( suffix = '' if uid == 'base' else '/' + uid backups = [] + # Reuse a single S3 client configured with AWS_ENDPOINT so MinIO / + # other S3-compatible backends are hit for list+get calls too. The + # previous plain client('s3') fell back to the default AWS endpoint + # and returned empty data against a custom endpoint; read_stored_clusters + # and read_versions already pass endpoint_url=AWS_ENDPOINT (#3078). + s3_client = client('s3', endpoint_url=AWS_ENDPOINT) + for vp in postgresql_versions: backup_prefix = f'{prefix}{pg_cluster}{suffix}/wal/{vp}/basebackups_005/' logger.info(f"{bucket}/{backup_prefix}") - paginator = client('s3').get_paginator('list_objects_v2') + paginator = s3_client.get_paginator('list_objects_v2') pages = paginator.paginate(Bucket=bucket, Prefix=backup_prefix) for page in pages: @@ -334,7 +341,7 @@ def read_basebackups( if not key.endswith("backup_stop_sentinel.json"): continue - response = client('s3').get_object(Bucket=bucket, Key=key) + response = s3_client.get_object(Bucket=bucket, Key=key) backup_info = loads(response["Body"].read().decode("utf-8")) last_modified = response["LastModified"].astimezone(timezone.utc).isoformat() diff --git a/ui/requirements.txt b/ui/requirements.txt index 2e43ccb0e..ace18641d 100644 --- a/ui/requirements.txt +++ b/ui/requirements.txt @@ -11,4 +11,4 @@ kubernetes==11.0.0 python-json-logger==2.0.7 requests==2.32.4 stups-tokens>=1.1.19 -werkzeug==3.1.5 +werkzeug==3.1.6