feat(backup): logs everything to stdout/err, implement lock file for both backup/restore (#1023)

This commit is contained in:
Luigi Operoso 2024-06-25 23:29:35 +02:00 committed by GitHub
parent 5ef6c730de
commit b722ef11ae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 127 additions and 43 deletions

View File

@ -1 +1 @@
v0.3.0
v0.4.0

View File

@ -1,39 +1,63 @@
#!/usr/bin/env bash
set -eo pipefail
source "$(dirname "$0")/utils.sh"
[[ ! $# -eq 1 ]] && echo "Usage: $0 backup_number" && exit 1;
[[ -z "${BACKUP_DIR}" ]] && echo "Required 'BACKUP_DIR' env not set" && exit 1;
[[ -z "${JENKINS_HOME}" ]] && echo "Required 'JENKINS_HOME' env not set" && exit 1;
[[ ! $# -eq 1 ]] && _log "ERROR" "Usage: $0 BACKUP_NUMBER" && exit 1
[[ -z "${BACKUP_DIR}" ]] && _log "ERROR" "Required 'BACKUP_DIR' env not set" && exit 1
[[ -z "${JENKINS_HOME}" ]] && _log "ERROR" "Required 'JENKINS_HOME' env not set" && exit 1
BACKUP_RETRY_COUNT=${BACKUP_RETRY_COUNT:-3}
BACKUP_RETRY_INTERVAL=${BACKUP_RETRY_INTERVAL:-60}
BACKUP_NUMBER=$1
TRAP_FILE="/tmp/_backup_${BACKUP_NUMBER}_is_running"
# --> Check if another backup process is running (operator restart/crash)
for ((i=0; i<BACKUP_RETRY_COUNT; i++)); do
[[ ! -f "${TRAP_FILE}" ]] && _log "INFO" "[backup] no other backup process are running" && break
_log "INFO" "[backup] backup is already running. Waiting for ${BACKUP_RETRY_INTERVAL} seconds..."
sleep "${BACKUP_RETRY_INTERVAL}"
done
[[ -f "${TRAP_FILE}" ]] && { _log "ERROR" "[backup] backup is still running after waiting ${BACKUP_RETRY_COUNT} time ${BACKUP_RETRY_INTERVAL}s. Exiting."; exit 1; }
# --< Done
_log "INFO" "[backup] running backup ${BACKUP_NUMBER}"
touch "${TRAP_FILE}"
# create temp dir on the same filesystem with a BACKUP_DIR to be able use atomic mv enstead of copy
BACKUP_TMP_DIR=$(mktemp -d --tmpdir=${BACKUP_DIR})
trap "test -d "${BACKUP_TMP_DIR}" && rm -fr "${BACKUP_TMP_DIR}"" EXIT SIGINT SIGTERM
BACKUP_TMP_DIR=$(mktemp -d --tmpdir="${BACKUP_DIR}")
backup_number=$1
echo "Running backup"
_clean(){
test -d "${BACKUP_TMP_DIR}" && rm -fr "${BACKUP_TMP_DIR}"
test -f "${TRAP_FILE}" && rm -f "${TRAP_FILE}"
}
_trap(){
_clean
_log "ERROR" "[backup] something wrong happened, check the logs"
}
trap '_trap' SIGQUIT SIGINT SIGTERM
# config.xml in a job directory is a config file that shouldn't be backed up
# config.xml in child directories is state that should. For example-
# branches/myorg/branches/myrepo/branches/master/config.xml should be retained while
# branches/myorg/config.xml should not
tar --zstd -C "${JENKINS_HOME}" -cf "${BACKUP_TMP_DIR}/${backup_number}.tar.zstd" \
tar --zstd -C "${JENKINS_HOME}" -cf "${BACKUP_TMP_DIR}/${BACKUP_NUMBER}.tar.zstd" \
--exclude jobs/*/workspace* \
--no-wildcards-match-slash --anchored \
--ignore-failed-read \
--exclude jobs/*/config.xml -c jobs || ret=$?
if [[ "$ret" -eq 0 ]]; then
echo "Backup was completed without warnings"
_log "INFO" "[backup] backup ${BACKUP_NUMBER} was completed without warnings"
elif [[ "$ret" -eq 1 ]]; then
echo "Backup was completed with some warnings"
_log "INFO" "[backup] backup ${BACKUP_NUMBER} was completed with some warnings"
fi
# atomically create a backup file
mv "${BACKUP_TMP_DIR}/${backup_number}.tar.zstd" "${BACKUP_DIR}/${backup_number}.tar.zstd"
mv "${BACKUP_TMP_DIR}/${BACKUP_NUMBER}.tar.zstd" "${BACKUP_DIR}/${BACKUP_NUMBER}.tar.zstd"
rm -rf "${BACKUP_TMP_DIR}"
[[ ! -s ${BACKUP_DIR}/${backup_number}.tar.zstd ]] && echo "backup file '${BACKUP_DIR}/${backup_number}.tar.zstd' is empty" && exit 1;
_log "INFO" "[backup] cleaning ${BACKUP_TMP_DIR} and trap file ${TRAP_FILE}"
_clean
[[ ! -s ${BACKUP_DIR}/${BACKUP_NUMBER}.tar.zstd ]] && _log "ERROR" "[backup] file '${BACKUP_DIR}/${BACKUP_NUMBER}.tar.zstd' is empty" && exit 1
echo Done
_log "INFO" "[backup] ${BACKUP_NUMBER} done"
exit 0

View File

@ -1,30 +1,26 @@
#!/usr/bin/env bash
set -eo pipefail
source "$(dirname "$0")/utils.sh"
is_backup_not_exist() {
local backup_dir="$1"
# Save the current value of 'set -e'
local previous_e
previous_e=$(set +e; :; echo $?)
# Temporarily turn off 'set -e'
set +e
# Run ls command to check if any files matching the pattern exist
ls "${backup_dir}"/*.tar.* 1> /dev/null 2>&1
# Store the exit status of the ls command
local ls_exit_status=$?
# Restore the previous value of 'set -e'
[ "$previous_e" = "0" ] && set -e
# Return true if ls command succeeded (no files found), otherwise return false
[ $ls_exit_status -ne 0 ]
}
[[ -z "${BACKUP_DIR}" ]] && { echo "Required 'BACKUP_DIR' env not set"; exit 1; }
[[ -z "${BACKUP_DIR}" ]] && { _log "ERROR" "Required 'BACKUP_DIR' env not set"; exit 1; }
# Check if we have any backup
if is_backup_not_exist "${BACKUP_DIR}"; then

View File

@ -1,29 +1,47 @@
#!/usr/bin/env bash
set -eo pipefail
source "$(dirname "$0")/utils.sh"
[[ ! $# -eq 1 ]] && echo "Usage: $0 backup_number" && exit 1
[[ -z "${BACKUP_DIR}" ]] && echo "Required 'BACKUP_DIR' env not set" && exit 1;
[[ -z "${JENKINS_HOME}" ]] && echo "Required 'JENKINS_HOME' env not set" && exit 1;
[[ ! $# -eq 1 ]] && _log "ERROR" "Usage: $0 <backup number>" && exit 1
[[ -z "${BACKUP_DIR}" ]] && _log "ERROR" "Required 'BACKUP_DIR' env not set" && exit 1
[[ -z "${JENKINS_HOME}" ]] && _log "ERROR" "Required 'JENKINS_HOME' env not set" && exit 1
BACKUP_NUMBER=$1
RESTORE_RETRY_COUNT=${RESTORE_RETRY_COUNT:-10}
RESTORE_RETRY_INTERVAL=${RESTORE_RETRY_INTERVAL:-10}
backup_number=$1
backup_file="${BACKUP_DIR}/${backup_number}"
echo "Running restore backup with backup number #${backup_number}"
# --> Check if another restore process is running (operator restart/crash)
TRAP_FILE="/tmp/_restore_${BACKUP_NUMBER}_is_running"
trap "rm -f ${TRAP_FILE}" SIGINT SIGTERM
if [[ -f "$backup_file.tar.gz" ]]; then
echo "Old format tar.gz found, restoring it"
for ((i=0; i<RESTORE_RETRY_COUNT; i++)); do
[[ ! -f "${TRAP_FILE}" ]] && _log "INFO" "[restore] no other process are running, restoring" && break
_log "INFO" "[restore] is already running. Waiting for ${RESTORE_RETRY_INTERVAL} seconds..."
sleep "${RESTORE_RETRY_INTERVAL}"
done
[[ -f "${TRAP_FILE}" ]] && { _log "ERROR" "[restore] is still running after waiting ${RESTORE_RETRY_COUNT} time ${RESTORE_RETRY_INTERVAL}s. Exiting."; exit 1; }
# --< Done
_log "INFO" "[restore] restore backup with backup number #${BACKUP_NUMBER}"
touch "${TRAP_FILE}"
BACKUP_FILE="${BACKUP_DIR}/${BACKUP_NUMBER}"
if [[ -f "$BACKUP_FILE.tar.gz" ]]; then
_log "INFO" "[restore] old format tar.gz found, restoring it"
OPTS=""
EXT="tar.gz"
elif [[ -f "$backup_file.tar.zstd" ]]; then
echo "Backup file found, proceeding"
elif [[ -f "$BACKUP_FILE.tar.zstd" ]]; then
_log "INFO" "[restore] Backup file found, proceeding"
OPTS="--zstd"
EXT="tar.zstd"
else
echo "ERR: Backup file not found: $backup_file"
_log "ERROR" "[restore] backup file not found: $BACKUP_FILE"
exit 1
fi
tar $OPTS -C "${JENKINS_HOME}" -xf "${BACKUP_DIR}/${backup_number}.${EXT}"
tar $OPTS -C "${JENKINS_HOME}" -xf "${BACKUP_DIR}/${BACKUP_NUMBER}.${EXT}"
echo Done
_log "INFO" "[restore] deleting lock file ${TRAP_FILE}"
test -f "${TRAP_FILE}" && rm -f "${TRAP_FILE}"
_log "INFO" "[restore] restoring ${BACKUP_NUMBER} Done"
exit 0

View File

@ -1,6 +1,7 @@
#!/usr/bin/env bash
set -eo pipefail
source "$(dirname "$0")/utils.sh"
# Use 60 as default in case BACKUP_CLEANUP_INTERVAL did not set
BACKUP_CLEANUP_INTERVAL=${BACKUP_CLEANUP_INTERVAL:=60}
@ -8,7 +9,7 @@ BACKUP_CLEANUP_INTERVAL=${BACKUP_CLEANUP_INTERVAL:=60}
# Ensure required environment variables are set
check_env_var() {
if [[ -z "${!1}" ]]; then
echo "Required '$1' environment variable is not set"
_log "ERROR" "Required '$1' environment variable is not set"
exit 1
fi
}
@ -41,7 +42,7 @@ find_exceeding_backups() {
local backup_count="$2"
# Check if we have any backup
if is_backup_not_exist "${backup_dir}"; then
echo "backups not found in ${backup_dir}" >&2
_log "ERROR" "[run] backups not found in ${backup_dir}"
return
fi
find "${backup_dir}"/*.tar.zstd -maxdepth 0 -exec basename {} \; | sort -gr | tail -n +$((backup_count +1))
@ -51,9 +52,9 @@ check_env_var "BACKUP_DIR"
check_env_var "JENKINS_HOME"
if [[ -z "${BACKUP_COUNT}" ]]; then
echo "ATTENTION! No BACKUP_COUNT set, it means you MUST delete old backups manually or by custom script"
_log "WARNING" "[run] no BACKUP_COUNT set, it means you MUST delete old backups manually or by custom script"
else
echo "Retaining only the ${BACKUP_COUNT} most recent backups, cleanup occurs every ${BACKUP_CLEANUP_INTERVAL} seconds"
_log "INFO" "[run] retaining only the ${BACKUP_COUNT} most recent backups, cleanup occurs every ${BACKUP_CLEANUP_INTERVAL} seconds"
fi
while true;
@ -62,7 +63,7 @@ do
if [[ -n "${BACKUP_COUNT}" ]]; then
exceeding_backups=$(find_exceeding_backups "${BACKUP_DIR}" "${BACKUP_COUNT}")
if [[ -n "$exceeding_backups" ]]; then
echo "Removing backups: $(echo "$exceeding_backups" | tr '\n' ', ' | sed 's/,$//')"
_log "INFO" "[run] removing backups: $(echo "$exceeding_backups" | tr '\n' ', ' | sed 's/,$//')"
echo "$exceeding_backups" | while read -r file; do
rm "${BACKUP_DIR}/${file}"
done

14
backup/pvc/bin/utils.sh Normal file
View File

@ -0,0 +1,14 @@
#!/usr/bin/env bash
# Common utils
_log() {
local level="$1"
local message="$2"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
if [[ "$level" =~ ^(ERROR|ERR|error|err)$ ]]; then
echo "${timestamp} - ${level} - ${message}" > /proc/1/fd/2
else
echo "${timestamp} - ${level} - ${message}" > /proc/1/fd/1
echo "${timestamp} - ${level} - ${message}" >&2
fi
}

View File

@ -30,7 +30,7 @@ Kubernetes native operator which fully manages Jenkins on Kubernetes
| jenkins.backup.env[2].name | string | `"BACKUP_COUNT"` | |
| jenkins.backup.env[2].value | string | `"3"` | |
| jenkins.backup.getLatestAction[0] | string | `"/home/user/bin/get-latest.sh"` | |
| jenkins.backup.image | string | `"quay.io/jenkins-kubernetes-operator/backup-pvc:v0.2.6"` | |
| jenkins.backup.image | string | `"quay.io/jenkins-kubernetes-operator/backup-pvc:v0.4.1"` | |
| jenkins.backup.interval | int | `30` | |
| jenkins.backup.makeBackupBeforePodDeletion | bool | `true` | |
| jenkins.backup.pvc.className | string | `""` | |

View File

@ -214,7 +214,7 @@ jenkins:
# image used by backup feature
# By default using prebuilt backup PVC image
image: quay.io/jenkins-kubernetes-operator/backup-pvc:v0.2.6
image: quay.io/jenkins-kubernetes-operator/backup-pvc:v0.4.1
# containerName is backup container name
containerName: backup
@ -262,6 +262,11 @@ jenkins:
# BACKUP_DIR - path for storing backup files (default: "/backup")
# JENKINS_HOME - path to jenkins home (default: "/jenkins-home")
# BACKUP_COUNT - define how much recent backups will be kept
# Optional in case you want to modify the backup and restore retry logic
# BACKUP_RETRY_COUNT
# BACKUP_RETRY_INTERVAL
# RESTORE_RETRY_COUNT
# RESTORE_RETRY_INTERVAL
env:
- name: BACKUP_DIR
value: /backup
@ -269,6 +274,15 @@ jenkins:
value: /jenkins-home
- name: BACKUP_COUNT
value: "3" # keep only the 3 most recent backups
#- name: BACKUP_RETRY_COUNT
# value: "3"
#- name: BACKUP_RETRY_INTERVAL
# value: "60"
#- name: RESTORE_RETRY_COUNT
# value: "10"
#- name: RESTORE_RETRY_INTERVAL
# value: "10"
# volumeMounts holds the mount points for volumes
volumeMounts:

View File

@ -4,6 +4,7 @@ let
devShellPackages = [
hugo_099_pkgs.hugo #hugo pre-v100
pkgs.nodejs_21 #Node 1.21
pkgs.helm-docs
];
baseUrl = ((builtins.fromTOML (builtins.readFile ../website/config.toml)).baseURL);
in

View File

@ -2,7 +2,7 @@
title: "Configuring backup and restore"
linkTitle: "Configuring backup and restore"
weight: 5
date: 2023-01-08
date: 2024-06-25
description: >
Prevent loss of job history
---
@ -115,3 +115,19 @@ spec:
command:
- /home/user/bin/get-latest.sh # this command is invoked on "backup" container to get last backup number before pod deletion; not having it in the CR may cause loss of data
```
#### Customizing pvc backup behaviour
To prevent situations where the operator crashes or gets killed during a backup and restore process, a retry logic has been implemented.
This logic can be customized by adjusting the following environment variables:
* **Backup**: total time wait until giving up by default: 180s
* `BACKUP_RETRY_COUNT`: by default is `3`
* `BACKUP_RETRY_INTERVAL`: by default is `60`
* **Restore**: total time wait until giving up by default: 100s
* `RESTORE_RETRY_COUNT`: by default is `10`
* `RESTORE_RETRY_INTERVAL`: by default is `10`
You can adjust the retry logic based on the size of your backup and the duration of the restore process.