feat(backup): logs everything to stdout/err, implement lock file for both backup/restore (#1023)
This commit is contained in:
parent
5ef6c730de
commit
b722ef11ae
|
|
@ -1 +1 @@
|
|||
v0.3.0
|
||||
v0.4.0
|
||||
|
|
|
|||
|
|
@ -1,39 +1,63 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -eo pipefail
|
||||
source "$(dirname "$0")/utils.sh"
|
||||
|
||||
[[ ! $# -eq 1 ]] && echo "Usage: $0 backup_number" && exit 1;
|
||||
[[ -z "${BACKUP_DIR}" ]] && echo "Required 'BACKUP_DIR' env not set" && exit 1;
|
||||
[[ -z "${JENKINS_HOME}" ]] && echo "Required 'JENKINS_HOME' env not set" && exit 1;
|
||||
[[ ! $# -eq 1 ]] && _log "ERROR" "Usage: $0 BACKUP_NUMBER" && exit 1
|
||||
[[ -z "${BACKUP_DIR}" ]] && _log "ERROR" "Required 'BACKUP_DIR' env not set" && exit 1
|
||||
[[ -z "${JENKINS_HOME}" ]] && _log "ERROR" "Required 'JENKINS_HOME' env not set" && exit 1
|
||||
BACKUP_RETRY_COUNT=${BACKUP_RETRY_COUNT:-3}
|
||||
BACKUP_RETRY_INTERVAL=${BACKUP_RETRY_INTERVAL:-60}
|
||||
BACKUP_NUMBER=$1
|
||||
TRAP_FILE="/tmp/_backup_${BACKUP_NUMBER}_is_running"
|
||||
|
||||
# --> Check if another backup process is running (operator restart/crash)
|
||||
for ((i=0; i<BACKUP_RETRY_COUNT; i++)); do
|
||||
[[ ! -f "${TRAP_FILE}" ]] && _log "INFO" "[backup] no other backup process are running" && break
|
||||
_log "INFO" "[backup] backup is already running. Waiting for ${BACKUP_RETRY_INTERVAL} seconds..."
|
||||
sleep "${BACKUP_RETRY_INTERVAL}"
|
||||
done
|
||||
[[ -f "${TRAP_FILE}" ]] && { _log "ERROR" "[backup] backup is still running after waiting ${BACKUP_RETRY_COUNT} time ${BACKUP_RETRY_INTERVAL}s. Exiting."; exit 1; }
|
||||
# --< Done
|
||||
|
||||
_log "INFO" "[backup] running backup ${BACKUP_NUMBER}"
|
||||
touch "${TRAP_FILE}"
|
||||
# create temp dir on the same filesystem with a BACKUP_DIR to be able use atomic mv enstead of copy
|
||||
BACKUP_TMP_DIR=$(mktemp -d --tmpdir=${BACKUP_DIR})
|
||||
trap "test -d "${BACKUP_TMP_DIR}" && rm -fr "${BACKUP_TMP_DIR}"" EXIT SIGINT SIGTERM
|
||||
BACKUP_TMP_DIR=$(mktemp -d --tmpdir="${BACKUP_DIR}")
|
||||
|
||||
backup_number=$1
|
||||
echo "Running backup"
|
||||
_clean(){
|
||||
test -d "${BACKUP_TMP_DIR}" && rm -fr "${BACKUP_TMP_DIR}"
|
||||
test -f "${TRAP_FILE}" && rm -f "${TRAP_FILE}"
|
||||
}
|
||||
|
||||
_trap(){
|
||||
_clean
|
||||
_log "ERROR" "[backup] something wrong happened, check the logs"
|
||||
}
|
||||
|
||||
trap '_trap' SIGQUIT SIGINT SIGTERM
|
||||
|
||||
# config.xml in a job directory is a config file that shouldn't be backed up
|
||||
# config.xml in child directories is state that should. For example-
|
||||
# branches/myorg/branches/myrepo/branches/master/config.xml should be retained while
|
||||
# branches/myorg/config.xml should not
|
||||
tar --zstd -C "${JENKINS_HOME}" -cf "${BACKUP_TMP_DIR}/${backup_number}.tar.zstd" \
|
||||
tar --zstd -C "${JENKINS_HOME}" -cf "${BACKUP_TMP_DIR}/${BACKUP_NUMBER}.tar.zstd" \
|
||||
--exclude jobs/*/workspace* \
|
||||
--no-wildcards-match-slash --anchored \
|
||||
--ignore-failed-read \
|
||||
--exclude jobs/*/config.xml -c jobs || ret=$?
|
||||
|
||||
if [[ "$ret" -eq 0 ]]; then
|
||||
echo "Backup was completed without warnings"
|
||||
_log "INFO" "[backup] backup ${BACKUP_NUMBER} was completed without warnings"
|
||||
elif [[ "$ret" -eq 1 ]]; then
|
||||
echo "Backup was completed with some warnings"
|
||||
_log "INFO" "[backup] backup ${BACKUP_NUMBER} was completed with some warnings"
|
||||
fi
|
||||
|
||||
# atomically create a backup file
|
||||
mv "${BACKUP_TMP_DIR}/${backup_number}.tar.zstd" "${BACKUP_DIR}/${backup_number}.tar.zstd"
|
||||
mv "${BACKUP_TMP_DIR}/${BACKUP_NUMBER}.tar.zstd" "${BACKUP_DIR}/${BACKUP_NUMBER}.tar.zstd"
|
||||
|
||||
rm -rf "${BACKUP_TMP_DIR}"
|
||||
[[ ! -s ${BACKUP_DIR}/${backup_number}.tar.zstd ]] && echo "backup file '${BACKUP_DIR}/${backup_number}.tar.zstd' is empty" && exit 1;
|
||||
_log "INFO" "[backup] cleaning ${BACKUP_TMP_DIR} and trap file ${TRAP_FILE}"
|
||||
_clean
|
||||
[[ ! -s ${BACKUP_DIR}/${BACKUP_NUMBER}.tar.zstd ]] && _log "ERROR" "[backup] file '${BACKUP_DIR}/${BACKUP_NUMBER}.tar.zstd' is empty" && exit 1
|
||||
|
||||
echo Done
|
||||
_log "INFO" "[backup] ${BACKUP_NUMBER} done"
|
||||
exit 0
|
||||
|
|
|
|||
|
|
@ -1,30 +1,26 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -eo pipefail
|
||||
source "$(dirname "$0")/utils.sh"
|
||||
|
||||
is_backup_not_exist() {
|
||||
local backup_dir="$1"
|
||||
# Save the current value of 'set -e'
|
||||
local previous_e
|
||||
previous_e=$(set +e; :; echo $?)
|
||||
|
||||
# Temporarily turn off 'set -e'
|
||||
set +e
|
||||
|
||||
# Run ls command to check if any files matching the pattern exist
|
||||
ls "${backup_dir}"/*.tar.* 1> /dev/null 2>&1
|
||||
|
||||
# Store the exit status of the ls command
|
||||
local ls_exit_status=$?
|
||||
|
||||
# Restore the previous value of 'set -e'
|
||||
[ "$previous_e" = "0" ] && set -e
|
||||
|
||||
# Return true if ls command succeeded (no files found), otherwise return false
|
||||
[ $ls_exit_status -ne 0 ]
|
||||
}
|
||||
|
||||
[[ -z "${BACKUP_DIR}" ]] && { echo "Required 'BACKUP_DIR' env not set"; exit 1; }
|
||||
[[ -z "${BACKUP_DIR}" ]] && { _log "ERROR" "Required 'BACKUP_DIR' env not set"; exit 1; }
|
||||
|
||||
# Check if we have any backup
|
||||
if is_backup_not_exist "${BACKUP_DIR}"; then
|
||||
|
|
|
|||
|
|
@ -1,29 +1,47 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -eo pipefail
|
||||
source "$(dirname "$0")/utils.sh"
|
||||
|
||||
[[ ! $# -eq 1 ]] && echo "Usage: $0 backup_number" && exit 1
|
||||
[[ -z "${BACKUP_DIR}" ]] && echo "Required 'BACKUP_DIR' env not set" && exit 1;
|
||||
[[ -z "${JENKINS_HOME}" ]] && echo "Required 'JENKINS_HOME' env not set" && exit 1;
|
||||
[[ ! $# -eq 1 ]] && _log "ERROR" "Usage: $0 <backup number>" && exit 1
|
||||
[[ -z "${BACKUP_DIR}" ]] && _log "ERROR" "Required 'BACKUP_DIR' env not set" && exit 1
|
||||
[[ -z "${JENKINS_HOME}" ]] && _log "ERROR" "Required 'JENKINS_HOME' env not set" && exit 1
|
||||
BACKUP_NUMBER=$1
|
||||
RESTORE_RETRY_COUNT=${RESTORE_RETRY_COUNT:-10}
|
||||
RESTORE_RETRY_INTERVAL=${RESTORE_RETRY_INTERVAL:-10}
|
||||
|
||||
backup_number=$1
|
||||
backup_file="${BACKUP_DIR}/${backup_number}"
|
||||
echo "Running restore backup with backup number #${backup_number}"
|
||||
# --> Check if another restore process is running (operator restart/crash)
|
||||
TRAP_FILE="/tmp/_restore_${BACKUP_NUMBER}_is_running"
|
||||
trap "rm -f ${TRAP_FILE}" SIGINT SIGTERM
|
||||
|
||||
if [[ -f "$backup_file.tar.gz" ]]; then
|
||||
echo "Old format tar.gz found, restoring it"
|
||||
for ((i=0; i<RESTORE_RETRY_COUNT; i++)); do
|
||||
[[ ! -f "${TRAP_FILE}" ]] && _log "INFO" "[restore] no other process are running, restoring" && break
|
||||
_log "INFO" "[restore] is already running. Waiting for ${RESTORE_RETRY_INTERVAL} seconds..."
|
||||
sleep "${RESTORE_RETRY_INTERVAL}"
|
||||
done
|
||||
[[ -f "${TRAP_FILE}" ]] && { _log "ERROR" "[restore] is still running after waiting ${RESTORE_RETRY_COUNT} time ${RESTORE_RETRY_INTERVAL}s. Exiting."; exit 1; }
|
||||
# --< Done
|
||||
|
||||
_log "INFO" "[restore] restore backup with backup number #${BACKUP_NUMBER}"
|
||||
touch "${TRAP_FILE}"
|
||||
BACKUP_FILE="${BACKUP_DIR}/${BACKUP_NUMBER}"
|
||||
|
||||
if [[ -f "$BACKUP_FILE.tar.gz" ]]; then
|
||||
_log "INFO" "[restore] old format tar.gz found, restoring it"
|
||||
OPTS=""
|
||||
EXT="tar.gz"
|
||||
elif [[ -f "$backup_file.tar.zstd" ]]; then
|
||||
echo "Backup file found, proceeding"
|
||||
elif [[ -f "$BACKUP_FILE.tar.zstd" ]]; then
|
||||
_log "INFO" "[restore] Backup file found, proceeding"
|
||||
OPTS="--zstd"
|
||||
EXT="tar.zstd"
|
||||
else
|
||||
echo "ERR: Backup file not found: $backup_file"
|
||||
_log "ERROR" "[restore] backup file not found: $BACKUP_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
tar $OPTS -C "${JENKINS_HOME}" -xf "${BACKUP_DIR}/${backup_number}.${EXT}"
|
||||
tar $OPTS -C "${JENKINS_HOME}" -xf "${BACKUP_DIR}/${BACKUP_NUMBER}.${EXT}"
|
||||
|
||||
echo Done
|
||||
_log "INFO" "[restore] deleting lock file ${TRAP_FILE}"
|
||||
test -f "${TRAP_FILE}" && rm -f "${TRAP_FILE}"
|
||||
_log "INFO" "[restore] restoring ${BACKUP_NUMBER} Done"
|
||||
exit 0
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -eo pipefail
|
||||
source "$(dirname "$0")/utils.sh"
|
||||
|
||||
# Use 60 as default in case BACKUP_CLEANUP_INTERVAL did not set
|
||||
BACKUP_CLEANUP_INTERVAL=${BACKUP_CLEANUP_INTERVAL:=60}
|
||||
|
|
@ -8,7 +9,7 @@ BACKUP_CLEANUP_INTERVAL=${BACKUP_CLEANUP_INTERVAL:=60}
|
|||
# Ensure required environment variables are set
|
||||
check_env_var() {
|
||||
if [[ -z "${!1}" ]]; then
|
||||
echo "Required '$1' environment variable is not set"
|
||||
_log "ERROR" "Required '$1' environment variable is not set"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
|
@ -41,7 +42,7 @@ find_exceeding_backups() {
|
|||
local backup_count="$2"
|
||||
# Check if we have any backup
|
||||
if is_backup_not_exist "${backup_dir}"; then
|
||||
echo "backups not found in ${backup_dir}" >&2
|
||||
_log "ERROR" "[run] backups not found in ${backup_dir}"
|
||||
return
|
||||
fi
|
||||
find "${backup_dir}"/*.tar.zstd -maxdepth 0 -exec basename {} \; | sort -gr | tail -n +$((backup_count +1))
|
||||
|
|
@ -51,9 +52,9 @@ check_env_var "BACKUP_DIR"
|
|||
check_env_var "JENKINS_HOME"
|
||||
|
||||
if [[ -z "${BACKUP_COUNT}" ]]; then
|
||||
echo "ATTENTION! No BACKUP_COUNT set, it means you MUST delete old backups manually or by custom script"
|
||||
_log "WARNING" "[run] no BACKUP_COUNT set, it means you MUST delete old backups manually or by custom script"
|
||||
else
|
||||
echo "Retaining only the ${BACKUP_COUNT} most recent backups, cleanup occurs every ${BACKUP_CLEANUP_INTERVAL} seconds"
|
||||
_log "INFO" "[run] retaining only the ${BACKUP_COUNT} most recent backups, cleanup occurs every ${BACKUP_CLEANUP_INTERVAL} seconds"
|
||||
fi
|
||||
|
||||
while true;
|
||||
|
|
@ -62,7 +63,7 @@ do
|
|||
if [[ -n "${BACKUP_COUNT}" ]]; then
|
||||
exceeding_backups=$(find_exceeding_backups "${BACKUP_DIR}" "${BACKUP_COUNT}")
|
||||
if [[ -n "$exceeding_backups" ]]; then
|
||||
echo "Removing backups: $(echo "$exceeding_backups" | tr '\n' ', ' | sed 's/,$//')"
|
||||
_log "INFO" "[run] removing backups: $(echo "$exceeding_backups" | tr '\n' ', ' | sed 's/,$//')"
|
||||
echo "$exceeding_backups" | while read -r file; do
|
||||
rm "${BACKUP_DIR}/${file}"
|
||||
done
|
||||
|
|
|
|||
|
|
@ -0,0 +1,14 @@
|
|||
#!/usr/bin/env bash
|
||||
# Common utils
|
||||
|
||||
_log() {
|
||||
local level="$1"
|
||||
local message="$2"
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
if [[ "$level" =~ ^(ERROR|ERR|error|err)$ ]]; then
|
||||
echo "${timestamp} - ${level} - ${message}" > /proc/1/fd/2
|
||||
else
|
||||
echo "${timestamp} - ${level} - ${message}" > /proc/1/fd/1
|
||||
echo "${timestamp} - ${level} - ${message}" >&2
|
||||
fi
|
||||
}
|
||||
|
|
@ -30,7 +30,7 @@ Kubernetes native operator which fully manages Jenkins on Kubernetes
|
|||
| jenkins.backup.env[2].name | string | `"BACKUP_COUNT"` | |
|
||||
| jenkins.backup.env[2].value | string | `"3"` | |
|
||||
| jenkins.backup.getLatestAction[0] | string | `"/home/user/bin/get-latest.sh"` | |
|
||||
| jenkins.backup.image | string | `"quay.io/jenkins-kubernetes-operator/backup-pvc:v0.2.6"` | |
|
||||
| jenkins.backup.image | string | `"quay.io/jenkins-kubernetes-operator/backup-pvc:v0.4.1"` | |
|
||||
| jenkins.backup.interval | int | `30` | |
|
||||
| jenkins.backup.makeBackupBeforePodDeletion | bool | `true` | |
|
||||
| jenkins.backup.pvc.className | string | `""` | |
|
||||
|
|
|
|||
|
|
@ -214,7 +214,7 @@ jenkins:
|
|||
|
||||
# image used by backup feature
|
||||
# By default using prebuilt backup PVC image
|
||||
image: quay.io/jenkins-kubernetes-operator/backup-pvc:v0.2.6
|
||||
image: quay.io/jenkins-kubernetes-operator/backup-pvc:v0.4.1
|
||||
|
||||
# containerName is backup container name
|
||||
containerName: backup
|
||||
|
|
@ -262,6 +262,11 @@ jenkins:
|
|||
# BACKUP_DIR - path for storing backup files (default: "/backup")
|
||||
# JENKINS_HOME - path to jenkins home (default: "/jenkins-home")
|
||||
# BACKUP_COUNT - define how much recent backups will be kept
|
||||
# Optional in case you want to modify the backup and restore retry logic
|
||||
# BACKUP_RETRY_COUNT
|
||||
# BACKUP_RETRY_INTERVAL
|
||||
# RESTORE_RETRY_COUNT
|
||||
# RESTORE_RETRY_INTERVAL
|
||||
env:
|
||||
- name: BACKUP_DIR
|
||||
value: /backup
|
||||
|
|
@ -269,6 +274,15 @@ jenkins:
|
|||
value: /jenkins-home
|
||||
- name: BACKUP_COUNT
|
||||
value: "3" # keep only the 3 most recent backups
|
||||
#- name: BACKUP_RETRY_COUNT
|
||||
# value: "3"
|
||||
#- name: BACKUP_RETRY_INTERVAL
|
||||
# value: "60"
|
||||
#- name: RESTORE_RETRY_COUNT
|
||||
# value: "10"
|
||||
#- name: RESTORE_RETRY_INTERVAL
|
||||
# value: "10"
|
||||
|
||||
|
||||
# volumeMounts holds the mount points for volumes
|
||||
volumeMounts:
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ let
|
|||
devShellPackages = [
|
||||
hugo_099_pkgs.hugo #hugo pre-v100
|
||||
pkgs.nodejs_21 #Node 1.21
|
||||
pkgs.helm-docs
|
||||
];
|
||||
baseUrl = ((builtins.fromTOML (builtins.readFile ../website/config.toml)).baseURL);
|
||||
in
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
title: "Configuring backup and restore"
|
||||
linkTitle: "Configuring backup and restore"
|
||||
weight: 5
|
||||
date: 2023-01-08
|
||||
date: 2024-06-25
|
||||
description: >
|
||||
Prevent loss of job history
|
||||
---
|
||||
|
|
@ -115,3 +115,19 @@ spec:
|
|||
command:
|
||||
- /home/user/bin/get-latest.sh # this command is invoked on "backup" container to get last backup number before pod deletion; not having it in the CR may cause loss of data
|
||||
```
|
||||
|
||||
#### Customizing pvc backup behaviour
|
||||
|
||||
To prevent situations where the operator crashes or gets killed during a backup and restore process, a retry logic has been implemented.
|
||||
|
||||
This logic can be customized by adjusting the following environment variables:
|
||||
|
||||
* **Backup**: total time wait until giving up by default: 180s
|
||||
* `BACKUP_RETRY_COUNT`: by default is `3`
|
||||
* `BACKUP_RETRY_INTERVAL`: by default is `60`
|
||||
|
||||
* **Restore**: total time wait until giving up by default: 100s
|
||||
* `RESTORE_RETRY_COUNT`: by default is `10`
|
||||
* `RESTORE_RETRY_INTERVAL`: by default is `10`
|
||||
|
||||
You can adjust the retry logic based on the size of your backup and the duration of the restore process.
|
||||
|
|
|
|||
Loading…
Reference in New Issue