From 071898c96bcfb482730290293dc591e010ec7595 Mon Sep 17 00:00:00 2001
From: Richard Fussenegger <Fleshgrinder@users.noreply.github.com>
Date: Thu, 23 Jun 2022 02:37:01 +0200
Subject: [PATCH] Overhauled `startup.sh` Script (#1454)

This overhaul turns it into a shellcheck valid script with explicit error handling for all possible situations I could think of. This change takes https://github.com/actions-runner-controller/actions-runner-controller/pull/1409 into account and things can be merged in any order. There are a few important changes here to the logic:

- The wait logic for checking if docker comes up was fundamentally flawed because it checks for the PID. Docker will always come up and thus become visible in the process list, just to immediately die when it encounters an issue, after which supervisor starts it again. This means that our check so far is flaky due to the `sleep 1` it might encounter a PID, or it might not, and the existence of the PID does not mean anything. The `docker ps` check we have in the `entrypoint.sh` script does not suffer from this as it checks for a feature of docker and not a PID. I thus entirely removed the PID check, and instead I am handing things over to our `entrypoint.sh` script by setting the environment variables correctly.
- This change has an influence on the `docker0` interface MTU configuration, because the interface might or might not exist after we started docker. Hence, I changed this to a time boxed loop that tries for one minute to set up the interface's MTU. In case the command fails we log an error and continue with the run.
- I changed the entire MTU handling by validating its value before configuring it, logging an error and continuing without if it is set incorrectly. This ensures that we are not going to send our users on a bug hunt.
- The way we started supervisord did not make much sense to me. It sends itself into the background automatically, there is no need for us to do so with Bash.

The decision to not fail on errors but continue is a deliberate choice, because I believe that running a build is more important than having a perfectly configured system. However, this strategy might also hide issues for all users who are not properly checking their logs. It also makes testing harder. Hence, we could change all error conditions from graceful to panicking. We should then align the exit codes across `startup.sh` and `entrypoint.sh` to ensure that every possible error condition has its own unique error code for easy debugging.
---
 runner/startup.sh | 119 ++++++++++++++++++++++++----------------------
 1 file changed, 62 insertions(+), 57 deletions(-)
diff --git a/runner/startup.sh b/runner/startup.sh
index 6352a0d5..dd287f8a 100755
--- a/runner/startup.sh
+++ b/runner/startup.sh
@@ -1,70 +1,75 @@
-#!/bin/bash
-source logger.bash
+#!/usr/bin/env bash
+set -Eeuo pipefail
 
-function wait_for_process () {
-    local max_time_wait=30
-    local process_name="$1"
-    local waited_sec=0
-    while ! pgrep "$process_name" >/dev/null && ((waited_sec < max_time_wait)); do
-        log.debug "Process $process_name is not running yet. Retrying in 1 seconds"
-        log.debug "Waited $waited_sec seconds of $max_time_wait seconds"
-        sleep 1
-        ((waited_sec=waited_sec+1))
-        if ((waited_sec >= max_time_wait)); then
-            return 1
-        fi
-    done
-    return 0
-}
+# This path can only be customized through arguments passed to the dockerd cli
+# at startup. Parsing the supervisor docker.conf file is cumbersome, and there
+# is little reason to allow users to customize this. Hence, we hardcode the path
+# here.
+#
+# See: https://docs.docker.com/engine/reference/commandline/dockerd/
+readonly dockerd_config_path=/etc/docker/daemon.json
+readonly dockerd_supervisor_config_path=/etc/supervisor/conf.d/dockerd.conf
 
-sudo /bin/bash <<SCRIPT
-mkdir -p /etc/docker
+# Extend existing config...
+dockerd_config=$(! cat $dockerd_config_path 2>&-)
 
-echo "{}" > /etc/docker/daemon.json
-
-if [ -n "${MTU}" ]; then
-jq ".\"mtu\" = ${MTU}" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
-# See https://docs.docker.com/engine/security/rootless/
-echo "environment=DOCKERD_ROOTLESS_ROOTLESSKIT_MTU=${MTU}" >> /etc/supervisor/conf.d/dockerd.conf
+# ...and fallback to an empty object if there is no existing configuration, or
+# otherwise the following `jq` merge commands would not produce anything.
+if [[ $dockerd_config == '' ]]; then
+  dockerd_config='{}'
 fi
 
-if [ -n "${DOCKER_REGISTRY_MIRROR}" ]; then
-jq ".\"registry-mirrors\"[0] = \"${DOCKER_REGISTRY_MIRROR}\"" /etc/docker/daemon.json > /tmp/.daemon.json && mv /tmp/.daemon.json /etc/docker/daemon.json
+readonly MTU=${MTU:-}
+if [[ $MTU ]]; then
+  # We have to verify that this value is a valid integer because we are going to
+  # start docker in the background and invalid values would never be surfaced.
+  # Well, except that startup fails and our users are sent on a nice debugging
+  # session.
+  if [[ $MTU =~ ^[1-9][0-9]*$ ]]; then
+    # See https://docs.docker.com/engine/security/rootless/
+    sudo tee -a $dockerd_supervisor_config_path 1>&- <<<"environment=DOCKERD_ROOTLESS_ROOTLESSKIT_MTU=$MTU"
+    dockerd_config=$(jq -S ".mtu = $MTU" <<<"$dockerd_config")
+  else
+    # shellcheck source=runner/logger.bash
+    source logger.bash
+    log.error "Docker MTU must be an integer, continuing bootstrapping without it, got: $MTU"
+    MTU=
+  fi
 fi
-SCRIPT
 
-dump() {
-  local path=${1:?missing required <path> argument}
-  shift
-  printf -- "%s\n---\n" "${*//\{path\}/"$path"}" 1>&2
-  cat "$path" 1>&2
-  printf -- '---\n' 1>&2
-}
+if [[ ${DOCKER_REGISTRY_MIRROR:-} != '' ]]; then
+  dockerd_config=$(jq -S --arg url "$DOCKER_REGISTRY_MIRROR" '."registry-mirrors" += ["$url"]' <<<"$dockerd_config")
+fi
 
-for config in /etc/docker/daemon.json /etc/supervisor/conf.d/dockerd.conf; do
-  dump "$config" 'Using {path} with the following content:'
+sudo mkdir -p ${dockerd_config_path%/*}
+sudo tee $dockerd_config_path 1>&- <<<"$dockerd_config"
+
+for config in $dockerd_config_path $dockerd_supervisor_config_path; do
+  (
+    echo "Using '$config' with the following content <<CONFIG"
+    cat $config
+    echo 'CONFIG'
+  ) 1>&2
 done
 
-log.debug 'Starting supervisor daemon'
-sudo /usr/bin/supervisord -n >> /dev/null 2>&1 &
+sudo /usr/bin/supervisord -c /etc/supervicor/supervisord.conf
 
-log.debug 'Waiting for processes to be running...'
-processes=(dockerd)
-
-for process in "${processes[@]}"; do
-    wait_for_process "$process"
-    if [ $? -ne 0 ]; then
-        log.error "$process is not running after max time"
-        dump /var/log/dockerd.err.log 'Dumping {path} to aid investigation'
-        exit 1
-    else
-        log.debug "$process is running"
-    fi
-done
-
-if [ -n "${MTU}" ]; then
-  sudo ifconfig docker0 mtu ${MTU} up
+if [[ $MTU ]]; then
+  # It might take a few ticks for the `docker0` device to be up after starting
+  # the docker daemon with supervisord, hence, we retry for a little while.
+  # Notice how we use `nice` to execute the command with a least favorable
+  # scheduling priority and are using a noop (`:`) instruction instead of
+  # `sleep`. We do this to ensure that we keep the startup as fast as possible.
+  if ! nice -n 19 timeout -k 70 60 -- bash -c "(until sudo ifconfig docker0 mtu '$MTU' up; do :; done) 1>&- 2>&-"; then
+    # shellcheck source=runner/logger.bash
+    source logger.bash
+    log.error 'Failed to set docker interface mtu within 1 minute, continuing bootstrapping without it...'
+  fi
 fi
 
-# Wait processes to be running
-entrypoint.sh
+# We leave the wait for docker to come up to our entrypoint.sh script, since its
+# check properly handles the case where the docker service is in a crash loop.
+# Users still have the ability to disable the wait.
+export DOCKER_ENABLED=true
+export DISABLE_WAIT_FOR_DOCKER=${DISABLE_WAIT_FOR_DOCKER:-false}
+exec entrypoint.sh