diff --git a/.github/workflows/validate-runners.yaml b/.github/workflows/validate-runners.yaml index 99ccbfa8..8de3ee74 100644 --- a/.github/workflows/validate-runners.yaml +++ b/.github/workflows/validate-runners.yaml @@ -6,7 +6,7 @@ on: - '**' paths: - 'runner/**' - - 'test/entrypoint/**' + - 'test/startup/**' - '!**.md' permissions: @@ -42,4 +42,4 @@ jobs: - name: Run tests run: | - make acceptance/runner/entrypoint + make acceptance/runner/startup diff --git a/Makefile b/Makefile index 332dac4a..df79619c 100644 --- a/Makefile +++ b/Makefile @@ -117,7 +117,7 @@ generate: controller-gen # Run shellcheck on runner scripts shellcheck: shellcheck-install - $(TOOLS_PATH)/shellcheck --shell bash --source-path runner runner/*.bash runner/*.sh + $(TOOLS_PATH)/shellcheck --shell bash --source-path runner runner/*.sh docker-buildx: export DOCKER_CLI_EXPERIMENTAL=enabled ;\ @@ -203,8 +203,8 @@ acceptance/deploy: acceptance/tests: acceptance/checks.sh -acceptance/runner/entrypoint: - cd test/entrypoint/ && bash test.sh +acceptance/runner/startup: + cd test/startup/ && bash test.sh # We use -count=1 instead of `go clean -testcache` # See https://terratest.gruntwork.io/docs/testing-best-practices/avoid-test-caching/ diff --git a/acceptance/testdata/runnerdeploy.envsubst.yaml b/acceptance/testdata/runnerdeploy.envsubst.yaml index e0ea068c..9d4fe5eb 100644 --- a/acceptance/testdata/runnerdeploy.envsubst.yaml +++ b/acceptance/testdata/runnerdeploy.envsubst.yaml @@ -8,6 +8,16 @@ provisioner: rancher.io/local-path reclaimPolicy: Delete volumeBindingMode: WaitForFirstConsumer --- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ${NAME}-rootless-dind-work-dir + labels: + content: ${NAME}-rootless-dind-work-dir +provisioner: rancher.io/local-path +reclaimPolicy: Delete +volumeBindingMode: WaitForFirstConsumer +--- apiVersion: actions.summerwind.dev/v1alpha1 kind: RunnerDeployment metadata: @@ -49,7 +59,12 @@ spec: labels: - "${RUNNER_LABEL}" + serviceAccountName: ${RUNNER_SERVICE_ACCOUNT_NAME} + terminationGracePeriodSeconds: ${RUNNER_TERMINATION_GRACE_PERIOD_SECONDS} + env: + - name: RUNNER_GRACEFUL_STOP_TIMEOUT + value: "${RUNNER_GRACEFUL_STOP_TIMEOUT}" - name: ROLLING_UPDATE_PHASE value: "${ROLLING_UPDATE_PHASE}" - name: ARC_DOCKER_MTU_PROPAGATION @@ -57,6 +72,30 @@ spec: dockerMTU: 1400 + # Fix the following no space left errors with rootless-dind runners that can happen while running buildx build: + # ------ + # > [4/5] RUN go mod download: + # ------ + # ERROR: failed to solve: failed to prepare yxsw8lv9hqnuafzlfta244l0z: mkdir /home/runner/.local/share/docker/vfs/dir/yxsw8lv9hqnuafzlfta244l0z/usr/local/go/src/cmd/compile/internal/types2/testdata: no space left on device + # Error: Process completed with exit code 1. + # + volumeMounts: + - name: rootless-dind-work-dir + # Omit the /share/docker part of the /home/runner/.local/share/docker as + # that part is created by dockerd. + mountPath: /home/runner/.local + readOnly: false + volumes: + - name: rootless-dind-work-dir + ephemeral: + volumeClaimTemplate: + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: "${NAME}-rootless-dind-work-dir" + resources: + requests: + storage: 3Gi + # # Non-standard working directory # @@ -72,7 +111,6 @@ spec: resources: requests: storage: 10Gi - serviceAccountName: ${RUNNER_SERVICE_ACCOUNT_NAME} --- apiVersion: actions.summerwind.dev/v1alpha1 kind: HorizontalRunnerAutoscaler diff --git a/acceptance/testdata/runnerset.envsubst.yaml b/acceptance/testdata/runnerset.envsubst.yaml index f9e6147e..4c957aad 100644 --- a/acceptance/testdata/runnerset.envsubst.yaml +++ b/acceptance/testdata/runnerset.envsubst.yaml @@ -54,6 +54,16 @@ provisioner: rancher.io/local-path reclaimPolicy: Retain volumeBindingMode: WaitForFirstConsumer --- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: ${NAME}-rootless-dind-work-dir + labels: + content: ${NAME}-rootless-dind-work-dir +provisioner: rancher.io/local-path +reclaimPolicy: Delete +volumeBindingMode: WaitForFirstConsumer +--- apiVersion: actions.summerwind.dev/v1alpha1 kind: RunnerSet metadata: @@ -113,10 +123,20 @@ spec: app: ${NAME} spec: serviceAccountName: ${RUNNER_SERVICE_ACCOUNT_NAME} + terminationGracePeriodSeconds: ${RUNNER_TERMINATION_GRACE_PERIOD_SECONDS} containers: + # # Uncomment only when non-dind-runner / you're using docker sidecar + # - name: docker + # # Image is required for the dind sidecar definition within RunnerSet spec + # image: "docker:dind" + # env: + # - name: RUNNER_GRACEFUL_STOP_TIMEOUT + # value: "${RUNNER_GRACEFUL_STOP_TIMEOUT}" - name: runner imagePullPolicy: IfNotPresent env: + - name: RUNNER_GRACEFUL_STOP_TIMEOUT + value: "${RUNNER_GRACEFUL_STOP_TIMEOUT}" - name: RUNNER_FEATURE_FLAG_EPHEMERAL value: "${RUNNER_FEATURE_FLAG_EPHEMERAL}" - name: GOMODCACHE @@ -168,7 +188,15 @@ spec: # # For buildx cache # - name: cache # mountPath: "/home/runner/.cache" - # Comment out the ephemeral work volume if you're going to test the kubernetes container mode + + # For fixing no space left error on rootless dind runner + - name: rootless-dind-work-dir + # Omit the /share/docker part of the /home/runner/.local/share/docker as + # that part is created by dockerd. + mountPath: /home/runner/.local + readOnly: false + + # Comment out the ephemeral work volume if you're going to test the kubernetes container mode # volumes: # - name: work # ephemeral: @@ -180,6 +208,24 @@ spec: # resources: # requests: # storage: 10Gi + + # Fix the following no space left errors with rootless-dind runners that can happen while running buildx build: + # ------ + # > [4/5] RUN go mod download: + # ------ + # ERROR: failed to solve: failed to prepare yxsw8lv9hqnuafzlfta244l0z: mkdir /home/runner/.local/share/docker/vfs/dir/yxsw8lv9hqnuafzlfta244l0z/usr/local/go/src/cmd/compile/internal/types2/testdata: no space left on device + # Error: Process completed with exit code 1. + # + volumes: + - name: rootless-dind-work-dir + ephemeral: + volumeClaimTemplate: + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: "${NAME}-rootless-dind-work-dir" + resources: + requests: + storage: 3Gi volumeClaimTemplates: - metadata: name: vol1 diff --git a/controllers/new_runner_pod_test.go b/controllers/new_runner_pod_test.go index e273fde8..2d95601e 100644 --- a/controllers/new_runner_pod_test.go +++ b/controllers/new_runner_pod_test.go @@ -190,6 +190,17 @@ func TestNewRunnerPod(t *testing.T) { SecurityContext: &corev1.SecurityContext{ Privileged: func(b bool) *bool { return &b }(true), }, + Lifecycle: &corev1.Lifecycle{ + PreStop: &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{ + Command: []string{ + "/bin/sh", + "-c", + "timeout \"${RUNNER_GRACEFUL_STOP_TIMEOUT:-15}\" /bin/sh -c \"echo 'Prestop hook started'; while [ -f /runner/.runner ]; do sleep 1; done; echo 'Waiting for dockerd to start'; while ! pgrep -x dockerd; do sleep 1; done; echo 'Prestop hook stopped'\" >/proc/1/fd/1 2>&1", + }, + }, + }, + }, }, }, RestartPolicy: corev1.RestartPolicyNever, @@ -709,6 +720,17 @@ func TestNewRunnerPodFromRunnerController(t *testing.T) { SecurityContext: &corev1.SecurityContext{ Privileged: func(b bool) *bool { return &b }(true), }, + Lifecycle: &corev1.Lifecycle{ + PreStop: &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{ + Command: []string{ + "/bin/sh", + "-c", + "timeout \"${RUNNER_GRACEFUL_STOP_TIMEOUT:-15}\" /bin/sh -c \"echo 'Prestop hook started'; while [ -f /runner/.runner ]; do sleep 1; done; echo 'Waiting for dockerd to start'; while ! pgrep -x dockerd; do sleep 1; done; echo 'Prestop hook stopped'\" >/proc/1/fd/1 2>&1", + }, + }, + }, + }, }, }, RestartPolicy: corev1.RestartPolicyNever, diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 980b0aee..ae306edc 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -1151,6 +1151,27 @@ func newRunnerPodWithContainerMode(containerMode string, template corev1.Pod, ru fmt.Sprintf("--registry-mirror=%s", dockerRegistryMirror), ) } + + dockerdContainer.Lifecycle = &corev1.Lifecycle{ + PreStop: &corev1.LifecycleHandler{ + Exec: &corev1.ExecAction{ + Command: []string{ + "/bin/sh", "-c", + // A prestop hook can start before the dockerd start up, for example, when the docker init is still provisioning + // the TLS key and the cert to be used by dockerd. + // + // The author of this prestop script encountered issues where the prestophung for ten or more minutes on his cluster. + // He realized that the hang happened when a prestop hook is executed while the docker init is provioning the key and cert. + // Assuming it's due to that the SIGTERM sent by K8s after the prestop hook was ignored by the docker init at that time, + // and it needed to wait until terminationGracePeriodSeconds to elapse before finally killing the container, + // he wrote this script so that it tries to delay SIGTERM until dockerd starts and becomes ready for processing the signal. + // + // Also note that we don't need to run `pkill dockerd` at the end of the prehook script, as SIGTERM is sent by K8s after the prestop had completed. + `timeout "${RUNNER_GRACEFUL_STOP_TIMEOUT:-15}" /bin/sh -c "echo 'Prestop hook started'; while [ -f /runner/.runner ]; do sleep 1; done; echo 'Waiting for dockerd to start'; while ! pgrep -x dockerd; do sleep 1; done; echo 'Prestop hook stopped'" >/proc/1/fd/1 2>&1`, + }, + }, + }, + } } if runnerContainerIndex == -1 { diff --git a/docs/detailed-docs.md b/docs/detailed-docs.md index 84eee4d3..ad4d1be8 100644 --- a/docs/detailed-docs.md +++ b/docs/detailed-docs.md @@ -36,6 +36,7 @@ ToC: - [Runner with rootless DinD](#runner-with-rootless-dind) - [Runner with k8s jobs](#runner-with-k8s-jobs) - [Additional Tweaks](#additional-tweaks) + - [Runner Graceful Termination](#runner-graceful-termination) - [Custom Volume mounts](#custom-volume-mounts) - [Runner Labels](#runner-labels) - [Runner Groups](#runner-groups) @@ -1220,6 +1221,66 @@ spec: # privileged: true ``` +### Runner Graceful Termination + +As of ARC 0.27.0 (unreleased as of 2022/09/30), runners can only wait for 15 seconds by default on pod termination. + +This can be problematic in two scenarios: + +- Scenario 1 - RunnerSet-only: You're triggering updates other than replica changes to `RunnerSet` very often- With current implementation, every update except `replicas` change to RunnerSet may result in terminating the in-progress workflow jobs to fail. +- Scenario 2 - RunnerDeployment and RunnerSet: You have another Kubernetes controller that evicts runner pods directly, not consulting ARC. + +> RunnerDeployment is not affected by the Scenario 1 as RunnerDeployment-managed runners are already tolerable to unlimitedly long in-progress running job while being replaced, as it's graceful termination process is handled outside of the entrypoint and the Kubernetes' pod termination process. + +To make it more reliable, please set `spec.template.spec.terminationGracePeriodSeconds` field and the `RUNNER_GRACEFUL_STOP_TIMEOUT` environment variable appropriately. + +If you want the pod to terminate in approximately 110 seconds at the latest since the termination request, try `terminationGracePeriodSeconds` of `110` and `RUNNER_GRACEFUL_STOP_TIMEOUT` of like `90`. + +The difference between `terminationGracePeriodSeconds` and `RUNNER_GRACEFUL_STOP_TIMEOUT` can vary depending on your environment and cluster. + +The idea is two fold: + +- `RUNNER_GRACEFUL_STOP_TIMEOUT` is for giving the runner the longest possible time to wait for the in-progress job to complete. You should keep this smaller than `terminationGracePeriodSeconds` so that you don't unnecessarily cancel running jobs. +- `terminationGracePeriodSeconds` is for giving the runner the longest possible time to stop before disappear. If the pod forcefully terminated before a graceful stop, the job running within the runner pod can hang like 10 minutes in the GitHub Actions Workflow Run/Job UI. A correct value for this avoids the hang, even though it had to cancel the running job due to the approaching deadline. + +> We know the default 15 seconds timeout is too short to be useful at all. +> In near future, we might raise the default to, for example, 100 seconds, so that runners that are tend to run up to 100 seconds jobs can +> terminate gracefully without failing running jobs. It will also allow the job which were running on the node that was requsted for termination +> to correct report its status as "cancelled", rather than hanging approximately 10 minutes in the Actions Web UI until it finally fails(without any specific error message). +> 100 seconds is just an example. It might be a good default in case you're using AWS EC2 Spot Instances because they tend to send +> termination notice two minutes before the termination. +> If you have any other suggestions for the default value, please share your thoughts in Discussions. + +#### Status and Future of this feature + +Note that this feature is currently intended for use with runner pods being terminated by other Kubernetes controller and human operators, or those being replaced by ARC RunnerSet controller due to spec change(s) except `replicas`. RunnerDeployment has no issue for the scenario. non-dind runners are affected but this feature does not support those yet. + +For example, a runner pod can be terminated prematurely by cluster-autoscaler when it's about to terminate the node on cluster scale down. +All the variants of RunnerDeployment and RunnerSet managed runner pods, including runners with dockerd sidecars, rootless and rootful dind runners are affected by it. For dind runner pods only, you can use this feature to fix or alleviate the issue. + +To be clear, an increase/decrease in the desired replicas of RunnerDeployment and RunnerSet will never result in worklfow jobs being termianted prematurely. +That's because it's handled BEFORE the runner pod is terminated, by ARC respective controller. + +For anyone interested in improving it, adding a dedicated pod finalizer for this issue will never work. +It's due to that a pod finalizer can't prevent SIGTERM from being sent when deletionTimestamp is updated to non-zero, +which triggers a Kubernetes pod termination process anyway. +What we want here is to delay the SIGTERM sent to the `actions/runner` process running within the runner container of the runner pod, +not blocking the removal of the pod resource in the Kubernetes cluster. + +Also, handling all the graceful termination scenarios with a single method may or may not work. + +The most viable option would be to do the graceful termination handling entirely in the SIGTERM handler within the runner entrypoint. +But this may or may not work long-term, as it's subject to terminationGracePeriodSeconds anyway and the author of this note thinks there still is +no formally defined limit for terminationGracePeriodSeconds and hence we arent' sure how long terminationGracePeriodSeconds can be set in practice. +Also, I think the max workflow job duration is approximately 24h. So Kubernetes must formally support setting terminationGracePeriodSeconds of 24h if +we are moving entirely to the entrypoint based solution. +If you have any insights about the matter, chime in to the development of ARC! + +That's why we still rely on ARC's own graceful termination logic in Runner controller for the spec change and replica increase/decrease of RunnerDeployment and +replica increase/decrease of RunnerSet, even though we now have the entrypoint based graceful stop handler. + +Our plan is to improve the RunnerSet to have the same logic as the Runner controller so that you don't need this feature based on the SIGTERM handler for the spec change of RunnerSet. + ### Custom Volume mounts You can configure your own custom volume mounts. For example to have the work/docker data in memory or on NVME SSD, for diff --git a/runner/actions-runner-dind-rootless.dockerfile b/runner/actions-runner-dind-rootless.dockerfile index 46e88705..db5f8751 100644 --- a/runner/actions-runner-dind-rootless.dockerfile +++ b/runner/actions-runner-dind-rootless.dockerfile @@ -102,9 +102,9 @@ RUN export ARCH=$(echo ${TARGETPLATFORM} | cut -d / -f2) \ && curl -f -L -o /usr/local/bin/dumb-init https://github.com/Yelp/dumb-init/releases/download/v${DUMB_INIT_VERSION}/dumb-init_${DUMB_INIT_VERSION}_${ARCH} \ && chmod +x /usr/local/bin/dumb-init -COPY entrypoint.sh logger.bash rootless-startup.sh update-status /usr/bin/ +COPY entrypoint-dind-rootless.sh startup.sh logger.sh graceful-stop.sh update-status /usr/bin/ -RUN chmod +x /usr/bin/rootless-startup.sh /usr/bin/entrypoint.sh +RUN chmod +x /usr/bin/entrypoint-dind-rootless.sh /usr/bin/startup.sh # Copy the docker shim which propagates the docker MTU to underlying networks # to replace the docker binary in the PATH. @@ -140,5 +140,5 @@ RUN curl -fsSL https://get.docker.com/rootless | sh RUN curl -L "https://github.com/docker/compose/releases/download/${COMPOSE_VERSION}/docker-compose-Linux-x86_64" -o /home/runner/bin/docker-compose ; \ chmod +x /home/runner/bin/docker-compose -ENTRYPOINT ["/usr/local/bin/dumb-init", "--"] -CMD ["rootless-startup.sh"] +ENTRYPOINT ["/bin/bash", "-c"] +CMD ["entrypoint-dind-rootless.sh"] diff --git a/runner/actions-runner-dind.dockerfile b/runner/actions-runner-dind.dockerfile index 8937f9a6..7d8fa864 100644 --- a/runner/actions-runner-dind.dockerfile +++ b/runner/actions-runner-dind.dockerfile @@ -99,9 +99,9 @@ RUN mkdir /opt/hostedtoolcache \ # We place the scripts in `/usr/bin` so that users who extend this image can # override them with scripts of the same name placed in `/usr/local/bin`. -COPY entrypoint.sh logger.bash startup.sh update-status /usr/bin/ +COPY entrypoint-dind.sh startup.sh logger.sh wait.sh graceful-stop.sh update-status /usr/bin/ COPY supervisor/ /etc/supervisor/conf.d/ -RUN chmod +x /usr/bin/startup.sh /usr/bin/entrypoint.sh +RUN chmod +x /usr/bin/entrypoint-dind.sh /usr/bin/startup.sh # Copy the docker shim which propagates the docker MTU to underlying networks # to replace the docker binary in the PATH. @@ -130,5 +130,5 @@ RUN echo "PATH=${PATH}" > /etc/environment \ # No group definition, as that makes it harder to run docker. USER runner -ENTRYPOINT ["/usr/local/bin/dumb-init", "--"] -CMD ["startup.sh"] +ENTRYPOINT ["/bin/bash", "-c"] +CMD ["entrypoint-dind.sh"] diff --git a/runner/actions-runner.dockerfile b/runner/actions-runner.dockerfile index 88862057..d7b72fec 100644 --- a/runner/actions-runner.dockerfile +++ b/runner/actions-runner.dockerfile @@ -117,7 +117,7 @@ RUN mkdir /opt/hostedtoolcache \ # We place the scripts in `/usr/bin` so that users who extend this image can # override them with scripts of the same name placed in `/usr/local/bin`. -COPY entrypoint.sh logger.bash update-status /usr/bin/ +COPY entrypoint.sh startup.sh logger.sh graceful-stop.sh update-status /usr/bin/ # Copy the docker shim which propagates the docker MTU to underlying networks # to replace the docker binary in the PATH. @@ -136,5 +136,5 @@ RUN echo "PATH=${PATH}" > /etc/environment \ USER runner -ENTRYPOINT ["/usr/local/bin/dumb-init", "--"] +ENTRYPOINT ["/bin/bash", "-c"] CMD ["entrypoint.sh"] diff --git a/runner/rootless-startup.sh b/runner/entrypoint-dind-rootless.sh similarity index 60% rename from runner/rootless-startup.sh rename to runner/entrypoint-dind-rootless.sh index 52066678..16023bc8 100644 --- a/runner/rootless-startup.sh +++ b/runner/entrypoint-dind-rootless.sh @@ -1,5 +1,7 @@ #!/bin/bash -source logger.bash +source logger.sh +source graceful-stop.sh +trap graceful_stop TERM log.notice "Writing out Docker config file" /bin/bash <