From b09c54045a09f3c0f10621210219eb2031719328 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Fri, 8 Apr 2022 10:17:33 +0900 Subject: [PATCH] Prevent runners from stuck in Terminating when pod disappeared without standard termination process (#1318) This fixes the said issue by additionally treating any runner pod whose phase is Failed or the runner container exited with non-zero code as "complete" so that ARC gives up unregistering the runner from Actions, deletes the runner pod anyway. Note that there are a plenty of causes for that. If you are deploying runner pods on AWS spot instances or GCE preemptive instances and a job assigned to a runner took more time than the shutdown grace period provided by your cloud provider (2 minutes for AWS spot instances), the runner pod would be terminated prematurely without letting actions/runner unregisters itself from Actions. If your VM or hypervisor failed then runner pods that were running on the node will become PodFailed without unregistering runners from Actions. Please beware that it is currently users responsibility to clean up any dangling runner resources on GitHub Actions. Ref https://github.com/actions-runner-controller/actions-runner-controller/issues/1307 Might also relate to https://github.com/actions-runner-controller/actions-runner-controller/issues/1273 --- controllers/runner_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 0619a06a..dc276dd2 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -188,7 +188,7 @@ func runnerContainerExitCode(pod *corev1.Pod) *int32 { func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool { // If pod has ended up succeeded we need to restart it // Happens e.g. when dind is in runner and run completes - stopped := pod.Status.Phase == corev1.PodSucceeded + stopped := pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed if !stopped { if pod.Status.Phase == corev1.PodRunning { @@ -197,7 +197,7 @@ func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool { continue } - if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 { + if status.State.Terminated != nil { stopped = true } }