From e46b90f75870ebce1df41966f475e5c8e16e2754 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Thu, 12 May 2022 17:34:27 +0900 Subject: [PATCH] fix: runner pods managed by RunnerSet to not stuck in Terminating (#1420) This is intended to fix #1369 mostly for RunnerSet-managed runner pods. It is "mostly" because this fix might work well for RunnerDeployment in cases that #1395 does not work, like in a case that the user explicitly set the runner pod restart policy to anything other than "Never". Ref #1369 --- controllers/runner_controller.go | 18 ++++++++++++++++++ controllers/runner_graceful_stop.go | 22 ++++++++++++++++++++-- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index ec63be90..7f6ec506 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -206,6 +206,24 @@ func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool { return stopped } +func ephemeralRunnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus { + if getRunnerEnv(pod, "RUNNER_EPHEMERAL") != "true" { + return nil + } + + for _, status := range pod.Status.ContainerStatuses { + if status.Name != containerName { + continue + } + + status := status + + return &status + } + + return nil +} + func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) { finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName) diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index 86de46bb..82fa0322 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -113,9 +113,27 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l // Happens e.g. when dind is in runner and run completes log.Info("Runner pod has been stopped with a successful status.") } else if pod != nil && pod.Annotations[AnnotationKeyRunnerCompletionWaitStartTimestamp] != "" { - log.Info("Runner pod is annotated to wait for completion") + ct := ephemeralRunnerContainerStatus(pod) + if ct == nil { + log.Info("Runner pod is annotated to wait for completion, and the runner container is not ephemeral") - return &ctrl.Result{RequeueAfter: retryDelay}, nil + return &ctrl.Result{RequeueAfter: retryDelay}, nil + } + + lts := ct.LastTerminationState.Terminated + if lts == nil { + log.Info("Runner pod is annotated to wait for completion, and the runner container is not restarting") + + return &ctrl.Result{RequeueAfter: retryDelay}, nil + } + + // Prevent runner pod from stucking in Terminating. + // See https://github.com/actions-runner-controller/actions-runner-controller/issues/1369 + log.Info("Deleting runner pod anyway because it has stopped prematurely. This may leave a dangling runner resource in GitHub Actions", + "lastState.exitCode", lts.ExitCode, + "lastState.message", lts.Message, + "pod.phase", pod.Status.Phase, + ) } else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, *runnerID); err != nil { if errors.Is(err, &gogithub.RateLimitError{}) { // We log the underlying error when we failed calling GitHub API to list or unregisters,