fix: runner pods managed by RunnerSet to not stuck in Terminating (#1420)

This is intended to fix #1369 mostly for RunnerSet-managed runner pods. It is "mostly" because this fix might work well for RunnerDeployment in cases that #1395 does not work, like in a case that the user explicitly set the runner pod restart policy to anything other than "Never".

Ref #1369
This commit is contained in:
Yusuke Kuoka 2022-05-12 17:34:27 +09:00 committed by GitHub
parent 3a7e8c844b
commit e46b90f758
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 38 additions and 2 deletions

View File

@ -206,6 +206,24 @@ func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
return stopped
}
func ephemeralRunnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus {
if getRunnerEnv(pod, "RUNNER_EPHEMERAL") != "true" {
return nil
}
for _, status := range pod.Status.ContainerStatuses {
if status.Name != containerName {
continue
}
status := status
return &status
}
return nil
}
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) {
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)

View File

@ -113,9 +113,27 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
// Happens e.g. when dind is in runner and run completes
log.Info("Runner pod has been stopped with a successful status.")
} else if pod != nil && pod.Annotations[AnnotationKeyRunnerCompletionWaitStartTimestamp] != "" {
log.Info("Runner pod is annotated to wait for completion")
ct := ephemeralRunnerContainerStatus(pod)
if ct == nil {
log.Info("Runner pod is annotated to wait for completion, and the runner container is not ephemeral")
return &ctrl.Result{RequeueAfter: retryDelay}, nil
}
lts := ct.LastTerminationState.Terminated
if lts == nil {
log.Info("Runner pod is annotated to wait for completion, and the runner container is not restarting")
return &ctrl.Result{RequeueAfter: retryDelay}, nil
}
// Prevent runner pod from stucking in Terminating.
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/1369
log.Info("Deleting runner pod anyway because it has stopped prematurely. This may leave a dangling runner resource in GitHub Actions",
"lastState.exitCode", lts.ExitCode,
"lastState.message", lts.Message,
"pod.phase", pod.Status.Phase,
)
} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, *runnerID); err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,