fix: runner pods managed by RunnerSet to not stuck in Terminating (#1420)
This is intended to fix #1369 mostly for RunnerSet-managed runner pods. It is "mostly" because this fix might work well for RunnerDeployment in cases that #1395 does not work, like in a case that the user explicitly set the runner pod restart policy to anything other than "Never". Ref #1369
This commit is contained in:
parent
3a7e8c844b
commit
e46b90f758
|
|
@ -206,6 +206,24 @@ func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
|
||||||
return stopped
|
return stopped
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func ephemeralRunnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus {
|
||||||
|
if getRunnerEnv(pod, "RUNNER_EPHEMERAL") != "true" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, status := range pod.Status.ContainerStatuses {
|
||||||
|
if status.Name != containerName {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
status := status
|
||||||
|
|
||||||
|
return &status
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) {
|
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) {
|
||||||
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -113,9 +113,27 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
|
||||||
// Happens e.g. when dind is in runner and run completes
|
// Happens e.g. when dind is in runner and run completes
|
||||||
log.Info("Runner pod has been stopped with a successful status.")
|
log.Info("Runner pod has been stopped with a successful status.")
|
||||||
} else if pod != nil && pod.Annotations[AnnotationKeyRunnerCompletionWaitStartTimestamp] != "" {
|
} else if pod != nil && pod.Annotations[AnnotationKeyRunnerCompletionWaitStartTimestamp] != "" {
|
||||||
log.Info("Runner pod is annotated to wait for completion")
|
ct := ephemeralRunnerContainerStatus(pod)
|
||||||
|
if ct == nil {
|
||||||
|
log.Info("Runner pod is annotated to wait for completion, and the runner container is not ephemeral")
|
||||||
|
|
||||||
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
lts := ct.LastTerminationState.Terminated
|
||||||
|
if lts == nil {
|
||||||
|
log.Info("Runner pod is annotated to wait for completion, and the runner container is not restarting")
|
||||||
|
|
||||||
|
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prevent runner pod from stucking in Terminating.
|
||||||
|
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/1369
|
||||||
|
log.Info("Deleting runner pod anyway because it has stopped prematurely. This may leave a dangling runner resource in GitHub Actions",
|
||||||
|
"lastState.exitCode", lts.ExitCode,
|
||||||
|
"lastState.message", lts.Message,
|
||||||
|
"pod.phase", pod.Status.Phase,
|
||||||
|
)
|
||||||
} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, *runnerID); err != nil {
|
} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, *runnerID); err != nil {
|
||||||
if errors.Is(err, &gogithub.RateLimitError{}) {
|
if errors.Is(err, &gogithub.RateLimitError{}) {
|
||||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue