Fix runner pod to not stuck in Terminating when runner got deleted before pod scheduling (#2043)

This fixes the said issue that I found while I was running a series of E2E tests to test other features and pull requestes I have recently contributed.
This commit is contained in:
Yusuke Kuoka 2022-11-27 11:13:38 +09:00 committed by GitHub
parent 877c93c5c3
commit 96a930bfd9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 1 deletions

View File

@ -207,9 +207,35 @@ func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
if status.State.Terminated != nil {
stopped = true
break
}
}
}
if pod.DeletionTimestamp != nil && !pod.DeletionTimestamp.IsZero() && len(pod.Status.ContainerStatuses) == 0 {
// This falls into cases where the pod is stuck with pod status like the below:
//
// status:
// conditions:
// - lastProbeTime: null
// lastTransitionTime: "2022-11-20T07:58:05Z"
// message: 'binding rejected: running Bind plugin "DefaultBinder": Operation cannot
// be fulfilled on pods/binding "org-runnerdeploy-l579v-qx5p2": pod org-runnerdeploy-l579v-qx5p2
// is being deleted, cannot be assigned to a host'
// reason: SchedulerError
// status: "False"
// type: PodScheduled
// phase: Pending
// qosClass: BestEffort
//
// ARC usually waits for the registration timeout to elapse when the pod is terminated before getting scheduled onto a node,
// assuming there can be a race condition between ARC and Kubernetes where Kubernetes schedules the pod while ARC is deleting the pod,
// which may end up with non-gracefully terminating the runner.
//
// However, Kubernetes seems to not schedule the pod after observing status like the above.
// This if-block is therefore needed to prevent ARC from unnecessarily waiting for the registration timeout to happen.
stopped = true
}
}
return stopped

View File

@ -102,7 +102,7 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
!podIsPending(pod) {
log.Info(
"Unregistration started before runner obtains ID. Waiting for the regisration timeout to elapse, or the runner to obtain ID, or the runner pod to stop",
"Unregistration started before runner obtains ID. Waiting for the registration timeout to elapse, or the runner to obtain ID, or the runner pod to stop",
"registrationTimeout", registrationTimeout,
)
return &ctrl.Result{RequeueAfter: retryDelay}, nil