Handle missing runner ID more gracefully (#1855)
so that ARC respect the registration timeout, terminationGracePeriodSeconds and RUNNER_GRACEFUL_STOP_TIMEOUT(#1759) when the runner pod was terminated externally too early after its creation While I was running E2E tests for #1759, I discovered a potential issue that ARC can terminate runner pods without waiting for the registration timeout of 10 minutes. You won't be affected by this in normal circumstances, as this failure scenario can be triggered only when you or another K8s controller like cluster-autoscaler deleted the runner or the runner pod immediately after the runner or the runner pod has been created. But probably is it worth fixing it anyway because it's not impossible to trigger it?
This commit is contained in:
parent
6aaff4ecee
commit
7ff5b7da8c
|
|
@ -98,11 +98,27 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l
|
|||
// If it's already unregistered in the previous reconcilation loop,
|
||||
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
|
||||
log.Info("Runner pod is marked as already unregistered.")
|
||||
} else if runnerID == nil {
|
||||
} else if runnerID == nil && !runnerPodOrContainerIsStopped(pod) && !podConditionTransitionTimeAfter(pod, corev1.PodReady, registrationTimeout) {
|
||||
log.Info(
|
||||
"Unregistration started before runner ID is assigned. " +
|
||||
"Unregistration started before runner obtains ID. Waiting for the regisration timeout to elapse, or the runner to obtain ID, or the runner pod to stop",
|
||||
"registrationTimeout", registrationTimeout,
|
||||
)
|
||||
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||
} else if runnerID == nil && runnerPodOrContainerIsStopped(pod) {
|
||||
log.Info(
|
||||
"Unregistration started before runner ID is assigned and the runner stopped before obtaining ID within registration timeout. "+
|
||||
"Perhaps the runner successfully ran the job and stopped normally before the runner ID becomes visible via GitHub API? "+
|
||||
"Perhaps the runner pod was terminated by anyone other than ARC? Was it OOM killed? "+
|
||||
"Marking unregistration as completed anyway because there's nothing ARC can do.",
|
||||
"registrationTimeout", registrationTimeout,
|
||||
)
|
||||
} else if runnerID == nil && podConditionTransitionTimeAfter(pod, corev1.PodReady, registrationTimeout) {
|
||||
log.Info(
|
||||
"Unregistration started before runner ID is assigned and the runner was unable to obtain ID within registration timeout. "+
|
||||
"Perhaps the runner has communication issue, or a firewall egress rule is dropping traffic to GitHub API, or GitHub API is unavailable? "+
|
||||
"Marking unregistration as completed anyway because there's nothing ARC can do. "+
|
||||
"This may result in in cancelling the job depending on your terminationGracePeriodSeconds and RUNNER_GRACEFUL_STOP_TIMEOUT settings.",
|
||||
"registrationTimeout", registrationTimeout,
|
||||
)
|
||||
} else if pod != nil && runnerPodOrContainerIsStopped(pod) {
|
||||
// If it's an ephemeral runner with the actions/runner container exited with 0,
|
||||
|
|
@ -351,6 +367,7 @@ func setRunnerEnv(pod *corev1.Pod, key, value string) {
|
|||
// Case 1. (true, nil) when it has successfully unregistered the runner.
|
||||
// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet)
|
||||
// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
|
||||
//
|
||||
// an error returned by GitHub API.
|
||||
//
|
||||
// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately.
|
||||
|
|
|
|||
Loading…
Reference in New Issue