Handle offline runners gracefully (#341)

* if a runner pod starts up with an invalid token, it will go in an 
infinite retry loop, appearing as RUNNING from the outside
* normally, this error situation is detected because no corresponding 
runner objects exists in GitHub and the pod will get removed after 
registration timeout
* if the GitHub runner object already existed before - e.g. because a 
finalizer was not properly run as part of a partial Kubernetes crash, 
the runner will always stay in a running mode, even updating the 
registration token will not kill the problematic pod
* introducing RunnerOffline exception that can be handled in runner 
controller and replicaset controller
* as runners are offline when a pod is completed and marked for restart, 
only do additional restart checks if no restart was already decided, 
making code a bit cleaner and saving GitHub API calls after each job 
completion
This commit is contained in:
Johannes Nicolai 2021-02-22 02:08:04 +01:00 committed by GitHub
parent dd0b9f3e95
commit 2d7fbbfb68
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 94 additions and 48 deletions

View File

@ -244,60 +244,85 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
return ctrl.Result{}, err return ctrl.Result{}, err
} }
notRegistered := false // all checks done below only decide whether a restart is needed
// if a restart was already decided before, there is no need for the checks
// saving API calls and scary log messages
if !restart {
runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) notRegistered := false
if err != nil { offline := false
var e *github.RunnerNotFound
if errors.As(err, &e) {
log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name)
notRegistered = true runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
} else { if err != nil {
var e *gogithub.RateLimitError var notFoundException *github.RunnerNotFound
if errors.As(err, &e) { var offlineException *github.RunnerOffline
// We log the underlying error when we failed calling GitHub API to list or unregisters, if errors.As(err, &notFoundException) {
// or the runner is still busy. log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name)
log.Error(
err,
fmt.Sprintf(
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err notRegistered = true
} else if errors.As(err, &offlineException) {
log.V(1).Info("GitHub runner appears to be offline, waiting for runner to get online ...", "runnerName", runner.Name)
offline = true
} else {
var e *gogithub.RateLimitError
if errors.As(err, &e) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
return ctrl.Result{}, err
} }
return ctrl.Result{}, err
} }
}
// See the `newPod` function called above for more information // See the `newPod` function called above for more information
// about when this hash changes. // about when this hash changes.
curHash := pod.Labels[LabelKeyPodTemplateHash] curHash := pod.Labels[LabelKeyPodTemplateHash]
newHash := newPod.Labels[LabelKeyPodTemplateHash] newHash := newPod.Labels[LabelKeyPodTemplateHash]
if !runnerBusy && curHash != newHash { if !runnerBusy && curHash != newHash {
restart = true restart = true
} }
registrationTimeout := 10 * time.Minute registrationTimeout := 10 * time.Minute
currentTime := time.Now() currentTime := time.Now()
registrationDidTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout)) > 0 registrationDidTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout)) > 0
if notRegistered && registrationDidTimeout { if notRegistered && registrationDidTimeout {
log.Info( log.Info(
"Runner failed to register itself to GitHub in timely manner. "+ "Runner failed to register itself to GitHub in timely manner. "+
"Recreating the pod to see if it resolves the issue. "+ "Recreating the pod to see if it resolves the issue. "+
"CAUTION: If you see this a lot, you should investigate the root cause. "+ "CAUTION: If you see this a lot, you should investigate the root cause. "+
"See https://github.com/summerwind/actions-runner-controller/issues/288", "See https://github.com/summerwind/actions-runner-controller/issues/288",
"podCreationTimestamp", pod.CreationTimestamp, "podCreationTimestamp", pod.CreationTimestamp,
"currentTime", currentTime, "currentTime", currentTime,
"configuredRegistrationTimeout", registrationTimeout, "configuredRegistrationTimeout", registrationTimeout,
) )
restart = true
}
if offline && registrationDidTimeout {
log.Info(
"Already existing GitHub runner still appears offline . "+
"Recreating the pod to see if it resolves the issue. "+
"CAUTION: If you see this a lot, you should investigate the root cause. ",
"podCreationTimestamp", pod.CreationTimestamp,
"currentTime", currentTime,
"configuredRegistrationTimeout", registrationTimeout,
)
restart = true
}
restart = true
} }
// Don't do anything if there's no need to restart the runner // Don't do anything if there's no need to restart the runner

View File

@ -109,11 +109,15 @@ func (r *RunnerReplicaSetReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
busy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) busy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
if err != nil { if err != nil {
notRegistered := false notRegistered := false
offline := false
var e *github.RunnerNotFound var notFoundException *github.RunnerNotFound
if errors.As(err, &e) { var offlineException *github.RunnerOffline
log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or has not managed yet to, and therefore we prioritize it for deletion", "runnerName", runner.Name) if errors.As(err, &notFoundException) {
log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name)
notRegistered = true notRegistered = true
} else if errors.As(err, &offlineException) {
offline = true
} else { } else {
var e *gogithub.RateLimitError var e *gogithub.RateLimitError
if errors.As(err, &e) { if errors.As(err, &e) {
@ -140,7 +144,7 @@ func (r *RunnerReplicaSetReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
if notRegistered && registrationDidTimeout { if notRegistered && registrationDidTimeout {
log.Info( log.Info(
"Runner failed to register itself to GitHub in timely manner. "+ "Runner failed to register itself to GitHub in timely manner. "+
"Recreating the pod to see if it resolves the issue. "+ "Marking the runner for scale down. "+
"CAUTION: If you see this a lot, you should investigate the root cause. "+ "CAUTION: If you see this a lot, you should investigate the root cause. "+
"See https://github.com/summerwind/actions-runner-controller/issues/288", "See https://github.com/summerwind/actions-runner-controller/issues/288",
"runnerCreationTimestamp", runner.CreationTimestamp, "runnerCreationTimestamp", runner.CreationTimestamp,
@ -150,6 +154,12 @@ func (r *RunnerReplicaSetReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e
notBusy = append(notBusy, runner) notBusy = append(notBusy, runner)
} }
// offline runners should always be a great target for scale down
if offline {
notBusy = append(notBusy, runner)
}
} else if !busy { } else if !busy {
notBusy = append(notBusy, runner) notBusy = append(notBusy, runner)
} }

View File

@ -310,6 +310,14 @@ func (e *RunnerNotFound) Error() string {
return fmt.Sprintf("runner %q not found", e.runnerName) return fmt.Sprintf("runner %q not found", e.runnerName)
} }
type RunnerOffline struct {
runnerName string
}
func (e *RunnerOffline) Error() string {
return fmt.Sprintf("runner %q offline", e.runnerName)
}
func (r *Client) IsRunnerBusy(ctx context.Context, enterprise, org, repo, name string) (bool, error) { func (r *Client) IsRunnerBusy(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
runners, err := r.ListRunners(ctx, enterprise, org, repo) runners, err := r.ListRunners(ctx, enterprise, org, repo)
if err != nil { if err != nil {
@ -318,6 +326,9 @@ func (r *Client) IsRunnerBusy(ctx context.Context, enterprise, org, repo, name s
for _, runner := range runners { for _, runner := range runners {
if runner.GetName() == name { if runner.GetName() == name {
if runner.GetStatus() == "offline" {
return false, &RunnerOffline{runnerName: name}
}
return runner.GetBusy(), nil return runner.GetBusy(), nil
} }
} }