diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 3380a70a..99056212 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -244,60 +244,85 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { return ctrl.Result{}, err } - notRegistered := false + // all checks done below only decide whether a restart is needed + // if a restart was already decided before, there is no need for the checks + // saving API calls and scary log messages + if !restart { - runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) - if err != nil { - var e *github.RunnerNotFound - if errors.As(err, &e) { - log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name) + notRegistered := false + offline := false - notRegistered = true - } else { - var e *gogithub.RateLimitError - if errors.As(err, &e) { - // We log the underlying error when we failed calling GitHub API to list or unregisters, - // or the runner is still busy. - log.Error( - err, - fmt.Sprintf( - "Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls", - retryDelayOnGitHubAPIRateLimitError, - ), - ) + runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) + if err != nil { + var notFoundException *github.RunnerNotFound + var offlineException *github.RunnerOffline + if errors.As(err, ¬FoundException) { + log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name) - return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err + notRegistered = true + } else if errors.As(err, &offlineException) { + log.V(1).Info("GitHub runner appears to be offline, waiting for runner to get online ...", "runnerName", runner.Name) + offline = true + } else { + var e *gogithub.RateLimitError + if errors.As(err, &e) { + // We log the underlying error when we failed calling GitHub API to list or unregisters, + // or the runner is still busy. + log.Error( + err, + fmt.Sprintf( + "Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls", + retryDelayOnGitHubAPIRateLimitError, + ), + ) + + return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err + } + + return ctrl.Result{}, err } - - return ctrl.Result{}, err } - } - // See the `newPod` function called above for more information - // about when this hash changes. - curHash := pod.Labels[LabelKeyPodTemplateHash] - newHash := newPod.Labels[LabelKeyPodTemplateHash] + // See the `newPod` function called above for more information + // about when this hash changes. + curHash := pod.Labels[LabelKeyPodTemplateHash] + newHash := newPod.Labels[LabelKeyPodTemplateHash] - if !runnerBusy && curHash != newHash { - restart = true - } + if !runnerBusy && curHash != newHash { + restart = true + } - registrationTimeout := 10 * time.Minute - currentTime := time.Now() - registrationDidTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout)) > 0 + registrationTimeout := 10 * time.Minute + currentTime := time.Now() + registrationDidTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout)) > 0 - if notRegistered && registrationDidTimeout { - log.Info( - "Runner failed to register itself to GitHub in timely manner. "+ - "Recreating the pod to see if it resolves the issue. "+ - "CAUTION: If you see this a lot, you should investigate the root cause. "+ - "See https://github.com/summerwind/actions-runner-controller/issues/288", - "podCreationTimestamp", pod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) + if notRegistered && registrationDidTimeout { + log.Info( + "Runner failed to register itself to GitHub in timely manner. "+ + "Recreating the pod to see if it resolves the issue. "+ + "CAUTION: If you see this a lot, you should investigate the root cause. "+ + "See https://github.com/summerwind/actions-runner-controller/issues/288", + "podCreationTimestamp", pod.CreationTimestamp, + "currentTime", currentTime, + "configuredRegistrationTimeout", registrationTimeout, + ) + + restart = true + } + + if offline && registrationDidTimeout { + log.Info( + "Already existing GitHub runner still appears offline . "+ + "Recreating the pod to see if it resolves the issue. "+ + "CAUTION: If you see this a lot, you should investigate the root cause. ", + "podCreationTimestamp", pod.CreationTimestamp, + "currentTime", currentTime, + "configuredRegistrationTimeout", registrationTimeout, + ) + + restart = true + } - restart = true } // Don't do anything if there's no need to restart the runner diff --git a/controllers/runnerreplicaset_controller.go b/controllers/runnerreplicaset_controller.go index 461fbc83..b1f8c0de 100644 --- a/controllers/runnerreplicaset_controller.go +++ b/controllers/runnerreplicaset_controller.go @@ -109,11 +109,15 @@ func (r *RunnerReplicaSetReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e busy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) if err != nil { notRegistered := false + offline := false - var e *github.RunnerNotFound - if errors.As(err, &e) { - log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or has not managed yet to, and therefore we prioritize it for deletion", "runnerName", runner.Name) + var notFoundException *github.RunnerNotFound + var offlineException *github.RunnerOffline + if errors.As(err, ¬FoundException) { + log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name) notRegistered = true + } else if errors.As(err, &offlineException) { + offline = true } else { var e *gogithub.RateLimitError if errors.As(err, &e) { @@ -140,7 +144,7 @@ func (r *RunnerReplicaSetReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e if notRegistered && registrationDidTimeout { log.Info( "Runner failed to register itself to GitHub in timely manner. "+ - "Recreating the pod to see if it resolves the issue. "+ + "Marking the runner for scale down. "+ "CAUTION: If you see this a lot, you should investigate the root cause. "+ "See https://github.com/summerwind/actions-runner-controller/issues/288", "runnerCreationTimestamp", runner.CreationTimestamp, @@ -150,6 +154,12 @@ func (r *RunnerReplicaSetReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e notBusy = append(notBusy, runner) } + + // offline runners should always be a great target for scale down + if offline { + notBusy = append(notBusy, runner) + } + } else if !busy { notBusy = append(notBusy, runner) } diff --git a/github/github.go b/github/github.go index 4bcfeee8..027ad76e 100644 --- a/github/github.go +++ b/github/github.go @@ -310,6 +310,14 @@ func (e *RunnerNotFound) Error() string { return fmt.Sprintf("runner %q not found", e.runnerName) } +type RunnerOffline struct { + runnerName string +} + +func (e *RunnerOffline) Error() string { + return fmt.Sprintf("runner %q offline", e.runnerName) +} + func (r *Client) IsRunnerBusy(ctx context.Context, enterprise, org, repo, name string) (bool, error) { runners, err := r.ListRunners(ctx, enterprise, org, repo) if err != nil { @@ -318,6 +326,9 @@ func (r *Client) IsRunnerBusy(ctx context.Context, enterprise, org, repo, name s for _, runner := range runners { if runner.GetName() == name { + if runner.GetStatus() == "offline" { + return false, &RunnerOffline{runnerName: name} + } return runner.GetBusy(), nil } }