From 3f23501b8efb44e34d64c07591b98015cab08721 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Tue, 16 Mar 2021 10:52:30 +0900 Subject: [PATCH] Reduce "No runner matching the specified labels was found" errors while runner replacement (#392) We occasionally encountered those errors while the underlying RunnerReplicaSet is being recreated/replaced on RunnerDeployment.Spec.Template update. It turned out to be due to that the RunnerDeployment controller was waiting for the runner pod becomes `Running`, intead of the new replacement runner to have registered to GitHub. This fixes that, by trying to Runner.Status.Phase to `Running` only after the runner in the runner pod appears to be registered. A side-effect of this change is that runner controller would call more "ListRunners" GitHub Actions API. I've reviewed and improved the runner controller code and Runner CRD to make make the number of calls minimum. In most cases, ListRunners should be called only twice for each runner creation. --- api/v1alpha1/runner_types.go | 3 + api/v1alpha1/zz_generated.deepcopy.go | 4 + charts/actions-runner-controller/Chart.yaml | 2 +- .../crds/actions.summerwind.dev_runners.yaml | 3 + .../bases/actions.summerwind.dev_runners.yaml | 3 + controllers/runner_controller.go | 171 ++++++++++++------ controllers/runnerdeployment_controller.go | 5 +- github/github.go | 2 +- 8 files changed, 139 insertions(+), 54 deletions(-) diff --git a/api/v1alpha1/runner_types.go b/api/v1alpha1/runner_types.go index 9a1d06aa..c3fa5b27 100644 --- a/api/v1alpha1/runner_types.go +++ b/api/v1alpha1/runner_types.go @@ -125,6 +125,9 @@ type RunnerStatus struct { Phase string `json:"phase"` Reason string `json:"reason"` Message string `json:"message"` + + //+optional + LastRegistrationCheckTime *metav1.Time `json:"lastRegistrationCheckTime"` } // RunnerStatusRegistration contains runner registration status diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 05d077f0..a0d49f3e 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -715,6 +715,10 @@ func (in *RunnerSpec) DeepCopy() *RunnerSpec { func (in *RunnerStatus) DeepCopyInto(out *RunnerStatus) { *out = *in in.Registration.DeepCopyInto(&out.Registration) + if in.LastRegistrationCheckTime != nil { + in, out := &in.LastRegistrationCheckTime, &out.LastRegistrationCheckTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RunnerStatus. diff --git a/charts/actions-runner-controller/Chart.yaml b/charts/actions-runner-controller/Chart.yaml index 6e2061ab..24e1219a 100644 --- a/charts/actions-runner-controller/Chart.yaml +++ b/charts/actions-runner-controller/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.9.0 +version: 0.10.0 home: https://github.com/summerwind/actions-runner-controller diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_runners.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_runners.yaml index 090f1af6..0f77c190 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_runners.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_runners.yaml @@ -1541,6 +1541,9 @@ spec: status: description: RunnerStatus defines the observed state of Runner properties: + lastRegistrationCheckTime: + format: date-time + type: string message: type: string phase: diff --git a/config/crd/bases/actions.summerwind.dev_runners.yaml b/config/crd/bases/actions.summerwind.dev_runners.yaml index 090f1af6..0f77c190 100644 --- a/config/crd/bases/actions.summerwind.dev_runners.yaml +++ b/config/crd/bases/actions.summerwind.dev_runners.yaml @@ -1541,6 +1541,9 @@ spec: status: description: RunnerStatus defines the observed state of Runner properties: + lastRegistrationCheckTime: + format: date-time + type: string message: type: string phase: diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 02c512bf..3b7f109a 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -22,6 +22,7 @@ import ( "fmt" gogithub "github.com/google/go-github/v33/github" "github.com/summerwind/actions-runner-controller/hash" + "k8s.io/apimachinery/pkg/util/wait" "strings" "time" @@ -129,8 +130,8 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { newRunner := runner.DeepCopy() newRunner.ObjectMeta.Finalizers = finalizers - if err := r.Update(ctx, newRunner); err != nil { - log.Error(err, "Failed to update runner") + if err := r.Patch(ctx, newRunner, client.MergeFrom(&runner)); err != nil { + log.Error(err, "Failed to update runner for finalizer removal") return ctrl.Result{}, err } @@ -159,31 +160,23 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { } if err := r.Create(ctx, &newPod); err != nil { + if kerrors.IsAlreadyExists(err) { + // Gracefully handle pod-already-exists errors due to informer cache delay. + // Without this we got a few errors like the below on new runner pod: + // 2021-03-16T00:23:10.116Z ERROR controller-runtime.controller Reconciler error {"controller": "runner-controller", "request": "default/example-runnerdeploy-b2g2g-j4mcp", "error": "pods \"example-runnerdeploy-b2g2g-j4mcp\" already exists"} + log.Info("Runner pod already exists. Probably this pod has been already created in previous reconcilation but the new pod is not yet cached.") + + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + log.Error(err, "Failed to create pod resource") + return ctrl.Result{}, err } r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodCreated", fmt.Sprintf("Created pod '%s'", newPod.Name)) log.Info("Created runner pod", "repository", runner.Spec.Repository) } else { - // If pod has ended up succeeded we need to restart it - // Happens e.g. when dind is in runner and run completes - restart := pod.Status.Phase == corev1.PodSucceeded - - if !restart && runner.Status.Phase != string(pod.Status.Phase) { - updated := runner.DeepCopy() - updated.Status.Phase = string(pod.Status.Phase) - updated.Status.Reason = pod.Status.Reason - updated.Status.Message = pod.Status.Message - - if err := r.Status().Update(ctx, updated); err != nil { - log.Error(err, "Failed to update runner status") - return ctrl.Result{}, err - } - - return ctrl.Result{}, nil - } - if !pod.ObjectMeta.DeletionTimestamp.IsZero() { deletionTimeout := 1 * time.Minute currentTime := time.Now() @@ -220,6 +213,10 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { } } + // If pod has ended up succeeded we need to restart it + // Happens e.g. when dind is in runner and run completes + restart := pod.Status.Phase == corev1.PodSucceeded + if pod.Status.Phase == corev1.PodRunning { for _, status := range pod.Status.ContainerStatuses { if status.Name != containerName { @@ -244,24 +241,45 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { return ctrl.Result{}, err } + var registrationRecheckDelay time.Duration + // all checks done below only decide whether a restart is needed // if a restart was already decided before, there is no need for the checks - // saving API calls and scary log messages + // saving API calls and scary{ log messages if !restart { + registrationCheckInterval := time.Minute - notRegistered := false + // We want to call ListRunners GitHub Actions API only once per runner per minute. + // This if block, in conjunction with: + // return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil + // achieves that. + if lastCheckTime := runner.Status.LastRegistrationCheckTime; lastCheckTime != nil { + nextCheckTime := lastCheckTime.Add(registrationCheckInterval) + if nextCheckTime.After(time.Now()) { + log.Info( + fmt.Sprintf("Skipping registration check because it's deferred until %s", nextCheckTime), + ) + + // Note that we don't need to explicitly requeue on this reconcilation because + // the requeue should have been already scheduled previsouly + // (with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil` as noted above and coded below) + return ctrl.Result{}, nil + } + } + + notFound := false offline := false runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) + + currentTime := time.Now() + if err != nil { var notFoundException *github.RunnerNotFound var offlineException *github.RunnerOffline if errors.As(err, ¬FoundException) { - log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name) - - notRegistered = true + notFound = true } else if errors.As(err, &offlineException) { - log.V(1).Info("GitHub runner appears to be offline, waiting for runner to get online ...", "runnerName", runner.Name) offline = true } else { var e *gogithub.RateLimitError @@ -293,40 +311,91 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { } registrationTimeout := 10 * time.Minute - currentTime := time.Now() - registrationDidTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout)) > 0 + durationAfterRegistrationTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout)) + registrationDidTimeout := durationAfterRegistrationTimeout > 0 - if notRegistered && registrationDidTimeout { - log.Info( - "Runner failed to register itself to GitHub in timely manner. "+ - "Recreating the pod to see if it resolves the issue. "+ - "CAUTION: If you see this a lot, you should investigate the root cause. "+ - "See https://github.com/summerwind/actions-runner-controller/issues/288", - "podCreationTimestamp", pod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) + if notFound { + if registrationDidTimeout { + log.Info( + "Runner failed to register itself to GitHub in timely manner. "+ + "Recreating the pod to see if it resolves the issue. "+ + "CAUTION: If you see this a lot, you should investigate the root cause. "+ + "See https://github.com/summerwind/actions-runner-controller/issues/288", + "podCreationTimestamp", pod.CreationTimestamp, + "currentTime", currentTime, + "configuredRegistrationTimeout", registrationTimeout, + ) - restart = true + restart = true + } else { + log.V(1).Info( + "Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.", + "runnerName", runner.Name, + ) + } + } else if offline { + if registrationDidTimeout { + log.Info( + "Already existing GitHub runner still appears offline . "+ + "Recreating the pod to see if it resolves the issue. "+ + "CAUTION: If you see this a lot, you should investigate the root cause. ", + "podCreationTimestamp", pod.CreationTimestamp, + "currentTime", currentTime, + "configuredRegistrationTimeout", registrationTimeout, + ) + + restart = true + } else { + log.V(1).Info( + "Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...", + "runnerName", runner.Name, + ) + } } - if offline && registrationDidTimeout { - log.Info( - "Already existing GitHub runner still appears offline . "+ - "Recreating the pod to see if it resolves the issue. "+ - "CAUTION: If you see this a lot, you should investigate the root cause. ", - "podCreationTimestamp", pod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) - - restart = true + if (notFound || offline) && !registrationDidTimeout { + registrationRecheckDelay = registrationCheckInterval + wait.Jitter(10*time.Second, 0.1) } - } // Don't do anything if there's no need to restart the runner if !restart { + // This guard enables us to update runner.Status.Phase to `Running` only after + // the runner is registered to GitHub. + if registrationRecheckDelay > 0 { + log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay)) + + updated := runner.DeepCopy() + updated.Status.LastRegistrationCheckTime = &metav1.Time{Time: time.Now()} + + if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil { + log.Error(err, "Failed to update runner status") + return ctrl.Result{}, err + } + + return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil + } + + if runner.Status.Phase != string(pod.Status.Phase) { + if pod.Status.Phase == corev1.PodRunning { + // Seeing this message, you can expect the runner to become `Running` soon. + log.Info( + "Runner appears to have registered and running.", + "podCreationTimestamp", pod.CreationTimestamp, + ) + } + + updated := runner.DeepCopy() + updated.Status.Phase = string(pod.Status.Phase) + updated.Status.Reason = pod.Status.Reason + updated.Status.Message = pod.Status.Message + + if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil { + log.Error(err, "Failed to update runner status") + return ctrl.Result{}, err + } + } + return ctrl.Result{}, nil } diff --git a/controllers/runnerdeployment_controller.go b/controllers/runnerdeployment_controller.go index 6d4780e6..ed5d84e0 100644 --- a/controllers/runnerdeployment_controller.go +++ b/controllers/runnerdeployment_controller.go @@ -194,7 +194,10 @@ func (r *RunnerDeploymentReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e Namespace: newestSet.Namespace, Name: newestSet.Name, }). - Info("Waiting until the newest runner replica set to be 100% available") + Info("Waiting until the newest runner replica set to be 100% available", + "ready", readyReplicas, + "desired", currentDesiredReplicas, + ) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } diff --git a/github/github.go b/github/github.go index 418a382b..2872dd99 100644 --- a/github/github.go +++ b/github/github.go @@ -158,7 +158,7 @@ func (c *Client) ListRunners(ctx context.Context, enterprise, org, repo string) var runners []*github.Runner - opts := github.ListOptions{PerPage: 10} + opts := github.ListOptions{PerPage: 100} for { list, res, err := c.listRunners(ctx, enterprise, owner, repo, &opts)