diff --git a/api/v1alpha1/runner_types.go b/api/v1alpha1/runner_types.go index 9a1d06aa..c3fa5b27 100644 --- a/api/v1alpha1/runner_types.go +++ b/api/v1alpha1/runner_types.go @@ -125,6 +125,9 @@ type RunnerStatus struct { Phase string `json:"phase"` Reason string `json:"reason"` Message string `json:"message"` + + //+optional + LastRegistrationCheckTime *metav1.Time `json:"lastRegistrationCheckTime"` } // RunnerStatusRegistration contains runner registration status diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 05d077f0..a0d49f3e 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -715,6 +715,10 @@ func (in *RunnerSpec) DeepCopy() *RunnerSpec { func (in *RunnerStatus) DeepCopyInto(out *RunnerStatus) { *out = *in in.Registration.DeepCopyInto(&out.Registration) + if in.LastRegistrationCheckTime != nil { + in, out := &in.LastRegistrationCheckTime, &out.LastRegistrationCheckTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RunnerStatus. diff --git a/charts/actions-runner-controller/Chart.yaml b/charts/actions-runner-controller/Chart.yaml index 6e2061ab..24e1219a 100644 --- a/charts/actions-runner-controller/Chart.yaml +++ b/charts/actions-runner-controller/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.9.0 +version: 0.10.0 home: https://github.com/summerwind/actions-runner-controller diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_runners.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_runners.yaml index 090f1af6..0f77c190 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_runners.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_runners.yaml @@ -1541,6 +1541,9 @@ spec: status: description: RunnerStatus defines the observed state of Runner properties: + lastRegistrationCheckTime: + format: date-time + type: string message: type: string phase: diff --git a/config/crd/bases/actions.summerwind.dev_runners.yaml b/config/crd/bases/actions.summerwind.dev_runners.yaml index 090f1af6..0f77c190 100644 --- a/config/crd/bases/actions.summerwind.dev_runners.yaml +++ b/config/crd/bases/actions.summerwind.dev_runners.yaml @@ -1541,6 +1541,9 @@ spec: status: description: RunnerStatus defines the observed state of Runner properties: + lastRegistrationCheckTime: + format: date-time + type: string message: type: string phase: diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 02c512bf..3b7f109a 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -22,6 +22,7 @@ import ( "fmt" gogithub "github.com/google/go-github/v33/github" "github.com/summerwind/actions-runner-controller/hash" + "k8s.io/apimachinery/pkg/util/wait" "strings" "time" @@ -129,8 +130,8 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { newRunner := runner.DeepCopy() newRunner.ObjectMeta.Finalizers = finalizers - if err := r.Update(ctx, newRunner); err != nil { - log.Error(err, "Failed to update runner") + if err := r.Patch(ctx, newRunner, client.MergeFrom(&runner)); err != nil { + log.Error(err, "Failed to update runner for finalizer removal") return ctrl.Result{}, err } @@ -159,31 +160,23 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { } if err := r.Create(ctx, &newPod); err != nil { + if kerrors.IsAlreadyExists(err) { + // Gracefully handle pod-already-exists errors due to informer cache delay. + // Without this we got a few errors like the below on new runner pod: + // 2021-03-16T00:23:10.116Z ERROR controller-runtime.controller Reconciler error {"controller": "runner-controller", "request": "default/example-runnerdeploy-b2g2g-j4mcp", "error": "pods \"example-runnerdeploy-b2g2g-j4mcp\" already exists"} + log.Info("Runner pod already exists. Probably this pod has been already created in previous reconcilation but the new pod is not yet cached.") + + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + } + log.Error(err, "Failed to create pod resource") + return ctrl.Result{}, err } r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodCreated", fmt.Sprintf("Created pod '%s'", newPod.Name)) log.Info("Created runner pod", "repository", runner.Spec.Repository) } else { - // If pod has ended up succeeded we need to restart it - // Happens e.g. when dind is in runner and run completes - restart := pod.Status.Phase == corev1.PodSucceeded - - if !restart && runner.Status.Phase != string(pod.Status.Phase) { - updated := runner.DeepCopy() - updated.Status.Phase = string(pod.Status.Phase) - updated.Status.Reason = pod.Status.Reason - updated.Status.Message = pod.Status.Message - - if err := r.Status().Update(ctx, updated); err != nil { - log.Error(err, "Failed to update runner status") - return ctrl.Result{}, err - } - - return ctrl.Result{}, nil - } - if !pod.ObjectMeta.DeletionTimestamp.IsZero() { deletionTimeout := 1 * time.Minute currentTime := time.Now() @@ -220,6 +213,10 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { } } + // If pod has ended up succeeded we need to restart it + // Happens e.g. when dind is in runner and run completes + restart := pod.Status.Phase == corev1.PodSucceeded + if pod.Status.Phase == corev1.PodRunning { for _, status := range pod.Status.ContainerStatuses { if status.Name != containerName { @@ -244,24 +241,45 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { return ctrl.Result{}, err } + var registrationRecheckDelay time.Duration + // all checks done below only decide whether a restart is needed // if a restart was already decided before, there is no need for the checks - // saving API calls and scary log messages + // saving API calls and scary{ log messages if !restart { + registrationCheckInterval := time.Minute - notRegistered := false + // We want to call ListRunners GitHub Actions API only once per runner per minute. + // This if block, in conjunction with: + // return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil + // achieves that. + if lastCheckTime := runner.Status.LastRegistrationCheckTime; lastCheckTime != nil { + nextCheckTime := lastCheckTime.Add(registrationCheckInterval) + if nextCheckTime.After(time.Now()) { + log.Info( + fmt.Sprintf("Skipping registration check because it's deferred until %s", nextCheckTime), + ) + + // Note that we don't need to explicitly requeue on this reconcilation because + // the requeue should have been already scheduled previsouly + // (with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil` as noted above and coded below) + return ctrl.Result{}, nil + } + } + + notFound := false offline := false runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) + + currentTime := time.Now() + if err != nil { var notFoundException *github.RunnerNotFound var offlineException *github.RunnerOffline if errors.As(err, ¬FoundException) { - log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name) - - notRegistered = true + notFound = true } else if errors.As(err, &offlineException) { - log.V(1).Info("GitHub runner appears to be offline, waiting for runner to get online ...", "runnerName", runner.Name) offline = true } else { var e *gogithub.RateLimitError @@ -293,40 +311,91 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { } registrationTimeout := 10 * time.Minute - currentTime := time.Now() - registrationDidTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout)) > 0 + durationAfterRegistrationTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout)) + registrationDidTimeout := durationAfterRegistrationTimeout > 0 - if notRegistered && registrationDidTimeout { - log.Info( - "Runner failed to register itself to GitHub in timely manner. "+ - "Recreating the pod to see if it resolves the issue. "+ - "CAUTION: If you see this a lot, you should investigate the root cause. "+ - "See https://github.com/summerwind/actions-runner-controller/issues/288", - "podCreationTimestamp", pod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) + if notFound { + if registrationDidTimeout { + log.Info( + "Runner failed to register itself to GitHub in timely manner. "+ + "Recreating the pod to see if it resolves the issue. "+ + "CAUTION: If you see this a lot, you should investigate the root cause. "+ + "See https://github.com/summerwind/actions-runner-controller/issues/288", + "podCreationTimestamp", pod.CreationTimestamp, + "currentTime", currentTime, + "configuredRegistrationTimeout", registrationTimeout, + ) - restart = true + restart = true + } else { + log.V(1).Info( + "Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.", + "runnerName", runner.Name, + ) + } + } else if offline { + if registrationDidTimeout { + log.Info( + "Already existing GitHub runner still appears offline . "+ + "Recreating the pod to see if it resolves the issue. "+ + "CAUTION: If you see this a lot, you should investigate the root cause. ", + "podCreationTimestamp", pod.CreationTimestamp, + "currentTime", currentTime, + "configuredRegistrationTimeout", registrationTimeout, + ) + + restart = true + } else { + log.V(1).Info( + "Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...", + "runnerName", runner.Name, + ) + } } - if offline && registrationDidTimeout { - log.Info( - "Already existing GitHub runner still appears offline . "+ - "Recreating the pod to see if it resolves the issue. "+ - "CAUTION: If you see this a lot, you should investigate the root cause. ", - "podCreationTimestamp", pod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) - - restart = true + if (notFound || offline) && !registrationDidTimeout { + registrationRecheckDelay = registrationCheckInterval + wait.Jitter(10*time.Second, 0.1) } - } // Don't do anything if there's no need to restart the runner if !restart { + // This guard enables us to update runner.Status.Phase to `Running` only after + // the runner is registered to GitHub. + if registrationRecheckDelay > 0 { + log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay)) + + updated := runner.DeepCopy() + updated.Status.LastRegistrationCheckTime = &metav1.Time{Time: time.Now()} + + if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil { + log.Error(err, "Failed to update runner status") + return ctrl.Result{}, err + } + + return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil + } + + if runner.Status.Phase != string(pod.Status.Phase) { + if pod.Status.Phase == corev1.PodRunning { + // Seeing this message, you can expect the runner to become `Running` soon. + log.Info( + "Runner appears to have registered and running.", + "podCreationTimestamp", pod.CreationTimestamp, + ) + } + + updated := runner.DeepCopy() + updated.Status.Phase = string(pod.Status.Phase) + updated.Status.Reason = pod.Status.Reason + updated.Status.Message = pod.Status.Message + + if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil { + log.Error(err, "Failed to update runner status") + return ctrl.Result{}, err + } + } + return ctrl.Result{}, nil } diff --git a/controllers/runnerdeployment_controller.go b/controllers/runnerdeployment_controller.go index 6d4780e6..ed5d84e0 100644 --- a/controllers/runnerdeployment_controller.go +++ b/controllers/runnerdeployment_controller.go @@ -194,7 +194,10 @@ func (r *RunnerDeploymentReconciler) Reconcile(req ctrl.Request) (ctrl.Result, e Namespace: newestSet.Namespace, Name: newestSet.Name, }). - Info("Waiting until the newest runner replica set to be 100% available") + Info("Waiting until the newest runner replica set to be 100% available", + "ready", readyReplicas, + "desired", currentDesiredReplicas, + ) return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } diff --git a/github/github.go b/github/github.go index 418a382b..2872dd99 100644 --- a/github/github.go +++ b/github/github.go @@ -158,7 +158,7 @@ func (c *Client) ListRunners(ctx context.Context, enterprise, org, repo string) var runners []*github.Runner - opts := github.ListOptions{PerPage: 10} + opts := github.ListOptions{PerPage: 100} for { list, res, err := c.listRunners(ctx, enterprise, owner, repo, &opts)