diff --git a/controllers/constants.go b/controllers/constants.go index c7dd3556..9d967fb4 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -39,4 +39,10 @@ const ( // This can be any value but a larger value can make an unregistration timeout longer than configured in practice. DefaultUnregistrationRetryDelay = 30 * time.Second + + // registrationTimeout is the duration until a pod times out after it becomes Ready and Running. + // A pod that is timed out can be terminated if needed. + registrationTimeout = 10 * time.Minute + + defaultRegistrationCheckInterval = time.Minute ) diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index 3d71f5cc..701c6539 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -217,6 +217,30 @@ func setAnnotation(meta *metav1.ObjectMeta, key, value string) { meta.Annotations[key] = value } +func podConditionTransitionTime(pod *corev1.Pod, tpe corev1.PodConditionType, v corev1.ConditionStatus) *metav1.Time { + for _, c := range pod.Status.Conditions { + if c.Type == tpe && c.Status == v { + return &c.LastTransitionTime + } + } + + return nil +} + +func podConditionTransitionTimeAfter(pod *corev1.Pod, tpe corev1.PodConditionType, d time.Duration) bool { + c := podConditionTransitionTime(pod, tpe, corev1.ConditionTrue) + if c == nil { + return false + } + + return c.Add(d).Before(time.Now()) +} + +func podRunnerID(pod *corev1.Pod) string { + id, _ := getAnnotation(&pod.ObjectMeta, AnnotationKeyRunnerID) + return id +} + // unregisterRunner unregisters the runner from GitHub Actions by name. // // This function returns: diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go index 308a6e08..e2d96ce8 100644 --- a/controllers/runner_pod_controller.go +++ b/controllers/runner_pod_controller.go @@ -18,13 +18,10 @@ package controllers import ( "context" - "errors" "fmt" "time" "github.com/go-logr/logr" - gogithub "github.com/google/go-github/v39/github" - "k8s.io/apimachinery/pkg/util/wait" kerrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" @@ -190,218 +187,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } - // If pod has ended up succeeded we need to restart it - // Happens e.g. when dind is in runner and run completes - stopped := runnerPod.Status.Phase == corev1.PodSucceeded - - if !stopped { - if runnerPod.Status.Phase == corev1.PodRunning { - for _, status := range runnerPod.Status.ContainerStatuses { - if status.Name != containerName { - continue - } - - if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 { - stopped = true - } - } - } - } - - restart := stopped - - var registrationRecheckDelay time.Duration - - // all checks done below only decide whether a restart is needed - // if a restart was already decided before, there is no need for the checks - // saving API calls and scary log messages - if !restart { - registrationCheckInterval := time.Minute - if r.RegistrationRecheckInterval > 0 { - registrationCheckInterval = r.RegistrationRecheckInterval - } - - lastCheckTimeStr := runnerPod.Annotations[AnnotationKeyLastRegistrationCheckTime] - - var lastCheckTime *time.Time - - if lastCheckTimeStr != "" { - t, err := time.Parse(time.RFC3339, lastCheckTimeStr) - if err != nil { - log.Error(err, "failed to parase last check time %q", lastCheckTimeStr) - return ctrl.Result{}, nil - } - - lastCheckTime = &t - } - - // We want to call ListRunners GitHub Actions API only once per runner per minute. - // This if block, in conjunction with: - // return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil - // achieves that. - if lastCheckTime != nil { - nextCheckTime := lastCheckTime.Add(registrationCheckInterval) - now := time.Now() - - // Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds) - // so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is - // shorter than 1s - requeueAfter := nextCheckTime.Sub(now) - time.Second - if requeueAfter > 0 { - log.Info( - fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter), - "lastRegistrationCheckTime", lastCheckTime, - "registrationCheckInterval", registrationCheckInterval, - ) - - // Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the - // next sync period passes, which can be too much later than nextCheckTime. - // - // We need to requeue on this reconcilation even though we have already scheduled the initial - // requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`. - // Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on - // other requeues- so the initial scheduled requeue may have been reset due to requeue on - // spec/status change. - return ctrl.Result{RequeueAfter: requeueAfter}, nil - } - } - - notFound := false - offline := false - - _, err := r.GitHubClient.IsRunnerBusy(ctx, enterprise, org, repo, runnerPod.Name) - - currentTime := time.Now() - - if err != nil { - var notFoundException *github.RunnerNotFound - var offlineException *github.RunnerOffline - if errors.As(err, ¬FoundException) { - notFound = true - } else if errors.As(err, &offlineException) { - offline = true - } else { - var e *gogithub.RateLimitError - if errors.As(err, &e) { - // We log the underlying error when we failed calling GitHub API to list or unregisters, - // or the runner is still busy. - log.Error( - err, - fmt.Sprintf( - "Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls", - retryDelayOnGitHubAPIRateLimitError, - ), - ) - - return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err - } - - return ctrl.Result{}, err - } - } - - registrationTimeout := 10 * time.Minute - durationAfterRegistrationTimeout := currentTime.Sub(runnerPod.CreationTimestamp.Add(registrationTimeout)) - registrationDidTimeout := durationAfterRegistrationTimeout > 0 - - if notFound { - if registrationDidTimeout { - log.Info( - "Runner failed to register itself to GitHub in timely manner. "+ - "Recreating the pod to see if it resolves the issue. "+ - "CAUTION: If you see this a lot, you should investigate the root cause. "+ - "See https://github.com/actions-runner-controller/actions-runner-controller/issues/288", - "podCreationTimestamp", runnerPod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) - - restart = true - } else { - log.V(1).Info( - "Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.", - "runnerName", runnerPod.Name, - ) - } - } else if offline { - if registrationDidTimeout { - log.Info( - "Already existing GitHub runner still appears offline . "+ - "Recreating the pod to see if it resolves the issue. "+ - "CAUTION: If you see this a lot, you should investigate the root cause. ", - "podCreationTimestamp", runnerPod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) - - restart = true - } else { - log.V(1).Info( - "Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...", - "runnerName", runnerPod.Name, - ) - } - } - - if (notFound || offline) && !registrationDidTimeout { - registrationRecheckJitter := 10 * time.Second - if r.RegistrationRecheckJitter > 0 { - registrationRecheckJitter = r.RegistrationRecheckJitter - } - - registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1) - } - } - - // Don't do anything if there's no need to restart the runner - if !restart { - // This guard enables us to update runner.Status.Phase to `Running` only after - // the runner is registered to GitHub. - if registrationRecheckDelay > 0 { - log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay)) - - updated := runnerPod.DeepCopy() - t := time.Now().Format(time.RFC3339) - updated.Annotations[AnnotationKeyLastRegistrationCheckTime] = t - - if err := r.Patch(ctx, updated, client.MergeFrom(&runnerPod)); err != nil { - log.Error(err, "Failed to update runner pod annotation for LastRegistrationCheckTime") - return ctrl.Result{}, err - } - - return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil - } - - // Seeing this message, you can expect the runner to become `Running` soon. - log.Info( - "Runner appears to have registered and running.", - "podCreationTimestamp", runnerPod.CreationTimestamp, - ) - - return ctrl.Result{}, nil - } - - updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) - if res != nil { - return *res, err - } - - if runnerPodOrContainerIsStopped(updated) { - log.Info("Detected runner to have successfully stopped", "name", runnerPod.Name) - return ctrl.Result{}, nil - } else { - log.Info("Runner can be safely deleted", "name", runnerPod.Name) - } - - // Delete current pod if recreation is needed - if err := r.Delete(ctx, updated); err != nil { - log.Error(err, "Failed to delete pod resource") - return ctrl.Result{}, err - } - - r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", runnerPod.Name)) - log.Info("Deleted runner pod", "name", runnerPod.Name) - return ctrl.Result{}, nil } diff --git a/controllers/runnerset_controller.go b/controllers/runnerset_controller.go index 75f17bea..1cab58b2 100644 --- a/controllers/runnerset_controller.go +++ b/controllers/runnerset_controller.go @@ -259,11 +259,12 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( log.V(2).Info("Detected some current statefulsets", "creationTimestampFirst", timestampFirst, "creationTimestampLast", timestampLast, "statefulsets", names) } - var pending, running int + var pending, running, regTimeout int for _, ss := range currentStatefulSets { pending += ss.pending running += ss.running + regTimeout += ss.regTimeout } const defaultReplicas = 1 @@ -276,15 +277,23 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( newDesiredReplicas := getIntOrDefault(replicasOfDesiredStatefulSet, defaultReplicas) - log.V(2).Info("Found some pods across statefulset(s)", "pending", pending, "running", running, "desired", newDesiredReplicas, "statefulsets", len(statefulsets)) + log.V(2).Info( + "Found some pods across statefulset(s)", + "pending", pending, + "running", running, + "regTimeout", regTimeout, + "desired", newDesiredReplicas, + "statefulsets", len(statefulsets), + ) effectiveTime := runnerSet.Spec.EffectiveTime ephemeral := runnerSet.Spec.Ephemeral == nil || *runnerSet.Spec.Ephemeral + maybeRunning := pending + running - if newDesiredReplicas > pending+running && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) { + if newDesiredReplicas > maybeRunning && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) { log.V(2).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", newDesiredReplicas, "pending", pending, "running", running) - } else if newDesiredReplicas > pending+running { - num := newDesiredReplicas - (pending + running) + } else if newDesiredReplicas > maybeRunning { + num := newDesiredReplicas - maybeRunning for i := 0; i < num; i++ { // Add more replicas @@ -415,6 +424,7 @@ type podsForStatefulset struct { completed int running int terminating int + regTimeout int pending int templateHash string statefulset *appsv1.StatefulSet @@ -429,7 +439,7 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log return nil, err } - var completed, running, terminating, pending, total int + var completed, running, terminating, regTimeout, pending, total int var pods []corev1.Pod @@ -445,7 +455,21 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log if runnerPodOrContainerIsStopped(&pod) { completed++ } else if pod.Status.Phase == corev1.PodRunning { - running++ + if podRunnerID(&pod) == "" && podConditionTransitionTimeAfter(&pod, corev1.PodReady, registrationTimeout) { + log.Info( + "Runner failed to register itself to GitHub in timely manner. "+ + "Recreating the pod to see if it resolves the issue. "+ + "CAUTION: If you see this a lot, you should investigate the root cause. "+ + "See https://github.com/actions-runner-controller/actions-runner-controller/issues/288", + "creationTimestamp", pod.CreationTimestamp, + "readyTransitionTime", podConditionTransitionTime(&pod, corev1.PodReady, corev1.ConditionTrue), + "configuredRegistrationTimeout", registrationTimeout, + ) + + regTimeout++ + } else { + running++ + } } else if !pod.DeletionTimestamp.IsZero() { terminating++ } else { @@ -466,6 +490,7 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log completed: completed, running: running, terminating: terminating, + regTimeout: regTimeout, pending: pending, templateHash: templateHash, statefulset: ss,