Fix regression that registration-timeout check was not working for runnerset (#1178)

Follow-up for #1167
2022-03-05 19:31:05 +09:00 · 2022-03-05 19:31:05 +09:00 · 95a5770d55
parent 9cc9f8c182
commit 95a5770d55
4 changed files with 62 additions and 222 deletions
--- a/controllers/constants.go
+++ b/controllers/constants.go
@ -39,4 +39,10 @@ const (
 	// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
 	DefaultUnregistrationRetryDelay = 30 * time.Second
 	// registrationTimeout is the duration until a pod times out after it becomes Ready and Running.
 	// A pod that is timed out can be terminated if needed.
 	registrationTimeout = 10 * time.Minute
 	defaultRegistrationCheckInterval = time.Minute
 )
--- a/controllers/runner_graceful_stop.go
+++ b/controllers/runner_graceful_stop.go
@ -217,6 +217,30 @@ func setAnnotation(meta *metav1.ObjectMeta, key, value string) {
 	meta.Annotations[key] = value
 }
 func podConditionTransitionTime(pod *corev1.Pod, tpe corev1.PodConditionType, v corev1.ConditionStatus) *metav1.Time {
 	for _, c := range pod.Status.Conditions {
 		if c.Type == tpe && c.Status == v {
 			return &c.LastTransitionTime
 		}
 	}
 	return nil
 }
 func podConditionTransitionTimeAfter(pod *corev1.Pod, tpe corev1.PodConditionType, d time.Duration) bool {
 	c := podConditionTransitionTime(pod, tpe, corev1.ConditionTrue)
 	if c == nil {
 		return false
 	}
 	return c.Add(d).Before(time.Now())
 }
 func podRunnerID(pod *corev1.Pod) string {
 	id, _ := getAnnotation(&pod.ObjectMeta, AnnotationKeyRunnerID)
 	return id
 }
 // unregisterRunner unregisters the runner from GitHub Actions by name.
 //
 // This function returns:
--- a/controllers/runner_pod_controller.go
+++ b/controllers/runner_pod_controller.go
@ -18,13 +18,10 @@ package controllers
 import (
 	"context"
 	"errors"
 	"fmt"
 	"time"
 	"github.com/go-logr/logr"
 	gogithub "github.com/google/go-github/v39/github"
 	"k8s.io/apimachinery/pkg/util/wait"
 	kerrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/runtime"
@ -190,218 +187,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 		return ctrl.Result{}, nil
 	}
 	// If pod has ended up succeeded we need to restart it
 	// Happens e.g. when dind is in runner and run completes
 	stopped := runnerPod.Status.Phase == corev1.PodSucceeded
 	if !stopped {
 		if runnerPod.Status.Phase == corev1.PodRunning {
 			for _, status := range runnerPod.Status.ContainerStatuses {
 				if status.Name != containerName {
 					continue
 				}
 				if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
 					stopped = true
 				}
 			}
 		}
 	}
 	restart := stopped
 	var registrationRecheckDelay time.Duration
 	// all checks done below only decide whether a restart is needed
 	// if a restart was already decided before, there is no need for the checks
 	// saving API calls and scary log messages
 	if !restart {
 		registrationCheckInterval := time.Minute
 		if r.RegistrationRecheckInterval > 0 {
 			registrationCheckInterval = r.RegistrationRecheckInterval
 		}
 		lastCheckTimeStr := runnerPod.Annotations[AnnotationKeyLastRegistrationCheckTime]
 		var lastCheckTime *time.Time
 		if lastCheckTimeStr != "" {
 			t, err := time.Parse(time.RFC3339, lastCheckTimeStr)
 			if err != nil {
 				log.Error(err, "failed to parase last check time %q", lastCheckTimeStr)
 				return ctrl.Result{}, nil
 			}
 			lastCheckTime = &t
 		}
 		// We want to call ListRunners GitHub Actions API only once per runner per minute.
 		// This if block, in conjunction with:
 		//   return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
 		// achieves that.
 		if lastCheckTime != nil {
 			nextCheckTime := lastCheckTime.Add(registrationCheckInterval)
 			now := time.Now()
 			// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
 			// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
 			// shorter than 1s
 			requeueAfter := nextCheckTime.Sub(now) - time.Second
 			if requeueAfter > 0 {
 				log.Info(
 					fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter),
 					"lastRegistrationCheckTime", lastCheckTime,
 					"registrationCheckInterval", registrationCheckInterval,
 				)
 				// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
 				// next sync period passes, which can be too much later than nextCheckTime.
 				//
 				// We need to requeue on this reconcilation even though we have already scheduled the initial
 				// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
 				// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
 				// other requeues- so the initial scheduled requeue may have been reset due to requeue on
 				// spec/status change.
 				return ctrl.Result{RequeueAfter: requeueAfter}, nil
 			}
 		}
 		notFound := false
 		offline := false
 		_, err := r.GitHubClient.IsRunnerBusy(ctx, enterprise, org, repo, runnerPod.Name)
 		currentTime := time.Now()
 		if err != nil {
 			var notFoundException *github.RunnerNotFound
 			var offlineException *github.RunnerOffline
 			if errors.As(err, &notFoundException) {
 				notFound = true
 			} else if errors.As(err, &offlineException) {
 				offline = true
 			} else {
 				var e *gogithub.RateLimitError
 				if errors.As(err, &e) {
 					// We log the underlying error when we failed calling GitHub API to list or unregisters,
 					// or the runner is still busy.
 					log.Error(
 						err,
 						fmt.Sprintf(
 							"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
 							retryDelayOnGitHubAPIRateLimitError,
 						),
 					)
 					return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
 				}
 				return ctrl.Result{}, err
 			}
 		}
 		registrationTimeout := 10 * time.Minute
 		durationAfterRegistrationTimeout := currentTime.Sub(runnerPod.CreationTimestamp.Add(registrationTimeout))
 		registrationDidTimeout := durationAfterRegistrationTimeout > 0
 		if notFound {
 			if registrationDidTimeout {
 				log.Info(
 					"Runner failed to register itself to GitHub in timely manner. "+
 						"Recreating the pod to see if it resolves the issue. "+
 						"CAUTION: If you see this a lot, you should investigate the root cause. "+
 						"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
 					"podCreationTimestamp", runnerPod.CreationTimestamp,
 					"currentTime", currentTime,
 					"configuredRegistrationTimeout", registrationTimeout,
 				)
 				restart = true
 			} else {
 				log.V(1).Info(
 					"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.",
 					"runnerName", runnerPod.Name,
 				)
 			}
 		} else if offline {
 			if registrationDidTimeout {
 				log.Info(
 					"Already existing GitHub runner still appears offline . "+
 						"Recreating the pod to see if it resolves the issue. "+
 						"CAUTION: If you see this a lot, you should investigate the root cause. ",
 					"podCreationTimestamp", runnerPod.CreationTimestamp,
 					"currentTime", currentTime,
 					"configuredRegistrationTimeout", registrationTimeout,
 				)
 				restart = true
 			} else {
 				log.V(1).Info(
 					"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...",
 					"runnerName", runnerPod.Name,
 				)
 			}
 		}
 		if (notFound || offline) && !registrationDidTimeout {
 			registrationRecheckJitter := 10 * time.Second
 			if r.RegistrationRecheckJitter > 0 {
 				registrationRecheckJitter = r.RegistrationRecheckJitter
 			}
 			registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1)
 		}
 	}
 	// Don't do anything if there's no need to restart the runner
 	if !restart {
 		// This guard enables us to update runner.Status.Phase to `Running` only after
 		// the runner is registered to GitHub.
 		if registrationRecheckDelay > 0 {
 			log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay))
 			updated := runnerPod.DeepCopy()
 			t := time.Now().Format(time.RFC3339)
 			updated.Annotations[AnnotationKeyLastRegistrationCheckTime] = t
 			if err := r.Patch(ctx, updated, client.MergeFrom(&runnerPod)); err != nil {
 				log.Error(err, "Failed to update runner pod annotation for LastRegistrationCheckTime")
 				return ctrl.Result{}, err
 			}
 			return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
 		}
 		// Seeing this message, you can expect the runner to become `Running` soon.
 		log.Info(
 			"Runner appears to have registered and running.",
 			"podCreationTimestamp", runnerPod.CreationTimestamp,
 		)
 		return ctrl.Result{}, nil
 	}
 	updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
 	if res != nil {
 		return *res, err
 	}
 	if runnerPodOrContainerIsStopped(updated) {
 		log.Info("Detected runner to have successfully stopped", "name", runnerPod.Name)
 		return ctrl.Result{}, nil
 	} else {
 		log.Info("Runner can be safely deleted", "name", runnerPod.Name)
 	}
 	// Delete current pod if recreation is needed
 	if err := r.Delete(ctx, updated); err != nil {
 		log.Error(err, "Failed to delete pod resource")
 		return ctrl.Result{}, err
 	}
 	r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", runnerPod.Name))
 	log.Info("Deleted runner pod", "name", runnerPod.Name)
 	return ctrl.Result{}, nil
 }
--- a/controllers/runnerset_controller.go
+++ b/controllers/runnerset_controller.go
@ -259,11 +259,12 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 		log.V(2).Info("Detected some current statefulsets", "creationTimestampFirst", timestampFirst, "creationTimestampLast", timestampLast, "statefulsets", names)
 	}
-	var pending, running int
+	var pending, running, regTimeout int
 	for _, ss := range currentStatefulSets {
 		pending += ss.pending
 		running += ss.running
 		regTimeout += ss.regTimeout
 	}
 	const defaultReplicas = 1
@ -276,15 +277,23 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 	newDesiredReplicas := getIntOrDefault(replicasOfDesiredStatefulSet, defaultReplicas)
-	log.V(2).Info("Found some pods across statefulset(s)", "pending", pending, "running", running, "desired", newDesiredReplicas, "statefulsets", len(statefulsets))
+	log.V(2).Info(
 		"Found some pods across statefulset(s)",
 		"pending", pending,
 		"running", running,
 		"regTimeout", regTimeout,
 		"desired", newDesiredReplicas,
 		"statefulsets", len(statefulsets),
 	)
 	effectiveTime := runnerSet.Spec.EffectiveTime
 	ephemeral := runnerSet.Spec.Ephemeral == nil || *runnerSet.Spec.Ephemeral
 	maybeRunning := pending + running
-	if newDesiredReplicas > pending+running && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
+	if newDesiredReplicas > maybeRunning && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
 		log.V(2).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", newDesiredReplicas, "pending", pending, "running", running)
-	} else if newDesiredReplicas > pending+running {
+	} else if newDesiredReplicas > maybeRunning {
-		num := newDesiredReplicas - (pending + running)
+		num := newDesiredReplicas - maybeRunning
 		for i := 0; i < num; i++ {
 			// Add more replicas
@ -415,6 +424,7 @@ type podsForStatefulset struct {
 	completed    int
 	running      int
 	terminating  int
 	regTimeout   int
 	pending      int
 	templateHash string
 	statefulset  *appsv1.StatefulSet
@ -429,7 +439,7 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log
 		return nil, err
 	}
-	var completed, running, terminating, pending, total int
+	var completed, running, terminating, regTimeout, pending, total int
 	var pods []corev1.Pod
@ -445,7 +455,21 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log
 		if runnerPodOrContainerIsStopped(&pod) {
 			completed++
 		} else if pod.Status.Phase == corev1.PodRunning {
-			running++
+			if podRunnerID(&pod) == "" && podConditionTransitionTimeAfter(&pod, corev1.PodReady, registrationTimeout) {
 				log.Info(
 					"Runner failed to register itself to GitHub in timely manner. "+
 						"Recreating the pod to see if it resolves the issue. "+
 						"CAUTION: If you see this a lot, you should investigate the root cause. "+
 						"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
 					"creationTimestamp", pod.CreationTimestamp,
 					"readyTransitionTime", podConditionTransitionTime(&pod, corev1.PodReady, corev1.ConditionTrue),
 					"configuredRegistrationTimeout", registrationTimeout,
 				)
 				regTimeout++
 			} else {
 				running++
 			}
 		} else if !pod.DeletionTimestamp.IsZero() {
 			terminating++
 		} else {
@ -466,6 +490,7 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log
 		completed:    completed,
 		running:      running,
 		terminating:  terminating,
 		regTimeout:   regTimeout,
 		pending:      pending,
 		templateHash: templateHash,
 		statefulset:  ss,