Fix regression that registration-timeout check was not working for runnerset (#1178)

Follow-up for #1167
This commit is contained in:
Yusuke Kuoka 2022-03-05 19:31:05 +09:00 committed by GitHub
parent 9cc9f8c182
commit 95a5770d55
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 62 additions and 222 deletions

View File

@ -39,4 +39,10 @@ const (
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
DefaultUnregistrationRetryDelay = 30 * time.Second
// registrationTimeout is the duration until a pod times out after it becomes Ready and Running.
// A pod that is timed out can be terminated if needed.
registrationTimeout = 10 * time.Minute
defaultRegistrationCheckInterval = time.Minute
)

View File

@ -217,6 +217,30 @@ func setAnnotation(meta *metav1.ObjectMeta, key, value string) {
meta.Annotations[key] = value
}
func podConditionTransitionTime(pod *corev1.Pod, tpe corev1.PodConditionType, v corev1.ConditionStatus) *metav1.Time {
for _, c := range pod.Status.Conditions {
if c.Type == tpe && c.Status == v {
return &c.LastTransitionTime
}
}
return nil
}
func podConditionTransitionTimeAfter(pod *corev1.Pod, tpe corev1.PodConditionType, d time.Duration) bool {
c := podConditionTransitionTime(pod, tpe, corev1.ConditionTrue)
if c == nil {
return false
}
return c.Add(d).Before(time.Now())
}
func podRunnerID(pod *corev1.Pod) string {
id, _ := getAnnotation(&pod.ObjectMeta, AnnotationKeyRunnerID)
return id
}
// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:

View File

@ -18,13 +18,10 @@ package controllers
import (
"context"
"errors"
"fmt"
"time"
"github.com/go-logr/logr"
gogithub "github.com/google/go-github/v39/github"
"k8s.io/apimachinery/pkg/util/wait"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
@ -190,218 +187,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, nil
}
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
stopped := runnerPod.Status.Phase == corev1.PodSucceeded
if !stopped {
if runnerPod.Status.Phase == corev1.PodRunning {
for _, status := range runnerPod.Status.ContainerStatuses {
if status.Name != containerName {
continue
}
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
stopped = true
}
}
}
}
restart := stopped
var registrationRecheckDelay time.Duration
// all checks done below only decide whether a restart is needed
// if a restart was already decided before, there is no need for the checks
// saving API calls and scary log messages
if !restart {
registrationCheckInterval := time.Minute
if r.RegistrationRecheckInterval > 0 {
registrationCheckInterval = r.RegistrationRecheckInterval
}
lastCheckTimeStr := runnerPod.Annotations[AnnotationKeyLastRegistrationCheckTime]
var lastCheckTime *time.Time
if lastCheckTimeStr != "" {
t, err := time.Parse(time.RFC3339, lastCheckTimeStr)
if err != nil {
log.Error(err, "failed to parase last check time %q", lastCheckTimeStr)
return ctrl.Result{}, nil
}
lastCheckTime = &t
}
// We want to call ListRunners GitHub Actions API only once per runner per minute.
// This if block, in conjunction with:
// return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
// achieves that.
if lastCheckTime != nil {
nextCheckTime := lastCheckTime.Add(registrationCheckInterval)
now := time.Now()
// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
// shorter than 1s
requeueAfter := nextCheckTime.Sub(now) - time.Second
if requeueAfter > 0 {
log.Info(
fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter),
"lastRegistrationCheckTime", lastCheckTime,
"registrationCheckInterval", registrationCheckInterval,
)
// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
// next sync period passes, which can be too much later than nextCheckTime.
//
// We need to requeue on this reconcilation even though we have already scheduled the initial
// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
// other requeues- so the initial scheduled requeue may have been reset due to requeue on
// spec/status change.
return ctrl.Result{RequeueAfter: requeueAfter}, nil
}
}
notFound := false
offline := false
_, err := r.GitHubClient.IsRunnerBusy(ctx, enterprise, org, repo, runnerPod.Name)
currentTime := time.Now()
if err != nil {
var notFoundException *github.RunnerNotFound
var offlineException *github.RunnerOffline
if errors.As(err, &notFoundException) {
notFound = true
} else if errors.As(err, &offlineException) {
offline = true
} else {
var e *gogithub.RateLimitError
if errors.As(err, &e) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
return ctrl.Result{}, err
}
}
registrationTimeout := 10 * time.Minute
durationAfterRegistrationTimeout := currentTime.Sub(runnerPod.CreationTimestamp.Add(registrationTimeout))
registrationDidTimeout := durationAfterRegistrationTimeout > 0
if notFound {
if registrationDidTimeout {
log.Info(
"Runner failed to register itself to GitHub in timely manner. "+
"Recreating the pod to see if it resolves the issue. "+
"CAUTION: If you see this a lot, you should investigate the root cause. "+
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
"podCreationTimestamp", runnerPod.CreationTimestamp,
"currentTime", currentTime,
"configuredRegistrationTimeout", registrationTimeout,
)
restart = true
} else {
log.V(1).Info(
"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.",
"runnerName", runnerPod.Name,
)
}
} else if offline {
if registrationDidTimeout {
log.Info(
"Already existing GitHub runner still appears offline . "+
"Recreating the pod to see if it resolves the issue. "+
"CAUTION: If you see this a lot, you should investigate the root cause. ",
"podCreationTimestamp", runnerPod.CreationTimestamp,
"currentTime", currentTime,
"configuredRegistrationTimeout", registrationTimeout,
)
restart = true
} else {
log.V(1).Info(
"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...",
"runnerName", runnerPod.Name,
)
}
}
if (notFound || offline) && !registrationDidTimeout {
registrationRecheckJitter := 10 * time.Second
if r.RegistrationRecheckJitter > 0 {
registrationRecheckJitter = r.RegistrationRecheckJitter
}
registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1)
}
}
// Don't do anything if there's no need to restart the runner
if !restart {
// This guard enables us to update runner.Status.Phase to `Running` only after
// the runner is registered to GitHub.
if registrationRecheckDelay > 0 {
log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay))
updated := runnerPod.DeepCopy()
t := time.Now().Format(time.RFC3339)
updated.Annotations[AnnotationKeyLastRegistrationCheckTime] = t
if err := r.Patch(ctx, updated, client.MergeFrom(&runnerPod)); err != nil {
log.Error(err, "Failed to update runner pod annotation for LastRegistrationCheckTime")
return ctrl.Result{}, err
}
return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
}
// Seeing this message, you can expect the runner to become `Running` soon.
log.Info(
"Runner appears to have registered and running.",
"podCreationTimestamp", runnerPod.CreationTimestamp,
)
return ctrl.Result{}, nil
}
updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if res != nil {
return *res, err
}
if runnerPodOrContainerIsStopped(updated) {
log.Info("Detected runner to have successfully stopped", "name", runnerPod.Name)
return ctrl.Result{}, nil
} else {
log.Info("Runner can be safely deleted", "name", runnerPod.Name)
}
// Delete current pod if recreation is needed
if err := r.Delete(ctx, updated); err != nil {
log.Error(err, "Failed to delete pod resource")
return ctrl.Result{}, err
}
r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", runnerPod.Name))
log.Info("Deleted runner pod", "name", runnerPod.Name)
return ctrl.Result{}, nil
}

View File

@ -259,11 +259,12 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
log.V(2).Info("Detected some current statefulsets", "creationTimestampFirst", timestampFirst, "creationTimestampLast", timestampLast, "statefulsets", names)
}
var pending, running int
var pending, running, regTimeout int
for _, ss := range currentStatefulSets {
pending += ss.pending
running += ss.running
regTimeout += ss.regTimeout
}
const defaultReplicas = 1
@ -276,15 +277,23 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
newDesiredReplicas := getIntOrDefault(replicasOfDesiredStatefulSet, defaultReplicas)
log.V(2).Info("Found some pods across statefulset(s)", "pending", pending, "running", running, "desired", newDesiredReplicas, "statefulsets", len(statefulsets))
log.V(2).Info(
"Found some pods across statefulset(s)",
"pending", pending,
"running", running,
"regTimeout", regTimeout,
"desired", newDesiredReplicas,
"statefulsets", len(statefulsets),
)
effectiveTime := runnerSet.Spec.EffectiveTime
ephemeral := runnerSet.Spec.Ephemeral == nil || *runnerSet.Spec.Ephemeral
maybeRunning := pending + running
if newDesiredReplicas > pending+running && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
if newDesiredReplicas > maybeRunning && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
log.V(2).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", newDesiredReplicas, "pending", pending, "running", running)
} else if newDesiredReplicas > pending+running {
num := newDesiredReplicas - (pending + running)
} else if newDesiredReplicas > maybeRunning {
num := newDesiredReplicas - maybeRunning
for i := 0; i < num; i++ {
// Add more replicas
@ -415,6 +424,7 @@ type podsForStatefulset struct {
completed int
running int
terminating int
regTimeout int
pending int
templateHash string
statefulset *appsv1.StatefulSet
@ -429,7 +439,7 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log
return nil, err
}
var completed, running, terminating, pending, total int
var completed, running, terminating, regTimeout, pending, total int
var pods []corev1.Pod
@ -445,7 +455,21 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log
if runnerPodOrContainerIsStopped(&pod) {
completed++
} else if pod.Status.Phase == corev1.PodRunning {
running++
if podRunnerID(&pod) == "" && podConditionTransitionTimeAfter(&pod, corev1.PodReady, registrationTimeout) {
log.Info(
"Runner failed to register itself to GitHub in timely manner. "+
"Recreating the pod to see if it resolves the issue. "+
"CAUTION: If you see this a lot, you should investigate the root cause. "+
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
"creationTimestamp", pod.CreationTimestamp,
"readyTransitionTime", podConditionTransitionTime(&pod, corev1.PodReady, corev1.ConditionTrue),
"configuredRegistrationTimeout", registrationTimeout,
)
regTimeout++
} else {
running++
}
} else if !pod.DeletionTimestamp.IsZero() {
terminating++
} else {
@ -466,6 +490,7 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log
completed: completed,
running: running,
terminating: terminating,
regTimeout: regTimeout,
pending: pending,
templateHash: templateHash,
statefulset: ss,