Fix regression that registration-timeout check was not working for runnerset (#1178)
Follow-up for #1167
This commit is contained in:
parent
9cc9f8c182
commit
95a5770d55
|
|
@ -39,4 +39,10 @@ const (
|
||||||
|
|
||||||
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
|
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
|
||||||
DefaultUnregistrationRetryDelay = 30 * time.Second
|
DefaultUnregistrationRetryDelay = 30 * time.Second
|
||||||
|
|
||||||
|
// registrationTimeout is the duration until a pod times out after it becomes Ready and Running.
|
||||||
|
// A pod that is timed out can be terminated if needed.
|
||||||
|
registrationTimeout = 10 * time.Minute
|
||||||
|
|
||||||
|
defaultRegistrationCheckInterval = time.Minute
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -217,6 +217,30 @@ func setAnnotation(meta *metav1.ObjectMeta, key, value string) {
|
||||||
meta.Annotations[key] = value
|
meta.Annotations[key] = value
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func podConditionTransitionTime(pod *corev1.Pod, tpe corev1.PodConditionType, v corev1.ConditionStatus) *metav1.Time {
|
||||||
|
for _, c := range pod.Status.Conditions {
|
||||||
|
if c.Type == tpe && c.Status == v {
|
||||||
|
return &c.LastTransitionTime
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func podConditionTransitionTimeAfter(pod *corev1.Pod, tpe corev1.PodConditionType, d time.Duration) bool {
|
||||||
|
c := podConditionTransitionTime(pod, tpe, corev1.ConditionTrue)
|
||||||
|
if c == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return c.Add(d).Before(time.Now())
|
||||||
|
}
|
||||||
|
|
||||||
|
func podRunnerID(pod *corev1.Pod) string {
|
||||||
|
id, _ := getAnnotation(&pod.ObjectMeta, AnnotationKeyRunnerID)
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
|
||||||
// unregisterRunner unregisters the runner from GitHub Actions by name.
|
// unregisterRunner unregisters the runner from GitHub Actions by name.
|
||||||
//
|
//
|
||||||
// This function returns:
|
// This function returns:
|
||||||
|
|
|
||||||
|
|
@ -18,13 +18,10 @@ package controllers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/go-logr/logr"
|
"github.com/go-logr/logr"
|
||||||
gogithub "github.com/google/go-github/v39/github"
|
|
||||||
"k8s.io/apimachinery/pkg/util/wait"
|
|
||||||
|
|
||||||
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
||||||
"k8s.io/apimachinery/pkg/runtime"
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
|
|
@ -190,218 +187,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// If pod has ended up succeeded we need to restart it
|
|
||||||
// Happens e.g. when dind is in runner and run completes
|
|
||||||
stopped := runnerPod.Status.Phase == corev1.PodSucceeded
|
|
||||||
|
|
||||||
if !stopped {
|
|
||||||
if runnerPod.Status.Phase == corev1.PodRunning {
|
|
||||||
for _, status := range runnerPod.Status.ContainerStatuses {
|
|
||||||
if status.Name != containerName {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
|
|
||||||
stopped = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
restart := stopped
|
|
||||||
|
|
||||||
var registrationRecheckDelay time.Duration
|
|
||||||
|
|
||||||
// all checks done below only decide whether a restart is needed
|
|
||||||
// if a restart was already decided before, there is no need for the checks
|
|
||||||
// saving API calls and scary log messages
|
|
||||||
if !restart {
|
|
||||||
registrationCheckInterval := time.Minute
|
|
||||||
if r.RegistrationRecheckInterval > 0 {
|
|
||||||
registrationCheckInterval = r.RegistrationRecheckInterval
|
|
||||||
}
|
|
||||||
|
|
||||||
lastCheckTimeStr := runnerPod.Annotations[AnnotationKeyLastRegistrationCheckTime]
|
|
||||||
|
|
||||||
var lastCheckTime *time.Time
|
|
||||||
|
|
||||||
if lastCheckTimeStr != "" {
|
|
||||||
t, err := time.Parse(time.RFC3339, lastCheckTimeStr)
|
|
||||||
if err != nil {
|
|
||||||
log.Error(err, "failed to parase last check time %q", lastCheckTimeStr)
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
lastCheckTime = &t
|
|
||||||
}
|
|
||||||
|
|
||||||
// We want to call ListRunners GitHub Actions API only once per runner per minute.
|
|
||||||
// This if block, in conjunction with:
|
|
||||||
// return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
|
||||||
// achieves that.
|
|
||||||
if lastCheckTime != nil {
|
|
||||||
nextCheckTime := lastCheckTime.Add(registrationCheckInterval)
|
|
||||||
now := time.Now()
|
|
||||||
|
|
||||||
// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
|
|
||||||
// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
|
|
||||||
// shorter than 1s
|
|
||||||
requeueAfter := nextCheckTime.Sub(now) - time.Second
|
|
||||||
if requeueAfter > 0 {
|
|
||||||
log.Info(
|
|
||||||
fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter),
|
|
||||||
"lastRegistrationCheckTime", lastCheckTime,
|
|
||||||
"registrationCheckInterval", registrationCheckInterval,
|
|
||||||
)
|
|
||||||
|
|
||||||
// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
|
|
||||||
// next sync period passes, which can be too much later than nextCheckTime.
|
|
||||||
//
|
|
||||||
// We need to requeue on this reconcilation even though we have already scheduled the initial
|
|
||||||
// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
|
|
||||||
// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
|
|
||||||
// other requeues- so the initial scheduled requeue may have been reset due to requeue on
|
|
||||||
// spec/status change.
|
|
||||||
return ctrl.Result{RequeueAfter: requeueAfter}, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
notFound := false
|
|
||||||
offline := false
|
|
||||||
|
|
||||||
_, err := r.GitHubClient.IsRunnerBusy(ctx, enterprise, org, repo, runnerPod.Name)
|
|
||||||
|
|
||||||
currentTime := time.Now()
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
var notFoundException *github.RunnerNotFound
|
|
||||||
var offlineException *github.RunnerOffline
|
|
||||||
if errors.As(err, ¬FoundException) {
|
|
||||||
notFound = true
|
|
||||||
} else if errors.As(err, &offlineException) {
|
|
||||||
offline = true
|
|
||||||
} else {
|
|
||||||
var e *gogithub.RateLimitError
|
|
||||||
if errors.As(err, &e) {
|
|
||||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
|
||||||
// or the runner is still busy.
|
|
||||||
log.Error(
|
|
||||||
err,
|
|
||||||
fmt.Sprintf(
|
|
||||||
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
|
|
||||||
retryDelayOnGitHubAPIRateLimitError,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
registrationTimeout := 10 * time.Minute
|
|
||||||
durationAfterRegistrationTimeout := currentTime.Sub(runnerPod.CreationTimestamp.Add(registrationTimeout))
|
|
||||||
registrationDidTimeout := durationAfterRegistrationTimeout > 0
|
|
||||||
|
|
||||||
if notFound {
|
|
||||||
if registrationDidTimeout {
|
|
||||||
log.Info(
|
|
||||||
"Runner failed to register itself to GitHub in timely manner. "+
|
|
||||||
"Recreating the pod to see if it resolves the issue. "+
|
|
||||||
"CAUTION: If you see this a lot, you should investigate the root cause. "+
|
|
||||||
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
|
|
||||||
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
|
||||||
"currentTime", currentTime,
|
|
||||||
"configuredRegistrationTimeout", registrationTimeout,
|
|
||||||
)
|
|
||||||
|
|
||||||
restart = true
|
|
||||||
} else {
|
|
||||||
log.V(1).Info(
|
|
||||||
"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.",
|
|
||||||
"runnerName", runnerPod.Name,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
} else if offline {
|
|
||||||
if registrationDidTimeout {
|
|
||||||
log.Info(
|
|
||||||
"Already existing GitHub runner still appears offline . "+
|
|
||||||
"Recreating the pod to see if it resolves the issue. "+
|
|
||||||
"CAUTION: If you see this a lot, you should investigate the root cause. ",
|
|
||||||
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
|
||||||
"currentTime", currentTime,
|
|
||||||
"configuredRegistrationTimeout", registrationTimeout,
|
|
||||||
)
|
|
||||||
|
|
||||||
restart = true
|
|
||||||
} else {
|
|
||||||
log.V(1).Info(
|
|
||||||
"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...",
|
|
||||||
"runnerName", runnerPod.Name,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (notFound || offline) && !registrationDidTimeout {
|
|
||||||
registrationRecheckJitter := 10 * time.Second
|
|
||||||
if r.RegistrationRecheckJitter > 0 {
|
|
||||||
registrationRecheckJitter = r.RegistrationRecheckJitter
|
|
||||||
}
|
|
||||||
|
|
||||||
registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Don't do anything if there's no need to restart the runner
|
|
||||||
if !restart {
|
|
||||||
// This guard enables us to update runner.Status.Phase to `Running` only after
|
|
||||||
// the runner is registered to GitHub.
|
|
||||||
if registrationRecheckDelay > 0 {
|
|
||||||
log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay))
|
|
||||||
|
|
||||||
updated := runnerPod.DeepCopy()
|
|
||||||
t := time.Now().Format(time.RFC3339)
|
|
||||||
updated.Annotations[AnnotationKeyLastRegistrationCheckTime] = t
|
|
||||||
|
|
||||||
if err := r.Patch(ctx, updated, client.MergeFrom(&runnerPod)); err != nil {
|
|
||||||
log.Error(err, "Failed to update runner pod annotation for LastRegistrationCheckTime")
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Seeing this message, you can expect the runner to become `Running` soon.
|
|
||||||
log.Info(
|
|
||||||
"Runner appears to have registered and running.",
|
|
||||||
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
|
||||||
)
|
|
||||||
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
|
|
||||||
if res != nil {
|
|
||||||
return *res, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if runnerPodOrContainerIsStopped(updated) {
|
|
||||||
log.Info("Detected runner to have successfully stopped", "name", runnerPod.Name)
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
} else {
|
|
||||||
log.Info("Runner can be safely deleted", "name", runnerPod.Name)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Delete current pod if recreation is needed
|
|
||||||
if err := r.Delete(ctx, updated); err != nil {
|
|
||||||
log.Error(err, "Failed to delete pod resource")
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", runnerPod.Name))
|
|
||||||
log.Info("Deleted runner pod", "name", runnerPod.Name)
|
|
||||||
|
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -259,11 +259,12 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||||
log.V(2).Info("Detected some current statefulsets", "creationTimestampFirst", timestampFirst, "creationTimestampLast", timestampLast, "statefulsets", names)
|
log.V(2).Info("Detected some current statefulsets", "creationTimestampFirst", timestampFirst, "creationTimestampLast", timestampLast, "statefulsets", names)
|
||||||
}
|
}
|
||||||
|
|
||||||
var pending, running int
|
var pending, running, regTimeout int
|
||||||
|
|
||||||
for _, ss := range currentStatefulSets {
|
for _, ss := range currentStatefulSets {
|
||||||
pending += ss.pending
|
pending += ss.pending
|
||||||
running += ss.running
|
running += ss.running
|
||||||
|
regTimeout += ss.regTimeout
|
||||||
}
|
}
|
||||||
|
|
||||||
const defaultReplicas = 1
|
const defaultReplicas = 1
|
||||||
|
|
@ -276,15 +277,23 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||||
|
|
||||||
newDesiredReplicas := getIntOrDefault(replicasOfDesiredStatefulSet, defaultReplicas)
|
newDesiredReplicas := getIntOrDefault(replicasOfDesiredStatefulSet, defaultReplicas)
|
||||||
|
|
||||||
log.V(2).Info("Found some pods across statefulset(s)", "pending", pending, "running", running, "desired", newDesiredReplicas, "statefulsets", len(statefulsets))
|
log.V(2).Info(
|
||||||
|
"Found some pods across statefulset(s)",
|
||||||
|
"pending", pending,
|
||||||
|
"running", running,
|
||||||
|
"regTimeout", regTimeout,
|
||||||
|
"desired", newDesiredReplicas,
|
||||||
|
"statefulsets", len(statefulsets),
|
||||||
|
)
|
||||||
|
|
||||||
effectiveTime := runnerSet.Spec.EffectiveTime
|
effectiveTime := runnerSet.Spec.EffectiveTime
|
||||||
ephemeral := runnerSet.Spec.Ephemeral == nil || *runnerSet.Spec.Ephemeral
|
ephemeral := runnerSet.Spec.Ephemeral == nil || *runnerSet.Spec.Ephemeral
|
||||||
|
maybeRunning := pending + running
|
||||||
|
|
||||||
if newDesiredReplicas > pending+running && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
|
if newDesiredReplicas > maybeRunning && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
|
||||||
log.V(2).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", newDesiredReplicas, "pending", pending, "running", running)
|
log.V(2).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", newDesiredReplicas, "pending", pending, "running", running)
|
||||||
} else if newDesiredReplicas > pending+running {
|
} else if newDesiredReplicas > maybeRunning {
|
||||||
num := newDesiredReplicas - (pending + running)
|
num := newDesiredReplicas - maybeRunning
|
||||||
|
|
||||||
for i := 0; i < num; i++ {
|
for i := 0; i < num; i++ {
|
||||||
// Add more replicas
|
// Add more replicas
|
||||||
|
|
@ -415,6 +424,7 @@ type podsForStatefulset struct {
|
||||||
completed int
|
completed int
|
||||||
running int
|
running int
|
||||||
terminating int
|
terminating int
|
||||||
|
regTimeout int
|
||||||
pending int
|
pending int
|
||||||
templateHash string
|
templateHash string
|
||||||
statefulset *appsv1.StatefulSet
|
statefulset *appsv1.StatefulSet
|
||||||
|
|
@ -429,7 +439,7 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var completed, running, terminating, pending, total int
|
var completed, running, terminating, regTimeout, pending, total int
|
||||||
|
|
||||||
var pods []corev1.Pod
|
var pods []corev1.Pod
|
||||||
|
|
||||||
|
|
@ -445,7 +455,21 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log
|
||||||
if runnerPodOrContainerIsStopped(&pod) {
|
if runnerPodOrContainerIsStopped(&pod) {
|
||||||
completed++
|
completed++
|
||||||
} else if pod.Status.Phase == corev1.PodRunning {
|
} else if pod.Status.Phase == corev1.PodRunning {
|
||||||
running++
|
if podRunnerID(&pod) == "" && podConditionTransitionTimeAfter(&pod, corev1.PodReady, registrationTimeout) {
|
||||||
|
log.Info(
|
||||||
|
"Runner failed to register itself to GitHub in timely manner. "+
|
||||||
|
"Recreating the pod to see if it resolves the issue. "+
|
||||||
|
"CAUTION: If you see this a lot, you should investigate the root cause. "+
|
||||||
|
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
|
||||||
|
"creationTimestamp", pod.CreationTimestamp,
|
||||||
|
"readyTransitionTime", podConditionTransitionTime(&pod, corev1.PodReady, corev1.ConditionTrue),
|
||||||
|
"configuredRegistrationTimeout", registrationTimeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
regTimeout++
|
||||||
|
} else {
|
||||||
|
running++
|
||||||
|
}
|
||||||
} else if !pod.DeletionTimestamp.IsZero() {
|
} else if !pod.DeletionTimestamp.IsZero() {
|
||||||
terminating++
|
terminating++
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -466,6 +490,7 @@ func (r *RunnerSetReconciler) getPodsForStatefulset(ctx context.Context, log log
|
||||||
completed: completed,
|
completed: completed,
|
||||||
running: running,
|
running: running,
|
||||||
terminating: terminating,
|
terminating: terminating,
|
||||||
|
regTimeout: regTimeout,
|
||||||
pending: pending,
|
pending: pending,
|
||||||
templateHash: templateHash,
|
templateHash: templateHash,
|
||||||
statefulset: ss,
|
statefulset: ss,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue