From 471f81d0b3cd0a5826e7ab17d7a6893aa223e3ee Mon Sep 17 00:00:00 2001 From: Rob Howie Date: Tue, 17 Mar 2026 17:20:24 +0000 Subject: [PATCH] fix: Add a sync period to ensure we don't overload the github rate limit --- controllers/actions.github.com/constants.go | 7 +-- .../ephemeralrunner_controller.go | 52 +++++++++++++++---- .../ephemeralrunner_controller_test.go | 6 ++- 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/controllers/actions.github.com/constants.go b/controllers/actions.github.com/constants.go index ad1841fc..6b4f7690 100644 --- a/controllers/actions.github.com/constants.go +++ b/controllers/actions.github.com/constants.go @@ -75,8 +75,9 @@ const DefaultScaleSetListenerLogFormat = string(logging.LogFormatText) // ownerKey is field selector matching the owner name of a particular resource const resourceOwnerKey = ".metadata.controller" -// EphemeralRunner pod creation failure reasons +// EphemeralRunner failure reasons const ( - ReasonTooManyPodFailures = "TooManyPodFailures" - ReasonInvalidPodFailure = "InvalidPod" + ReasonTooManyPodFailures = "TooManyPodFailures" + ReasonInvalidPodFailure = "InvalidPod" + ReasonRegistrationInvalidated = "RegistrationInvalidated" ) diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 5e4dfbc4..4282b741 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -23,6 +23,7 @@ import ( "net/http" "strconv" "strings" + "sync" "time" "github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1" @@ -44,12 +45,22 @@ const ( ephemeralRunnerActionsFinalizerName = "ephemeralrunner.actions.github.com/runner-registration-finalizer" ) +const defaultHealthCheckInterval = 5 * time.Minute + // EphemeralRunnerReconciler reconciles a EphemeralRunner object type EphemeralRunnerReconciler struct { client.Client Log logr.Logger Scheme *runtime.Scheme ResourceBuilder + + // HealthCheckInterval controls how often the GitHub registration + // health check runs per runner. Defaults to defaultHealthCheckInterval. + HealthCheckInterval time.Duration + + // healthCheckLastRun tracks the last time each runner's registration + // was checked, keyed by "namespace/name". + healthCheckLastRun sync.Map } // precompute backoff durations for failed ephemeral runners @@ -82,6 +93,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ ephemeralRunner := new(v1alpha1.EphemeralRunner) if err := r.Get(ctx, req.NamespacedName, ephemeralRunner); err != nil { + if client.IgnoreNotFound(err) == nil { + r.healthCheckLastRun.Delete(req.Namespace + "/" + req.Name) + } return ctrl.Result{}, client.IgnoreNotFound(err) } @@ -357,16 +371,19 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, err } - // Check if runner's GitHub registration is still valid - healthy, err := r.checkRunnerRegistration(ctx, ephemeralRunner, log) - if err != nil { - return ctrl.Result{}, err - } - if !healthy { - if err := r.markAsFailed(ctx, ephemeralRunner, "Runner registration no longer exists on GitHub", "RegistrationInvalidated", log); err != nil { + // Only check registration when the container is actually running + // (not in Waiting states like ImagePullBackOff or CrashLoopBackOff) + if cs.State.Running != nil { + healthy, err := r.checkRunnerRegistration(ctx, ephemeralRunner, log) + if err != nil { return ctrl.Result{}, err } - return ctrl.Result{}, nil + if !healthy { + if err := r.markAsFailed(ctx, ephemeralRunner, "Runner registration no longer exists on GitHub", ReasonRegistrationInvalidated, log); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } } return ctrl.Result{}, nil @@ -817,15 +834,31 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context, return nil } +func (r *EphemeralRunnerReconciler) healthCheckIntervalOrDefault() time.Duration { + if r.HealthCheckInterval > 0 { + return r.HealthCheckInterval + } + return defaultHealthCheckInterval +} + // checkRunnerRegistration verifies the runner's GitHub-side registration is still valid. // Returns true if the runner is healthy or we can't determine status (API errors). // Returns false if the registration is confirmed gone (404). +// Calls are throttled to at most once per HealthCheckInterval per runner. func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (bool, error) { // Only check runners that have been registered (have a RunnerId) if ephemeralRunner.Status.RunnerId == 0 { return true, nil } + // Throttle: skip if we checked this runner recently + key := ephemeralRunner.Namespace + "/" + ephemeralRunner.Name + if lastRun, ok := r.healthCheckLastRun.Load(key); ok { + if time.Since(lastRun.(time.Time)) < r.healthCheckIntervalOrDefault() { + return true, nil + } + } + actionsClient, err := r.GetActionsService(ctx, ephemeralRunner) if err != nil { log.Error(err, "Failed to get actions client for health check, skipping") @@ -833,6 +866,7 @@ func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context, } _, err = actionsClient.GetRunner(ctx, int64(ephemeralRunner.Status.RunnerId)) + r.healthCheckLastRun.Store(key, time.Now()) if err == nil { return true, nil } @@ -847,7 +881,7 @@ func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context, } // For non-404 errors (transient API issues), don't take action - log.Info("Health check API call failed, will retry on next reconciliation", + log.Info("Health check API call failed, will retry after health check interval", "runnerId", ephemeralRunner.Status.RunnerId, "error", err.Error(), ) diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index af3d6579..bc1db8bd 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -1460,7 +1460,8 @@ var _ = Describe("EphemeralRunner", func() { controller = &EphemeralRunnerReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), - Log: logf.Log, + Log: logf.Log, + HealthCheckInterval: 1 * time.Second, ResourceBuilder: ResourceBuilder{ SecretResolver: &SecretResolver{ k8sClient: mgr.GetClient(), @@ -1557,7 +1558,8 @@ var _ = Describe("EphemeralRunner", func() { controller = &EphemeralRunnerReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), - Log: logf.Log, + Log: logf.Log, + HealthCheckInterval: 1 * time.Second, ResourceBuilder: ResourceBuilder{ SecretResolver: &SecretResolver{ k8sClient: mgr.GetClient(),