fix: Add a sync period to ensure we don't overload the github rate limit

This commit is contained in:
Rob Howie 2026-03-17 17:20:24 +00:00
parent ff7b81dc8f
commit 471f81d0b3
3 changed files with 51 additions and 14 deletions

View File

@ -75,8 +75,9 @@ const DefaultScaleSetListenerLogFormat = string(logging.LogFormatText)
// ownerKey is field selector matching the owner name of a particular resource
const resourceOwnerKey = ".metadata.controller"
// EphemeralRunner pod creation failure reasons
// EphemeralRunner failure reasons
const (
ReasonTooManyPodFailures = "TooManyPodFailures"
ReasonInvalidPodFailure = "InvalidPod"
ReasonTooManyPodFailures = "TooManyPodFailures"
ReasonInvalidPodFailure = "InvalidPod"
ReasonRegistrationInvalidated = "RegistrationInvalidated"
)

View File

@ -23,6 +23,7 @@ import (
"net/http"
"strconv"
"strings"
"sync"
"time"
"github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1"
@ -44,12 +45,22 @@ const (
ephemeralRunnerActionsFinalizerName = "ephemeralrunner.actions.github.com/runner-registration-finalizer"
)
const defaultHealthCheckInterval = 5 * time.Minute
// EphemeralRunnerReconciler reconciles a EphemeralRunner object
type EphemeralRunnerReconciler struct {
client.Client
Log logr.Logger
Scheme *runtime.Scheme
ResourceBuilder
// HealthCheckInterval controls how often the GitHub registration
// health check runs per runner. Defaults to defaultHealthCheckInterval.
HealthCheckInterval time.Duration
// healthCheckLastRun tracks the last time each runner's registration
// was checked, keyed by "namespace/name".
healthCheckLastRun sync.Map
}
// precompute backoff durations for failed ephemeral runners
@ -82,6 +93,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
ephemeralRunner := new(v1alpha1.EphemeralRunner)
if err := r.Get(ctx, req.NamespacedName, ephemeralRunner); err != nil {
if client.IgnoreNotFound(err) == nil {
r.healthCheckLastRun.Delete(req.Namespace + "/" + req.Name)
}
return ctrl.Result{}, client.IgnoreNotFound(err)
}
@ -357,16 +371,19 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return ctrl.Result{}, err
}
// Check if runner's GitHub registration is still valid
healthy, err := r.checkRunnerRegistration(ctx, ephemeralRunner, log)
if err != nil {
return ctrl.Result{}, err
}
if !healthy {
if err := r.markAsFailed(ctx, ephemeralRunner, "Runner registration no longer exists on GitHub", "RegistrationInvalidated", log); err != nil {
// Only check registration when the container is actually running
// (not in Waiting states like ImagePullBackOff or CrashLoopBackOff)
if cs.State.Running != nil {
healthy, err := r.checkRunnerRegistration(ctx, ephemeralRunner, log)
if err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
if !healthy {
if err := r.markAsFailed(ctx, ephemeralRunner, "Runner registration no longer exists on GitHub", ReasonRegistrationInvalidated, log); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
}
return ctrl.Result{}, nil
@ -817,15 +834,31 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context,
return nil
}
func (r *EphemeralRunnerReconciler) healthCheckIntervalOrDefault() time.Duration {
if r.HealthCheckInterval > 0 {
return r.HealthCheckInterval
}
return defaultHealthCheckInterval
}
// checkRunnerRegistration verifies the runner's GitHub-side registration is still valid.
// Returns true if the runner is healthy or we can't determine status (API errors).
// Returns false if the registration is confirmed gone (404).
// Calls are throttled to at most once per HealthCheckInterval per runner.
func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (bool, error) {
// Only check runners that have been registered (have a RunnerId)
if ephemeralRunner.Status.RunnerId == 0 {
return true, nil
}
// Throttle: skip if we checked this runner recently
key := ephemeralRunner.Namespace + "/" + ephemeralRunner.Name
if lastRun, ok := r.healthCheckLastRun.Load(key); ok {
if time.Since(lastRun.(time.Time)) < r.healthCheckIntervalOrDefault() {
return true, nil
}
}
actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
if err != nil {
log.Error(err, "Failed to get actions client for health check, skipping")
@ -833,6 +866,7 @@ func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context,
}
_, err = actionsClient.GetRunner(ctx, int64(ephemeralRunner.Status.RunnerId))
r.healthCheckLastRun.Store(key, time.Now())
if err == nil {
return true, nil
}
@ -847,7 +881,7 @@ func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context,
}
// For non-404 errors (transient API issues), don't take action
log.Info("Health check API call failed, will retry on next reconciliation",
log.Info("Health check API call failed, will retry after health check interval",
"runnerId", ephemeralRunner.Status.RunnerId,
"error", err.Error(),
)

View File

@ -1460,7 +1460,8 @@ var _ = Describe("EphemeralRunner", func() {
controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
Log: logf.Log,
HealthCheckInterval: 1 * time.Second,
ResourceBuilder: ResourceBuilder{
SecretResolver: &SecretResolver{
k8sClient: mgr.GetClient(),
@ -1557,7 +1558,8 @@ var _ = Describe("EphemeralRunner", func() {
controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
Log: logf.Log,
HealthCheckInterval: 1 * time.Second,
ResourceBuilder: ResourceBuilder{
SecretResolver: &SecretResolver{
k8sClient: mgr.GetClient(),