This commit is contained in:
Rob Howie 2026-03-16 10:55:09 +00:00 committed by GitHub
commit 317706ab18
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 273 additions and 0 deletions

View File

@ -357,6 +357,19 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
return ctrl.Result{}, err
}
// Check if runner's GitHub registration is still valid
healthy, err := r.checkRunnerRegistration(ctx, ephemeralRunner, log)
if err != nil {
return ctrl.Result{}, err
}
if !healthy {
if err := r.markAsFailed(ctx, ephemeralRunner, "Runner registration no longer exists on GitHub", "RegistrationInvalidated", log); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
return ctrl.Result{}, nil
case cs.State.Terminated.ExitCode != 0: // failed
@ -805,6 +818,43 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context,
return nil
}
// checkRunnerRegistration verifies the runner's GitHub-side registration is still valid.
// Returns true if the runner is healthy or we can't determine status (API errors).
// Returns false if the registration is confirmed gone (404).
func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (bool, error) {
// Only check runners that have been registered (have a RunnerId)
if ephemeralRunner.Status.RunnerId == 0 {
return true, nil
}
actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
if err != nil {
log.Error(err, "Failed to get actions client for health check, skipping")
return true, nil
}
_, err = actionsClient.GetRunner(ctx, int64(ephemeralRunner.Status.RunnerId))
if err == nil {
return true, nil
}
var actionsErr *actions.ActionsError
if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 {
log.Info("Runner registration not found on GitHub, marking as failed",
"runnerId", ephemeralRunner.Status.RunnerId,
"runnerName", ephemeralRunner.Status.RunnerName,
)
return false, nil
}
// For non-404 errors (transient API issues), don't take action
log.Info("Health check API call failed, will retry on next reconciliation",
"runnerId", ephemeralRunner.Status.RunnerId,
"error", err.Error(),
)
return true, nil
}
func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
client, err := r.GetActionsService(ctx, ephemeralRunner)
if err != nil {
@ -814,6 +864,11 @@ func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context,
log.Info("Removing runner from the service", "runnerId", ephemeralRunner.Status.RunnerId)
err = client.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId))
if err != nil {
var actionsErr *actions.ActionsError
if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 {
log.Info("Runner already removed from service", "runnerId", ephemeralRunner.Status.RunnerId)
return nil
}
return fmt.Errorf("failed to remove runner from the service: %w", err)
}

View File

@ -1445,4 +1445,216 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeTrue(), "failed to contact server")
})
})
Describe("Stale runner health check", func() {
var ctx context.Context
var mgr ctrl.Manager
var autoscalingNS *corev1.Namespace
var configSecret *corev1.Secret
var controller *EphemeralRunnerReconciler
Context("when GitHub registration is invalidated (404)", func() {
var ephemeralRunner *v1alpha1.EphemeralRunner
BeforeEach(func() {
ctx = context.Background()
autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)
// GetRunner returns 404 — simulates GitHub forgetting the registration
// RemoveRunner also returns 404 — the runner is already gone
fakeClient := fake.NewFakeClient(
fake.WithGetRunner(nil, &actions.ActionsError{
StatusCode: 404,
Err: &actions.ActionsExceptionError{
ExceptionName: "AgentNotFoundException",
Message: "runner not found",
},
}),
fake.WithRemoveRunnerError(&actions.ActionsError{
StatusCode: 404,
Err: &actions.ActionsExceptionError{
ExceptionName: "AgentNotFoundException",
Message: "runner not found",
},
}),
)
controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
ResourceBuilder: ResourceBuilder{
SecretResolver: &SecretResolver{
k8sClient: mgr.GetClient(),
multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)),
},
},
}
err := controller.SetupWithManager(mgr)
Expect(err).To(BeNil(), "failed to setup controller")
ephemeralRunner = newExampleRunner("stale-runner", autoscalingNS.Name, configSecret.Name)
err = k8sClient.Create(ctx, ephemeralRunner)
Expect(err).To(BeNil(), "failed to create ephemeral runner")
startManagers(GinkgoT(), mgr)
})
It("should mark the runner as failed when GetRunner returns 404", func() {
// Wait for the controller to set up the runner (finalizers, secret, pod, status)
Eventually(
func() (int, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return 0, err
}
return updated.Status.RunnerId, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).ShouldNot(Equal(0), "runner should have a RunnerId set")
// Wait for the pod to exist, then simulate a running container
pod := new(corev1.Pod)
Eventually(
func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(true), "pod should exist")
// Set pod to running with a running container status
pod.Status.Phase = corev1.PodRunning
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Running: &corev1.ContainerStateRunning{
StartedAt: metav1.Now(),
},
},
},
}
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status to running")
// Now the health check should detect the stale registration and mark it failed
Eventually(
func() (corev1.PodPhase, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return "", err
}
return updated.Status.Phase, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(Equal(corev1.PodFailed), "runner with invalidated registration should be marked failed")
})
})
Context("when GetRunner returns a transient error (500)", func() {
var ephemeralRunner *v1alpha1.EphemeralRunner
BeforeEach(func() {
ctx = context.Background()
autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)
// GetRunner returns 500 — transient error, should NOT kill the runner
fakeClient := fake.NewFakeClient(
fake.WithGetRunner(nil, &actions.ActionsError{
StatusCode: 500,
Err: fmt.Errorf("internal server error"),
}),
)
controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
ResourceBuilder: ResourceBuilder{
SecretResolver: &SecretResolver{
k8sClient: mgr.GetClient(),
multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)),
},
},
}
err := controller.SetupWithManager(mgr)
Expect(err).To(BeNil(), "failed to setup controller")
ephemeralRunner = newExampleRunner("transient-error-runner", autoscalingNS.Name, configSecret.Name)
err = k8sClient.Create(ctx, ephemeralRunner)
Expect(err).To(BeNil(), "failed to create ephemeral runner")
startManagers(GinkgoT(), mgr)
})
It("should NOT mark the runner as failed on transient errors", func() {
// Wait for runner to get a RunnerId
Eventually(
func() (int, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return 0, err
}
return updated.Status.RunnerId, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).ShouldNot(Equal(0), "runner should have a RunnerId set")
// Wait for pod and set it to running (same as the 404 test)
pod := new(corev1.Pod)
Eventually(
func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(true), "pod should exist")
pod.Status.Phase = corev1.PodRunning
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Running: &corev1.ContainerStateRunning{
StartedAt: metav1.Now(),
},
},
},
}
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status to running")
// Give the controller time to process — it should NOT mark as failed
Consistently(
func() (corev1.PodPhase, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return "", err
}
return updated.Status.Phase, nil
},
time.Second*3,
ephemeralRunnerInterval,
).ShouldNot(Equal(corev1.PodFailed), "runner should NOT be marked failed on transient errors")
})
})
})
})

View File

@ -45,6 +45,12 @@ func WithUpdateRunnerScaleSet(scaleSet *actions.RunnerScaleSet, err error) Optio
}
}
func WithRemoveRunnerError(err error) Option {
return func(f *FakeClient) {
f.removeRunnerResult.err = err
}
}
var defaultRunnerScaleSet = &actions.RunnerScaleSet{
Id: 1,
Name: "testset",