diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 4d0d978f..5e4dfbc4 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -356,6 +356,19 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ log.Info("Failed to update ephemeral runner status. Requeue to not miss this event") return ctrl.Result{}, err } + + // Check if runner's GitHub registration is still valid + healthy, err := r.checkRunnerRegistration(ctx, ephemeralRunner, log) + if err != nil { + return ctrl.Result{}, err + } + if !healthy { + if err := r.markAsFailed(ctx, ephemeralRunner, "Runner registration no longer exists on GitHub", "RegistrationInvalidated", log); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + return ctrl.Result{}, nil case cs.State.Terminated.ExitCode != 0: // failed @@ -804,6 +817,43 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context, return nil } +// checkRunnerRegistration verifies the runner's GitHub-side registration is still valid. +// Returns true if the runner is healthy or we can't determine status (API errors). +// Returns false if the registration is confirmed gone (404). +func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (bool, error) { + // Only check runners that have been registered (have a RunnerId) + if ephemeralRunner.Status.RunnerId == 0 { + return true, nil + } + + actionsClient, err := r.GetActionsService(ctx, ephemeralRunner) + if err != nil { + log.Error(err, "Failed to get actions client for health check, skipping") + return true, nil + } + + _, err = actionsClient.GetRunner(ctx, int64(ephemeralRunner.Status.RunnerId)) + if err == nil { + return true, nil + } + + var actionsErr *actions.ActionsError + if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 { + log.Info("Runner registration not found on GitHub, marking as failed", + "runnerId", ephemeralRunner.Status.RunnerId, + "runnerName", ephemeralRunner.Status.RunnerName, + ) + return false, nil + } + + // For non-404 errors (transient API issues), don't take action + log.Info("Health check API call failed, will retry on next reconciliation", + "runnerId", ephemeralRunner.Status.RunnerId, + "error", err.Error(), + ) + return true, nil +} + func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error { client, err := r.GetActionsService(ctx, ephemeralRunner) if err != nil { @@ -813,6 +863,11 @@ func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, log.Info("Removing runner from the service", "runnerId", ephemeralRunner.Status.RunnerId) err = client.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId)) if err != nil { + var actionsErr *actions.ActionsError + if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 { + log.Info("Runner already removed from service", "runnerId", ephemeralRunner.Status.RunnerId) + return nil + } return fmt.Errorf("failed to remove runner from the service: %w", err) } diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index 5ee0df03..af3d6579 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -1422,4 +1422,216 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeTrue(), "failed to contact server") }) }) + + Describe("Stale runner health check", func() { + var ctx context.Context + var mgr ctrl.Manager + var autoscalingNS *corev1.Namespace + var configSecret *corev1.Secret + var controller *EphemeralRunnerReconciler + + Context("when GitHub registration is invalidated (404)", func() { + var ephemeralRunner *v1alpha1.EphemeralRunner + + BeforeEach(func() { + ctx = context.Background() + autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient) + configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name) + + // GetRunner returns 404 — simulates GitHub forgetting the registration + // RemoveRunner also returns 404 — the runner is already gone + fakeClient := fake.NewFakeClient( + fake.WithGetRunner(nil, &actions.ActionsError{ + StatusCode: 404, + Err: &actions.ActionsExceptionError{ + ExceptionName: "AgentNotFoundException", + Message: "runner not found", + }, + }), + fake.WithRemoveRunnerError(&actions.ActionsError{ + StatusCode: 404, + Err: &actions.ActionsExceptionError{ + ExceptionName: "AgentNotFoundException", + Message: "runner not found", + }, + }), + ) + + controller = &EphemeralRunnerReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: logf.Log, + ResourceBuilder: ResourceBuilder{ + SecretResolver: &SecretResolver{ + k8sClient: mgr.GetClient(), + multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)), + }, + }, + } + + err := controller.SetupWithManager(mgr) + Expect(err).To(BeNil(), "failed to setup controller") + + ephemeralRunner = newExampleRunner("stale-runner", autoscalingNS.Name, configSecret.Name) + err = k8sClient.Create(ctx, ephemeralRunner) + Expect(err).To(BeNil(), "failed to create ephemeral runner") + + startManagers(GinkgoT(), mgr) + }) + + It("should mark the runner as failed when GetRunner returns 404", func() { + // Wait for the controller to set up the runner (finalizers, secret, pod, status) + Eventually( + func() (int, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return 0, err + } + return updated.Status.RunnerId, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).ShouldNot(Equal(0), "runner should have a RunnerId set") + + // Wait for the pod to exist, then simulate a running container + pod := new(corev1.Pod) + Eventually( + func() (bool, error) { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return false, err + } + return true, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(true), "pod should exist") + + // Set pod to running with a running container status + pod.Status.Phase = corev1.PodRunning + pod.Status.ContainerStatuses = []corev1.ContainerStatus{ + { + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{ + StartedAt: metav1.Now(), + }, + }, + }, + } + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status to running") + + // Now the health check should detect the stale registration and mark it failed + Eventually( + func() (corev1.PodPhase, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return "", err + } + return updated.Status.Phase, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(Equal(corev1.PodFailed), "runner with invalidated registration should be marked failed") + }) + }) + + Context("when GetRunner returns a transient error (500)", func() { + var ephemeralRunner *v1alpha1.EphemeralRunner + + BeforeEach(func() { + ctx = context.Background() + autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient) + configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name) + + // GetRunner returns 500 — transient error, should NOT kill the runner + fakeClient := fake.NewFakeClient( + fake.WithGetRunner(nil, &actions.ActionsError{ + StatusCode: 500, + Err: fmt.Errorf("internal server error"), + }), + ) + + controller = &EphemeralRunnerReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: logf.Log, + ResourceBuilder: ResourceBuilder{ + SecretResolver: &SecretResolver{ + k8sClient: mgr.GetClient(), + multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)), + }, + }, + } + + err := controller.SetupWithManager(mgr) + Expect(err).To(BeNil(), "failed to setup controller") + + ephemeralRunner = newExampleRunner("transient-error-runner", autoscalingNS.Name, configSecret.Name) + err = k8sClient.Create(ctx, ephemeralRunner) + Expect(err).To(BeNil(), "failed to create ephemeral runner") + + startManagers(GinkgoT(), mgr) + }) + + It("should NOT mark the runner as failed on transient errors", func() { + // Wait for runner to get a RunnerId + Eventually( + func() (int, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return 0, err + } + return updated.Status.RunnerId, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).ShouldNot(Equal(0), "runner should have a RunnerId set") + + // Wait for pod and set it to running (same as the 404 test) + pod := new(corev1.Pod) + Eventually( + func() (bool, error) { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return false, err + } + return true, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(true), "pod should exist") + + pod.Status.Phase = corev1.PodRunning + pod.Status.ContainerStatuses = []corev1.ContainerStatus{ + { + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{ + StartedAt: metav1.Now(), + }, + }, + }, + } + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status to running") + + // Give the controller time to process — it should NOT mark as failed + Consistently( + func() (corev1.PodPhase, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return "", err + } + return updated.Status.Phase, nil + }, + time.Second*3, + ephemeralRunnerInterval, + ).ShouldNot(Equal(corev1.PodFailed), "runner should NOT be marked failed on transient errors") + }) + }) + }) }) diff --git a/github/actions/fake/client.go b/github/actions/fake/client.go index a108b902..9df026e5 100644 --- a/github/actions/fake/client.go +++ b/github/actions/fake/client.go @@ -45,6 +45,12 @@ func WithUpdateRunnerScaleSet(scaleSet *actions.RunnerScaleSet, err error) Optio } } +func WithRemoveRunnerError(err error) Option { + return func(f *FakeClient) { + f.removeRunnerResult.err = err + } +} + var defaultRunnerScaleSet = &actions.RunnerScaleSet{ Id: 1, Name: "testset",