Merge ff7b81dc8f into 2fc51aaf32
This commit is contained in:
commit
317706ab18
|
|
@ -357,6 +357,19 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
// Check if runner's GitHub registration is still valid
|
||||
healthy, err := r.checkRunnerRegistration(ctx, ephemeralRunner, log)
|
||||
if err != nil {
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
if !healthy {
|
||||
if err := r.markAsFailed(ctx, ephemeralRunner, "Runner registration no longer exists on GitHub", "RegistrationInvalidated", log); err != nil {
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
|
||||
case cs.State.Terminated.ExitCode != 0: // failed
|
||||
|
|
@ -805,6 +818,43 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context,
|
|||
return nil
|
||||
}
|
||||
|
||||
// checkRunnerRegistration verifies the runner's GitHub-side registration is still valid.
|
||||
// Returns true if the runner is healthy or we can't determine status (API errors).
|
||||
// Returns false if the registration is confirmed gone (404).
|
||||
func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (bool, error) {
|
||||
// Only check runners that have been registered (have a RunnerId)
|
||||
if ephemeralRunner.Status.RunnerId == 0 {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
|
||||
if err != nil {
|
||||
log.Error(err, "Failed to get actions client for health check, skipping")
|
||||
return true, nil
|
||||
}
|
||||
|
||||
_, err = actionsClient.GetRunner(ctx, int64(ephemeralRunner.Status.RunnerId))
|
||||
if err == nil {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
var actionsErr *actions.ActionsError
|
||||
if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 {
|
||||
log.Info("Runner registration not found on GitHub, marking as failed",
|
||||
"runnerId", ephemeralRunner.Status.RunnerId,
|
||||
"runnerName", ephemeralRunner.Status.RunnerName,
|
||||
)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// For non-404 errors (transient API issues), don't take action
|
||||
log.Info("Health check API call failed, will retry on next reconciliation",
|
||||
"runnerId", ephemeralRunner.Status.RunnerId,
|
||||
"error", err.Error(),
|
||||
)
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
|
||||
client, err := r.GetActionsService(ctx, ephemeralRunner)
|
||||
if err != nil {
|
||||
|
|
@ -814,6 +864,11 @@ func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context,
|
|||
log.Info("Removing runner from the service", "runnerId", ephemeralRunner.Status.RunnerId)
|
||||
err = client.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId))
|
||||
if err != nil {
|
||||
var actionsErr *actions.ActionsError
|
||||
if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 {
|
||||
log.Info("Runner already removed from service", "runnerId", ephemeralRunner.Status.RunnerId)
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("failed to remove runner from the service: %w", err)
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1445,4 +1445,216 @@ var _ = Describe("EphemeralRunner", func() {
|
|||
).Should(BeTrue(), "failed to contact server")
|
||||
})
|
||||
})
|
||||
|
||||
Describe("Stale runner health check", func() {
|
||||
var ctx context.Context
|
||||
var mgr ctrl.Manager
|
||||
var autoscalingNS *corev1.Namespace
|
||||
var configSecret *corev1.Secret
|
||||
var controller *EphemeralRunnerReconciler
|
||||
|
||||
Context("when GitHub registration is invalidated (404)", func() {
|
||||
var ephemeralRunner *v1alpha1.EphemeralRunner
|
||||
|
||||
BeforeEach(func() {
|
||||
ctx = context.Background()
|
||||
autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
|
||||
configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)
|
||||
|
||||
// GetRunner returns 404 — simulates GitHub forgetting the registration
|
||||
// RemoveRunner also returns 404 — the runner is already gone
|
||||
fakeClient := fake.NewFakeClient(
|
||||
fake.WithGetRunner(nil, &actions.ActionsError{
|
||||
StatusCode: 404,
|
||||
Err: &actions.ActionsExceptionError{
|
||||
ExceptionName: "AgentNotFoundException",
|
||||
Message: "runner not found",
|
||||
},
|
||||
}),
|
||||
fake.WithRemoveRunnerError(&actions.ActionsError{
|
||||
StatusCode: 404,
|
||||
Err: &actions.ActionsExceptionError{
|
||||
ExceptionName: "AgentNotFoundException",
|
||||
Message: "runner not found",
|
||||
},
|
||||
}),
|
||||
)
|
||||
|
||||
controller = &EphemeralRunnerReconciler{
|
||||
Client: mgr.GetClient(),
|
||||
Scheme: mgr.GetScheme(),
|
||||
Log: logf.Log,
|
||||
ResourceBuilder: ResourceBuilder{
|
||||
SecretResolver: &SecretResolver{
|
||||
k8sClient: mgr.GetClient(),
|
||||
multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
err := controller.SetupWithManager(mgr)
|
||||
Expect(err).To(BeNil(), "failed to setup controller")
|
||||
|
||||
ephemeralRunner = newExampleRunner("stale-runner", autoscalingNS.Name, configSecret.Name)
|
||||
err = k8sClient.Create(ctx, ephemeralRunner)
|
||||
Expect(err).To(BeNil(), "failed to create ephemeral runner")
|
||||
|
||||
startManagers(GinkgoT(), mgr)
|
||||
})
|
||||
|
||||
It("should mark the runner as failed when GetRunner returns 404", func() {
|
||||
// Wait for the controller to set up the runner (finalizers, secret, pod, status)
|
||||
Eventually(
|
||||
func() (int, error) {
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return updated.Status.RunnerId, nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).ShouldNot(Equal(0), "runner should have a RunnerId set")
|
||||
|
||||
// Wait for the pod to exist, then simulate a running container
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(
|
||||
func() (bool, error) {
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
|
||||
return false, err
|
||||
}
|
||||
return true, nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeEquivalentTo(true), "pod should exist")
|
||||
|
||||
// Set pod to running with a running container status
|
||||
pod.Status.Phase = corev1.PodRunning
|
||||
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
|
||||
{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Running: &corev1.ContainerStateRunning{
|
||||
StartedAt: metav1.Now(),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
err := k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status to running")
|
||||
|
||||
// Now the health check should detect the stale registration and mark it failed
|
||||
Eventually(
|
||||
func() (corev1.PodPhase, error) {
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return updated.Status.Phase, nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(Equal(corev1.PodFailed), "runner with invalidated registration should be marked failed")
|
||||
})
|
||||
})
|
||||
|
||||
Context("when GetRunner returns a transient error (500)", func() {
|
||||
var ephemeralRunner *v1alpha1.EphemeralRunner
|
||||
|
||||
BeforeEach(func() {
|
||||
ctx = context.Background()
|
||||
autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
|
||||
configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)
|
||||
|
||||
// GetRunner returns 500 — transient error, should NOT kill the runner
|
||||
fakeClient := fake.NewFakeClient(
|
||||
fake.WithGetRunner(nil, &actions.ActionsError{
|
||||
StatusCode: 500,
|
||||
Err: fmt.Errorf("internal server error"),
|
||||
}),
|
||||
)
|
||||
|
||||
controller = &EphemeralRunnerReconciler{
|
||||
Client: mgr.GetClient(),
|
||||
Scheme: mgr.GetScheme(),
|
||||
Log: logf.Log,
|
||||
ResourceBuilder: ResourceBuilder{
|
||||
SecretResolver: &SecretResolver{
|
||||
k8sClient: mgr.GetClient(),
|
||||
multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
err := controller.SetupWithManager(mgr)
|
||||
Expect(err).To(BeNil(), "failed to setup controller")
|
||||
|
||||
ephemeralRunner = newExampleRunner("transient-error-runner", autoscalingNS.Name, configSecret.Name)
|
||||
err = k8sClient.Create(ctx, ephemeralRunner)
|
||||
Expect(err).To(BeNil(), "failed to create ephemeral runner")
|
||||
|
||||
startManagers(GinkgoT(), mgr)
|
||||
})
|
||||
|
||||
It("should NOT mark the runner as failed on transient errors", func() {
|
||||
// Wait for runner to get a RunnerId
|
||||
Eventually(
|
||||
func() (int, error) {
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return updated.Status.RunnerId, nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).ShouldNot(Equal(0), "runner should have a RunnerId set")
|
||||
|
||||
// Wait for pod and set it to running (same as the 404 test)
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(
|
||||
func() (bool, error) {
|
||||
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
|
||||
return false, err
|
||||
}
|
||||
return true, nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeEquivalentTo(true), "pod should exist")
|
||||
|
||||
pod.Status.Phase = corev1.PodRunning
|
||||
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
|
||||
{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Running: &corev1.ContainerStateRunning{
|
||||
StartedAt: metav1.Now(),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
err := k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "failed to update pod status to running")
|
||||
|
||||
// Give the controller time to process — it should NOT mark as failed
|
||||
Consistently(
|
||||
func() (corev1.PodPhase, error) {
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return updated.Status.Phase, nil
|
||||
},
|
||||
time.Second*3,
|
||||
ephemeralRunnerInterval,
|
||||
).ShouldNot(Equal(corev1.PodFailed), "runner should NOT be marked failed on transient errors")
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
||||
|
|
|
|||
|
|
@ -45,6 +45,12 @@ func WithUpdateRunnerScaleSet(scaleSet *actions.RunnerScaleSet, err error) Optio
|
|||
}
|
||||
}
|
||||
|
||||
func WithRemoveRunnerError(err error) Option {
|
||||
return func(f *FakeClient) {
|
||||
f.removeRunnerResult.err = err
|
||||
}
|
||||
}
|
||||
|
||||
var defaultRunnerScaleSet = &actions.RunnerScaleSet{
|
||||
Id: 1,
|
||||
Name: "testset",
|
||||
|
|
|
|||
Loading…
Reference in New Issue