diff --git a/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go b/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go index e34b255e..0c13bf7c 100644 --- a/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go +++ b/apis/actions.github.com/v1alpha1/ephemeralrunner_types.go @@ -119,7 +119,7 @@ type EphemeralRunnerStatus struct { RunnerJITConfig string `json:"runnerJITConfig,omitempty"` // +optional - Failures map[string]bool `json:"failures,omitempty"` + Failures map[string]metav1.Time `json:"failures,omitempty"` // +optional JobRequestId int64 `json:"jobRequestId,omitempty"` @@ -137,6 +137,20 @@ type EphemeralRunnerStatus struct { JobDisplayName string `json:"jobDisplayName,omitempty"` } +func (s *EphemeralRunnerStatus) LastFailure() metav1.Time { + var maxTime metav1.Time + if len(s.Failures) == 0 { + return maxTime + } + + for _, ts := range s.Failures { + if ts.After(maxTime.Time) { + maxTime = ts + } + } + return maxTime +} + // +kubebuilder:object:root=true // EphemeralRunnerList contains a list of EphemeralRunner diff --git a/apis/actions.github.com/v1alpha1/zz_generated.deepcopy.go b/apis/actions.github.com/v1alpha1/zz_generated.deepcopy.go index dd7553f0..b0947659 100644 --- a/apis/actions.github.com/v1alpha1/zz_generated.deepcopy.go +++ b/apis/actions.github.com/v1alpha1/zz_generated.deepcopy.go @@ -22,6 +22,7 @@ package v1alpha1 import ( "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -459,9 +460,9 @@ func (in *EphemeralRunnerStatus) DeepCopyInto(out *EphemeralRunnerStatus) { *out = *in if in.Failures != nil { in, out := &in.Failures, &out.Failures - *out = make(map[string]bool, len(*in)) + *out = make(map[string]metav1.Time, len(*in)) for key, val := range *in { - (*out)[key] = val + (*out)[key] = *val.DeepCopy() } } } diff --git a/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml b/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml index e1505280..f7cf1139 100644 --- a/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml +++ b/charts/gha-runner-scale-set-controller/crds/actions.github.com_ephemeralrunners.yaml @@ -7794,7 +7794,8 @@ spec: properties: failures: additionalProperties: - type: boolean + format: date-time + type: string type: object jobDisplayName: type: string diff --git a/config/crd/bases/actions.github.com_ephemeralrunners.yaml b/config/crd/bases/actions.github.com_ephemeralrunners.yaml index e1505280..f7cf1139 100644 --- a/config/crd/bases/actions.github.com_ephemeralrunners.yaml +++ b/config/crd/bases/actions.github.com_ephemeralrunners.yaml @@ -7794,7 +7794,8 @@ spec: properties: failures: additionalProperties: - type: boolean + format: date-time + type: string type: object jobDisplayName: type: string diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 750ea1b6..5ac80df7 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -28,6 +28,7 @@ import ( "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" @@ -50,6 +51,19 @@ type EphemeralRunnerReconciler struct { ResourceBuilder } +// precompute backoff durations for failed ephemeral runners +// the len(failedRunnerBackoff) must be equal to maxFailures + 1 +var failedRunnerBackoff = []time.Duration{ + 0, + 5 * time.Second, + 10 * time.Second, + 20 * time.Second, + 40 * time.Second, + 80 * time.Second, +} + +const maxFailures = 5 + // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/status,verbs=get;update;patch // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/finalizers,verbs=get;list;watch;create;update;patch;delete @@ -173,6 +187,29 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ } } + if len(ephemeralRunner.Status.Failures) > maxFailures { + log.Info(fmt.Sprintf("EphemeralRunner has failed more than %d times. Deleting ephemeral runner so it can be re-created", maxFailures)) + if err := r.Delete(ctx, ephemeralRunner); err != nil { + log.Error(fmt.Errorf("failed to delete ephemeral runner after %d failures: %w", maxFailures, err), "Failed to delete ephemeral runner") + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil + } + + now := metav1.Now() + lastFailure := ephemeralRunner.Status.LastFailure() + backoffDuration := failedRunnerBackoff[len(ephemeralRunner.Status.Failures)] + nextReconciliation := lastFailure.Add(backoffDuration) + if !lastFailure.IsZero() && now.Before(&metav1.Time{Time: nextReconciliation}) { + log.Info("Backing off the next reconciliation due to failure", + "lastFailure", lastFailure, + "nextReconciliation", nextReconciliation, + "requeueAfter", nextReconciliation.Sub(now.Time), + ) + return ctrl.Result{RequeueAfter: now.Sub(nextReconciliation)}, nil + } + secret := new(corev1.Secret) if err := r.Get(ctx, req.NamespacedName, secret); err != nil { if !kerrors.IsNotFound(err) { @@ -196,39 +233,28 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ pod := new(corev1.Pod) if err := r.Get(ctx, req.NamespacedName, pod); err != nil { - switch { - case !kerrors.IsNotFound(err): + if !kerrors.IsNotFound(err) { log.Error(err, "Failed to fetch the pod") return ctrl.Result{}, err + } - case len(ephemeralRunner.Status.Failures) > 5: - log.Info("EphemeralRunner has failed more than 5 times. Marking it as failed") - errMessage := fmt.Sprintf("Pod has failed to start more than 5 times: %s", pod.Status.Message) - if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonTooManyPodFailures, log); err != nil { + // Pod was not found. Create if the pod has never been created + log.Info("Creating new EphemeralRunner pod.") + result, err := r.createPod(ctx, ephemeralRunner, secret, log) + switch { + case err == nil: + return result, nil + case kerrors.IsInvalid(err) || kerrors.IsForbidden(err): + log.Error(err, "Failed to create a pod due to unrecoverable failure") + errMessage := fmt.Sprintf("Failed to create the pod: %v", err) + if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil { log.Error(err, "Failed to set ephemeral runner to phase Failed") return ctrl.Result{}, err } return ctrl.Result{}, nil - default: - // Pod was not found. Create if the pod has never been created - log.Info("Creating new EphemeralRunner pod.") - result, err := r.createPod(ctx, ephemeralRunner, secret, log) - switch { - case err == nil: - return result, nil - case kerrors.IsInvalid(err) || kerrors.IsForbidden(err): - log.Error(err, "Failed to create a pod due to unrecoverable failure") - errMessage := fmt.Sprintf("Failed to create the pod: %v", err) - if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil { - log.Error(err, "Failed to set ephemeral runner to phase Failed") - return ctrl.Result{}, err - } - return ctrl.Result{}, nil - default: - log.Error(err, "Failed to create the pod") - return ctrl.Result{}, err - } + log.Error(err, "Failed to create the pod") + return ctrl.Result{}, err } } @@ -484,9 +510,9 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem log.Info("Updating ephemeral runner status to track the failure count") if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) { if obj.Status.Failures == nil { - obj.Status.Failures = make(map[string]bool) + obj.Status.Failures = make(map[string]metav1.Time) } - obj.Status.Failures[string(pod.UID)] = true + obj.Status.Failures[string(pod.UID)] = metav1.Now() obj.Status.Ready = false obj.Status.Reason = pod.Status.Reason obj.Status.Message = pod.Status.Message diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index 1305bfca..0b842d5c 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -30,7 +30,7 @@ import ( const ( ephemeralRunnerTimeout = time.Second * 20 - ephemeralRunnerInterval = time.Millisecond * 250 + ephemeralRunnerInterval = time.Millisecond * 10 runnerImage = "ghcr.io/actions/actions-runner:latest" ) @@ -528,44 +528,26 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeEquivalentTo("")) }) - It("It should not re-create pod indefinitely", func() { + It("It should eventually delete ephemeral runner after consecutive failures", func() { updated := new(v1alpha1.EphemeralRunner) - pod := new(corev1.Pod) Eventually( - func() (bool, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) - if err != nil { - return false, err - } - - err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) - if err != nil { - if kerrors.IsNotFound(err) && len(updated.Status.Failures) > 5 { - return true, nil - } - - return false, err - } - - pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ - Name: v1alpha1.EphemeralRunnerContainerName, - State: corev1.ContainerState{ - Terminated: &corev1.ContainerStateTerminated{ - ExitCode: 1, - }, - }, - }) - err = k8sClient.Status().Update(ctx, pod) - Expect(err).To(BeNil(), "Failed to update pod status") - return false, fmt.Errorf("pod haven't failed for 5 times.") + func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) }, ephemeralRunnerTimeout, ephemeralRunnerInterval, - ).Should(BeEquivalentTo(true), "we should stop creating pod after 5 failures") + ).Should(Succeed(), "failed to get ephemeral runner") + + failEphemeralRunnerPod := func() *corev1.Pod { + pod := new(corev1.Pod) + Eventually( + func() error { + return k8sClient.Get(ctx, client.ObjectKey{Name: updated.Name, Namespace: updated.Namespace}, pod) + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(Succeed(), "failed to get ephemeral runner pod") - // In case we still have pod created due to controller-runtime cache delay, mark the container as exited - err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) - if err == nil { pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ Name: v1alpha1.EphemeralRunnerContainerName, State: corev1.ContainerState{ @@ -576,25 +558,70 @@ var _ = Describe("EphemeralRunner", func() { }) err := k8sClient.Status().Update(ctx, pod) Expect(err).To(BeNil(), "Failed to update pod status") + + return pod } - // EphemeralRunner should failed with reason TooManyPodFailures - Eventually(func() (string, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) - if err != nil { - return "", err - } - return updated.Status.Reason, nil - }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("TooManyPodFailures"), "Reason should be TooManyPodFailures") + for i := range 5 { + pod := failEphemeralRunnerPod() - // EphemeralRunner should not have any pod - Eventually(func() (bool, error) { - err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) - if err == nil { - return false, nil - } - return kerrors.IsNotFound(err), nil - }, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) + Eventually( + func() (int, error) { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return 0, err + } + return len(updated.Status.Failures), nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(i + 1)) + + Eventually( + func() error { + nextPod := new(corev1.Pod) + err := k8sClient.Get(ctx, client.ObjectKey{Name: pod.Name, Namespace: pod.Namespace}, nextPod) + if err != nil { + return err + } + if nextPod.UID != pod.UID { + return nil + } + return fmt.Errorf("pod not recreated") + }, + ).WithTimeout(20*time.Second).WithPolling(10*time.Millisecond).Should(Succeed(), "pod should be recreated") + + Eventually( + func() (bool, error) { + pod := new(corev1.Pod) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) + if err != nil { + return false, err + } + for _, cs := range pod.Status.ContainerStatuses { + if cs.Name == v1alpha1.EphemeralRunnerContainerName { + return cs.State.Terminated == nil, nil + } + } + + return true, nil + }, + ).WithTimeout(20*time.Second).WithPolling(10*time.Millisecond).Should(BeEquivalentTo(true), "pod should be terminated") + } + + failEphemeralRunnerPod() + + Eventually( + func() (bool, error) { + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if kerrors.IsNotFound(err) { + return true, nil + } + return false, err + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeTrue(), "Ephemeral runner should eventually be deleted") }) It("It should re-create pod on eviction", func() { diff --git a/controllers/actions.github.com/ephemeralrunnerset_controller_test.go b/controllers/actions.github.com/ephemeralrunnerset_controller_test.go index 0ea2027d..665279e8 100644 --- a/controllers/actions.github.com/ephemeralrunnerset_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunnerset_controller_test.go @@ -10,6 +10,7 @@ import ( "os" "path/filepath" "strings" + "testing" "time" corev1 "k8s.io/api/core/v1" @@ -21,6 +22,7 @@ import ( "github.com/go-logr/logr" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1" @@ -35,6 +37,10 @@ const ( ephemeralRunnerSetTestGitHubToken = "gh_token" ) +func TestPrecomputedConstants(t *testing.T) { + require.Equal(t, len(failedRunnerBackoff), maxFailures+1) +} + var _ = Describe("Test EphemeralRunnerSet controller", func() { var ctx context.Context var mgr ctrl.Manager diff --git a/controllers/actions.github.com/suite_test.go b/controllers/actions.github.com/suite_test.go index 80fb4196..46b97eb7 100644 --- a/controllers/actions.github.com/suite_test.go +++ b/controllers/actions.github.com/suite_test.go @@ -20,6 +20,7 @@ import ( "os" "path/filepath" "testing" + "time" "github.com/onsi/ginkgo/config" @@ -79,6 +80,15 @@ var _ = BeforeSuite(func() { k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) Expect(err).ToNot(HaveOccurred()) Expect(k8sClient).ToNot(BeNil()) + + failedRunnerBackoff = []time.Duration{ + 20 * time.Millisecond, + 20 * time.Millisecond, + 20 * time.Millisecond, + 20 * time.Millisecond, + 20 * time.Millisecond, + 20 * time.Millisecond, + } }) var _ = AfterSuite(func() {