Create backoff mechanism for failed runners and allow re-creation of failed ephemeral runners (#4059)
This commit is contained in:
		
							parent
							
								
									d6e2790db5
								
							
						
					
					
						commit
						cae7efa2c6
					
				|  | @ -119,7 +119,7 @@ type EphemeralRunnerStatus struct { | |||
| 	RunnerJITConfig string `json:"runnerJITConfig,omitempty"` | ||||
| 
 | ||||
| 	// +optional
 | ||||
| 	Failures map[string]bool `json:"failures,omitempty"` | ||||
| 	Failures map[string]metav1.Time `json:"failures,omitempty"` | ||||
| 
 | ||||
| 	// +optional
 | ||||
| 	JobRequestId int64 `json:"jobRequestId,omitempty"` | ||||
|  | @ -137,6 +137,20 @@ type EphemeralRunnerStatus struct { | |||
| 	JobDisplayName string `json:"jobDisplayName,omitempty"` | ||||
| } | ||||
| 
 | ||||
| func (s *EphemeralRunnerStatus) LastFailure() metav1.Time { | ||||
| 	var maxTime metav1.Time | ||||
| 	if len(s.Failures) == 0 { | ||||
| 		return maxTime | ||||
| 	} | ||||
| 
 | ||||
| 	for _, ts := range s.Failures { | ||||
| 		if ts.After(maxTime.Time) { | ||||
| 			maxTime = ts | ||||
| 		} | ||||
| 	} | ||||
| 	return maxTime | ||||
| } | ||||
| 
 | ||||
| // +kubebuilder:object:root=true
 | ||||
| 
 | ||||
| // EphemeralRunnerList contains a list of EphemeralRunner
 | ||||
|  |  | |||
|  | @ -22,6 +22,7 @@ package v1alpha1 | |||
| 
 | ||||
| import ( | ||||
| 	"k8s.io/api/core/v1" | ||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||
| 	runtime "k8s.io/apimachinery/pkg/runtime" | ||||
| ) | ||||
| 
 | ||||
|  | @ -459,9 +460,9 @@ func (in *EphemeralRunnerStatus) DeepCopyInto(out *EphemeralRunnerStatus) { | |||
| 	*out = *in | ||||
| 	if in.Failures != nil { | ||||
| 		in, out := &in.Failures, &out.Failures | ||||
| 		*out = make(map[string]bool, len(*in)) | ||||
| 		*out = make(map[string]metav1.Time, len(*in)) | ||||
| 		for key, val := range *in { | ||||
| 			(*out)[key] = val | ||||
| 			(*out)[key] = *val.DeepCopy() | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
|  |  | |||
|  | @ -7794,7 +7794,8 @@ spec: | |||
|               properties: | ||||
|                 failures: | ||||
|                   additionalProperties: | ||||
|                     type: boolean | ||||
|                     format: date-time | ||||
|                     type: string | ||||
|                   type: object | ||||
|                 jobDisplayName: | ||||
|                   type: string | ||||
|  |  | |||
|  | @ -7794,7 +7794,8 @@ spec: | |||
|               properties: | ||||
|                 failures: | ||||
|                   additionalProperties: | ||||
|                     type: boolean | ||||
|                     format: date-time | ||||
|                     type: string | ||||
|                   type: object | ||||
|                 jobDisplayName: | ||||
|                   type: string | ||||
|  |  | |||
|  | @ -28,6 +28,7 @@ import ( | |||
| 	"github.com/go-logr/logr" | ||||
| 	corev1 "k8s.io/api/core/v1" | ||||
| 	kerrors "k8s.io/apimachinery/pkg/api/errors" | ||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||
| 	"k8s.io/apimachinery/pkg/runtime" | ||||
| 	"k8s.io/apimachinery/pkg/types" | ||||
| 	ctrl "sigs.k8s.io/controller-runtime" | ||||
|  | @ -50,6 +51,19 @@ type EphemeralRunnerReconciler struct { | |||
| 	ResourceBuilder | ||||
| } | ||||
| 
 | ||||
| // precompute backoff durations for failed ephemeral runners
 | ||||
| // the len(failedRunnerBackoff) must be equal to maxFailures + 1
 | ||||
| var failedRunnerBackoff = []time.Duration{ | ||||
| 	0, | ||||
| 	5 * time.Second, | ||||
| 	10 * time.Second, | ||||
| 	20 * time.Second, | ||||
| 	40 * time.Second, | ||||
| 	80 * time.Second, | ||||
| } | ||||
| 
 | ||||
| const maxFailures = 5 | ||||
| 
 | ||||
| // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners,verbs=get;list;watch;create;update;patch;delete
 | ||||
| // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/status,verbs=get;update;patch
 | ||||
| // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/finalizers,verbs=get;list;watch;create;update;patch;delete
 | ||||
|  | @ -173,6 +187,29 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if len(ephemeralRunner.Status.Failures) > maxFailures { | ||||
| 		log.Info(fmt.Sprintf("EphemeralRunner has failed more than %d times. Deleting ephemeral runner so it can be re-created", maxFailures)) | ||||
| 		if err := r.Delete(ctx, ephemeralRunner); err != nil { | ||||
| 			log.Error(fmt.Errorf("failed to delete ephemeral runner after %d failures: %w", maxFailures, err), "Failed to delete ephemeral runner") | ||||
| 			return ctrl.Result{}, err | ||||
| 		} | ||||
| 
 | ||||
| 		return ctrl.Result{}, nil | ||||
| 	} | ||||
| 
 | ||||
| 	now := metav1.Now() | ||||
| 	lastFailure := ephemeralRunner.Status.LastFailure() | ||||
| 	backoffDuration := failedRunnerBackoff[len(ephemeralRunner.Status.Failures)] | ||||
| 	nextReconciliation := lastFailure.Add(backoffDuration) | ||||
| 	if !lastFailure.IsZero() && now.Before(&metav1.Time{Time: nextReconciliation}) { | ||||
| 		log.Info("Backing off the next reconciliation due to failure", | ||||
| 			"lastFailure", lastFailure, | ||||
| 			"nextReconciliation", nextReconciliation, | ||||
| 			"requeueAfter", nextReconciliation.Sub(now.Time), | ||||
| 		) | ||||
| 		return ctrl.Result{RequeueAfter: now.Sub(nextReconciliation)}, nil | ||||
| 	} | ||||
| 
 | ||||
| 	secret := new(corev1.Secret) | ||||
| 	if err := r.Get(ctx, req.NamespacedName, secret); err != nil { | ||||
| 		if !kerrors.IsNotFound(err) { | ||||
|  | @ -196,39 +233,28 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ | |||
| 
 | ||||
| 	pod := new(corev1.Pod) | ||||
| 	if err := r.Get(ctx, req.NamespacedName, pod); err != nil { | ||||
| 		switch { | ||||
| 		case !kerrors.IsNotFound(err): | ||||
| 		if !kerrors.IsNotFound(err) { | ||||
| 			log.Error(err, "Failed to fetch the pod") | ||||
| 			return ctrl.Result{}, err | ||||
| 		} | ||||
| 
 | ||||
| 		case len(ephemeralRunner.Status.Failures) > 5: | ||||
| 			log.Info("EphemeralRunner has failed more than 5 times. Marking it as failed") | ||||
| 			errMessage := fmt.Sprintf("Pod has failed to start more than 5 times: %s", pod.Status.Message) | ||||
| 			if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonTooManyPodFailures, log); err != nil { | ||||
| 		// Pod was not found. Create if the pod has never been created
 | ||||
| 		log.Info("Creating new EphemeralRunner pod.") | ||||
| 		result, err := r.createPod(ctx, ephemeralRunner, secret, log) | ||||
| 		switch { | ||||
| 		case err == nil: | ||||
| 			return result, nil | ||||
| 		case kerrors.IsInvalid(err) || kerrors.IsForbidden(err): | ||||
| 			log.Error(err, "Failed to create a pod due to unrecoverable failure") | ||||
| 			errMessage := fmt.Sprintf("Failed to create the pod: %v", err) | ||||
| 			if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil { | ||||
| 				log.Error(err, "Failed to set ephemeral runner to phase Failed") | ||||
| 				return ctrl.Result{}, err | ||||
| 			} | ||||
| 			return ctrl.Result{}, nil | ||||
| 
 | ||||
| 		default: | ||||
| 			// Pod was not found. Create if the pod has never been created
 | ||||
| 			log.Info("Creating new EphemeralRunner pod.") | ||||
| 			result, err := r.createPod(ctx, ephemeralRunner, secret, log) | ||||
| 			switch { | ||||
| 			case err == nil: | ||||
| 				return result, nil | ||||
| 			case kerrors.IsInvalid(err) || kerrors.IsForbidden(err): | ||||
| 				log.Error(err, "Failed to create a pod due to unrecoverable failure") | ||||
| 				errMessage := fmt.Sprintf("Failed to create the pod: %v", err) | ||||
| 				if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil { | ||||
| 					log.Error(err, "Failed to set ephemeral runner to phase Failed") | ||||
| 					return ctrl.Result{}, err | ||||
| 				} | ||||
| 				return ctrl.Result{}, nil | ||||
| 			default: | ||||
| 				log.Error(err, "Failed to create the pod") | ||||
| 				return ctrl.Result{}, err | ||||
| 			} | ||||
| 			log.Error(err, "Failed to create the pod") | ||||
| 			return ctrl.Result{}, err | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
|  | @ -484,9 +510,9 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem | |||
| 	log.Info("Updating ephemeral runner status to track the failure count") | ||||
| 	if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) { | ||||
| 		if obj.Status.Failures == nil { | ||||
| 			obj.Status.Failures = make(map[string]bool) | ||||
| 			obj.Status.Failures = make(map[string]metav1.Time) | ||||
| 		} | ||||
| 		obj.Status.Failures[string(pod.UID)] = true | ||||
| 		obj.Status.Failures[string(pod.UID)] = metav1.Now() | ||||
| 		obj.Status.Ready = false | ||||
| 		obj.Status.Reason = pod.Status.Reason | ||||
| 		obj.Status.Message = pod.Status.Message | ||||
|  |  | |||
|  | @ -30,7 +30,7 @@ import ( | |||
| 
 | ||||
| const ( | ||||
| 	ephemeralRunnerTimeout  = time.Second * 20 | ||||
| 	ephemeralRunnerInterval = time.Millisecond * 250 | ||||
| 	ephemeralRunnerInterval = time.Millisecond * 10 | ||||
| 	runnerImage             = "ghcr.io/actions/actions-runner:latest" | ||||
| ) | ||||
| 
 | ||||
|  | @ -528,44 +528,26 @@ var _ = Describe("EphemeralRunner", func() { | |||
| 			).Should(BeEquivalentTo("")) | ||||
| 		}) | ||||
| 
 | ||||
| 		It("It should not re-create pod indefinitely", func() { | ||||
| 		It("It should eventually delete ephemeral runner after consecutive failures", func() { | ||||
| 			updated := new(v1alpha1.EphemeralRunner) | ||||
| 			pod := new(corev1.Pod) | ||||
| 			Eventually( | ||||
| 				func() (bool, error) { | ||||
| 					err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) | ||||
| 					if err != nil { | ||||
| 						return false, err | ||||
| 					} | ||||
| 
 | ||||
| 					err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) | ||||
| 					if err != nil { | ||||
| 						if kerrors.IsNotFound(err) && len(updated.Status.Failures) > 5 { | ||||
| 							return true, nil | ||||
| 						} | ||||
| 
 | ||||
| 						return false, err | ||||
| 					} | ||||
| 
 | ||||
| 					pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ | ||||
| 						Name: v1alpha1.EphemeralRunnerContainerName, | ||||
| 						State: corev1.ContainerState{ | ||||
| 							Terminated: &corev1.ContainerStateTerminated{ | ||||
| 								ExitCode: 1, | ||||
| 							}, | ||||
| 						}, | ||||
| 					}) | ||||
| 					err = k8sClient.Status().Update(ctx, pod) | ||||
| 					Expect(err).To(BeNil(), "Failed to update pod status") | ||||
| 					return false, fmt.Errorf("pod haven't failed for 5 times.") | ||||
| 				func() error { | ||||
| 					return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) | ||||
| 				}, | ||||
| 				ephemeralRunnerTimeout, | ||||
| 				ephemeralRunnerInterval, | ||||
| 			).Should(BeEquivalentTo(true), "we should stop creating pod after 5 failures") | ||||
| 			).Should(Succeed(), "failed to get ephemeral runner") | ||||
| 
 | ||||
| 			failEphemeralRunnerPod := func() *corev1.Pod { | ||||
| 				pod := new(corev1.Pod) | ||||
| 				Eventually( | ||||
| 					func() error { | ||||
| 						return k8sClient.Get(ctx, client.ObjectKey{Name: updated.Name, Namespace: updated.Namespace}, pod) | ||||
| 					}, | ||||
| 					ephemeralRunnerTimeout, | ||||
| 					ephemeralRunnerInterval, | ||||
| 				).Should(Succeed(), "failed to get ephemeral runner pod") | ||||
| 
 | ||||
| 			// In case we still have pod created due to controller-runtime cache delay, mark the container as exited
 | ||||
| 			err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) | ||||
| 			if err == nil { | ||||
| 				pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ | ||||
| 					Name: v1alpha1.EphemeralRunnerContainerName, | ||||
| 					State: corev1.ContainerState{ | ||||
|  | @ -576,25 +558,70 @@ var _ = Describe("EphemeralRunner", func() { | |||
| 				}) | ||||
| 				err := k8sClient.Status().Update(ctx, pod) | ||||
| 				Expect(err).To(BeNil(), "Failed to update pod status") | ||||
| 
 | ||||
| 				return pod | ||||
| 			} | ||||
| 
 | ||||
| 			// EphemeralRunner should failed with reason TooManyPodFailures
 | ||||
| 			Eventually(func() (string, error) { | ||||
| 				err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) | ||||
| 				if err != nil { | ||||
| 					return "", err | ||||
| 				} | ||||
| 				return updated.Status.Reason, nil | ||||
| 			}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("TooManyPodFailures"), "Reason should be TooManyPodFailures") | ||||
| 			for i := range 5 { | ||||
| 				pod := failEphemeralRunnerPod() | ||||
| 
 | ||||
| 			// EphemeralRunner should not have any pod
 | ||||
| 			Eventually(func() (bool, error) { | ||||
| 				err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) | ||||
| 				if err == nil { | ||||
| 					return false, nil | ||||
| 				} | ||||
| 				return kerrors.IsNotFound(err), nil | ||||
| 			}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) | ||||
| 				Eventually( | ||||
| 					func() (int, error) { | ||||
| 						err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) | ||||
| 						if err != nil { | ||||
| 							return 0, err | ||||
| 						} | ||||
| 						return len(updated.Status.Failures), nil | ||||
| 					}, | ||||
| 					ephemeralRunnerTimeout, | ||||
| 					ephemeralRunnerInterval, | ||||
| 				).Should(BeEquivalentTo(i + 1)) | ||||
| 
 | ||||
| 				Eventually( | ||||
| 					func() error { | ||||
| 						nextPod := new(corev1.Pod) | ||||
| 						err := k8sClient.Get(ctx, client.ObjectKey{Name: pod.Name, Namespace: pod.Namespace}, nextPod) | ||||
| 						if err != nil { | ||||
| 							return err | ||||
| 						} | ||||
| 						if nextPod.UID != pod.UID { | ||||
| 							return nil | ||||
| 						} | ||||
| 						return fmt.Errorf("pod not recreated") | ||||
| 					}, | ||||
| 				).WithTimeout(20*time.Second).WithPolling(10*time.Millisecond).Should(Succeed(), "pod should be recreated") | ||||
| 
 | ||||
| 				Eventually( | ||||
| 					func() (bool, error) { | ||||
| 						pod := new(corev1.Pod) | ||||
| 						err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) | ||||
| 						if err != nil { | ||||
| 							return false, err | ||||
| 						} | ||||
| 						for _, cs := range pod.Status.ContainerStatuses { | ||||
| 							if cs.Name == v1alpha1.EphemeralRunnerContainerName { | ||||
| 								return cs.State.Terminated == nil, nil | ||||
| 							} | ||||
| 						} | ||||
| 
 | ||||
| 						return true, nil | ||||
| 					}, | ||||
| 				).WithTimeout(20*time.Second).WithPolling(10*time.Millisecond).Should(BeEquivalentTo(true), "pod should be terminated") | ||||
| 			} | ||||
| 
 | ||||
| 			failEphemeralRunnerPod() | ||||
| 
 | ||||
| 			Eventually( | ||||
| 				func() (bool, error) { | ||||
| 					err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) | ||||
| 					if kerrors.IsNotFound(err) { | ||||
| 						return true, nil | ||||
| 					} | ||||
| 					return false, err | ||||
| 				}, | ||||
| 				ephemeralRunnerTimeout, | ||||
| 				ephemeralRunnerInterval, | ||||
| 			).Should(BeTrue(), "Ephemeral runner should eventually be deleted") | ||||
| 		}) | ||||
| 
 | ||||
| 		It("It should re-create pod on eviction", func() { | ||||
|  |  | |||
|  | @ -10,6 +10,7 @@ import ( | |||
| 	"os" | ||||
| 	"path/filepath" | ||||
| 	"strings" | ||||
| 	"testing" | ||||
| 	"time" | ||||
| 
 | ||||
| 	corev1 "k8s.io/api/core/v1" | ||||
|  | @ -21,6 +22,7 @@ import ( | |||
| 	"github.com/go-logr/logr" | ||||
| 	. "github.com/onsi/ginkgo/v2" | ||||
| 	. "github.com/onsi/gomega" | ||||
| 	"github.com/stretchr/testify/require" | ||||
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||
| 
 | ||||
| 	"github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1" | ||||
|  | @ -35,6 +37,10 @@ const ( | |||
| 	ephemeralRunnerSetTestGitHubToken = "gh_token" | ||||
| ) | ||||
| 
 | ||||
| func TestPrecomputedConstants(t *testing.T) { | ||||
| 	require.Equal(t, len(failedRunnerBackoff), maxFailures+1) | ||||
| } | ||||
| 
 | ||||
| var _ = Describe("Test EphemeralRunnerSet controller", func() { | ||||
| 	var ctx context.Context | ||||
| 	var mgr ctrl.Manager | ||||
|  |  | |||
|  | @ -20,6 +20,7 @@ import ( | |||
| 	"os" | ||||
| 	"path/filepath" | ||||
| 	"testing" | ||||
| 	"time" | ||||
| 
 | ||||
| 	"github.com/onsi/ginkgo/config" | ||||
| 
 | ||||
|  | @ -79,6 +80,15 @@ var _ = BeforeSuite(func() { | |||
| 	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) | ||||
| 	Expect(err).ToNot(HaveOccurred()) | ||||
| 	Expect(k8sClient).ToNot(BeNil()) | ||||
| 
 | ||||
| 	failedRunnerBackoff = []time.Duration{ | ||||
| 		20 * time.Millisecond, | ||||
| 		20 * time.Millisecond, | ||||
| 		20 * time.Millisecond, | ||||
| 		20 * time.Millisecond, | ||||
| 		20 * time.Millisecond, | ||||
| 		20 * time.Millisecond, | ||||
| 	} | ||||
| }) | ||||
| 
 | ||||
| var _ = AfterSuite(func() { | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue