Create backoff mechanism for failed runners and allow re-creation of failed ephemeral runners (#4059)

This commit is contained in:
Nikola Jokic 2025-05-14 15:38:50 +02:00 committed by GitHub
parent d6e2790db5
commit cae7efa2c6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 167 additions and 81 deletions

View File

@ -119,7 +119,7 @@ type EphemeralRunnerStatus struct {
RunnerJITConfig string `json:"runnerJITConfig,omitempty"` RunnerJITConfig string `json:"runnerJITConfig,omitempty"`
// +optional // +optional
Failures map[string]bool `json:"failures,omitempty"` Failures map[string]metav1.Time `json:"failures,omitempty"`
// +optional // +optional
JobRequestId int64 `json:"jobRequestId,omitempty"` JobRequestId int64 `json:"jobRequestId,omitempty"`
@ -137,6 +137,20 @@ type EphemeralRunnerStatus struct {
JobDisplayName string `json:"jobDisplayName,omitempty"` JobDisplayName string `json:"jobDisplayName,omitempty"`
} }
func (s *EphemeralRunnerStatus) LastFailure() metav1.Time {
var maxTime metav1.Time
if len(s.Failures) == 0 {
return maxTime
}
for _, ts := range s.Failures {
if ts.After(maxTime.Time) {
maxTime = ts
}
}
return maxTime
}
// +kubebuilder:object:root=true // +kubebuilder:object:root=true
// EphemeralRunnerList contains a list of EphemeralRunner // EphemeralRunnerList contains a list of EphemeralRunner

View File

@ -22,6 +22,7 @@ package v1alpha1
import ( import (
"k8s.io/api/core/v1" "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime" runtime "k8s.io/apimachinery/pkg/runtime"
) )
@ -459,9 +460,9 @@ func (in *EphemeralRunnerStatus) DeepCopyInto(out *EphemeralRunnerStatus) {
*out = *in *out = *in
if in.Failures != nil { if in.Failures != nil {
in, out := &in.Failures, &out.Failures in, out := &in.Failures, &out.Failures
*out = make(map[string]bool, len(*in)) *out = make(map[string]metav1.Time, len(*in))
for key, val := range *in { for key, val := range *in {
(*out)[key] = val (*out)[key] = *val.DeepCopy()
} }
} }
} }

View File

@ -7794,7 +7794,8 @@ spec:
properties: properties:
failures: failures:
additionalProperties: additionalProperties:
type: boolean format: date-time
type: string
type: object type: object
jobDisplayName: jobDisplayName:
type: string type: string

View File

@ -7794,7 +7794,8 @@ spec:
properties: properties:
failures: failures:
additionalProperties: additionalProperties:
type: boolean format: date-time
type: string
type: object type: object
jobDisplayName: jobDisplayName:
type: string type: string

View File

@ -28,6 +28,7 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors" kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime" ctrl "sigs.k8s.io/controller-runtime"
@ -50,6 +51,19 @@ type EphemeralRunnerReconciler struct {
ResourceBuilder ResourceBuilder
} }
// precompute backoff durations for failed ephemeral runners
// the len(failedRunnerBackoff) must be equal to maxFailures + 1
var failedRunnerBackoff = []time.Duration{
0,
5 * time.Second,
10 * time.Second,
20 * time.Second,
40 * time.Second,
80 * time.Second,
}
const maxFailures = 5
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/status,verbs=get;update;patch // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/finalizers,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/finalizers,verbs=get;list;watch;create;update;patch;delete
@ -173,6 +187,29 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
} }
} }
if len(ephemeralRunner.Status.Failures) > maxFailures {
log.Info(fmt.Sprintf("EphemeralRunner has failed more than %d times. Deleting ephemeral runner so it can be re-created", maxFailures))
if err := r.Delete(ctx, ephemeralRunner); err != nil {
log.Error(fmt.Errorf("failed to delete ephemeral runner after %d failures: %w", maxFailures, err), "Failed to delete ephemeral runner")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
now := metav1.Now()
lastFailure := ephemeralRunner.Status.LastFailure()
backoffDuration := failedRunnerBackoff[len(ephemeralRunner.Status.Failures)]
nextReconciliation := lastFailure.Add(backoffDuration)
if !lastFailure.IsZero() && now.Before(&metav1.Time{Time: nextReconciliation}) {
log.Info("Backing off the next reconciliation due to failure",
"lastFailure", lastFailure,
"nextReconciliation", nextReconciliation,
"requeueAfter", nextReconciliation.Sub(now.Time),
)
return ctrl.Result{RequeueAfter: now.Sub(nextReconciliation)}, nil
}
secret := new(corev1.Secret) secret := new(corev1.Secret)
if err := r.Get(ctx, req.NamespacedName, secret); err != nil { if err := r.Get(ctx, req.NamespacedName, secret); err != nil {
if !kerrors.IsNotFound(err) { if !kerrors.IsNotFound(err) {
@ -196,39 +233,28 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
pod := new(corev1.Pod) pod := new(corev1.Pod)
if err := r.Get(ctx, req.NamespacedName, pod); err != nil { if err := r.Get(ctx, req.NamespacedName, pod); err != nil {
switch { if !kerrors.IsNotFound(err) {
case !kerrors.IsNotFound(err):
log.Error(err, "Failed to fetch the pod") log.Error(err, "Failed to fetch the pod")
return ctrl.Result{}, err return ctrl.Result{}, err
}
case len(ephemeralRunner.Status.Failures) > 5: // Pod was not found. Create if the pod has never been created
log.Info("EphemeralRunner has failed more than 5 times. Marking it as failed") log.Info("Creating new EphemeralRunner pod.")
errMessage := fmt.Sprintf("Pod has failed to start more than 5 times: %s", pod.Status.Message) result, err := r.createPod(ctx, ephemeralRunner, secret, log)
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonTooManyPodFailures, log); err != nil { switch {
case err == nil:
return result, nil
case kerrors.IsInvalid(err) || kerrors.IsForbidden(err):
log.Error(err, "Failed to create a pod due to unrecoverable failure")
errMessage := fmt.Sprintf("Failed to create the pod: %v", err)
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil {
log.Error(err, "Failed to set ephemeral runner to phase Failed") log.Error(err, "Failed to set ephemeral runner to phase Failed")
return ctrl.Result{}, err return ctrl.Result{}, err
} }
return ctrl.Result{}, nil return ctrl.Result{}, nil
default: default:
// Pod was not found. Create if the pod has never been created log.Error(err, "Failed to create the pod")
log.Info("Creating new EphemeralRunner pod.") return ctrl.Result{}, err
result, err := r.createPod(ctx, ephemeralRunner, secret, log)
switch {
case err == nil:
return result, nil
case kerrors.IsInvalid(err) || kerrors.IsForbidden(err):
log.Error(err, "Failed to create a pod due to unrecoverable failure")
errMessage := fmt.Sprintf("Failed to create the pod: %v", err)
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonInvalidPodFailure, log); err != nil {
log.Error(err, "Failed to set ephemeral runner to phase Failed")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
default:
log.Error(err, "Failed to create the pod")
return ctrl.Result{}, err
}
} }
} }
@ -484,9 +510,9 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem
log.Info("Updating ephemeral runner status to track the failure count") log.Info("Updating ephemeral runner status to track the failure count")
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) { if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
if obj.Status.Failures == nil { if obj.Status.Failures == nil {
obj.Status.Failures = make(map[string]bool) obj.Status.Failures = make(map[string]metav1.Time)
} }
obj.Status.Failures[string(pod.UID)] = true obj.Status.Failures[string(pod.UID)] = metav1.Now()
obj.Status.Ready = false obj.Status.Ready = false
obj.Status.Reason = pod.Status.Reason obj.Status.Reason = pod.Status.Reason
obj.Status.Message = pod.Status.Message obj.Status.Message = pod.Status.Message

View File

@ -30,7 +30,7 @@ import (
const ( const (
ephemeralRunnerTimeout = time.Second * 20 ephemeralRunnerTimeout = time.Second * 20
ephemeralRunnerInterval = time.Millisecond * 250 ephemeralRunnerInterval = time.Millisecond * 10
runnerImage = "ghcr.io/actions/actions-runner:latest" runnerImage = "ghcr.io/actions/actions-runner:latest"
) )
@ -528,44 +528,26 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeEquivalentTo("")) ).Should(BeEquivalentTo(""))
}) })
It("It should not re-create pod indefinitely", func() { It("It should eventually delete ephemeral runner after consecutive failures", func() {
updated := new(v1alpha1.EphemeralRunner) updated := new(v1alpha1.EphemeralRunner)
pod := new(corev1.Pod)
Eventually( Eventually(
func() (bool, error) { func() error {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return false, err
}
err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
if err != nil {
if kerrors.IsNotFound(err) && len(updated.Status.Failures) > 5 {
return true, nil
}
return false, err
}
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status")
return false, fmt.Errorf("pod haven't failed for 5 times.")
}, },
ephemeralRunnerTimeout, ephemeralRunnerTimeout,
ephemeralRunnerInterval, ephemeralRunnerInterval,
).Should(BeEquivalentTo(true), "we should stop creating pod after 5 failures") ).Should(Succeed(), "failed to get ephemeral runner")
failEphemeralRunnerPod := func() *corev1.Pod {
pod := new(corev1.Pod)
Eventually(
func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: updated.Name, Namespace: updated.Namespace}, pod)
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(Succeed(), "failed to get ephemeral runner pod")
// In case we still have pod created due to controller-runtime cache delay, mark the container as exited
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
if err == nil {
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{ pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName, Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{ State: corev1.ContainerState{
@ -576,25 +558,70 @@ var _ = Describe("EphemeralRunner", func() {
}) })
err := k8sClient.Status().Update(ctx, pod) err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status") Expect(err).To(BeNil(), "Failed to update pod status")
return pod
} }
// EphemeralRunner should failed with reason TooManyPodFailures for i := range 5 {
Eventually(func() (string, error) { pod := failEphemeralRunnerPod()
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return "", err
}
return updated.Status.Reason, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("TooManyPodFailures"), "Reason should be TooManyPodFailures")
// EphemeralRunner should not have any pod Eventually(
Eventually(func() (bool, error) { func() (int, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod) err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err == nil { if err != nil {
return false, nil return 0, err
} }
return kerrors.IsNotFound(err), nil return len(updated.Status.Failures), nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true)) },
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(i + 1))
Eventually(
func() error {
nextPod := new(corev1.Pod)
err := k8sClient.Get(ctx, client.ObjectKey{Name: pod.Name, Namespace: pod.Namespace}, nextPod)
if err != nil {
return err
}
if nextPod.UID != pod.UID {
return nil
}
return fmt.Errorf("pod not recreated")
},
).WithTimeout(20*time.Second).WithPolling(10*time.Millisecond).Should(Succeed(), "pod should be recreated")
Eventually(
func() (bool, error) {
pod := new(corev1.Pod)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
if err != nil {
return false, err
}
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == v1alpha1.EphemeralRunnerContainerName {
return cs.State.Terminated == nil, nil
}
}
return true, nil
},
).WithTimeout(20*time.Second).WithPolling(10*time.Millisecond).Should(BeEquivalentTo(true), "pod should be terminated")
}
failEphemeralRunnerPod()
Eventually(
func() (bool, error) {
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if kerrors.IsNotFound(err) {
return true, nil
}
return false, err
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
}) })
It("It should re-create pod on eviction", func() { It("It should re-create pod on eviction", func() {

View File

@ -10,6 +10,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"strings" "strings"
"testing"
"time" "time"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
@ -21,6 +22,7 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
. "github.com/onsi/ginkgo/v2" . "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1" "github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1"
@ -35,6 +37,10 @@ const (
ephemeralRunnerSetTestGitHubToken = "gh_token" ephemeralRunnerSetTestGitHubToken = "gh_token"
) )
func TestPrecomputedConstants(t *testing.T) {
require.Equal(t, len(failedRunnerBackoff), maxFailures+1)
}
var _ = Describe("Test EphemeralRunnerSet controller", func() { var _ = Describe("Test EphemeralRunnerSet controller", func() {
var ctx context.Context var ctx context.Context
var mgr ctrl.Manager var mgr ctrl.Manager

View File

@ -20,6 +20,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"testing" "testing"
"time"
"github.com/onsi/ginkgo/config" "github.com/onsi/ginkgo/config"
@ -79,6 +80,15 @@ var _ = BeforeSuite(func() {
k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(k8sClient).ToNot(BeNil()) Expect(k8sClient).ToNot(BeNil())
failedRunnerBackoff = []time.Duration{
20 * time.Millisecond,
20 * time.Millisecond,
20 * time.Millisecond,
20 * time.Millisecond,
20 * time.Millisecond,
20 * time.Millisecond,
}
}) })
var _ = AfterSuite(func() { var _ = AfterSuite(func() {