Create backoff mechanism for failed runners and allow re-creation of failed ephemeral runners (#4059)
This commit is contained in:
parent
d6e2790db5
commit
cae7efa2c6
|
|
@ -119,7 +119,7 @@ type EphemeralRunnerStatus struct {
|
|||
RunnerJITConfig string `json:"runnerJITConfig,omitempty"`
|
||||
|
||||
// +optional
|
||||
Failures map[string]bool `json:"failures,omitempty"`
|
||||
Failures map[string]metav1.Time `json:"failures,omitempty"`
|
||||
|
||||
// +optional
|
||||
JobRequestId int64 `json:"jobRequestId,omitempty"`
|
||||
|
|
@ -137,6 +137,20 @@ type EphemeralRunnerStatus struct {
|
|||
JobDisplayName string `json:"jobDisplayName,omitempty"`
|
||||
}
|
||||
|
||||
func (s *EphemeralRunnerStatus) LastFailure() metav1.Time {
|
||||
var maxTime metav1.Time
|
||||
if len(s.Failures) == 0 {
|
||||
return maxTime
|
||||
}
|
||||
|
||||
for _, ts := range s.Failures {
|
||||
if ts.After(maxTime.Time) {
|
||||
maxTime = ts
|
||||
}
|
||||
}
|
||||
return maxTime
|
||||
}
|
||||
|
||||
// +kubebuilder:object:root=true
|
||||
|
||||
// EphemeralRunnerList contains a list of EphemeralRunner
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ package v1alpha1
|
|||
|
||||
import (
|
||||
"k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
runtime "k8s.io/apimachinery/pkg/runtime"
|
||||
)
|
||||
|
||||
|
|
@ -459,9 +460,9 @@ func (in *EphemeralRunnerStatus) DeepCopyInto(out *EphemeralRunnerStatus) {
|
|||
*out = *in
|
||||
if in.Failures != nil {
|
||||
in, out := &in.Failures, &out.Failures
|
||||
*out = make(map[string]bool, len(*in))
|
||||
*out = make(map[string]metav1.Time, len(*in))
|
||||
for key, val := range *in {
|
||||
(*out)[key] = val
|
||||
(*out)[key] = *val.DeepCopy()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7794,7 +7794,8 @@ spec:
|
|||
properties:
|
||||
failures:
|
||||
additionalProperties:
|
||||
type: boolean
|
||||
format: date-time
|
||||
type: string
|
||||
type: object
|
||||
jobDisplayName:
|
||||
type: string
|
||||
|
|
|
|||
|
|
@ -7794,7 +7794,8 @@ spec:
|
|||
properties:
|
||||
failures:
|
||||
additionalProperties:
|
||||
type: boolean
|
||||
format: date-time
|
||||
type: string
|
||||
type: object
|
||||
jobDisplayName:
|
||||
type: string
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ import (
|
|||
"github.com/go-logr/logr"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
|
|
@ -50,6 +51,19 @@ type EphemeralRunnerReconciler struct {
|
|||
ResourceBuilder
|
||||
}
|
||||
|
||||
// precompute backoff durations for failed ephemeral runners
|
||||
// the len(failedRunnerBackoff) must be equal to maxFailures + 1
|
||||
var failedRunnerBackoff = []time.Duration{
|
||||
0,
|
||||
5 * time.Second,
|
||||
10 * time.Second,
|
||||
20 * time.Second,
|
||||
40 * time.Second,
|
||||
80 * time.Second,
|
||||
}
|
||||
|
||||
const maxFailures = 5
|
||||
|
||||
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners,verbs=get;list;watch;create;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/status,verbs=get;update;patch
|
||||
// +kubebuilder:rbac:groups=actions.github.com,resources=ephemeralrunners/finalizers,verbs=get;list;watch;create;update;patch;delete
|
||||
|
|
@ -173,6 +187,29 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||
}
|
||||
}
|
||||
|
||||
if len(ephemeralRunner.Status.Failures) > maxFailures {
|
||||
log.Info(fmt.Sprintf("EphemeralRunner has failed more than %d times. Deleting ephemeral runner so it can be re-created", maxFailures))
|
||||
if err := r.Delete(ctx, ephemeralRunner); err != nil {
|
||||
log.Error(fmt.Errorf("failed to delete ephemeral runner after %d failures: %w", maxFailures, err), "Failed to delete ephemeral runner")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
now := metav1.Now()
|
||||
lastFailure := ephemeralRunner.Status.LastFailure()
|
||||
backoffDuration := failedRunnerBackoff[len(ephemeralRunner.Status.Failures)]
|
||||
nextReconciliation := lastFailure.Add(backoffDuration)
|
||||
if !lastFailure.IsZero() && now.Before(&metav1.Time{Time: nextReconciliation}) {
|
||||
log.Info("Backing off the next reconciliation due to failure",
|
||||
"lastFailure", lastFailure,
|
||||
"nextReconciliation", nextReconciliation,
|
||||
"requeueAfter", nextReconciliation.Sub(now.Time),
|
||||
)
|
||||
return ctrl.Result{RequeueAfter: now.Sub(nextReconciliation)}, nil
|
||||
}
|
||||
|
||||
secret := new(corev1.Secret)
|
||||
if err := r.Get(ctx, req.NamespacedName, secret); err != nil {
|
||||
if !kerrors.IsNotFound(err) {
|
||||
|
|
@ -196,21 +233,11 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||
|
||||
pod := new(corev1.Pod)
|
||||
if err := r.Get(ctx, req.NamespacedName, pod); err != nil {
|
||||
switch {
|
||||
case !kerrors.IsNotFound(err):
|
||||
if !kerrors.IsNotFound(err) {
|
||||
log.Error(err, "Failed to fetch the pod")
|
||||
return ctrl.Result{}, err
|
||||
|
||||
case len(ephemeralRunner.Status.Failures) > 5:
|
||||
log.Info("EphemeralRunner has failed more than 5 times. Marking it as failed")
|
||||
errMessage := fmt.Sprintf("Pod has failed to start more than 5 times: %s", pod.Status.Message)
|
||||
if err := r.markAsFailed(ctx, ephemeralRunner, errMessage, ReasonTooManyPodFailures, log); err != nil {
|
||||
log.Error(err, "Failed to set ephemeral runner to phase Failed")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
return ctrl.Result{}, nil
|
||||
|
||||
default:
|
||||
// Pod was not found. Create if the pod has never been created
|
||||
log.Info("Creating new EphemeralRunner pod.")
|
||||
result, err := r.createPod(ctx, ephemeralRunner, secret, log)
|
||||
|
|
@ -230,7 +257,6 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
|
|||
return ctrl.Result{}, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cs := runnerContainerStatus(pod)
|
||||
switch {
|
||||
|
|
@ -484,9 +510,9 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem
|
|||
log.Info("Updating ephemeral runner status to track the failure count")
|
||||
if err := patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {
|
||||
if obj.Status.Failures == nil {
|
||||
obj.Status.Failures = make(map[string]bool)
|
||||
obj.Status.Failures = make(map[string]metav1.Time)
|
||||
}
|
||||
obj.Status.Failures[string(pod.UID)] = true
|
||||
obj.Status.Failures[string(pod.UID)] = metav1.Now()
|
||||
obj.Status.Ready = false
|
||||
obj.Status.Reason = pod.Status.Reason
|
||||
obj.Status.Message = pod.Status.Message
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ import (
|
|||
|
||||
const (
|
||||
ephemeralRunnerTimeout = time.Second * 20
|
||||
ephemeralRunnerInterval = time.Millisecond * 250
|
||||
ephemeralRunnerInterval = time.Millisecond * 10
|
||||
runnerImage = "ghcr.io/actions/actions-runner:latest"
|
||||
)
|
||||
|
||||
|
|
@ -528,44 +528,26 @@ var _ = Describe("EphemeralRunner", func() {
|
|||
).Should(BeEquivalentTo(""))
|
||||
})
|
||||
|
||||
It("It should not re-create pod indefinitely", func() {
|
||||
It("It should eventually delete ephemeral runner after consecutive failures", func() {
|
||||
updated := new(v1alpha1.EphemeralRunner)
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(
|
||||
func() (bool, error) {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
err = k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
if err != nil {
|
||||
if kerrors.IsNotFound(err) && len(updated.Status.Failures) > 5 {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
return false, err
|
||||
}
|
||||
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
Terminated: &corev1.ContainerStateTerminated{
|
||||
ExitCode: 1,
|
||||
},
|
||||
},
|
||||
})
|
||||
err = k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "Failed to update pod status")
|
||||
return false, fmt.Errorf("pod haven't failed for 5 times.")
|
||||
func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeEquivalentTo(true), "we should stop creating pod after 5 failures")
|
||||
).Should(Succeed(), "failed to get ephemeral runner")
|
||||
|
||||
failEphemeralRunnerPod := func() *corev1.Pod {
|
||||
pod := new(corev1.Pod)
|
||||
Eventually(
|
||||
func() error {
|
||||
return k8sClient.Get(ctx, client.ObjectKey{Name: updated.Name, Namespace: updated.Namespace}, pod)
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(Succeed(), "failed to get ephemeral runner pod")
|
||||
|
||||
// In case we still have pod created due to controller-runtime cache delay, mark the container as exited
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
if err == nil {
|
||||
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
|
||||
Name: v1alpha1.EphemeralRunnerContainerName,
|
||||
State: corev1.ContainerState{
|
||||
|
|
@ -576,25 +558,70 @@ var _ = Describe("EphemeralRunner", func() {
|
|||
})
|
||||
err := k8sClient.Status().Update(ctx, pod)
|
||||
Expect(err).To(BeNil(), "Failed to update pod status")
|
||||
|
||||
return pod
|
||||
}
|
||||
|
||||
// EphemeralRunner should failed with reason TooManyPodFailures
|
||||
Eventually(func() (string, error) {
|
||||
for i := range 5 {
|
||||
pod := failEphemeralRunnerPod()
|
||||
|
||||
Eventually(
|
||||
func() (int, error) {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
if err != nil {
|
||||
return "", err
|
||||
return 0, err
|
||||
}
|
||||
return updated.Status.Reason, nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("TooManyPodFailures"), "Reason should be TooManyPodFailures")
|
||||
return len(updated.Status.Failures), nil
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeEquivalentTo(i + 1))
|
||||
|
||||
// EphemeralRunner should not have any pod
|
||||
Eventually(func() (bool, error) {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
if err == nil {
|
||||
return false, nil
|
||||
Eventually(
|
||||
func() error {
|
||||
nextPod := new(corev1.Pod)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: pod.Name, Namespace: pod.Namespace}, nextPod)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return kerrors.IsNotFound(err), nil
|
||||
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))
|
||||
if nextPod.UID != pod.UID {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("pod not recreated")
|
||||
},
|
||||
).WithTimeout(20*time.Second).WithPolling(10*time.Millisecond).Should(Succeed(), "pod should be recreated")
|
||||
|
||||
Eventually(
|
||||
func() (bool, error) {
|
||||
pod := new(corev1.Pod)
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
for _, cs := range pod.Status.ContainerStatuses {
|
||||
if cs.Name == v1alpha1.EphemeralRunnerContainerName {
|
||||
return cs.State.Terminated == nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
return true, nil
|
||||
},
|
||||
).WithTimeout(20*time.Second).WithPolling(10*time.Millisecond).Should(BeEquivalentTo(true), "pod should be terminated")
|
||||
}
|
||||
|
||||
failEphemeralRunnerPod()
|
||||
|
||||
Eventually(
|
||||
func() (bool, error) {
|
||||
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
|
||||
if kerrors.IsNotFound(err) {
|
||||
return true, nil
|
||||
}
|
||||
return false, err
|
||||
},
|
||||
ephemeralRunnerTimeout,
|
||||
ephemeralRunnerInterval,
|
||||
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
|
||||
})
|
||||
|
||||
It("It should re-create pod on eviction", func() {
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import (
|
|||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
|
|
@ -21,6 +22,7 @@ import (
|
|||
"github.com/go-logr/logr"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
"github.com/stretchr/testify/require"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
|
||||
"github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1"
|
||||
|
|
@ -35,6 +37,10 @@ const (
|
|||
ephemeralRunnerSetTestGitHubToken = "gh_token"
|
||||
)
|
||||
|
||||
func TestPrecomputedConstants(t *testing.T) {
|
||||
require.Equal(t, len(failedRunnerBackoff), maxFailures+1)
|
||||
}
|
||||
|
||||
var _ = Describe("Test EphemeralRunnerSet controller", func() {
|
||||
var ctx context.Context
|
||||
var mgr ctrl.Manager
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ import (
|
|||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/onsi/ginkgo/config"
|
||||
|
||||
|
|
@ -79,6 +80,15 @@ var _ = BeforeSuite(func() {
|
|||
k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(k8sClient).ToNot(BeNil())
|
||||
|
||||
failedRunnerBackoff = []time.Duration{
|
||||
20 * time.Millisecond,
|
||||
20 * time.Millisecond,
|
||||
20 * time.Millisecond,
|
||||
20 * time.Millisecond,
|
||||
20 * time.Millisecond,
|
||||
20 * time.Millisecond,
|
||||
}
|
||||
})
|
||||
|
||||
var _ = AfterSuite(func() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue