Prevent unnecessary ephemeral runner recreations
This commit is contained in:
parent
d4a9750e20
commit
b8e65aa857
|
|
@ -107,6 +107,9 @@ type CapacityReservation struct {
|
||||||
Name string `json:"name,omitempty"`
|
Name string `json:"name,omitempty"`
|
||||||
ExpirationTime metav1.Time `json:"expirationTime,omitempty"`
|
ExpirationTime metav1.Time `json:"expirationTime,omitempty"`
|
||||||
Replicas int `json:"replicas,omitempty"`
|
Replicas int `json:"replicas,omitempty"`
|
||||||
|
|
||||||
|
// +optional
|
||||||
|
EffectiveTime metav1.Time `json:"effectiveTime,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ScaleTargetRef struct {
|
type ScaleTargetRef struct {
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,14 @@ type RunnerDeploymentSpec struct {
|
||||||
// +nullable
|
// +nullable
|
||||||
Replicas *int `json:"replicas,omitempty"`
|
Replicas *int `json:"replicas,omitempty"`
|
||||||
|
|
||||||
|
// EffectiveTime is the time the upstream controller requested to sync Replicas.
|
||||||
|
// It is usually populated by the webhook-based autoscaler via HRA.
|
||||||
|
// The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
|
||||||
|
//
|
||||||
|
// +optional
|
||||||
|
// +nullable
|
||||||
|
EffectiveTime *metav1.Time `json:"effectiveTime"`
|
||||||
|
|
||||||
// +optional
|
// +optional
|
||||||
// +nullable
|
// +nullable
|
||||||
Selector *metav1.LabelSelector `json:"selector"`
|
Selector *metav1.LabelSelector `json:"selector"`
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,15 @@ type RunnerReplicaSetSpec struct {
|
||||||
// +nullable
|
// +nullable
|
||||||
Replicas *int `json:"replicas,omitempty"`
|
Replicas *int `json:"replicas,omitempty"`
|
||||||
|
|
||||||
|
// EffectiveTime is the time the upstream controller requested to sync Replicas.
|
||||||
|
// It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment.
|
||||||
|
// The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners
|
||||||
|
// based on potentially outdated Replicas value.
|
||||||
|
//
|
||||||
|
// +optional
|
||||||
|
// +nullable
|
||||||
|
EffectiveTime *metav1.Time `json:"effectiveTime"`
|
||||||
|
|
||||||
// +optional
|
// +optional
|
||||||
// +nullable
|
// +nullable
|
||||||
Selector *metav1.LabelSelector `json:"selector"`
|
Selector *metav1.LabelSelector `json:"selector"`
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,7 @@ func (in *CacheEntry) DeepCopy() *CacheEntry {
|
||||||
func (in *CapacityReservation) DeepCopyInto(out *CapacityReservation) {
|
func (in *CapacityReservation) DeepCopyInto(out *CapacityReservation) {
|
||||||
*out = *in
|
*out = *in
|
||||||
in.ExpirationTime.DeepCopyInto(&out.ExpirationTime)
|
in.ExpirationTime.DeepCopyInto(&out.ExpirationTime)
|
||||||
|
in.EffectiveTime.DeepCopyInto(&out.EffectiveTime)
|
||||||
}
|
}
|
||||||
|
|
||||||
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityReservation.
|
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityReservation.
|
||||||
|
|
@ -498,6 +499,10 @@ func (in *RunnerDeploymentSpec) DeepCopyInto(out *RunnerDeploymentSpec) {
|
||||||
*out = new(int)
|
*out = new(int)
|
||||||
**out = **in
|
**out = **in
|
||||||
}
|
}
|
||||||
|
if in.EffectiveTime != nil {
|
||||||
|
in, out := &in.EffectiveTime, &out.EffectiveTime
|
||||||
|
*out = (*in).DeepCopy()
|
||||||
|
}
|
||||||
if in.Selector != nil {
|
if in.Selector != nil {
|
||||||
in, out := &in.Selector, &out.Selector
|
in, out := &in.Selector, &out.Selector
|
||||||
*out = new(metav1.LabelSelector)
|
*out = new(metav1.LabelSelector)
|
||||||
|
|
@ -812,6 +817,10 @@ func (in *RunnerReplicaSetSpec) DeepCopyInto(out *RunnerReplicaSetSpec) {
|
||||||
*out = new(int)
|
*out = new(int)
|
||||||
**out = **in
|
**out = **in
|
||||||
}
|
}
|
||||||
|
if in.EffectiveTime != nil {
|
||||||
|
in, out := &in.EffectiveTime, &out.EffectiveTime
|
||||||
|
*out = (*in).DeepCopy()
|
||||||
|
}
|
||||||
if in.Selector != nil {
|
if in.Selector != nil {
|
||||||
in, out := &in.Selector, &out.Selector
|
in, out := &in.Selector, &out.Selector
|
||||||
*out = new(metav1.LabelSelector)
|
*out = new(metav1.LabelSelector)
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,9 @@ spec:
|
||||||
items:
|
items:
|
||||||
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
|
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
|
||||||
properties:
|
properties:
|
||||||
|
effectiveTime:
|
||||||
|
format: date-time
|
||||||
|
type: string
|
||||||
expirationTime:
|
expirationTime:
|
||||||
format: date-time
|
format: date-time
|
||||||
type: string
|
type: string
|
||||||
|
|
|
||||||
|
|
@ -48,6 +48,11 @@ spec:
|
||||||
spec:
|
spec:
|
||||||
description: RunnerDeploymentSpec defines the desired state of RunnerDeployment
|
description: RunnerDeploymentSpec defines the desired state of RunnerDeployment
|
||||||
properties:
|
properties:
|
||||||
|
effectiveTime:
|
||||||
|
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
|
||||||
|
format: date-time
|
||||||
|
nullable: true
|
||||||
|
type: string
|
||||||
replicas:
|
replicas:
|
||||||
nullable: true
|
nullable: true
|
||||||
type: integer
|
type: integer
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,11 @@ spec:
|
||||||
spec:
|
spec:
|
||||||
description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet
|
description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet
|
||||||
properties:
|
properties:
|
||||||
|
effectiveTime:
|
||||||
|
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value.
|
||||||
|
format: date-time
|
||||||
|
nullable: true
|
||||||
|
type: string
|
||||||
replicas:
|
replicas:
|
||||||
nullable: true
|
nullable: true
|
||||||
type: integer
|
type: integer
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,9 @@ spec:
|
||||||
items:
|
items:
|
||||||
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
|
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
|
||||||
properties:
|
properties:
|
||||||
|
effectiveTime:
|
||||||
|
format: date-time
|
||||||
|
type: string
|
||||||
expirationTime:
|
expirationTime:
|
||||||
format: date-time
|
format: date-time
|
||||||
type: string
|
type: string
|
||||||
|
|
|
||||||
|
|
@ -48,6 +48,11 @@ spec:
|
||||||
spec:
|
spec:
|
||||||
description: RunnerDeploymentSpec defines the desired state of RunnerDeployment
|
description: RunnerDeploymentSpec defines the desired state of RunnerDeployment
|
||||||
properties:
|
properties:
|
||||||
|
effectiveTime:
|
||||||
|
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
|
||||||
|
format: date-time
|
||||||
|
nullable: true
|
||||||
|
type: string
|
||||||
replicas:
|
replicas:
|
||||||
nullable: true
|
nullable: true
|
||||||
type: integer
|
type: integer
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,11 @@ spec:
|
||||||
spec:
|
spec:
|
||||||
description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet
|
description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet
|
||||||
properties:
|
properties:
|
||||||
|
effectiveTime:
|
||||||
|
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value.
|
||||||
|
format: date-time
|
||||||
|
nullable: true
|
||||||
|
type: string
|
||||||
replicas:
|
replicas:
|
||||||
nullable: true
|
nullable: true
|
||||||
type: integer
|
type: integer
|
||||||
|
|
|
||||||
|
|
@ -770,8 +770,10 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.
|
||||||
capacityReservations := getValidCapacityReservations(copy)
|
capacityReservations := getValidCapacityReservations(copy)
|
||||||
|
|
||||||
if amount > 0 {
|
if amount > 0 {
|
||||||
|
now := time.Now()
|
||||||
copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
|
copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
|
||||||
ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)},
|
EffectiveTime: metav1.Time{Time: now},
|
||||||
|
ExpirationTime: metav1.Time{Time: now.Add(target.ScaleUpTrigger.Duration.Duration)},
|
||||||
Replicas: amount,
|
Replicas: amount,
|
||||||
})
|
})
|
||||||
} else if amount < 0 {
|
} else if amount < 0 {
|
||||||
|
|
|
||||||
|
|
@ -99,11 +99,33 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re
|
||||||
return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error {
|
return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error {
|
||||||
currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas)
|
currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas)
|
||||||
|
|
||||||
|
ephemeral := rd.Spec.Template.Spec.Ephemeral == nil || *rd.Spec.Template.Spec.Ephemeral
|
||||||
|
|
||||||
|
var effectiveTime *time.Time
|
||||||
|
|
||||||
|
for _, r := range hra.Spec.CapacityReservations {
|
||||||
|
t := r.EffectiveTime
|
||||||
|
if effectiveTime == nil || effectiveTime.Before(t.Time) {
|
||||||
|
effectiveTime = &t.Time
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
|
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
|
||||||
if currentDesiredReplicas != newDesiredReplicas {
|
if currentDesiredReplicas != newDesiredReplicas {
|
||||||
copy := rd.DeepCopy()
|
copy := rd.DeepCopy()
|
||||||
copy.Spec.Replicas = &newDesiredReplicas
|
copy.Spec.Replicas = &newDesiredReplicas
|
||||||
|
|
||||||
|
if ephemeral && effectiveTime != nil {
|
||||||
|
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
|
||||||
|
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
|
||||||
|
}
|
||||||
|
} else if ephemeral && effectiveTime != nil {
|
||||||
|
copy := rd.DeepCopy()
|
||||||
|
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
|
||||||
|
|
||||||
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
|
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
|
||||||
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
|
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -177,6 +177,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req
|
||||||
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
|
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
|
||||||
if currentDesiredReplicas != newDesiredReplicas {
|
if currentDesiredReplicas != newDesiredReplicas {
|
||||||
newestSet.Spec.Replicas = &newDesiredReplicas
|
newestSet.Spec.Replicas = &newDesiredReplicas
|
||||||
|
newestSet.Spec.EffectiveTime = rd.Spec.EffectiveTime
|
||||||
|
|
||||||
if err := r.Client.Update(ctx, newestSet); err != nil {
|
if err := r.Client.Update(ctx, newestSet); err != nil {
|
||||||
log.Error(err, "Failed to update runnerreplicaset resource")
|
log.Error(err, "Failed to update runnerreplicaset resource")
|
||||||
|
|
@ -417,9 +418,10 @@ func newRunnerReplicaSet(rd *v1alpha1.RunnerDeployment, commonRunnerLabels []str
|
||||||
Labels: newRSTemplate.ObjectMeta.Labels,
|
Labels: newRSTemplate.ObjectMeta.Labels,
|
||||||
},
|
},
|
||||||
Spec: v1alpha1.RunnerReplicaSetSpec{
|
Spec: v1alpha1.RunnerReplicaSetSpec{
|
||||||
Replicas: rd.Spec.Replicas,
|
Replicas: rd.Spec.Replicas,
|
||||||
Selector: newRSSelector,
|
Selector: newRSSelector,
|
||||||
Template: newRSTemplate,
|
Template: newRSTemplate,
|
||||||
|
EffectiveTime: rd.Spec.EffectiveTime,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,10 @@ type RunnerReplicaSetReconciler struct {
|
||||||
Name string
|
Name string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
SyncTimeAnnotationKey = "sync-time"
|
||||||
|
)
|
||||||
|
|
||||||
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete
|
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete
|
||||||
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete
|
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete
|
||||||
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch
|
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch
|
||||||
|
|
@ -85,19 +89,36 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var myRunners []v1alpha1.Runner
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
current int
|
current int
|
||||||
ready int
|
ready int
|
||||||
available int
|
available int
|
||||||
|
|
||||||
|
lastSyncTime *time.Time
|
||||||
)
|
)
|
||||||
|
|
||||||
for _, r := range allRunners.Items {
|
for _, r := range allRunners.Items {
|
||||||
// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
|
// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
|
||||||
// to not treat all the runners in the namespace as its children.
|
// to not treat all the runners in the namespace as its children.
|
||||||
if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) {
|
if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) {
|
||||||
myRunners = append(myRunners, r)
|
// If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
|
||||||
|
// or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop),
|
||||||
|
// we don't need to bother calling GitHub API to re-mark the runner for deletion.
|
||||||
|
// Just hold on, and runners will disappear as long as the runner controller is up and running.
|
||||||
|
if !r.DeletionTimestamp.IsZero() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if r.Annotations != nil {
|
||||||
|
if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok {
|
||||||
|
t, err := time.Parse(time.RFC3339, a)
|
||||||
|
if err == nil {
|
||||||
|
if lastSyncTime == nil || lastSyncTime.Before(t) {
|
||||||
|
lastSyncTime = &t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
current += 1
|
current += 1
|
||||||
|
|
||||||
|
|
@ -152,7 +173,30 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if current > desired {
|
effectiveTime := rs.Spec.EffectiveTime
|
||||||
|
ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral
|
||||||
|
|
||||||
|
if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
|
||||||
|
log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready)
|
||||||
|
} else if current > desired {
|
||||||
|
// If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally,
|
||||||
|
// you're unlikely to fall into this branch.
|
||||||
|
//
|
||||||
|
// That's becaseu all the stakeholders work like this:
|
||||||
|
//
|
||||||
|
// 1. A runner pod completes with the runner container exiting with code 0
|
||||||
|
// 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero)
|
||||||
|
// 3. GitHub triggers a corresponding workflow_job "complete" webhook event
|
||||||
|
// 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation
|
||||||
|
// 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations
|
||||||
|
// 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas
|
||||||
|
// 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet
|
||||||
|
//
|
||||||
|
// In a normally working ARC installation, the runner that was used to run the workflow job should already have been
|
||||||
|
// marked for deletion by the runner controller.
|
||||||
|
// This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to
|
||||||
|
// fall into this branch when you're using ephemeral runners with webhook-based-autoscaler.
|
||||||
|
|
||||||
n := current - desired
|
n := current - desired
|
||||||
|
|
||||||
log.V(0).Info(fmt.Sprintf("Deleting %d runners", n), "desired", desired, "current", current, "ready", ready)
|
log.V(0).Info(fmt.Sprintf("Deleting %d runners", n), "desired", desired, "current", current, "ready", ready)
|
||||||
|
|
@ -282,6 +326,10 @@ func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1
|
||||||
|
|
||||||
objectMeta.GenerateName = rs.ObjectMeta.Name + "-"
|
objectMeta.GenerateName = rs.ObjectMeta.Name + "-"
|
||||||
objectMeta.Namespace = rs.ObjectMeta.Namespace
|
objectMeta.Namespace = rs.ObjectMeta.Namespace
|
||||||
|
if objectMeta.Annotations == nil {
|
||||||
|
objectMeta.Annotations = map[string]string{}
|
||||||
|
}
|
||||||
|
objectMeta.Annotations[SyncTimeAnnotationKey] = time.Now().Format(time.RFC3339)
|
||||||
|
|
||||||
runner := v1alpha1.Runner{
|
runner := v1alpha1.Runner{
|
||||||
TypeMeta: metav1.TypeMeta{},
|
TypeMeta: metav1.TypeMeta{},
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue