From b8e65aa857cf37a4cd829cb3ba61a1edf11ab468 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 20 Feb 2022 06:59:56 +0000 Subject: [PATCH] Prevent unnecessary ephemeral runner recreations --- .../horizontalrunnerautoscaler_types.go | 3 + api/v1alpha1/runnerdeployment_types.go | 8 +++ api/v1alpha1/runnerreplicaset_types.go | 9 +++ api/v1alpha1/zz_generated.deepcopy.go | 9 +++ ...rwind.dev_horizontalrunnerautoscalers.yaml | 3 + ...ions.summerwind.dev_runnerdeployments.yaml | 5 ++ ...ions.summerwind.dev_runnerreplicasets.yaml | 5 ++ ...rwind.dev_horizontalrunnerautoscalers.yaml | 3 + ...ions.summerwind.dev_runnerdeployments.yaml | 5 ++ ...ions.summerwind.dev_runnerreplicasets.yaml | 5 ++ .../horizontal_runner_autoscaler_webhook.go | 4 +- .../horizontalrunnerautoscaler_controller.go | 22 ++++++++ controllers/runnerdeployment_controller.go | 8 ++- controllers/runnerreplicaset_controller.go | 56 +++++++++++++++++-- 14 files changed, 137 insertions(+), 8 deletions(-) diff --git a/api/v1alpha1/horizontalrunnerautoscaler_types.go b/api/v1alpha1/horizontalrunnerautoscaler_types.go index 350583e7..b5da8275 100644 --- a/api/v1alpha1/horizontalrunnerautoscaler_types.go +++ b/api/v1alpha1/horizontalrunnerautoscaler_types.go @@ -107,6 +107,9 @@ type CapacityReservation struct { Name string `json:"name,omitempty"` ExpirationTime metav1.Time `json:"expirationTime,omitempty"` Replicas int `json:"replicas,omitempty"` + + // +optional + EffectiveTime metav1.Time `json:"effectiveTime,omitempty"` } type ScaleTargetRef struct { diff --git a/api/v1alpha1/runnerdeployment_types.go b/api/v1alpha1/runnerdeployment_types.go index 23b0d0d2..588a6855 100644 --- a/api/v1alpha1/runnerdeployment_types.go +++ b/api/v1alpha1/runnerdeployment_types.go @@ -31,6 +31,14 @@ type RunnerDeploymentSpec struct { // +nullable Replicas *int `json:"replicas,omitempty"` + // EffectiveTime is the time the upstream controller requested to sync Replicas. + // It is usually populated by the webhook-based autoscaler via HRA. + // The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated. + // + // +optional + // +nullable + EffectiveTime *metav1.Time `json:"effectiveTime"` + // +optional // +nullable Selector *metav1.LabelSelector `json:"selector"` diff --git a/api/v1alpha1/runnerreplicaset_types.go b/api/v1alpha1/runnerreplicaset_types.go index 4c698feb..9ecf1349 100644 --- a/api/v1alpha1/runnerreplicaset_types.go +++ b/api/v1alpha1/runnerreplicaset_types.go @@ -26,6 +26,15 @@ type RunnerReplicaSetSpec struct { // +nullable Replicas *int `json:"replicas,omitempty"` + // EffectiveTime is the time the upstream controller requested to sync Replicas. + // It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. + // The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners + // based on potentially outdated Replicas value. + // + // +optional + // +nullable + EffectiveTime *metav1.Time `json:"effectiveTime"` + // +optional // +nullable Selector *metav1.LabelSelector `json:"selector"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index d040cc78..d37ce954 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -47,6 +47,7 @@ func (in *CacheEntry) DeepCopy() *CacheEntry { func (in *CapacityReservation) DeepCopyInto(out *CapacityReservation) { *out = *in in.ExpirationTime.DeepCopyInto(&out.ExpirationTime) + in.EffectiveTime.DeepCopyInto(&out.EffectiveTime) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityReservation. @@ -498,6 +499,10 @@ func (in *RunnerDeploymentSpec) DeepCopyInto(out *RunnerDeploymentSpec) { *out = new(int) **out = **in } + if in.EffectiveTime != nil { + in, out := &in.EffectiveTime, &out.EffectiveTime + *out = (*in).DeepCopy() + } if in.Selector != nil { in, out := &in.Selector, &out.Selector *out = new(metav1.LabelSelector) @@ -812,6 +817,10 @@ func (in *RunnerReplicaSetSpec) DeepCopyInto(out *RunnerReplicaSetSpec) { *out = new(int) **out = **in } + if in.EffectiveTime != nil { + in, out := &in.EffectiveTime, &out.EffectiveTime + *out = (*in).DeepCopy() + } if in.Selector != nil { in, out := &in.Selector, &out.Selector *out = new(metav1.LabelSelector) diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml index 0a34c042..2f1719fd 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml @@ -49,6 +49,9 @@ spec: items: description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime. properties: + effectiveTime: + format: date-time + type: string expirationTime: format: date-time type: string diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml index 1883f901..9a693f9b 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml @@ -48,6 +48,11 @@ spec: spec: description: RunnerDeploymentSpec defines the desired state of RunnerDeployment properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml index 1bd5051d..7527c088 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml @@ -45,6 +45,11 @@ spec: spec: description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml b/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml index 0a34c042..2f1719fd 100644 --- a/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml +++ b/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml @@ -49,6 +49,9 @@ spec: items: description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime. properties: + effectiveTime: + format: date-time + type: string expirationTime: format: date-time type: string diff --git a/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml b/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml index 1883f901..9a693f9b 100644 --- a/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml +++ b/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml @@ -48,6 +48,11 @@ spec: spec: description: RunnerDeploymentSpec defines the desired state of RunnerDeployment properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml b/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml index 1bd5051d..7527c088 100644 --- a/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml +++ b/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml @@ -45,6 +45,11 @@ spec: spec: description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/controllers/horizontal_runner_autoscaler_webhook.go b/controllers/horizontal_runner_autoscaler_webhook.go index ea407fdb..d097b7e9 100644 --- a/controllers/horizontal_runner_autoscaler_webhook.go +++ b/controllers/horizontal_runner_autoscaler_webhook.go @@ -770,8 +770,10 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context. capacityReservations := getValidCapacityReservations(copy) if amount > 0 { + now := time.Now() copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{ - ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)}, + EffectiveTime: metav1.Time{Time: now}, + ExpirationTime: metav1.Time{Time: now.Add(target.ScaleUpTrigger.Duration.Duration)}, Replicas: amount, }) } else if amount < 0 { diff --git a/controllers/horizontalrunnerautoscaler_controller.go b/controllers/horizontalrunnerautoscaler_controller.go index 2fd949bb..140a8f51 100644 --- a/controllers/horizontalrunnerautoscaler_controller.go +++ b/controllers/horizontalrunnerautoscaler_controller.go @@ -99,11 +99,33 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error { currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas) + ephemeral := rd.Spec.Template.Spec.Ephemeral == nil || *rd.Spec.Template.Spec.Ephemeral + + var effectiveTime *time.Time + + for _, r := range hra.Spec.CapacityReservations { + t := r.EffectiveTime + if effectiveTime == nil || effectiveTime.Before(t.Time) { + effectiveTime = &t.Time + } + } + // Please add more conditions that we can in-place update the newest runnerreplicaset without disruption if currentDesiredReplicas != newDesiredReplicas { copy := rd.DeepCopy() copy.Spec.Replicas = &newDesiredReplicas + if ephemeral && effectiveTime != nil { + copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime} + } + + if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { + return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err) + } + } else if ephemeral && effectiveTime != nil { + copy := rd.DeepCopy() + copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime} + if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err) } diff --git a/controllers/runnerdeployment_controller.go b/controllers/runnerdeployment_controller.go index 6725bc64..9a55ae3d 100644 --- a/controllers/runnerdeployment_controller.go +++ b/controllers/runnerdeployment_controller.go @@ -177,6 +177,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req // Please add more conditions that we can in-place update the newest runnerreplicaset without disruption if currentDesiredReplicas != newDesiredReplicas { newestSet.Spec.Replicas = &newDesiredReplicas + newestSet.Spec.EffectiveTime = rd.Spec.EffectiveTime if err := r.Client.Update(ctx, newestSet); err != nil { log.Error(err, "Failed to update runnerreplicaset resource") @@ -417,9 +418,10 @@ func newRunnerReplicaSet(rd *v1alpha1.RunnerDeployment, commonRunnerLabels []str Labels: newRSTemplate.ObjectMeta.Labels, }, Spec: v1alpha1.RunnerReplicaSetSpec{ - Replicas: rd.Spec.Replicas, - Selector: newRSSelector, - Template: newRSTemplate, + Replicas: rd.Spec.Replicas, + Selector: newRSSelector, + Template: newRSTemplate, + EffectiveTime: rd.Spec.EffectiveTime, }, } diff --git a/controllers/runnerreplicaset_controller.go b/controllers/runnerreplicaset_controller.go index 2acc93aa..e4ac040f 100644 --- a/controllers/runnerreplicaset_controller.go +++ b/controllers/runnerreplicaset_controller.go @@ -49,6 +49,10 @@ type RunnerReplicaSetReconciler struct { Name string } +const ( + SyncTimeAnnotationKey = "sync-time" +) + // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch @@ -85,19 +89,36 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req } } - var myRunners []v1alpha1.Runner - var ( current int ready int available int + + lastSyncTime *time.Time ) for _, r := range allRunners.Items { // This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before // to not treat all the runners in the namespace as its children. if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) { - myRunners = append(myRunners, r) + // If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion) + // or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop), + // we don't need to bother calling GitHub API to re-mark the runner for deletion. + // Just hold on, and runners will disappear as long as the runner controller is up and running. + if !r.DeletionTimestamp.IsZero() { + continue + } + + if r.Annotations != nil { + if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok { + t, err := time.Parse(time.RFC3339, a) + if err == nil { + if lastSyncTime == nil || lastSyncTime.Before(t) { + lastSyncTime = &t + } + } + } + } current += 1 @@ -152,7 +173,30 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req } } - if current > desired { + effectiveTime := rs.Spec.EffectiveTime + ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral + + if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) { + log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready) + } else if current > desired { + // If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally, + // you're unlikely to fall into this branch. + // + // That's becaseu all the stakeholders work like this: + // + // 1. A runner pod completes with the runner container exiting with code 0 + // 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero) + // 3. GitHub triggers a corresponding workflow_job "complete" webhook event + // 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation + // 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations + // 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas + // 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet + // + // In a normally working ARC installation, the runner that was used to run the workflow job should already have been + // marked for deletion by the runner controller. + // This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to + // fall into this branch when you're using ephemeral runners with webhook-based-autoscaler. + n := current - desired log.V(0).Info(fmt.Sprintf("Deleting %d runners", n), "desired", desired, "current", current, "ready", ready) @@ -282,6 +326,10 @@ func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1 objectMeta.GenerateName = rs.ObjectMeta.Name + "-" objectMeta.Namespace = rs.ObjectMeta.Namespace + if objectMeta.Annotations == nil { + objectMeta.Annotations = map[string]string{} + } + objectMeta.Annotations[SyncTimeAnnotationKey] = time.Now().Format(time.RFC3339) runner := v1alpha1.Runner{ TypeMeta: metav1.TypeMeta{},