edge: Enable scaling from zero with PercentageRunnersBusy (#524)

`PercentageRunnersBusy`, in combination with a secondary `TotalInProgressAndQueuedWorkflowRuns` metric, enables scale-from-zero for PercentageRunnersBusy. Please see the new `Autoscaling to/from 0` section in the updated documentation about how it works. Resolves #522
2021-05-05 14:27:17 +09:00 · 2021-05-05 14:27:17 +09:00 · 4e7b8b57c0
parent e7020c7c0f
commit 4e7b8b57c0
2 changed files with 88 additions and 19 deletions
--- a/README.md
+++ b/README.md
@ -17,9 +17,10 @@ ToC:
  - [Organization Runners](#organization-runners)
  - [Enterprise Runners](#enterprise-runners)
  - [Runner Deployments](#runnerdeployments)
-    - [Note on scaling to/from 0](#note-on-scaling-to-from-zero)
-    - [Autoscaling](#autoscaling)
-      - [Faster Autoscaling with GitHub Webhook](#faster-autoscaling-with-github-webhook)
+    - [Note on scaling to/from 0](#note-on-scaling-tofrom-0)
+  - [Autoscaling](#autoscaling)
+    - [Faster Autoscaling with GitHub Webhook](#faster-autoscaling-with-github-webhook)
+    - [Autoscaling to/from 0](#autoscaling-tofrom-0)
  - [Runner with DinD](#runner-with-dind)
  - [Additional tweaks](#additional-tweaks)
  - [Runner labels](#runner-labels)
@ -296,7 +297,12 @@ spec:

 The implication of setting `replicas: 0` instead of deleting the runner depoyment is that you can let GitHub Actions queue jobs until there will be one or more runners. See [#465](https://github.com/actions-runner-controller/actions-runner-controller/pull/465) for more information.

-#### Autoscaling
+Also note that the controller creates a "registration-only" runner per RunnerReplicaSet on it's being scaled to zero,
+and retains it until there are one or more runners available.
+
+This, in combination with a correctly configured HorizontalRunnerAutoscaler, allows you to automatically [scale to/from 0](#autoscaling-tofrom-0)
+
+### Autoscaling

 __**IMPORTANT : Due to limitations / a bug with GitHub's [routing engine](https://docs.github.com/en/actions/hosting-your-own-runners/using-self-hosted-runners-in-a-workflow#routing-precedence-for-self-hosted-runners) autoscaling does NOT work correctly with RunnerDeployments that target the enterprise level. Scaling activity works as expected however jobs fail to get assigned to the scaled out replicas. This was explored in issue [#470](https://github.com/actions-runner-controller/actions-runner-controller/issues/470). Once GitHub resolves the issue with their backend service we expect the solution to be able to support autoscaled enterprise runnerdeploments without any additional changes.**__

@ -544,6 +550,27 @@ spec:

 See ["activity types"](https://docs.github.com/en/actions/reference/events-that-trigger-workflows#pull_request) for the list of valid values for `scaleUpTriggers[].githubEvent.pullRequest.types`.

+
+#### Autoscaling to/from 0
+
+Previously, we've discussed about [how to scale a RunnerDeployment to/from 0](#note-on-scaling-tofrom-0)
+
+To automate the process of scaling to/from 0, you can use `HorizontalRunerAutoscaler` with a caveat.
+
+That is, you need to choose one of the following configuration for metrigs and triggers:
+
+- `TotalNumberOfQueuedAndInProgressWorkflowRuns`
+- `PercentageRunnersBusy` + `TotalNumberOfQueuedAndInProgressWorkflowRuns`
+- `PercentageRunnersBusy` + Webhook-based autoscaling
+
+This is due to that `PercentageRunnersBusy`, by its definition, needs one or more GitHub runners that can become `busy`, which cannot happen at all when you have 0 active runners.
+
+If and only if HorizontalRunnerAutoscaler is configured to have a secondary metric of `TotalNumberOfQueuedAndInProgressWorkflowRuns` and the controller sees the primary metric of `PercentageRunnersBusy` returned 0 desired replicas, it uses the secondary metric for calculating the desired replicas once agian.
+
+A correctly configured `TotalNumberOfQueuedAndInProgressWorkflowRuns` can return non-zero desired replicas even when there are no runners other than [registration-only runners](#note-on-scaling-tofrom-0), hence the `PercentageRunnersBusy` + `TotalNumberOfQueuedAndInProgressWorkflowRuns` configuration makes scaling from zero possible.
+
+Similarly, Webhook-based autoscaling works regarless of there are active runners, hence `PercentageRunnersBusy` + Webhook-based autoscaling configuration makes scaling from zero, too.
+
 ### Runner with DinD

 When using default runner, runner pod starts up 2 containers: runner and DinD (Docker-in-Docker). This might create issues if there's `LimitRange` set to namespace.
--- a/controllers/autoscaling.go
+++ b/controllers/autoscaling.go
@ -71,25 +71,68 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestDesiredReplicas(rd v1alpha
 	}

 	metrics := hra.Spec.Metrics
-	if len(metrics) == 0 {
+	numMetrics := len(metrics)
+	if numMetrics == 0 {
 		if len(hra.Spec.ScaleUpTriggers) == 0 {
-			return r.suggestReplicasByQueuedAndInProgressWorkflowRuns(rd, hra)
+			return r.suggestReplicasByQueuedAndInProgressWorkflowRuns(rd, hra, nil)
 		}

 		return nil, nil
-	} else if metrics[0].Type == v1alpha1.AutoscalingMetricTypeTotalNumberOfQueuedAndInProgressWorkflowRuns {
-		return r.suggestReplicasByQueuedAndInProgressWorkflowRuns(rd, hra)
-	} else if metrics[0].Type == v1alpha1.AutoscalingMetricTypePercentageRunnersBusy {
-		return r.suggestReplicasByPercentageRunnersBusy(rd, hra)
-	} else {
-		return nil, fmt.Errorf("validting autoscaling metrics: unsupported metric type %q", metrics[0].Type)
+	} else if numMetrics > 2 {
+		return nil, fmt.Errorf("Too many autoscaling metrics configured: It must be 0 to 2, but got %d", numMetrics)
 	}
+
+	primaryMetric := metrics[0]
+	primaryMetricType := primaryMetric.Type
+
+	var (
+		suggested *int
+		err       error
+	)
+
+	switch primaryMetricType {
+	case v1alpha1.AutoscalingMetricTypeTotalNumberOfQueuedAndInProgressWorkflowRuns:
+		suggested, err = r.suggestReplicasByQueuedAndInProgressWorkflowRuns(rd, hra, &primaryMetric)
+	case v1alpha1.AutoscalingMetricTypePercentageRunnersBusy:
+		suggested, err = r.suggestReplicasByPercentageRunnersBusy(rd, hra, primaryMetric)
+	default:
+		return nil, fmt.Errorf("validting autoscaling metrics: unsupported metric type %q", primaryMetric)
+	}
+
+	if err != nil {
+		return nil, err
+	}
+
+	if suggested != nil && *suggested > 0 {
+		return suggested, nil
+	}
+
+	if len(metrics) == 1 {
+		// This is never supposed to happen but anyway-
+		// Fall-back to `minReplicas + capacityReservedThroughWebhook`.
+		return nil, nil
+	}
+
+	// At this point, we are sure that there are exactly 2 Metrics entries.
+
+	fallbackMetric := metrics[1]
+	fallbackMetricType := fallbackMetric.Type
+
+	if primaryMetricType != v1alpha1.AutoscalingMetricTypePercentageRunnersBusy ||
+		fallbackMetricType != v1alpha1.AutoscalingMetricTypeTotalNumberOfQueuedAndInProgressWorkflowRuns {
+
+		return nil, fmt.Errorf(
+			"invalid HRA Spec: Metrics[0] of %s cannot be combined with Metrics[1] of %s: The only allowed combination is 0=PercentageRunnersBusy and 1=TotalNumberOfQueuedAndInProgressWorkflowRuns",
+			primaryMetricType, fallbackMetricType,
+		)
+	}
+
+	return r.suggestReplicasByQueuedAndInProgressWorkflowRuns(rd, hra, &fallbackMetric)
 }

-func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByQueuedAndInProgressWorkflowRuns(rd v1alpha1.RunnerDeployment, hra v1alpha1.HorizontalRunnerAutoscaler) (*int, error) {
+func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByQueuedAndInProgressWorkflowRuns(rd v1alpha1.RunnerDeployment, hra v1alpha1.HorizontalRunnerAutoscaler, metrics *v1alpha1.MetricSpec) (*int, error) {

 	var repos [][]string
-	metrics := hra.Spec.Metrics
 	repoID := rd.Spec.Template.Spec.Repository
 	if repoID == "" {
 		orgName := rd.Spec.Template.Spec.Organization
@ -100,15 +143,15 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByQueuedAndInProgr
 		// In case it's an organizational runners deployment without any scaling metrics defined,
 		// we assume that the desired replicas should always be `minReplicas + capacityReservedThroughWebhook`.
 		// See https://github.com/summerwind/actions-runner-controller/issues/377#issuecomment-793372693
-		if len(metrics) == 0 {
+		if metrics == nil {
 			return nil, nil
 		}

-		if len(metrics[0].RepositoryNames) == 0 {
+		if len(metrics.RepositoryNames) == 0 {
 			return nil, errors.New("validating autoscaling metrics: spec.autoscaling.metrics[].repositoryNames is required and must have one more more entries for organizational runner deployment")
 		}

-		for _, repoName := range metrics[0].RepositoryNames {
+		for _, repoName := range metrics.RepositoryNames {
 			repos = append(repos, []string{orgName, repoName})
 		}
 	} else {
@ -194,9 +237,8 @@ func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByQueuedAndInProgr
 	return &necessaryReplicas, nil
 }

-func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByPercentageRunnersBusy(rd v1alpha1.RunnerDeployment, hra v1alpha1.HorizontalRunnerAutoscaler) (*int, error) {
+func (r *HorizontalRunnerAutoscalerReconciler) suggestReplicasByPercentageRunnersBusy(rd v1alpha1.RunnerDeployment, hra v1alpha1.HorizontalRunnerAutoscaler, metrics v1alpha1.MetricSpec) (*int, error) {
 	ctx := context.Background()
-	metrics := hra.Spec.Metrics[0]
 	scaleUpThreshold := defaultScaleUpThreshold
 	scaleDownThreshold := defaultScaleDownThreshold
 	scaleUpFactor := defaultScaleUpFactor