diff --git a/acceptance/deploy.sh b/acceptance/deploy.sh index 2922a396..953293e8 100755 --- a/acceptance/deploy.sh +++ b/acceptance/deploy.sh @@ -37,7 +37,10 @@ fi tool=${ACCEPTANCE_TEST_DEPLOYMENT_TOOL} +TEST_ID=${TEST_ID:-default} + if [ "${tool}" == "helm" ]; then + set -v helm upgrade --install actions-runner-controller \ charts/actions-runner-controller \ -n actions-runner-system \ @@ -46,7 +49,10 @@ if [ "${tool}" == "helm" ]; then --set authSecret.create=false \ --set image.repository=${NAME} \ --set image.tag=${VERSION} \ + --set podAnnotations.test-id=${TEST_ID} \ + --set githubWebhookServer.podAnnotations.test-id=${TEST_ID} \ -f ${VALUES_FILE} + set +v # To prevent `CustomResourceDefinition.apiextensions.k8s.io "runners.actions.summerwind.dev" is invalid: metadata.annotations: Too long: must have at most 262144 bytes` # errors kubectl create -f charts/actions-runner-controller/crds || kubectl replace -f charts/actions-runner-controller/crds @@ -77,7 +83,7 @@ else fi if [ -n "${TEST_ORG}" ]; then - cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= NAME=org-runnerdeploy envsubst | kubectl apply -f - + cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= RUNNER_MIN_REPLICAS=${ORG_RUNNER_MIN_REPLICAS} NAME=org-runnerdeploy envsubst | kubectl apply -f - if [ -n "${TEST_ORG_GROUP}" ]; then cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= TEST_GROUP=${TEST_ORG_GROUP} NAME=orggroup-runnerdeploy envsubst | kubectl apply -f - diff --git a/acceptance/testdata/runnerdeploy.envsubst.yaml b/acceptance/testdata/runnerdeploy.envsubst.yaml index ad935125..e0862f60 100644 --- a/acceptance/testdata/runnerdeploy.envsubst.yaml +++ b/acceptance/testdata/runnerdeploy.envsubst.yaml @@ -28,6 +28,7 @@ spec: ## Replace `mumoshu/actions-runner-dind:dev` with your dind image #dockerdWithinRunnerContainer: true #image: mumoshu/actions-runner-dind:dev + dockerdWithinRunnerContainer: ${RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER} # # Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu) @@ -56,6 +57,7 @@ spec: scaleUpTriggers: - githubEvent: {} amount: 1 - duration: "1m" - minReplicas: 0 + duration: "10m" + minReplicas: ${RUNNER_MIN_REPLICAS} maxReplicas: 10 + scaleDownDelaySecondsAfterScaleOut: ${RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT} diff --git a/acceptance/values.yaml b/acceptance/values.yaml index 496589b9..1a86610e 100644 --- a/acceptance/values.yaml +++ b/acceptance/values.yaml @@ -1,7 +1,8 @@ # Set actions-runner-controller settings for testing githubAPICacheDuration: 10s +logLevel: "-3" githubWebhookServer: - logLevel: debug + logLevel: "-3" enabled: true labels: {} replicaCount: 1 diff --git a/api/v1alpha1/horizontalrunnerautoscaler_types.go b/api/v1alpha1/horizontalrunnerautoscaler_types.go index 350583e7..b5da8275 100644 --- a/api/v1alpha1/horizontalrunnerautoscaler_types.go +++ b/api/v1alpha1/horizontalrunnerautoscaler_types.go @@ -107,6 +107,9 @@ type CapacityReservation struct { Name string `json:"name,omitempty"` ExpirationTime metav1.Time `json:"expirationTime,omitempty"` Replicas int `json:"replicas,omitempty"` + + // +optional + EffectiveTime metav1.Time `json:"effectiveTime,omitempty"` } type ScaleTargetRef struct { diff --git a/api/v1alpha1/runnerdeployment_types.go b/api/v1alpha1/runnerdeployment_types.go index 23b0d0d2..588a6855 100644 --- a/api/v1alpha1/runnerdeployment_types.go +++ b/api/v1alpha1/runnerdeployment_types.go @@ -31,6 +31,14 @@ type RunnerDeploymentSpec struct { // +nullable Replicas *int `json:"replicas,omitempty"` + // EffectiveTime is the time the upstream controller requested to sync Replicas. + // It is usually populated by the webhook-based autoscaler via HRA. + // The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated. + // + // +optional + // +nullable + EffectiveTime *metav1.Time `json:"effectiveTime"` + // +optional // +nullable Selector *metav1.LabelSelector `json:"selector"` diff --git a/api/v1alpha1/runnerreplicaset_types.go b/api/v1alpha1/runnerreplicaset_types.go index 4c698feb..9ecf1349 100644 --- a/api/v1alpha1/runnerreplicaset_types.go +++ b/api/v1alpha1/runnerreplicaset_types.go @@ -26,6 +26,15 @@ type RunnerReplicaSetSpec struct { // +nullable Replicas *int `json:"replicas,omitempty"` + // EffectiveTime is the time the upstream controller requested to sync Replicas. + // It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. + // The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners + // based on potentially outdated Replicas value. + // + // +optional + // +nullable + EffectiveTime *metav1.Time `json:"effectiveTime"` + // +optional // +nullable Selector *metav1.LabelSelector `json:"selector"` diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index d040cc78..d37ce954 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -47,6 +47,7 @@ func (in *CacheEntry) DeepCopy() *CacheEntry { func (in *CapacityReservation) DeepCopyInto(out *CapacityReservation) { *out = *in in.ExpirationTime.DeepCopyInto(&out.ExpirationTime) + in.EffectiveTime.DeepCopyInto(&out.EffectiveTime) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityReservation. @@ -498,6 +499,10 @@ func (in *RunnerDeploymentSpec) DeepCopyInto(out *RunnerDeploymentSpec) { *out = new(int) **out = **in } + if in.EffectiveTime != nil { + in, out := &in.EffectiveTime, &out.EffectiveTime + *out = (*in).DeepCopy() + } if in.Selector != nil { in, out := &in.Selector, &out.Selector *out = new(metav1.LabelSelector) @@ -812,6 +817,10 @@ func (in *RunnerReplicaSetSpec) DeepCopyInto(out *RunnerReplicaSetSpec) { *out = new(int) **out = **in } + if in.EffectiveTime != nil { + in, out := &in.EffectiveTime, &out.EffectiveTime + *out = (*in).DeepCopy() + } if in.Selector != nil { in, out := &in.Selector, &out.Selector *out = new(metav1.LabelSelector) diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml index 0a34c042..2f1719fd 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_horizontalrunnerautoscalers.yaml @@ -49,6 +49,9 @@ spec: items: description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime. properties: + effectiveTime: + format: date-time + type: string expirationTime: format: date-time type: string diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml index 1883f901..9a693f9b 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerdeployments.yaml @@ -48,6 +48,11 @@ spec: spec: description: RunnerDeploymentSpec defines the desired state of RunnerDeployment properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml index 1bd5051d..7527c088 100644 --- a/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml +++ b/charts/actions-runner-controller/crds/actions.summerwind.dev_runnerreplicasets.yaml @@ -45,6 +45,11 @@ spec: spec: description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/charts/actions-runner-controller/templates/deployment.yaml b/charts/actions-runner-controller/templates/deployment.yaml index 234d36a7..d2f602ca 100644 --- a/charts/actions-runner-controller/templates/deployment.yaml +++ b/charts/actions-runner-controller/templates/deployment.yaml @@ -14,6 +14,7 @@ spec: metadata: {{- with .Values.podAnnotations }} annotations: + kubectl.kubernetes.io/default-logs-container: "manager" {{- toYaml . | nindent 8 }} {{- end }} labels: diff --git a/charts/actions-runner-controller/templates/githubwebhook.deployment.yaml b/charts/actions-runner-controller/templates/githubwebhook.deployment.yaml index 3776fb09..6dbb428c 100644 --- a/charts/actions-runner-controller/templates/githubwebhook.deployment.yaml +++ b/charts/actions-runner-controller/templates/githubwebhook.deployment.yaml @@ -15,6 +15,7 @@ spec: metadata: {{- with .Values.githubWebhookServer.podAnnotations }} annotations: + kubectl.kubernetes.io/default-logs-container: "github-webhook-server" {{- toYaml . | nindent 8 }} {{- end }} labels: diff --git a/cmd/githubwebhookserver/main.go b/cmd/githubwebhookserver/main.go index 436e8b4a..4e3e604b 100644 --- a/cmd/githubwebhookserver/main.go +++ b/cmd/githubwebhookserver/main.go @@ -29,15 +29,14 @@ import ( actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1" "github.com/actions-runner-controller/actions-runner-controller/controllers" "github.com/actions-runner-controller/actions-runner-controller/github" + "github.com/actions-runner-controller/actions-runner-controller/logging" "github.com/kelseyhightower/envconfig" - zaplib "go.uber.org/zap" "k8s.io/apimachinery/pkg/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth/exec" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" _ "k8s.io/client-go/plugin/pkg/client/auth/oidc" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/log/zap" // +kubebuilder:scaffold:imports ) @@ -47,11 +46,6 @@ var ( ) const ( - logLevelDebug = "debug" - logLevelInfo = "info" - logLevelWarn = "warn" - logLevelError = "error" - webhookSecretTokenEnvName = "GITHUB_WEBHOOK_SECRET_TOKEN" ) @@ -97,7 +91,7 @@ func main() { flag.BoolVar(&enableLeaderElection, "enable-leader-election", false, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change") - flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) + flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) flag.StringVar(&webhookSecretToken, "github-webhook-secret-token", "", "The personal access token of GitHub.") flag.StringVar(&c.Token, "github-token", c.Token, "The personal access token of GitHub.") flag.Int64Var(&c.AppID, "github-app-id", c.AppID, "The application ID of GitHub App.") @@ -126,23 +120,7 @@ func main() { setupLog.Info("-watch-namespace is %q. Only HorizontalRunnerAutoscalers in %q are watched, cached, and considered as scale targets.") } - logger := zap.New(func(o *zap.Options) { - switch logLevel { - case logLevelDebug: - o.Development = true - lvl := zaplib.NewAtomicLevelAt(-2) // maps to logr's V(2) - o.Level = &lvl - case logLevelInfo: - lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel) - o.Level = &lvl - case logLevelWarn: - lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel) - o.Level = &lvl - case logLevelError: - lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel) - o.Level = &lvl - } - }) + logger := logging.NewLogger(logLevel) ctrl.SetLogger(logger) @@ -152,6 +130,8 @@ func main() { // That is, all runner groups managed by ARC are assumed to be visible to any repositories, // which is wrong when you have one or more non-default runner groups in your organization or enterprise. if len(c.Token) > 0 || (c.AppID > 0 && c.AppInstallationID > 0 && c.AppPrivateKey != "") || (len(c.BasicauthUsername) > 0 && len(c.BasicauthPassword) > 0) { + c.Log = &logger + ghClient, err = c.NewClient() if err != nil { fmt.Fprintln(os.Stderr, "Error: Client creation failed.", err) diff --git a/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml b/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml index 0a34c042..2f1719fd 100644 --- a/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml +++ b/config/crd/bases/actions.summerwind.dev_horizontalrunnerautoscalers.yaml @@ -49,6 +49,9 @@ spec: items: description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime. properties: + effectiveTime: + format: date-time + type: string expirationTime: format: date-time type: string diff --git a/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml b/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml index 1883f901..9a693f9b 100644 --- a/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml +++ b/config/crd/bases/actions.summerwind.dev_runnerdeployments.yaml @@ -48,6 +48,11 @@ spec: spec: description: RunnerDeploymentSpec defines the desired state of RunnerDeployment properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml b/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml index 1bd5051d..7527c088 100644 --- a/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml +++ b/config/crd/bases/actions.summerwind.dev_runnerreplicasets.yaml @@ -45,6 +45,11 @@ spec: spec: description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet properties: + effectiveTime: + description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value. + format: date-time + nullable: true + type: string replicas: nullable: true type: integer diff --git a/controllers/horizontal_runner_autoscaler_webhook.go b/controllers/horizontal_runner_autoscaler_webhook.go index ea407fdb..9553c33c 100644 --- a/controllers/horizontal_runner_autoscaler_webhook.go +++ b/controllers/horizontal_runner_autoscaler_webhook.go @@ -770,8 +770,10 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context. capacityReservations := getValidCapacityReservations(copy) if amount > 0 { + now := time.Now() copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{ - ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)}, + EffectiveTime: metav1.Time{Time: now}, + ExpirationTime: metav1.Time{Time: now.Add(target.ScaleUpTrigger.Duration.Duration)}, Replicas: amount, }) } else if amount < 0 { @@ -790,10 +792,16 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context. copy.Spec.CapacityReservations = reservations } - autoscaler.Log.Info( + before := len(target.HorizontalRunnerAutoscaler.Spec.CapacityReservations) + expired := before - len(capacityReservations) + after := len(copy.Spec.CapacityReservations) + + autoscaler.Log.V(1).Info( "Patching hra for capacityReservations update", - "before", target.HorizontalRunnerAutoscaler.Spec.CapacityReservations, - "after", copy.Spec.CapacityReservations, + "before", before, + "expired", expired, + "amount", amount, + "after", after, ) if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil { diff --git a/controllers/horizontalrunnerautoscaler_controller.go b/controllers/horizontalrunnerautoscaler_controller.go index 2fd949bb..140a8f51 100644 --- a/controllers/horizontalrunnerautoscaler_controller.go +++ b/controllers/horizontalrunnerautoscaler_controller.go @@ -99,11 +99,33 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error { currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas) + ephemeral := rd.Spec.Template.Spec.Ephemeral == nil || *rd.Spec.Template.Spec.Ephemeral + + var effectiveTime *time.Time + + for _, r := range hra.Spec.CapacityReservations { + t := r.EffectiveTime + if effectiveTime == nil || effectiveTime.Before(t.Time) { + effectiveTime = &t.Time + } + } + // Please add more conditions that we can in-place update the newest runnerreplicaset without disruption if currentDesiredReplicas != newDesiredReplicas { copy := rd.DeepCopy() copy.Spec.Replicas = &newDesiredReplicas + if ephemeral && effectiveTime != nil { + copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime} + } + + if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { + return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err) + } + } else if ephemeral && effectiveTime != nil { + copy := rd.DeepCopy() + copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime} + if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil { return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err) } diff --git a/controllers/integration_test.go b/controllers/integration_test.go index d87bdb18..2e038e4d 100644 --- a/controllers/integration_test.go +++ b/controllers/integration_test.go @@ -110,6 +110,8 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment { Name: controllerName("runner"), RegistrationRecheckInterval: time.Millisecond, RegistrationRecheckJitter: time.Millisecond, + UnregistrationTimeout: 1 * time.Second, + UnregistrationRetryDelay: 1 * time.Second, } err = runnerController.SetupWithManager(mgr) Expect(err).NotTo(HaveOccurred(), "failed to setup runner controller") diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 1bfa8bcc..03c22b27 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -72,6 +72,9 @@ type RunnerReconciler struct { Name string RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration + + UnregistrationTimeout time.Duration + UnregistrationRetryDelay time.Duration } // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners,verbs=get;list;watch;create;update;patch;delete @@ -110,8 +113,23 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } } else { + var p *corev1.Pod + + { + var pod corev1.Pod + if err := r.Get(ctx, req.NamespacedName, &pod); err != nil { + if !kerrors.IsNotFound(err) { + log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err)) + + return ctrl.Result{Requeue: true}, nil + } + } else { + p = &pod + } + } + // Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner - return r.processRunnerDeletion(runner, ctx, log) + return r.processRunnerDeletion(runner, ctx, log, p) } registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly) @@ -159,20 +177,27 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr // If pod has ended up succeeded we need to restart it // Happens e.g. when dind is in runner and run completes - stopped := pod.Status.Phase == corev1.PodSucceeded + stopped := runnerPodOrContainerIsStopped(&pod) - if !stopped { - if pod.Status.Phase == corev1.PodRunning { - for _, status := range pod.Status.ContainerStatuses { - if status.Name != containerName { - continue - } + ephemeral := runner.Spec.Ephemeral == nil || *runner.Spec.Ephemeral - if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 { - stopped = true - } - } + if stopped && ephemeral { + log.V(1).Info("Ephemeral runner has been stopped successfully. Marking this runner for deletion.") + + // This is the key to make ephemeral runners to work reliably with webhook-based autoscale. + // See https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1046161384 for more context. + // + // In the next reconcilation loop, this triggers a runner unregistration. + // (Note that the unregistration can fail safely because an ephemeral runner usually unregisters itself from GitHub but we do it just for confirmation) + // + // See the code path above that is executed when `runner.ObjectMeta.DeletionTimestamp.IsZero()` isn't true, + // which handles the unregistrationa the removal of the completed pod, and so on. + if err := r.Delete(ctx, &runner); err != nil { + log.V(1).Error(err, "Retrying to mark this runner for deletion in 10 seconds.") + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil } + + return ctrl.Result{Requeue: true}, nil } restart := stopped @@ -404,64 +429,53 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } - // Try to delete current pod if recreation is needed - safeToDeletePod := false - ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) - if err != nil { - log.Error(err, "Failed to unregister runner before deleting the pod.", "runner", runner.Name) - } else { - // `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub. - if !ok { - log.Info("Runner no longer exists on GitHub", "runner", runner.Name) - } - - safeToDeletePod = true + updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod) + if res != nil { + return *res, err } - if safeToDeletePod { - // Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service. - // This should help us avoid race condition between runner pickup job after we think the runner is not busy. - if err := r.Delete(ctx, &pod); err != nil { - log.Error(err, "Failed to delete pod resource") - return ctrl.Result{}, err - } - - r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name)) - log.Info("Deleted runner pod", "repository", runner.Spec.Repository) + // Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service. + // This should help us avoid race condition between runner pickup job after we think the runner is not busy. + if err := r.Delete(ctx, updatedPod); err != nil { + log.Error(err, "Failed to delete pod resource") + return ctrl.Result{}, err } + r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name)) + log.Info("Deleted runner pod", "repository", runner.Spec.Repository) + return ctrl.Result{}, nil } -func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger) (reconcile.Result, error) { +func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool { + // If pod has ended up succeeded we need to restart it + // Happens e.g. when dind is in runner and run completes + stopped := pod.Status.Phase == corev1.PodSucceeded + + if !stopped { + if pod.Status.Phase == corev1.PodRunning { + for _, status := range pod.Status.ContainerStatuses { + if status.Name != containerName { + continue + } + + if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 { + stopped = true + } + } + } + } + + return stopped +} + +func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) { finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName) if removed { - if len(runner.Status.Registration.Token) > 0 { - ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) - if err != nil { - if errors.Is(err, &gogithub.RateLimitError{}) { - // We log the underlying error when we failed calling GitHub API to list or unregisters, - // or the runner is still busy. - log.Error( - err, - fmt.Sprintf( - "Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls", - retryDelayOnGitHubAPIRateLimitError, - ), - ) - - return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err - } - - return ctrl.Result{}, err - } - - if !ok { - log.V(1).Info("Runner no longer exists on GitHub") - } - } else { - log.V(1).Info("Runner was never registered on GitHub") + _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod) + if res != nil { + return *res, err } newRunner := runner.DeepCopy() @@ -478,6 +492,24 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con return ctrl.Result{}, nil } +func (r *RunnerReconciler) unregistrationTimeout() time.Duration { + unregistrationTimeout := DefaultUnregistrationTimeout + + if r.UnregistrationTimeout > 0 { + unregistrationTimeout = r.UnregistrationTimeout + } + return unregistrationTimeout +} + +func (r *RunnerReconciler) unregistrationRetryDelay() time.Duration { + retryDelay := DefaultUnregistrationRetryDelay + + if r.UnregistrationRetryDelay > 0 { + retryDelay = r.UnregistrationRetryDelay + } + return retryDelay +} + func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) { deletionTimeout := 1 * time.Minute currentTime := time.Now() diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go new file mode 100644 index 00000000..83e4b41e --- /dev/null +++ b/controllers/runner_graceful_stop.go @@ -0,0 +1,249 @@ +package controllers + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/actions-runner-controller/actions-runner-controller/github" + "github.com/go-logr/logr" + gogithub "github.com/google/go-github/v39/github" + corev1 "k8s.io/api/core/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + unregistrationCompleteTimestamp = "unregistration-complete-timestamp" + unregistrationStartTimestamp = "unregistration-start-timestamp" + + // DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name) + // and RemoveRunner API (to actually unregister the runner) calls. + // This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s + // and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response + // up to 60 seconds (or even more depending on the situation). + DefaultUnregistrationTimeout = 60 * time.Second + + // This can be any value but a larger value can make an unregistration timeout longer than configured in practice. + DefaultUnregistrationRetryDelay = 30 * time.Second +) + +// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that +// we can delete the runner pod without disrupting a workflow job. +// +// This function returns a non-nil pointer to corev1.Pod as the first return value +// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion. +// +// It's a "tick" operation so a graceful stop can take multiple calls to complete. +// This function is designed to complete a length graceful stop process in a unblocking way. +// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value. +// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry. +func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { + if pod != nil { + if _, ok := getAnnotation(pod, unregistrationStartTimestamp); !ok { + updated := pod.DeepCopy() + setAnnotation(updated, unregistrationStartTimestamp, time.Now().Format(time.RFC3339)) + if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationStartTimestamp)) + return nil, &ctrl.Result{}, err + } + pod = updated + + log.Info("Runner has started unregistration") + } else { + log.Info("Runner has already started unregistration") + } + } + + if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, enterprise, organization, repository, runner, pod); res != nil { + return nil, res, err + } + + if pod != nil { + if _, ok := getAnnotation(pod, unregistrationCompleteTimestamp); !ok { + updated := pod.DeepCopy() + setAnnotation(updated, unregistrationCompleteTimestamp, time.Now().Format(time.RFC3339)) + if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationCompleteTimestamp)) + return nil, &ctrl.Result{}, err + } + pod = updated + + log.Info("Runner has completed unregistration") + } else { + log.Info("Runner has already completed unregistration") + } + } + + return pod, nil, nil +} + +// If the first return value is nil, it's safe to delete the runner pod. +func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { + ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner) + if err != nil { + if errors.Is(err, &gogithub.RateLimitError{}) { + // We log the underlying error when we failed calling GitHub API to list or unregisters, + // or the runner is still busy. + log.Error( + err, + fmt.Sprintf( + "Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls", + retryDelayOnGitHubAPIRateLimitError, + ), + ) + + return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err + } + + log.Error(err, "Failed to unregister runner before deleting the pod.") + + return &ctrl.Result{}, err + } else if ok { + log.Info("Runner has just been unregistered. Removing the runner pod.") + } else if pod == nil { + // `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub. + // However, that doesn't always mean the pod can be safely removed. + // + // If the pod does not exist for the runner, + // it may be due to that the runner pod has never been created. + // In that case we can safely assume that the runner will never be registered. + + log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.") + } else if pod.Annotations[unregistrationCompleteTimestamp] != "" { + // If it's already unregistered in the previous reconcilation loop, + // you can safely assume that it won't get registered again so it's safe to delete the runner pod. + log.Info("Runner pod is marked as already unregistered.") + } else if runnerPodOrContainerIsStopped(pod) { + // If it's an ephemeral runner with the actions/runner container exited with 0, + // we can safely assume that it has unregistered itself from GitHub Actions + // so it's natural that RemoveRunner fails due to 404. + + // If pod has ended up succeeded we need to restart it + // Happens e.g. when dind is in runner and run completes + log.Info("Runner pod has been stopped with a successful status.") + } else if ts := pod.Annotations[unregistrationStartTimestamp]; ts != "" { + t, err := time.Parse(time.RFC3339, ts) + if err != nil { + return &ctrl.Result{RequeueAfter: retryDelay}, err + } + + if r := time.Until(t.Add(unregistrationTimeout)); r > 0 { + log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r) + return &ctrl.Result{RequeueAfter: retryDelay}, err + } + + log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout) + } else { + // A runner and a runner pod that is created by this version of ARC should match + // any of the above branches. + // + // But we leave this match all branch for potential backward-compatibility. + // The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process, + // and retry later. + log.V(1).Info("Runner unregistration is being retried later.") + + return &ctrl.Result{RequeueAfter: retryDelay}, nil + } + + return nil, nil +} + +func getAnnotation(pod *corev1.Pod, key string) (string, bool) { + if pod.Annotations == nil { + return "", false + } + + v, ok := pod.Annotations[key] + + return v, ok +} + +func setAnnotation(pod *corev1.Pod, key, value string) { + if pod.Annotations == nil { + pod.Annotations = map[string]string{} + } + + pod.Annotations[key] = value +} + +// unregisterRunner unregisters the runner from GitHub Actions by name. +// +// This function returns: +// +// Case 1. (true, nil) when it has successfully unregistered the runner. +// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet) +// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to +// an error returned by GitHub API. +// +// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately. +// In other words, all those three sub-cases cannot be distinguished by this function alone. +// +// - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run). +// You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere, +// so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil). +// +// - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed. +// Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever. +// There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side. +// +// - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up. +// If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner. +// +// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back +// as long as you recreate the runner pod. +// +// If it was "2-3.", you need a workaround to avoid the race condition. +// +// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever +// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up. +// +// Beware though, you need extra care to set an appropriate grace period depending on your environment. +// There isn't a single right grace period that works for everyone. +// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions, +// while the shorter the grace period is, the more likely you may encounter the race issue. +func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) { + runners, err := client.ListRunners(ctx, enterprise, org, repo) + if err != nil { + return false, err + } + + id := int64(0) + for _, runner := range runners { + if runner.GetName() == name { + id = runner.GetID() + break + } + } + + if id == int64(0) { + return false, nil + } + + // For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true. + // The reason ARC did so was to let a runner running a job to not stop prematurely. + // + // However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner, + // so ARC doesn't need to do anything special for a graceful runner stop. + // It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and + // the runner pod is safe for deletion. + // + // Trying to remove a busy runner can result in errors like the following: + // failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" [] + // + // # NOTES + // + // - It can be "status=offline" at the same time but that's another story. + // - After https://github.com/actions-runner-controller/actions-runner-controller/pull/1127, ListRunners responses that are used to + // determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds. + // - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't + // change from 60 seconds. + // + // TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422? + if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil { + return false, err + } + + return true, nil +} diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go index d43eea37..e93f53c8 100644 --- a/controllers/runner_pod_controller.go +++ b/controllers/runner_pod_controller.go @@ -47,6 +47,9 @@ type RunnerPodReconciler struct { Name string RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration + + UnregistrationTimeout time.Duration + UnregistrationRetryDelay time.Duration } const ( @@ -105,33 +108,16 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName) if removed { - ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name) - if err != nil { - if errors.Is(err, &gogithub.RateLimitError{}) { - // We log the underlying error when we failed calling GitHub API to list or unregisters, - // or the runner is still busy. - log.Error( - err, - fmt.Sprintf( - "Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls", - retryDelayOnGitHubAPIRateLimitError, - ), - ) - - return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err - } - - return ctrl.Result{}, err + updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + if res != nil { + return *res, err } - if !ok { - log.V(1).Info("Runner no longer exists on GitHub") - } + patchedPod := updatedPod.DeepCopy() + patchedPod.ObjectMeta.Finalizers = finalizers - newRunner := runnerPod.DeepCopy() - newRunner.ObjectMeta.Finalizers = finalizers - - if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil { + // We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster. + if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil { log.Error(err, "Failed to update runner for finalizer removal") return ctrl.Result{}, err } @@ -365,8 +351,13 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } + updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + if res != nil { + return *res, err + } + // Delete current pod if recreation is needed - if err := r.Delete(ctx, &runnerPod); err != nil { + if err := r.Delete(ctx, updated); err != nil { log.Error(err, "Failed to delete pod resource") return ctrl.Result{}, err } @@ -377,8 +368,22 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } -func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) { - return unregisterRunner(ctx, r.GitHubClient, enterprise, org, repo, name) +func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration { + unregistrationTimeout := DefaultUnregistrationTimeout + + if r.UnregistrationTimeout > 0 { + unregistrationTimeout = r.UnregistrationTimeout + } + return unregistrationTimeout +} + +func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration { + retryDelay := DefaultUnregistrationRetryDelay + + if r.UnregistrationRetryDelay > 0 { + retryDelay = r.UnregistrationRetryDelay + } + return retryDelay } func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error { diff --git a/controllers/runnerdeployment_controller.go b/controllers/runnerdeployment_controller.go index 6725bc64..9a55ae3d 100644 --- a/controllers/runnerdeployment_controller.go +++ b/controllers/runnerdeployment_controller.go @@ -177,6 +177,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req // Please add more conditions that we can in-place update the newest runnerreplicaset without disruption if currentDesiredReplicas != newDesiredReplicas { newestSet.Spec.Replicas = &newDesiredReplicas + newestSet.Spec.EffectiveTime = rd.Spec.EffectiveTime if err := r.Client.Update(ctx, newestSet); err != nil { log.Error(err, "Failed to update runnerreplicaset resource") @@ -417,9 +418,10 @@ func newRunnerReplicaSet(rd *v1alpha1.RunnerDeployment, commonRunnerLabels []str Labels: newRSTemplate.ObjectMeta.Labels, }, Spec: v1alpha1.RunnerReplicaSetSpec{ - Replicas: rd.Spec.Replicas, - Selector: newRSSelector, - Template: newRSTemplate, + Replicas: rd.Spec.Replicas, + Selector: newRSSelector, + Template: newRSTemplate, + EffectiveTime: rd.Spec.EffectiveTime, }, } diff --git a/controllers/runnerreplicaset_controller.go b/controllers/runnerreplicaset_controller.go index 2acc93aa..e4ac040f 100644 --- a/controllers/runnerreplicaset_controller.go +++ b/controllers/runnerreplicaset_controller.go @@ -49,6 +49,10 @@ type RunnerReplicaSetReconciler struct { Name string } +const ( + SyncTimeAnnotationKey = "sync-time" +) + // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch @@ -85,19 +89,36 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req } } - var myRunners []v1alpha1.Runner - var ( current int ready int available int + + lastSyncTime *time.Time ) for _, r := range allRunners.Items { // This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before // to not treat all the runners in the namespace as its children. if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) { - myRunners = append(myRunners, r) + // If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion) + // or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop), + // we don't need to bother calling GitHub API to re-mark the runner for deletion. + // Just hold on, and runners will disappear as long as the runner controller is up and running. + if !r.DeletionTimestamp.IsZero() { + continue + } + + if r.Annotations != nil { + if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok { + t, err := time.Parse(time.RFC3339, a) + if err == nil { + if lastSyncTime == nil || lastSyncTime.Before(t) { + lastSyncTime = &t + } + } + } + } current += 1 @@ -152,7 +173,30 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req } } - if current > desired { + effectiveTime := rs.Spec.EffectiveTime + ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral + + if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) { + log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready) + } else if current > desired { + // If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally, + // you're unlikely to fall into this branch. + // + // That's becaseu all the stakeholders work like this: + // + // 1. A runner pod completes with the runner container exiting with code 0 + // 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero) + // 3. GitHub triggers a corresponding workflow_job "complete" webhook event + // 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation + // 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations + // 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas + // 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet + // + // In a normally working ARC installation, the runner that was used to run the workflow job should already have been + // marked for deletion by the runner controller. + // This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to + // fall into this branch when you're using ephemeral runners with webhook-based-autoscaler. + n := current - desired log.V(0).Info(fmt.Sprintf("Deleting %d runners", n), "desired", desired, "current", current, "ready", ready) @@ -282,6 +326,10 @@ func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1 objectMeta.GenerateName = rs.ObjectMeta.Name + "-" objectMeta.Namespace = rs.ObjectMeta.Namespace + if objectMeta.Annotations == nil { + objectMeta.Annotations = map[string]string{} + } + objectMeta.Annotations[SyncTimeAnnotationKey] = time.Now().Format(time.RFC3339) runner := v1alpha1.Runner{ TypeMeta: metav1.TypeMeta{}, diff --git a/controllers/unregister.go b/controllers/unregister.go deleted file mode 100644 index 9c139be0..00000000 --- a/controllers/unregister.go +++ /dev/null @@ -1,49 +0,0 @@ -package controllers - -import ( - "context" - "fmt" - - "github.com/actions-runner-controller/actions-runner-controller/github" -) - -// unregisterRunner unregisters the runner from GitHub Actions by name. -// -// This function returns: -// - (true, nil) when it has successfully unregistered the runner. -// - (false, nil) when the runner has been already unregistered. -// - (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to -// an error returned by GitHub API. -func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) { - runners, err := client.ListRunners(ctx, enterprise, org, repo) - if err != nil { - return false, err - } - - id := int64(0) - for _, runner := range runners { - if runner.GetName() == name { - // Note that sometimes a runner can stuck "busy" even though it is already "offline". - // But we assume that it's not actually offline and still running a job. - if runner.GetBusy() { - return false, fmt.Errorf("runner is busy") - } - id = runner.GetID() - break - } - } - - if id == int64(0) { - return false, nil - } - - // Trying to remove a busy runner can result in errors like the following: - // failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" [] - // - // TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422? - if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil { - return false, err - } - - return true, nil -} diff --git a/github/github.go b/github/github.go index 89a49913..d8d0acf6 100644 --- a/github/github.go +++ b/github/github.go @@ -11,8 +11,11 @@ import ( "time" "github.com/actions-runner-controller/actions-runner-controller/github/metrics" + "github.com/actions-runner-controller/actions-runner-controller/logging" "github.com/bradleyfalzon/ghinstallation" + "github.com/go-logr/logr" "github.com/google/go-github/v39/github" + "github.com/gregjones/httpcache" "golang.org/x/oauth2" ) @@ -28,6 +31,8 @@ type Config struct { BasicauthUsername string `split_words:"true"` BasicauthPassword string `split_words:"true"` RunnerGitHubURL string `split_words:"true"` + + Log *logr.Logger } // Client wraps GitHub client with some additional @@ -82,8 +87,11 @@ func (c *Config) NewClient() (*Client, error) { transport = tr } - transport = metrics.Transport{Transport: transport} - httpClient := &http.Client{Transport: transport} + cached := httpcache.NewTransport(httpcache.NewMemoryCache()) + cached.Transport = transport + loggingTransport := logging.Transport{Transport: cached, Log: c.Log} + metricsTransport := metrics.Transport{Transport: loggingTransport} + httpClient := &http.Client{Transport: metricsTransport} var client *github.Client var githubBaseURL string diff --git a/go.mod b/go.mod index f9627220..25380078 100644 --- a/go.mod +++ b/go.mod @@ -41,6 +41,7 @@ require ( github.com/google/gofuzz v1.1.0 // indirect github.com/google/uuid v1.1.2 // indirect github.com/googleapis/gnostic v0.5.5 // indirect + github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect github.com/imdario/mergo v0.3.12 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect diff --git a/go.sum b/go.sum index d9d4388c..65d55022 100644 --- a/go.sum +++ b/go.sum @@ -259,6 +259,8 @@ github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI= github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= +github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= diff --git a/logging/logger.go b/logging/logger.go new file mode 100644 index 00000000..463726b9 --- /dev/null +++ b/logging/logger.go @@ -0,0 +1,54 @@ +package logging + +import ( + "fmt" + "os" + "strconv" + + "github.com/go-logr/logr" + zaplib "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +const ( + LogLevelDebug = "debug" + LogLevelInfo = "info" + LogLevelWarn = "warn" + LogLevelError = "error" +) + +func NewLogger(logLevel string) logr.Logger { + log := zap.New(func(o *zap.Options) { + switch logLevel { + case LogLevelDebug: + o.Development = true + lvl := zaplib.NewAtomicLevelAt(zaplib.DebugLevel) // maps to logr's V(1) + o.Level = &lvl + case LogLevelInfo: + lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel) + o.Level = &lvl + case LogLevelWarn: + lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel) + o.Level = &lvl + case LogLevelError: + lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel) + o.Level = &lvl + default: + // We use bitsize of 8 as zapcore.Level is a type alias to int8 + levelInt, err := strconv.ParseInt(logLevel, 10, 8) + if err != nil { + fmt.Fprintf(os.Stderr, "Failed to parse --log-level=%s: %v", logLevel, err) + os.Exit(1) + } + + // For example, --log-level=debug a.k.a --log-level=-1 maps to zaplib.DebugLevel, which is associated to logr's V(1) + // --log-level=-2 maps the specific custom log level that is associated to logr's V(2). + level := zapcore.Level(levelInt) + atomicLevel := zaplib.NewAtomicLevelAt(level) + o.Level = &atomicLevel + } + }) + + return log +} diff --git a/logging/transport.go b/logging/transport.go new file mode 100644 index 00000000..9d61a318 --- /dev/null +++ b/logging/transport.go @@ -0,0 +1,51 @@ +// Package logging provides various logging helpers for ARC +package logging + +import ( + "net/http" + + "github.com/go-logr/logr" + "github.com/gregjones/httpcache" +) + +const ( + // https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting + headerRateLimitRemaining = "X-RateLimit-Remaining" +) + +// Transport wraps a transport with metrics monitoring +type Transport struct { + Transport http.RoundTripper + + Log *logr.Logger +} + +func (t Transport) RoundTrip(req *http.Request) (*http.Response, error) { + resp, err := t.Transport.RoundTrip(req) + if resp != nil { + t.log(req, resp) + } + return resp, err +} + +func (t Transport) log(req *http.Request, resp *http.Response) { + if t.Log == nil { + return + } + + var args []interface{} + + marked := resp.Header.Get(httpcache.XFromCache) == "1" + + args = append(args, "from_cache", marked, "method", req.Method, "url", req.URL.String()) + + if !marked { + // Do not log outdated rate limit remaining value + + remaining := resp.Header.Get(headerRateLimitRemaining) + + args = append(args, "ratelimit_remaining", remaining) + } + + t.Log.V(3).Info("Seen HTTP response", args...) +} diff --git a/main.go b/main.go index 43c55961..7383fc5f 100644 --- a/main.go +++ b/main.go @@ -26,24 +26,18 @@ import ( actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1" "github.com/actions-runner-controller/actions-runner-controller/controllers" "github.com/actions-runner-controller/actions-runner-controller/github" + "github.com/actions-runner-controller/actions-runner-controller/logging" "github.com/kelseyhightower/envconfig" - zaplib "go.uber.org/zap" "k8s.io/apimachinery/pkg/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/log/zap" // +kubebuilder:scaffold:imports ) const ( defaultRunnerImage = "summerwind/actions-runner:latest" defaultDockerImage = "docker:dind" - - logLevelDebug = "debug" - logLevelInfo = "info" - logLevelWarn = "warn" - logLevelError = "error" ) var ( @@ -120,24 +114,12 @@ func main() { flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change. . If you're tweaking this in order to make autoscaling more responsive, you'll probably want to tweak github-api-cache-duration, too") flag.Var(&commonRunnerLabels, "common-runner-labels", "Runner labels in the K1=V1,K2=V2,... format that are inherited all the runners created by the controller. See https://github.com/actions-runner-controller/actions-runner-controller/issues/321 for more information") flag.StringVar(&namespace, "watch-namespace", "", "The namespace to watch for custom resources. Set to empty for letting it watch for all namespaces.") - flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) + flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`) flag.Parse() - logger := zap.New(func(o *zap.Options) { - switch logLevel { - case logLevelDebug: - o.Development = true - case logLevelInfo: - lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel) - o.Level = &lvl - case logLevelWarn: - lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel) - o.Level = &lvl - case logLevelError: - lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel) - o.Level = &lvl - } - }) + logger := logging.NewLogger(logLevel) + + c.Log = &logger ghClient, err = c.NewClient() if err != nil { diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 0d93821a..728ba4ff 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -17,12 +17,15 @@ var ( controllerImageTag = "e2e" controllerImage = testing.Img(controllerImageRepo, controllerImageTag) runnerImageRepo = "actionsrunnercontrollere2e/actions-runner" + runnerDindImageRepo = "actionsrunnercontrollere2e/actions-runner-dind" runnerImageTag = "e2e" runnerImage = testing.Img(runnerImageRepo, runnerImageTag) + runnerDindImage = testing.Img(runnerDindImageRepo, runnerImageTag) prebuildImages = []testing.ContainerImage{ controllerImage, runnerImage, + runnerDindImage, } builds = []testing.DockerBuild{ @@ -36,6 +39,11 @@ var ( Args: []testing.BuildArg{}, Image: runnerImage, }, + { + Dockerfile: "../../runner/Dockerfile.dindrunner", + Args: []testing.BuildArg{}, + Image: runnerDindImage, + }, } certManagerVersion = "v1.1.1" @@ -52,7 +60,6 @@ var ( "SYNC_PERIOD=" + "10s", "NAME=" + controllerImageRepo, "VERSION=" + controllerImageTag, - "RUNNER_NAME=" + runnerImageRepo, "RUNNER_TAG=" + runnerImageTag, } @@ -167,11 +174,15 @@ type env struct { useRunnerSet bool testID string + testName string repoToCommit string runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string githubTokenWebhook string testEnterprise string featureFlagEphemeral bool + scaleDownDelaySecondsAfterScaleOut int64 + minReplicas int64 + dockerdWithinRunnerContainer bool testJobs []job } @@ -184,11 +195,12 @@ func initTestEnv(t *testing.T) *env { id := e.ID() - testID := t.Name() + " " + id + testName := t.Name() + " " + id - t.Logf("Using test id %s", testID) + t.Logf("Initializing test with name %s", testName) - e.testID = testID + e.testID = id + e.testName = testName e.runnerLabel = "test-" + id e.githubToken = testing.Getenv(t, "GITHUB_TOKEN") e.githubTokenWebhook = testing.Getenv(t, "WEBHOOK_GITHUB_TOKEN") @@ -197,9 +209,17 @@ func initTestEnv(t *testing.T) *env { e.testOrg = testing.Getenv(t, "TEST_ORG", "") e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "") e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE") - e.testJobs = createTestJobs(id, testResultCMNamePrefix, 10) + e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100) ephemeral, _ := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL")) e.featureFlagEphemeral = ephemeral + e.scaleDownDelaySecondsAfterScaleOut, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT", "10"), 10, 32) + e.minReplicas, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_MIN_REPLICAS", "1"), 10, 32) + + var err error + e.dockerdWithinRunnerContainer, err = strconv.ParseBool(testing.Getenv(t, "TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER", "false")) + if err != nil { + panic(fmt.Sprintf("unable to parse bool from TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER: %v", err)) + } return e } @@ -254,7 +274,22 @@ func (e *env) installActionsRunnerController(t *testing.T) { "GITHUB_TOKEN=" + e.githubToken, "WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook, "RUNNER_LABEL=" + e.runnerLabel, + "TEST_ID=" + e.testID, fmt.Sprintf("RUNNER_FEATURE_FLAG_EPHEMERAL=%v", e.featureFlagEphemeral), + fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut), + fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas), + } + + if e.dockerdWithinRunnerContainer { + varEnv = append(varEnv, + "RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER=true", + "RUNNER_NAME="+runnerDindImageRepo, + ) + } else { + varEnv = append(varEnv, + "RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER=false", + "RUNNER_NAME="+runnerImageRepo, + ) } scriptEnv = append(scriptEnv, varEnv...) @@ -273,7 +308,7 @@ func (e *env) createControllerNamespaceAndServiceAccount(t *testing.T) { func (e *env) installActionsWorkflow(t *testing.T) { t.Helper() - installActionsWorkflow(t, e.testID, e.runnerLabel, testResultCMNamePrefix, e.repoToCommit, e.testJobs) + installActionsWorkflow(t, e.testName, e.runnerLabel, testResultCMNamePrefix, e.repoToCommit, e.testJobs) } func (e *env) verifyActionsWorkflowRun(t *testing.T) { @@ -302,13 +337,13 @@ func createTestJobs(id, testResultCMNamePrefix string, numJobs int) []job { const Branch = "main" -func installActionsWorkflow(t *testing.T, testID, runnerLabel, testResultCMNamePrefix, testRepo string, testJobs []job) { +func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNamePrefix, testRepo string, testJobs []job) { t.Helper() ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() - wfName := "E2E " + testID + wfName := "E2E " + testName wf := testing.Workflow{ Name: wfName, On: testing.On{