From c4b24f8366e1847f8bf961f7ccf45b564fe11d69 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 07:22:04 +0000 Subject: [PATCH 01/13] Prevent static runners from terminating due to unregister timeout The unregister timeout of 1 minute (no matter how long it is) can negatively impact availability of static runner constantly running workflow jobs, and ephemeral runner that runs a long-running job. We deal with that by completely removing the unregistaration timeout, so that regarldess of the type of runner(static or ephemeral) it waits forever until it successfully to get unregistered before being terminated. --- README.md | 4 +++- .../testdata/runnerdeploy.envsubst.yaml | 2 ++ controllers/constants.go | 15 +++++++-------- controllers/integration_test.go | 1 - controllers/runner_controller.go | 7 +++++-- controllers/runner_graceful_stop.go | 19 ++++++------------- controllers/runner_pod_controller.go | 14 ++------------ test/e2e/e2e_test.go | 5 ++++- 8 files changed, 29 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index e0daa215..ecce405e 100644 --- a/README.md +++ b/README.md @@ -1320,7 +1320,9 @@ Once the ephemeral runner has completed running a workflow job, it stops with a As it's removed after a workflow job run, the runner pod is never reused across multiple GitHub Actions workflow jobs, providing you a clean environment per each workflow job. -Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs. It may worth trying only when you do want to prioritize job run speed more than job reliability and security. +Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and in this mode `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs. + +Static runners may worth trying only when you do want to prioritize job run speed more than job reliability and security. > In early days of the project, the flag passed to the runner when `ephemeral: true` was `--once` rather than `--ephemeral`. > diff --git a/acceptance/testdata/runnerdeploy.envsubst.yaml b/acceptance/testdata/runnerdeploy.envsubst.yaml index e0862f60..ae6375e7 100644 --- a/acceptance/testdata/runnerdeploy.envsubst.yaml +++ b/acceptance/testdata/runnerdeploy.envsubst.yaml @@ -17,6 +17,8 @@ spec: image: ${RUNNER_NAME}:${RUNNER_TAG} imagePullPolicy: IfNotPresent + ephemeral: ${TEST_EPHEMERAL} + # Whether to pass --ephemeral (true) or --once (false, deprecated) env: - name: RUNNER_FEATURE_FLAG_EPHEMERAL diff --git a/controllers/constants.go b/controllers/constants.go index 510fa911..24dba05d 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -34,15 +34,14 @@ const ( AnnotationKeyRunnerID = annotationKeyPrefix + "id" - // DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name) - // and RemoveRunner API (to actually unregister the runner) calls. - // This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s - // and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response - // up to 60 seconds (or even more depending on the situation). - DefaultUnregistrationTimeout = 60 * time.Second - // This can be any value but a larger value can make an unregistration timeout longer than configured in practice. - DefaultUnregistrationRetryDelay = 30 * time.Second + DefaultUnregistrationRetryDelay = time.Minute + + // RetryDelayOnCreateRegistrationError is the delay between retry attempts for runner registration token creation. + // Usually, a retry in this case happens when e.g. your PAT has no access to certain scope of runners, like you're using repository admin's token + // for creating a broader scoped runner token, like organizationa or enterprise runner token. + // Such permission issue will never fixed automatically, so we don't need to retry so often, hence this value. + RetryDelayOnCreateRegistrationError = 3 * time.Minute // registrationTimeout is the duration until a pod times out after it becomes Ready and Running. // A pod that is timed out can be terminated if needed. diff --git a/controllers/integration_test.go b/controllers/integration_test.go index 105d8658..5695b44d 100644 --- a/controllers/integration_test.go +++ b/controllers/integration_test.go @@ -110,7 +110,6 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment { Name: controllerName("runner"), RegistrationRecheckInterval: time.Millisecond * 100, RegistrationRecheckJitter: time.Millisecond * 10, - UnregistrationTimeout: 1 * time.Second, UnregistrationRetryDelay: 1 * time.Second, } err = runnerController.SetupWithManager(mgr) diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 5811494b..09241318 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -73,7 +73,6 @@ type RunnerReconciler struct { RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration - UnregistrationTimeout time.Duration UnregistrationRetryDelay time.Duration } @@ -212,7 +211,7 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) { if updated, err := r.updateRegistrationToken(ctx, runner); err != nil { - return ctrl.Result{}, err + return ctrl.Result{RequeueAfter: RetryDelayOnCreateRegistrationError}, nil } else if updated { return ctrl.Result{Requeue: true}, nil } @@ -254,6 +253,10 @@ func (r *RunnerReconciler) updateRegistrationToken(ctx context.Context, runner v rt, err := r.GitHubClient.GetRegistrationToken(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) if err != nil { + // An error can be a permanent, permission issue like the below: + // POST https://api.github.com/enterprises/YOUR_ENTERPRISE/actions/runners/registration-token: 403 Resource not accessible by integration [] + // In such case retrying in seconds might not make much sense. + r.Recorder.Event(&runner, corev1.EventTypeWarning, "FailedUpdateRegistrationToken", "Updating registration token failed") log.Error(err, "Failed to get new registration token") return false, err diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index d95d94a8..68e6650e 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -26,13 +26,13 @@ import ( // This function is designed to complete a lengthy graceful stop process in a unblocking way. // When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value. // The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry. -func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { +func tickRunnerGracefulStop(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { pod, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationStartTimestamp, time.Now().Format(time.RFC3339)) if err != nil { return nil, &ctrl.Result{}, err } - if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil { + if res, err := ensureRunnerUnregistration(ctx, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil { return nil, res, err } @@ -69,7 +69,7 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod } // If the first return value is nil, it's safe to delete the runner pod. -func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { +func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { var runnerID *int64 if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok { @@ -177,17 +177,10 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.") } else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" { - t, err := time.Parse(time.RFC3339, ts) - if err != nil { - return &ctrl.Result{RequeueAfter: retryDelay}, err - } + log.Info("Runner unregistration is in-progress. It can take forever to complete if if it's a static runner constantly running jobs."+ + " It can also take very long time if it's an ephemeral runner that is running a log-running job.", "error", err) - if r := time.Until(t.Add(unregistrationTimeout)); r > 0 { - log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r) - return &ctrl.Result{RequeueAfter: retryDelay}, err - } - - log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout) + return &ctrl.Result{RequeueAfter: retryDelay}, nil } else { // A runner and a runner pod that is created by this version of ARC should match // any of the above branches. diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go index ab683124..3a631671 100644 --- a/controllers/runner_pod_controller.go +++ b/controllers/runner_pod_controller.go @@ -45,7 +45,6 @@ type RunnerPodReconciler struct { RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration - UnregistrationTimeout time.Duration UnregistrationRetryDelay time.Duration } @@ -104,7 +103,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // In a standard scenario, the upstream controller, like runnerset-controller, ensures this runner to be gracefully stopped before the deletion timestamp is set. // But for the case that the user manually deleted it for whatever reason, // we have to ensure it to gracefully stop now. - updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } @@ -172,7 +171,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // // In a standard scenario, ARC starts the unregistration process before marking the pod for deletion at all, // so that it isn't subject to terminationGracePeriod and can safely take hours to finish it's work. - _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } @@ -190,15 +189,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } -func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration { - unregistrationTimeout := DefaultUnregistrationTimeout - - if r.UnregistrationTimeout > 0 { - unregistrationTimeout = r.UnregistrationTimeout - } - return unregistrationTimeout -} - func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration { retryDelay := DefaultUnregistrationRetryDelay diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index c2994d23..84d7d611 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -186,6 +186,7 @@ type env struct { runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string githubTokenWebhook string testEnterprise string + testEphemeral string featureFlagEphemeral *bool scaleDownDelaySecondsAfterScaleOut int64 minReplicas int64 @@ -219,7 +220,8 @@ func initTestEnv(t *testing.T) *env { e.testOrg = testing.Getenv(t, "TEST_ORG", "") e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "") e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE") - e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100) + e.testEphemeral = testing.Getenv(t, "TEST_EPHEMERAL", "") + e.testJobs = createTestJobs(id, testResultCMNamePrefix, 20) if ephemeral, err := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL", "")); err == nil { e.featureFlagEphemeral = &ephemeral @@ -288,6 +290,7 @@ func (e *env) installActionsRunnerController(t *testing.T) { "WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook, "RUNNER_LABEL=" + e.runnerLabel, "TEST_ID=" + e.testID, + "TEST_EPHEMERAL=" + e.testEphemeral, fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut), fmt.Sprintf("REPO_RUNNER_MIN_REPLICAS=%d", e.minReplicas), fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas), From 791634fb128b871999486aa3ffb2acbb90cd4efb Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 07:24:35 +0000 Subject: [PATCH 02/13] Fix static runners not scaling up It turned out that #1179 broke static runners in a way it is no longer able to scale up at all when the desired replicas is updated. This fixes that by correcting a certain short-circuit that is intended only for ephemeral runners to not mistakenly triggered for static runners. --- controllers/runner_pod_owner.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/controllers/runner_pod_owner.go b/controllers/runner_pod_owner.go index 6cf1ca23..9e150fed 100644 --- a/controllers/runner_pod_owner.go +++ b/controllers/runner_pod_owner.go @@ -329,7 +329,7 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, maybeRunning := pending + running wantMoreRunners := newDesiredReplicas > maybeRunning - alreadySyncedAfterEffectiveTime := lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) + alreadySyncedAfterEffectiveTime := ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) runnerPodRecreationDelayAfterWebhookScale := lastSyncTime != nil && time.Now().Before(lastSyncTime.Add(DefaultRunnerPodRecreationDelayAfterWebhookScale)) log = log.WithValues( @@ -344,6 +344,16 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, ) if wantMoreRunners && alreadySyncedAfterEffectiveTime && runnerPodRecreationDelayAfterWebhookScale { + // This is our special handling of the situation for ephemeral runners only. + // + // Handling static runners this way results in scale-up to not work at all, + // because then any scale up attempts for static runenrs fall within this condition, for two reasons. + // First, static(persistent) runners will never restart on their own. + // Second, we don't update EffectiveTime for static runners. + // + // We do need to skip this condition for static runners, and that's why we take the `ephemeral` flag into account when + // computing `alreadySyncedAfterEffectiveTime``. + log.V(2).Info( "Detected that some ephemeral runners have disappeared. " + "Usually this is due to that ephemeral runner completions " + From 8ca39caff5ac03795f95004316fb09c7a3b80554 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 12:11:11 +0000 Subject: [PATCH 03/13] Fix log message on runner deletion --- controllers/runner_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 09241318..907bcc9f 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -199,11 +199,11 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con newRunner.ObjectMeta.Finalizers = finalizers if err := r.Patch(ctx, newRunner, client.MergeFrom(&runner)); err != nil { - log.Error(err, "Failed to update runner for finalizer removal") + log.Error(err, "Unable to remove finalizer") return ctrl.Result{}, err } - log.Info("Removed runner from GitHub", "repository", runner.Spec.Repository, "organization", runner.Spec.Organization) + log.Info("Removed finalizer") } return ctrl.Result{}, nil From f153870f5f7a8d4b18c6dea31eb1986f3833bd22 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 12:12:01 +0000 Subject: [PATCH 04/13] fix: Do not block indefinitely on runner that cannot be deleted due to 403 --- controllers/runner_graceful_stop.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index 68e6650e..8f054ad9 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -120,6 +120,15 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l errRes := &gogithub.ErrorResponse{} if errors.As(err, &errRes) { + if errRes.Response.StatusCode == 403 { + log.Error(err, "Unable to unregister due to permission error. "+ + "Perhaps you've changed the permissions of PAT or GitHub App, or you updated authentication method of ARC in a wrong way? "+ + "ARC considers it as already unregistered and continue removing the pod. "+ + "You may need to remove the runner on GitHub UI.") + + return nil, nil + } + runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner) var runnerID int64 From e4280dcb0d67f910c5184d9a56f79be436fdd533 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 12:14:14 +0000 Subject: [PATCH 05/13] Fix patch MergeFrom target --- controllers/runner_pod_owner.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/controllers/runner_pod_owner.go b/controllers/runner_pod_owner.go index 9e150fed..b78465b9 100644 --- a/controllers/runner_pod_owner.go +++ b/controllers/runner_pod_owner.go @@ -434,12 +434,12 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); !ok { updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)) - if err := c.Patch(ctx, updated, client.MergeFrom(ss.object)); err != nil { - log.Error(err, fmt.Sprintf("Failed to patch object to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp)) + if err := c.Patch(ctx, updated, client.MergeFrom(ss.owner)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp)) return nil, err } - log.V(2).Info("Redundant object has been annotated to start the unregistration before deletion") + log.V(2).Info("Redundant owner has been annotated to start the unregistration before deletion") } else { log.V(2).Info("BUG: Redundant object was already annotated") } @@ -539,7 +539,7 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); !ok { updated := res.owner.withAnnotation(AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339)) - if err := c.Patch(ctx, updated, client.MergeFrom(res.object)); err != nil { + if err := c.Patch(ctx, updated, client.MergeFrom(res.owner)); err != nil { log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationCompleteTimestamp)) return nil, err } From efb7fca3080e5d4f7ab26c7c57f046452fac6da9 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 12:15:49 +0000 Subject: [PATCH 06/13] Fix externally deleted runner pod to not block unregistration process --- controllers/runner_graceful_stop.go | 36 ++++++++++++++++------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index 8f054ad9..86de46bb 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -81,12 +81,29 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l runnerID = &v } + if runnerID == nil { + runner, err := getRunner(ctx, ghClient, enterprise, organization, repository, runner) + if err != nil { + return &ctrl.Result{}, err + } + + if runner != nil && runner.ID != nil { + runnerID = runner.ID + } + } + code := runnerContainerExitCode(pod) if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" { // If it's already unregistered in the previous reconcilation loop, // you can safely assume that it won't get registered again so it's safe to delete the runner pod. log.Info("Runner pod is marked as already unregistered.") + } else if runnerID == nil { + log.Info( + "Unregistration started before runner ID is assigned. " + + "Perhaps the runner pod was terminated by anyone other than ARC? Was it OOM killed? " + + "Marking unregistration as completed anyway because there's nothing ARC can do.", + ) } else if pod != nil && runnerPodOrContainerIsStopped(pod) { // If it's an ephemeral runner with the actions/runner container exited with 0, // we can safely assume that it has unregistered itself from GitHub Actions @@ -99,7 +116,7 @@ func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, l log.Info("Runner pod is annotated to wait for completion") return &ctrl.Result{RequeueAfter: retryDelay}, nil - } else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, runnerID); err != nil { + } else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner, *runnerID); err != nil { if errors.Is(err, &gogithub.RateLimitError{}) { // We log the underlying error when we failed calling GitHub API to list or unregisters, // or the runner is still busy. @@ -335,20 +352,7 @@ func setRunnerEnv(pod *corev1.Pod, key, value string) { // There isn't a single right grace period that works for everyone. // The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions, // while the shorter the grace period is, the more likely you may encounter the race issue. -func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string, id *int64) (bool, error) { - if id == nil { - runner, err := getRunner(ctx, client, enterprise, org, repo, name) - if err != nil { - return false, err - } - - if runner == nil || runner.ID == nil { - return false, nil - } - - id = runner.ID - } - +func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string, id int64) (bool, error) { // For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true. // The reason ARC did so was to let a runner running a job to not stop prematurely. // @@ -369,7 +373,7 @@ func unregisterRunner(ctx context.Context, client *github.Client, enterprise, or // change from 60 seconds. // // TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422? - if err := client.RemoveRunner(ctx, enterprise, org, repo, *id); err != nil { + if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil { return false, err } From fa8ff70aa225de11ae3f84b297e9906b8dcea808 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 12:16:29 +0000 Subject: [PATCH 07/13] Add log when deletion timestamp is being set on owner object --- controllers/runner_pod_owner.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/controllers/runner_pod_owner.go b/controllers/runner_pod_owner.go index b78465b9..d34c9372 100644 --- a/controllers/runner_pod_owner.go +++ b/controllers/runner_pod_owner.go @@ -518,6 +518,9 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, log.Error(err, "Failed to delete owner") return nil, err } + + log.V(2).Info("Started deletion of owner") + continue } From 326d6a1fe8919966d1ddc95083331571de7e41a2 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 12:16:55 +0000 Subject: [PATCH 08/13] Fix the timing of `Marking owner for unregistration completion` log --- controllers/runner_pod_owner.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/runner_pod_owner.go b/controllers/runner_pod_owner.go index d34c9372..e594e3fa 100644 --- a/controllers/runner_pod_owner.go +++ b/controllers/runner_pod_owner.go @@ -536,9 +536,9 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, } } - log.V(2).Info("Marking owner for unregistration completion", "deletionSafe", deletionSafe, "total", res.total) - if deletionSafe == res.total { + log.V(2).Info("Marking owner for unregistration completion", "deletionSafe", deletionSafe, "total", res.total) + if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); !ok { updated := res.owner.withAnnotation(AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339)) From c612e87d85b069645c677fdaca26cf1dc9a5fa31 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 12:18:22 +0000 Subject: [PATCH 09/13] fix: Let RunnerDeployment scale RunnerReplicaSet to zero before terminating it so that hopefully RunnerDeployment can gracefully termiante older RunnerReplicaSet on update. --- controllers/runnerdeployment_controller.go | 27 ++++++++++++++++++++-- controllers/runnerreplicaset_controller.go | 4 ++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/controllers/runnerdeployment_controller.go b/controllers/runnerdeployment_controller.go index 9a55ae3d..f3a2ba7d 100644 --- a/controllers/runnerdeployment_controller.go +++ b/controllers/runnerdeployment_controller.go @@ -222,15 +222,38 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req for i := range oldSets { rs := oldSets[i] + rslog := log.WithValues("runnerreplicaset", rs.Name) + + if rs.Status.Replicas != nil && *rs.Status.Replicas > 0 { + if rs.Spec.Replicas != nil && *rs.Spec.Replicas == 0 { + rslog.V(2).Info("Waiting for runnerreplicaset to scale to zero") + + continue + } + + updated := rs.DeepCopy() + zero := 0 + updated.Spec.Replicas = &zero + if err := r.Client.Update(ctx, updated); err != nil { + rslog.Error(err, "Failed to scale runnerreplicaset to zero") + + return ctrl.Result{}, err + } + + rslog.Info("Scaled runnerreplicaset to zero") + + continue + } + if err := r.Client.Delete(ctx, &rs); err != nil { - log.Error(err, "Failed to delete runnerreplicaset resource") + rslog.Error(err, "Failed to delete runnerreplicaset resource") return ctrl.Result{}, err } r.Recorder.Event(&rd, corev1.EventTypeNormal, "RunnerReplicaSetDeleted", fmt.Sprintf("Deleted runnerreplicaset '%s'", rs.Name)) - log.Info("Deleted runnerreplicaset", "runnerdeployment", rd.ObjectMeta.Name, "runnerreplicaset", rs.Name) + rslog.Info("Deleted runnerreplicaset") } } diff --git a/controllers/runnerreplicaset_controller.go b/controllers/runnerreplicaset_controller.go index 3433fc58..6ceb559b 100644 --- a/controllers/runnerreplicaset_controller.go +++ b/controllers/runnerreplicaset_controller.go @@ -65,6 +65,10 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req } if !rs.ObjectMeta.DeletionTimestamp.IsZero() { + // RunnerReplicaSet cannot be gracefuly removed. + // That means any runner that is running a job can be prematurely terminated. + // To gracefully remove a RunnerReplicaSet, scale it down to zero first, observe RunnerReplicaSet's status replicas, + // and remove it only after the status replicas becomes zero. return ctrl.Result{}, nil } From cc55d0bd7d62d4d5eafc40840431b3741f718322 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 12:25:53 +0000 Subject: [PATCH 10/13] Let runnerdeployment controller log runnerreplicaset creation --- controllers/runnerdeployment_controller.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/controllers/runnerdeployment_controller.go b/controllers/runnerdeployment_controller.go index f3a2ba7d..e893f33c 100644 --- a/controllers/runnerdeployment_controller.go +++ b/controllers/runnerdeployment_controller.go @@ -118,6 +118,8 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, err } + log.Info("Created runnerreplicaset", "runnerreplicaset", desiredRS.Name) + return ctrl.Result{}, nil } @@ -142,6 +144,8 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, err } + log.Info("Created runnerreplicaset", "runnerreplicaset", desiredRS.Name) + // We requeue in order to clean up old runner replica sets later. // Otherwise, they aren't cleaned up until the next re-sync interval. return ctrl.Result{RequeueAfter: 5 * time.Second}, nil From 7123b18a4726ec29fc75f1b9f486df36a105d3fb Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 13:04:28 +0000 Subject: [PATCH 11/13] chore: Log more variables when log level is -2 --- controllers/runner_pod_owner.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/controllers/runner_pod_owner.go b/controllers/runner_pod_owner.go index e594e3fa..c505db78 100644 --- a/controllers/runner_pod_owner.go +++ b/controllers/runner_pod_owner.go @@ -302,9 +302,11 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, log.V(2).Info("Detected some current object(s)", "creationTimestampFirst", timestampFirst, "creationTimestampLast", timestampLast, "names", names) } - var pending, running, regTimeout int + var total, terminating, pending, running, regTimeout int for _, ss := range currentObjects { + total += ss.total + terminating += ss.terminating pending += ss.pending running += ss.running regTimeout += ss.regTimeout @@ -319,6 +321,8 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, log.V(2).Info( "Found some pods across owner(s)", + "total", total, + "terminating", terminating, "pending", pending, "running", running, "regTimeout", regTimeout, From 4551309e30a1fb1ba43b9031b7f7f6467e9003fd Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 13:09:46 +0000 Subject: [PATCH 12/13] Fix runners to not terminate before unregistration when scaling down #1179 was not working particularly for scale down of static (and perhaps long-running ephemeral) runners, which resulted in some runner pods are terminated before the requested unregistration processes complete, that triggered some in-progress workflow jobs to hang forever. This fixes an edge-case that resulted in a decreased desired replicas to trigger the failure, so that every runner is unregistered then terminated, as originally designed. --- controllers/runner_pod_owner.go | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/controllers/runner_pod_owner.go b/controllers/runner_pod_owner.go index c505db78..abc5020a 100644 --- a/controllers/runner_pod_owner.go +++ b/controllers/runner_pod_owner.go @@ -429,24 +429,26 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, // or actions/runner may potentially misbehave on SIGTERM immediately sent by K8s. // We'd better unregister first and then start a pod deletion process. // The annotation works as a mark to start the pod unregistration and deletion process of ours. + + if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); ok { + log.V(2).Info("Still waiting for runner pod(s) unregistration to complete") + + continue + } + for _, po := range ss.pods { if _, err := annotatePodOnce(ctx, c, log, &po, AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)); err != nil { return nil, err } } - if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); !ok { - updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)) - - if err := c.Patch(ctx, updated, client.MergeFrom(ss.owner)); err != nil { - log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp)) - return nil, err - } - - log.V(2).Info("Redundant owner has been annotated to start the unregistration before deletion") - } else { - log.V(2).Info("BUG: Redundant object was already annotated") + updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)) + if err := c.Patch(ctx, updated, client.MergeFrom(ss.owner)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp)) + return nil, err } + + log.V(2).Info("Redundant owner has been annotated to start the unregistration before deletion") } } else if retained > newDesiredReplicas { log.V(2).Info("Waiting sync before scale down", "retained", retained, "newDesiredReplicas", newDesiredReplicas) @@ -555,9 +557,9 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, } else { log.V(2).Info("BUG: Redundant owner was already annotated to start the deletion") } - } - continue + continue + } } if annotations := res.owner.GetAnnotations(); annotations != nil { From 1cc06e7408d1283cb4d03bcdaf8d27fa99158030 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 13 Mar 2022 13:11:26 +0000 Subject: [PATCH 13/13] e2e: Make enterprise runners optional for testing GitHub App As GitHub App does not allow ARC to access enterprise runner related API endpoints, like the create-registration-token API. --- test/e2e/e2e_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 84d7d611..293051c3 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -135,6 +135,7 @@ func TestE2ERunnerDeploy(t *testing.T) { } env := initTestEnv(t) + env.useApp = true t.Run("build and load images", func(t *testing.T) { env.buildAndLoadImages(t) @@ -219,7 +220,7 @@ func initTestEnv(t *testing.T) *env { e.testRepo = testing.Getenv(t, "TEST_REPO", "") e.testOrg = testing.Getenv(t, "TEST_ORG", "") e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "") - e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE") + e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE", "") e.testEphemeral = testing.Getenv(t, "TEST_EPHEMERAL", "") e.testJobs = createTestJobs(id, testResultCMNamePrefix, 20)