diff --git a/README.md b/README.md index e0daa215..ecce405e 100644 --- a/README.md +++ b/README.md @@ -1320,7 +1320,9 @@ Once the ephemeral runner has completed running a workflow job, it stops with a As it's removed after a workflow job run, the runner pod is never reused across multiple GitHub Actions workflow jobs, providing you a clean environment per each workflow job. -Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs. It may worth trying only when you do want to prioritize job run speed more than job reliability and security. +Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and in this mode `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs. + +Static runners may worth trying only when you do want to prioritize job run speed more than job reliability and security. > In early days of the project, the flag passed to the runner when `ephemeral: true` was `--once` rather than `--ephemeral`. > diff --git a/acceptance/testdata/runnerdeploy.envsubst.yaml b/acceptance/testdata/runnerdeploy.envsubst.yaml index e0862f60..ae6375e7 100644 --- a/acceptance/testdata/runnerdeploy.envsubst.yaml +++ b/acceptance/testdata/runnerdeploy.envsubst.yaml @@ -17,6 +17,8 @@ spec: image: ${RUNNER_NAME}:${RUNNER_TAG} imagePullPolicy: IfNotPresent + ephemeral: ${TEST_EPHEMERAL} + # Whether to pass --ephemeral (true) or --once (false, deprecated) env: - name: RUNNER_FEATURE_FLAG_EPHEMERAL diff --git a/controllers/constants.go b/controllers/constants.go index 510fa911..24dba05d 100644 --- a/controllers/constants.go +++ b/controllers/constants.go @@ -34,15 +34,14 @@ const ( AnnotationKeyRunnerID = annotationKeyPrefix + "id" - // DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name) - // and RemoveRunner API (to actually unregister the runner) calls. - // This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s - // and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response - // up to 60 seconds (or even more depending on the situation). - DefaultUnregistrationTimeout = 60 * time.Second - // This can be any value but a larger value can make an unregistration timeout longer than configured in practice. - DefaultUnregistrationRetryDelay = 30 * time.Second + DefaultUnregistrationRetryDelay = time.Minute + + // RetryDelayOnCreateRegistrationError is the delay between retry attempts for runner registration token creation. + // Usually, a retry in this case happens when e.g. your PAT has no access to certain scope of runners, like you're using repository admin's token + // for creating a broader scoped runner token, like organizationa or enterprise runner token. + // Such permission issue will never fixed automatically, so we don't need to retry so often, hence this value. + RetryDelayOnCreateRegistrationError = 3 * time.Minute // registrationTimeout is the duration until a pod times out after it becomes Ready and Running. // A pod that is timed out can be terminated if needed. diff --git a/controllers/integration_test.go b/controllers/integration_test.go index 105d8658..5695b44d 100644 --- a/controllers/integration_test.go +++ b/controllers/integration_test.go @@ -110,7 +110,6 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment { Name: controllerName("runner"), RegistrationRecheckInterval: time.Millisecond * 100, RegistrationRecheckJitter: time.Millisecond * 10, - UnregistrationTimeout: 1 * time.Second, UnregistrationRetryDelay: 1 * time.Second, } err = runnerController.SetupWithManager(mgr) diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 5811494b..09241318 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -73,7 +73,6 @@ type RunnerReconciler struct { RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration - UnregistrationTimeout time.Duration UnregistrationRetryDelay time.Duration } @@ -212,7 +211,7 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) { if updated, err := r.updateRegistrationToken(ctx, runner); err != nil { - return ctrl.Result{}, err + return ctrl.Result{RequeueAfter: RetryDelayOnCreateRegistrationError}, nil } else if updated { return ctrl.Result{Requeue: true}, nil } @@ -254,6 +253,10 @@ func (r *RunnerReconciler) updateRegistrationToken(ctx context.Context, runner v rt, err := r.GitHubClient.GetRegistrationToken(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) if err != nil { + // An error can be a permanent, permission issue like the below: + // POST https://api.github.com/enterprises/YOUR_ENTERPRISE/actions/runners/registration-token: 403 Resource not accessible by integration [] + // In such case retrying in seconds might not make much sense. + r.Recorder.Event(&runner, corev1.EventTypeWarning, "FailedUpdateRegistrationToken", "Updating registration token failed") log.Error(err, "Failed to get new registration token") return false, err diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index d95d94a8..68e6650e 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -26,13 +26,13 @@ import ( // This function is designed to complete a lengthy graceful stop process in a unblocking way. // When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value. // The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry. -func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { +func tickRunnerGracefulStop(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { pod, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationStartTimestamp, time.Now().Format(time.RFC3339)) if err != nil { return nil, &ctrl.Result{}, err } - if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil { + if res, err := ensureRunnerUnregistration(ctx, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil { return nil, res, err } @@ -69,7 +69,7 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod } // If the first return value is nil, it's safe to delete the runner pod. -func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { +func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { var runnerID *int64 if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok { @@ -177,17 +177,10 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.") } else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" { - t, err := time.Parse(time.RFC3339, ts) - if err != nil { - return &ctrl.Result{RequeueAfter: retryDelay}, err - } + log.Info("Runner unregistration is in-progress. It can take forever to complete if if it's a static runner constantly running jobs."+ + " It can also take very long time if it's an ephemeral runner that is running a log-running job.", "error", err) - if r := time.Until(t.Add(unregistrationTimeout)); r > 0 { - log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r) - return &ctrl.Result{RequeueAfter: retryDelay}, err - } - - log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout) + return &ctrl.Result{RequeueAfter: retryDelay}, nil } else { // A runner and a runner pod that is created by this version of ARC should match // any of the above branches. diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go index ab683124..3a631671 100644 --- a/controllers/runner_pod_controller.go +++ b/controllers/runner_pod_controller.go @@ -45,7 +45,6 @@ type RunnerPodReconciler struct { RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration - UnregistrationTimeout time.Duration UnregistrationRetryDelay time.Duration } @@ -104,7 +103,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // In a standard scenario, the upstream controller, like runnerset-controller, ensures this runner to be gracefully stopped before the deletion timestamp is set. // But for the case that the user manually deleted it for whatever reason, // we have to ensure it to gracefully stop now. - updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } @@ -172,7 +171,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // // In a standard scenario, ARC starts the unregistration process before marking the pod for deletion at all, // so that it isn't subject to terminationGracePeriod and can safely take hours to finish it's work. - _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } @@ -190,15 +189,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } -func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration { - unregistrationTimeout := DefaultUnregistrationTimeout - - if r.UnregistrationTimeout > 0 { - unregistrationTimeout = r.UnregistrationTimeout - } - return unregistrationTimeout -} - func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration { retryDelay := DefaultUnregistrationRetryDelay diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index c2994d23..84d7d611 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -186,6 +186,7 @@ type env struct { runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string githubTokenWebhook string testEnterprise string + testEphemeral string featureFlagEphemeral *bool scaleDownDelaySecondsAfterScaleOut int64 minReplicas int64 @@ -219,7 +220,8 @@ func initTestEnv(t *testing.T) *env { e.testOrg = testing.Getenv(t, "TEST_ORG", "") e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "") e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE") - e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100) + e.testEphemeral = testing.Getenv(t, "TEST_EPHEMERAL", "") + e.testJobs = createTestJobs(id, testResultCMNamePrefix, 20) if ephemeral, err := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL", "")); err == nil { e.featureFlagEphemeral = &ephemeral @@ -288,6 +290,7 @@ func (e *env) installActionsRunnerController(t *testing.T) { "WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook, "RUNNER_LABEL=" + e.runnerLabel, "TEST_ID=" + e.testID, + "TEST_EPHEMERAL=" + e.testEphemeral, fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut), fmt.Sprintf("REPO_RUNNER_MIN_REPLICAS=%d", e.minReplicas), fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas),