Prevent static runners from terminating due to unregister timeout

The unregister timeout of 1 minute (no matter how long it is) can negatively impact availability of static runner constantly running workflow jobs, and ephemeral runner that runs a long-running job.
We deal with that by completely removing the unregistaration timeout, so that regarldess of the type of runner(static or ephemeral) it waits forever until it successfully to get unregistered before being terminated.
This commit is contained in:
Yusuke Kuoka 2022-03-13 07:22:04 +00:00
parent a1c6d1d11a
commit c4b24f8366
8 changed files with 29 additions and 38 deletions

View File

@ -1320,7 +1320,9 @@ Once the ephemeral runner has completed running a workflow job, it stops with a
As it's removed after a workflow job run, the runner pod is never reused across multiple GitHub Actions workflow jobs, providing you a clean environment per each workflow job. As it's removed after a workflow job run, the runner pod is never reused across multiple GitHub Actions workflow jobs, providing you a clean environment per each workflow job.
Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs. It may worth trying only when you do want to prioritize job run speed more than job reliability and security. Although not recommended, it's possible to disable passing `--ephemeral` flag by explicitly setting `ephemeral: false` in the `RunnerDeployment` or `RunnerSet` spec. When disabled, your runner becomes "static". A static runner does not stop after workflow job run, and in this mode `actions/runner` is known to clean only runner's work dir after each job. That means your runner's environment, including various actions cache, docker images stored in the `dind` and layer cache, is retained across multiple workflow job runs.
Static runners may worth trying only when you do want to prioritize job run speed more than job reliability and security.
> In early days of the project, the flag passed to the runner when `ephemeral: true` was `--once` rather than `--ephemeral`. > In early days of the project, the flag passed to the runner when `ephemeral: true` was `--once` rather than `--ephemeral`.
> >

View File

@ -17,6 +17,8 @@ spec:
image: ${RUNNER_NAME}:${RUNNER_TAG} image: ${RUNNER_NAME}:${RUNNER_TAG}
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
ephemeral: ${TEST_EPHEMERAL}
# Whether to pass --ephemeral (true) or --once (false, deprecated) # Whether to pass --ephemeral (true) or --once (false, deprecated)
env: env:
- name: RUNNER_FEATURE_FLAG_EPHEMERAL - name: RUNNER_FEATURE_FLAG_EPHEMERAL

View File

@ -34,15 +34,14 @@ const (
AnnotationKeyRunnerID = annotationKeyPrefix + "id" AnnotationKeyRunnerID = annotationKeyPrefix + "id"
// DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name)
// and RemoveRunner API (to actually unregister the runner) calls.
// This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s
// and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response
// up to 60 seconds (or even more depending on the situation).
DefaultUnregistrationTimeout = 60 * time.Second
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice. // This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
DefaultUnregistrationRetryDelay = 30 * time.Second DefaultUnregistrationRetryDelay = time.Minute
// RetryDelayOnCreateRegistrationError is the delay between retry attempts for runner registration token creation.
// Usually, a retry in this case happens when e.g. your PAT has no access to certain scope of runners, like you're using repository admin's token
// for creating a broader scoped runner token, like organizationa or enterprise runner token.
// Such permission issue will never fixed automatically, so we don't need to retry so often, hence this value.
RetryDelayOnCreateRegistrationError = 3 * time.Minute
// registrationTimeout is the duration until a pod times out after it becomes Ready and Running. // registrationTimeout is the duration until a pod times out after it becomes Ready and Running.
// A pod that is timed out can be terminated if needed. // A pod that is timed out can be terminated if needed.

View File

@ -110,7 +110,6 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment {
Name: controllerName("runner"), Name: controllerName("runner"),
RegistrationRecheckInterval: time.Millisecond * 100, RegistrationRecheckInterval: time.Millisecond * 100,
RegistrationRecheckJitter: time.Millisecond * 10, RegistrationRecheckJitter: time.Millisecond * 10,
UnregistrationTimeout: 1 * time.Second,
UnregistrationRetryDelay: 1 * time.Second, UnregistrationRetryDelay: 1 * time.Second,
} }
err = runnerController.SetupWithManager(mgr) err = runnerController.SetupWithManager(mgr)

View File

@ -73,7 +73,6 @@ type RunnerReconciler struct {
RegistrationRecheckInterval time.Duration RegistrationRecheckInterval time.Duration
RegistrationRecheckJitter time.Duration RegistrationRecheckJitter time.Duration
UnregistrationTimeout time.Duration
UnregistrationRetryDelay time.Duration UnregistrationRetryDelay time.Duration
} }
@ -212,7 +211,7 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con
func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) { func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) {
if updated, err := r.updateRegistrationToken(ctx, runner); err != nil { if updated, err := r.updateRegistrationToken(ctx, runner); err != nil {
return ctrl.Result{}, err return ctrl.Result{RequeueAfter: RetryDelayOnCreateRegistrationError}, nil
} else if updated { } else if updated {
return ctrl.Result{Requeue: true}, nil return ctrl.Result{Requeue: true}, nil
} }
@ -254,6 +253,10 @@ func (r *RunnerReconciler) updateRegistrationToken(ctx context.Context, runner v
rt, err := r.GitHubClient.GetRegistrationToken(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) rt, err := r.GitHubClient.GetRegistrationToken(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
if err != nil { if err != nil {
// An error can be a permanent, permission issue like the below:
// POST https://api.github.com/enterprises/YOUR_ENTERPRISE/actions/runners/registration-token: 403 Resource not accessible by integration []
// In such case retrying in seconds might not make much sense.
r.Recorder.Event(&runner, corev1.EventTypeWarning, "FailedUpdateRegistrationToken", "Updating registration token failed") r.Recorder.Event(&runner, corev1.EventTypeWarning, "FailedUpdateRegistrationToken", "Updating registration token failed")
log.Error(err, "Failed to get new registration token") log.Error(err, "Failed to get new registration token")
return false, err return false, err

View File

@ -26,13 +26,13 @@ import (
// This function is designed to complete a lengthy graceful stop process in a unblocking way. // This function is designed to complete a lengthy graceful stop process in a unblocking way.
// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value. // When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry. // The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { func tickRunnerGracefulStop(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
pod, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationStartTimestamp, time.Now().Format(time.RFC3339)) pod, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationStartTimestamp, time.Now().Format(time.RFC3339))
if err != nil { if err != nil {
return nil, &ctrl.Result{}, err return nil, &ctrl.Result{}, err
} }
if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil { if res, err := ensureRunnerUnregistration(ctx, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil {
return nil, res, err return nil, res, err
} }
@ -69,7 +69,7 @@ func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod
} }
// If the first return value is nil, it's safe to delete the runner pod. // If the first return value is nil, it's safe to delete the runner pod.
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
var runnerID *int64 var runnerID *int64
if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok { if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok {
@ -177,17 +177,10 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.") log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
} else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" { } else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" {
t, err := time.Parse(time.RFC3339, ts) log.Info("Runner unregistration is in-progress. It can take forever to complete if if it's a static runner constantly running jobs."+
if err != nil { " It can also take very long time if it's an ephemeral runner that is running a log-running job.", "error", err)
return &ctrl.Result{RequeueAfter: retryDelay}, err
}
if r := time.Until(t.Add(unregistrationTimeout)); r > 0 { return &ctrl.Result{RequeueAfter: retryDelay}, nil
log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r)
return &ctrl.Result{RequeueAfter: retryDelay}, err
}
log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout)
} else { } else {
// A runner and a runner pod that is created by this version of ARC should match // A runner and a runner pod that is created by this version of ARC should match
// any of the above branches. // any of the above branches.

View File

@ -45,7 +45,6 @@ type RunnerPodReconciler struct {
RegistrationRecheckInterval time.Duration RegistrationRecheckInterval time.Duration
RegistrationRecheckJitter time.Duration RegistrationRecheckJitter time.Duration
UnregistrationTimeout time.Duration
UnregistrationRetryDelay time.Duration UnregistrationRetryDelay time.Duration
} }
@ -104,7 +103,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
// In a standard scenario, the upstream controller, like runnerset-controller, ensures this runner to be gracefully stopped before the deletion timestamp is set. // In a standard scenario, the upstream controller, like runnerset-controller, ensures this runner to be gracefully stopped before the deletion timestamp is set.
// But for the case that the user manually deleted it for whatever reason, // But for the case that the user manually deleted it for whatever reason,
// we have to ensure it to gracefully stop now. // we have to ensure it to gracefully stop now.
updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if res != nil { if res != nil {
return *res, err return *res, err
} }
@ -172,7 +171,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
// //
// In a standard scenario, ARC starts the unregistration process before marking the pod for deletion at all, // In a standard scenario, ARC starts the unregistration process before marking the pod for deletion at all,
// so that it isn't subject to terminationGracePeriod and can safely take hours to finish it's work. // so that it isn't subject to terminationGracePeriod and can safely take hours to finish it's work.
_, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if res != nil { if res != nil {
return *res, err return *res, err
} }
@ -190,15 +189,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, nil return ctrl.Result{}, nil
} }
func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration {
unregistrationTimeout := DefaultUnregistrationTimeout
if r.UnregistrationTimeout > 0 {
unregistrationTimeout = r.UnregistrationTimeout
}
return unregistrationTimeout
}
func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration { func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration {
retryDelay := DefaultUnregistrationRetryDelay retryDelay := DefaultUnregistrationRetryDelay

View File

@ -186,6 +186,7 @@ type env struct {
runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string
githubTokenWebhook string githubTokenWebhook string
testEnterprise string testEnterprise string
testEphemeral string
featureFlagEphemeral *bool featureFlagEphemeral *bool
scaleDownDelaySecondsAfterScaleOut int64 scaleDownDelaySecondsAfterScaleOut int64
minReplicas int64 minReplicas int64
@ -219,7 +220,8 @@ func initTestEnv(t *testing.T) *env {
e.testOrg = testing.Getenv(t, "TEST_ORG", "") e.testOrg = testing.Getenv(t, "TEST_ORG", "")
e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "") e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "")
e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE") e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE")
e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100) e.testEphemeral = testing.Getenv(t, "TEST_EPHEMERAL", "")
e.testJobs = createTestJobs(id, testResultCMNamePrefix, 20)
if ephemeral, err := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL", "")); err == nil { if ephemeral, err := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL", "")); err == nil {
e.featureFlagEphemeral = &ephemeral e.featureFlagEphemeral = &ephemeral
@ -288,6 +290,7 @@ func (e *env) installActionsRunnerController(t *testing.T) {
"WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook, "WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook,
"RUNNER_LABEL=" + e.runnerLabel, "RUNNER_LABEL=" + e.runnerLabel,
"TEST_ID=" + e.testID, "TEST_ID=" + e.testID,
"TEST_EPHEMERAL=" + e.testEphemeral,
fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut), fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut),
fmt.Sprintf("REPO_RUNNER_MIN_REPLICAS=%d", e.minReplicas), fmt.Sprintf("REPO_RUNNER_MIN_REPLICAS=%d", e.minReplicas),
fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas), fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas),