diff --git a/controllers/integration_test.go b/controllers/integration_test.go index d87bdb18..2e038e4d 100644 --- a/controllers/integration_test.go +++ b/controllers/integration_test.go @@ -110,6 +110,8 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment { Name: controllerName("runner"), RegistrationRecheckInterval: time.Millisecond, RegistrationRecheckJitter: time.Millisecond, + UnregistrationTimeout: 1 * time.Second, + UnregistrationRetryDelay: 1 * time.Second, } err = runnerController.SetupWithManager(mgr) Expect(err).NotTo(HaveOccurred(), "failed to setup runner controller") diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index efbb74bc..03c22b27 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -72,6 +72,9 @@ type RunnerReconciler struct { Name string RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration + + UnregistrationTimeout time.Duration + UnregistrationRetryDelay time.Duration } // +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners,verbs=get;list;watch;create;update;patch;delete @@ -426,7 +429,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } - updatedPod, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod) + updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod) if res != nil { return *res, err } @@ -470,7 +473,7 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName) if removed { - _, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod) + _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod) if res != nil { return *res, err } @@ -489,6 +492,24 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con return ctrl.Result{}, nil } +func (r *RunnerReconciler) unregistrationTimeout() time.Duration { + unregistrationTimeout := DefaultUnregistrationTimeout + + if r.UnregistrationTimeout > 0 { + unregistrationTimeout = r.UnregistrationTimeout + } + return unregistrationTimeout +} + +func (r *RunnerReconciler) unregistrationRetryDelay() time.Duration { + retryDelay := DefaultUnregistrationRetryDelay + + if r.UnregistrationRetryDelay > 0 { + retryDelay = r.UnregistrationRetryDelay + } + return retryDelay +} + func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) { deletionTimeout := 1 * time.Minute currentTime := time.Now() diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index ce242d53..83e4b41e 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -17,6 +17,16 @@ import ( const ( unregistrationCompleteTimestamp = "unregistration-complete-timestamp" unregistrationStartTimestamp = "unregistration-start-timestamp" + + // DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name) + // and RemoveRunner API (to actually unregister the runner) calls. + // This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s + // and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response + // up to 60 seconds (or even more depending on the situation). + DefaultUnregistrationTimeout = 60 * time.Second + + // This can be any value but a larger value can make an unregistration timeout longer than configured in practice. + DefaultUnregistrationRetryDelay = 30 * time.Second ) // tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that @@ -29,7 +39,7 @@ const ( // This function is designed to complete a length graceful stop process in a unblocking way. // When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value. // The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry. -func tickRunnerGracefulStop(ctx context.Context, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { +func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { if pod != nil { if _, ok := getAnnotation(pod, unregistrationStartTimestamp); !ok { updated := pod.DeepCopy() @@ -46,7 +56,7 @@ func tickRunnerGracefulStop(ctx context.Context, log logr.Logger, ghClient *gith } } - if res, err := ensureRunnerUnregistration(ctx, 10*time.Minute, 30*time.Second, log, ghClient, enterprise, organization, repository, runner, pod); res != nil { + if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, enterprise, organization, repository, runner, pod); res != nil { return nil, res, err } diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go index 57851f16..e93f53c8 100644 --- a/controllers/runner_pod_controller.go +++ b/controllers/runner_pod_controller.go @@ -47,6 +47,9 @@ type RunnerPodReconciler struct { Name string RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration + + UnregistrationTimeout time.Duration + UnregistrationRetryDelay time.Duration } const ( @@ -105,7 +108,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName) if removed { - updatedPod, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } @@ -348,7 +351,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } - updated, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } @@ -365,6 +368,24 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } +func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration { + unregistrationTimeout := DefaultUnregistrationTimeout + + if r.UnregistrationTimeout > 0 { + unregistrationTimeout = r.UnregistrationTimeout + } + return unregistrationTimeout +} + +func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration { + retryDelay := DefaultUnregistrationRetryDelay + + if r.UnregistrationRetryDelay > 0 { + retryDelay = r.UnregistrationRetryDelay + } + return retryDelay +} + func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error { name := "runnerpod-controller" if r.Name != "" {