diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 1bfa8bcc..d007e9fa 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -110,8 +110,23 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } } else { + var p *corev1.Pod + + { + var pod corev1.Pod + if err := r.Get(ctx, req.NamespacedName, &pod); err != nil { + if !kerrors.IsNotFound(err) { + log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err)) + + return ctrl.Result{Requeue: true}, nil + } + } else { + p = &pod + } + } + // Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner - return r.processRunnerDeletion(runner, ctx, log) + return r.processRunnerDeletion(runner, ctx, log, p) } registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly) @@ -159,21 +174,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr // If pod has ended up succeeded we need to restart it // Happens e.g. when dind is in runner and run completes - stopped := pod.Status.Phase == corev1.PodSucceeded - - if !stopped { - if pod.Status.Phase == corev1.PodRunning { - for _, status := range pod.Status.ContainerStatuses { - if status.Name != containerName { - continue - } - - if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 { - stopped = true - } - } - } - } + stopped := runnerPodOrContainerIsStopped(&pod) restart := stopped @@ -404,64 +405,53 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } - // Try to delete current pod if recreation is needed - safeToDeletePod := false - ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) - if err != nil { - log.Error(err, "Failed to unregister runner before deleting the pod.", "runner", runner.Name) - } else { - // `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub. - if !ok { - log.Info("Runner no longer exists on GitHub", "runner", runner.Name) - } - - safeToDeletePod = true + updatedPod, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod) + if res != nil { + return *res, err } - if safeToDeletePod { - // Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service. - // This should help us avoid race condition between runner pickup job after we think the runner is not busy. - if err := r.Delete(ctx, &pod); err != nil { - log.Error(err, "Failed to delete pod resource") - return ctrl.Result{}, err - } - - r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name)) - log.Info("Deleted runner pod", "repository", runner.Spec.Repository) + // Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service. + // This should help us avoid race condition between runner pickup job after we think the runner is not busy. + if err := r.Delete(ctx, updatedPod); err != nil { + log.Error(err, "Failed to delete pod resource") + return ctrl.Result{}, err } + r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name)) + log.Info("Deleted runner pod", "repository", runner.Spec.Repository) + return ctrl.Result{}, nil } -func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger) (reconcile.Result, error) { +func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool { + // If pod has ended up succeeded we need to restart it + // Happens e.g. when dind is in runner and run completes + stopped := pod.Status.Phase == corev1.PodSucceeded + + if !stopped { + if pod.Status.Phase == corev1.PodRunning { + for _, status := range pod.Status.ContainerStatuses { + if status.Name != containerName { + continue + } + + if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 { + stopped = true + } + } + } + } + + return stopped +} + +func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) { finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName) if removed { - if len(runner.Status.Registration.Token) > 0 { - ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) - if err != nil { - if errors.Is(err, &gogithub.RateLimitError{}) { - // We log the underlying error when we failed calling GitHub API to list or unregisters, - // or the runner is still busy. - log.Error( - err, - fmt.Sprintf( - "Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls", - retryDelayOnGitHubAPIRateLimitError, - ), - ) - - return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err - } - - return ctrl.Result{}, err - } - - if !ok { - log.V(1).Info("Runner no longer exists on GitHub") - } - } else { - log.V(1).Info("Runner was never registered on GitHub") + _, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod) + if res != nil { + return *res, err } newRunner := runner.DeepCopy() diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go new file mode 100644 index 00000000..ce242d53 --- /dev/null +++ b/controllers/runner_graceful_stop.go @@ -0,0 +1,239 @@ +package controllers + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/actions-runner-controller/actions-runner-controller/github" + "github.com/go-logr/logr" + gogithub "github.com/google/go-github/v39/github" + corev1 "k8s.io/api/core/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + unregistrationCompleteTimestamp = "unregistration-complete-timestamp" + unregistrationStartTimestamp = "unregistration-start-timestamp" +) + +// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that +// we can delete the runner pod without disrupting a workflow job. +// +// This function returns a non-nil pointer to corev1.Pod as the first return value +// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion. +// +// It's a "tick" operation so a graceful stop can take multiple calls to complete. +// This function is designed to complete a length graceful stop process in a unblocking way. +// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value. +// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry. +func tickRunnerGracefulStop(ctx context.Context, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) { + if pod != nil { + if _, ok := getAnnotation(pod, unregistrationStartTimestamp); !ok { + updated := pod.DeepCopy() + setAnnotation(updated, unregistrationStartTimestamp, time.Now().Format(time.RFC3339)) + if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationStartTimestamp)) + return nil, &ctrl.Result{}, err + } + pod = updated + + log.Info("Runner has started unregistration") + } else { + log.Info("Runner has already started unregistration") + } + } + + if res, err := ensureRunnerUnregistration(ctx, 10*time.Minute, 30*time.Second, log, ghClient, enterprise, organization, repository, runner, pod); res != nil { + return nil, res, err + } + + if pod != nil { + if _, ok := getAnnotation(pod, unregistrationCompleteTimestamp); !ok { + updated := pod.DeepCopy() + setAnnotation(updated, unregistrationCompleteTimestamp, time.Now().Format(time.RFC3339)) + if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil { + log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationCompleteTimestamp)) + return nil, &ctrl.Result{}, err + } + pod = updated + + log.Info("Runner has completed unregistration") + } else { + log.Info("Runner has already completed unregistration") + } + } + + return pod, nil, nil +} + +// If the first return value is nil, it's safe to delete the runner pod. +func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) { + ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner) + if err != nil { + if errors.Is(err, &gogithub.RateLimitError{}) { + // We log the underlying error when we failed calling GitHub API to list or unregisters, + // or the runner is still busy. + log.Error( + err, + fmt.Sprintf( + "Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls", + retryDelayOnGitHubAPIRateLimitError, + ), + ) + + return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err + } + + log.Error(err, "Failed to unregister runner before deleting the pod.") + + return &ctrl.Result{}, err + } else if ok { + log.Info("Runner has just been unregistered. Removing the runner pod.") + } else if pod == nil { + // `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub. + // However, that doesn't always mean the pod can be safely removed. + // + // If the pod does not exist for the runner, + // it may be due to that the runner pod has never been created. + // In that case we can safely assume that the runner will never be registered. + + log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.") + } else if pod.Annotations[unregistrationCompleteTimestamp] != "" { + // If it's already unregistered in the previous reconcilation loop, + // you can safely assume that it won't get registered again so it's safe to delete the runner pod. + log.Info("Runner pod is marked as already unregistered.") + } else if runnerPodOrContainerIsStopped(pod) { + // If it's an ephemeral runner with the actions/runner container exited with 0, + // we can safely assume that it has unregistered itself from GitHub Actions + // so it's natural that RemoveRunner fails due to 404. + + // If pod has ended up succeeded we need to restart it + // Happens e.g. when dind is in runner and run completes + log.Info("Runner pod has been stopped with a successful status.") + } else if ts := pod.Annotations[unregistrationStartTimestamp]; ts != "" { + t, err := time.Parse(time.RFC3339, ts) + if err != nil { + return &ctrl.Result{RequeueAfter: retryDelay}, err + } + + if r := time.Until(t.Add(unregistrationTimeout)); r > 0 { + log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r) + return &ctrl.Result{RequeueAfter: retryDelay}, err + } + + log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout) + } else { + // A runner and a runner pod that is created by this version of ARC should match + // any of the above branches. + // + // But we leave this match all branch for potential backward-compatibility. + // The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process, + // and retry later. + log.V(1).Info("Runner unregistration is being retried later.") + + return &ctrl.Result{RequeueAfter: retryDelay}, nil + } + + return nil, nil +} + +func getAnnotation(pod *corev1.Pod, key string) (string, bool) { + if pod.Annotations == nil { + return "", false + } + + v, ok := pod.Annotations[key] + + return v, ok +} + +func setAnnotation(pod *corev1.Pod, key, value string) { + if pod.Annotations == nil { + pod.Annotations = map[string]string{} + } + + pod.Annotations[key] = value +} + +// unregisterRunner unregisters the runner from GitHub Actions by name. +// +// This function returns: +// +// Case 1. (true, nil) when it has successfully unregistered the runner. +// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet) +// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to +// an error returned by GitHub API. +// +// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately. +// In other words, all those three sub-cases cannot be distinguished by this function alone. +// +// - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run). +// You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere, +// so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil). +// +// - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed. +// Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever. +// There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side. +// +// - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up. +// If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner. +// +// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back +// as long as you recreate the runner pod. +// +// If it was "2-3.", you need a workaround to avoid the race condition. +// +// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever +// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up. +// +// Beware though, you need extra care to set an appropriate grace period depending on your environment. +// There isn't a single right grace period that works for everyone. +// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions, +// while the shorter the grace period is, the more likely you may encounter the race issue. +func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) { + runners, err := client.ListRunners(ctx, enterprise, org, repo) + if err != nil { + return false, err + } + + id := int64(0) + for _, runner := range runners { + if runner.GetName() == name { + id = runner.GetID() + break + } + } + + if id == int64(0) { + return false, nil + } + + // For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true. + // The reason ARC did so was to let a runner running a job to not stop prematurely. + // + // However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner, + // so ARC doesn't need to do anything special for a graceful runner stop. + // It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and + // the runner pod is safe for deletion. + // + // Trying to remove a busy runner can result in errors like the following: + // failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" [] + // + // # NOTES + // + // - It can be "status=offline" at the same time but that's another story. + // - After https://github.com/actions-runner-controller/actions-runner-controller/pull/1127, ListRunners responses that are used to + // determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds. + // - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't + // change from 60 seconds. + // + // TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422? + if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil { + return false, err + } + + return true, nil +} diff --git a/controllers/runner_pod_controller.go b/controllers/runner_pod_controller.go index d43eea37..57851f16 100644 --- a/controllers/runner_pod_controller.go +++ b/controllers/runner_pod_controller.go @@ -105,33 +105,16 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName) if removed { - ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name) - if err != nil { - if errors.Is(err, &gogithub.RateLimitError{}) { - // We log the underlying error when we failed calling GitHub API to list or unregisters, - // or the runner is still busy. - log.Error( - err, - fmt.Sprintf( - "Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls", - retryDelayOnGitHubAPIRateLimitError, - ), - ) - - return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err - } - - return ctrl.Result{}, err + updatedPod, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + if res != nil { + return *res, err } - if !ok { - log.V(1).Info("Runner no longer exists on GitHub") - } + patchedPod := updatedPod.DeepCopy() + patchedPod.ObjectMeta.Finalizers = finalizers - newRunner := runnerPod.DeepCopy() - newRunner.ObjectMeta.Finalizers = finalizers - - if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil { + // We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster. + if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil { log.Error(err, "Failed to update runner for finalizer removal") return ctrl.Result{}, err } @@ -365,8 +348,13 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } + updated, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) + if res != nil { + return *res, err + } + // Delete current pod if recreation is needed - if err := r.Delete(ctx, &runnerPod); err != nil { + if err := r.Delete(ctx, updated); err != nil { log.Error(err, "Failed to delete pod resource") return ctrl.Result{}, err } @@ -377,10 +365,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } -func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) { - return unregisterRunner(ctx, r.GitHubClient, enterprise, org, repo, name) -} - func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error { name := "runnerpod-controller" if r.Name != "" { diff --git a/controllers/unregister.go b/controllers/unregister.go deleted file mode 100644 index 9c139be0..00000000 --- a/controllers/unregister.go +++ /dev/null @@ -1,49 +0,0 @@ -package controllers - -import ( - "context" - "fmt" - - "github.com/actions-runner-controller/actions-runner-controller/github" -) - -// unregisterRunner unregisters the runner from GitHub Actions by name. -// -// This function returns: -// - (true, nil) when it has successfully unregistered the runner. -// - (false, nil) when the runner has been already unregistered. -// - (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to -// an error returned by GitHub API. -func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) { - runners, err := client.ListRunners(ctx, enterprise, org, repo) - if err != nil { - return false, err - } - - id := int64(0) - for _, runner := range runners { - if runner.GetName() == name { - // Note that sometimes a runner can stuck "busy" even though it is already "offline". - // But we assume that it's not actually offline and still running a job. - if runner.GetBusy() { - return false, fmt.Errorf("runner is busy") - } - id = runner.GetID() - break - } - } - - if id == int64(0) { - return false, nil - } - - // Trying to remove a busy runner can result in errors like the following: - // failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" [] - // - // TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422? - if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil { - return false, err - } - - return true, nil -}