actions-runner-controller/controllers/actions.summerwind.net/runner_graceful_stop.go

package actionssummerwindnet

import (
	"context"
	"errors"
	"fmt"
	"strconv"
	"time"

	"github.com/actions/actions-runner-controller/github"
	"github.com/go-logr/logr"
	gogithub "github.com/google/go-github/v52/github"
	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	ctrl "sigs.k8s.io/controller-runtime"
	"sigs.k8s.io/controller-runtime/pkg/client"
)

// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that
// we can delete the runner pod without disrupting a workflow job.
//
// This function returns a non-nil pointer to corev1.Pod as the first return value
// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion.
//
// It's a "tick" operation so a graceful stop can take multiple calls to complete.
// This function is designed to complete a lengthy graceful stop process in a unblocking way.
// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
func tickRunnerGracefulStop(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
	pod, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationStartTimestamp, time.Now().Format(time.RFC3339))
	if err != nil {
		return nil, &ctrl.Result{}, err
	}

	if res, err := ensureRunnerUnregistration(ctx, retryDelay, log, ghClient, c, enterprise, organization, repository, runner, pod); res != nil {
		return nil, res, err
	}

	pod, err = annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
	if err != nil {
		return nil, &ctrl.Result{}, err
	}

	return pod, nil, nil
}

// annotatePodOnce annotates the pod if it wasn't.
// Returns the provided pod as-is if it was already annotated.
// Returns the updated pod if the pod was missing the annotation and the update to add the annotation succeeded.
func annotatePodOnce(ctx context.Context, c client.Client, log logr.Logger, pod *corev1.Pod, k, v string) (*corev1.Pod, error) {
	if pod == nil {
		return nil, nil
	}

	if _, ok := getAnnotation(pod, k); ok {
		return pod, nil
	}

	updated := pod.DeepCopy()
	setAnnotation(&updated.ObjectMeta, k, v)
	if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
		log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", k))
		return nil, err
	}

	log.V(2).Info("Annotated pod", "key", k, "value", v)

	return updated, nil
}

// If the first return value is nil, it's safe to delete the runner pod.
func ensureRunnerUnregistration(ctx context.Context, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
	var runnerID *int64

	if id, ok := getAnnotation(pod, AnnotationKeyRunnerID); ok {
		v, err := strconv.ParseInt(id, 10, 64)
		if err != nil {
			return &ctrl.Result{}, err
		}

		runnerID = &v
	}

	if runnerID == nil {
		runner, err := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
		if err != nil {
			return &ctrl.Result{}, err
		}

		if runner != nil && runner.ID != nil {
			runnerID = runner.ID
		}
	}

	code := runnerContainerExitCode(pod)

	if pod != nil && pod.Annotations[AnnotationKeyUnregistrationCompleteTimestamp] != "" {
		// If it's already unregistered in the previous reconcilation loop,
		// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
		log.Info("Runner pod is marked as already unregistered.")
	} else if runnerID == nil && !runnerPodOrContainerIsStopped(pod) && !podConditionTransitionTimeAfter(pod, corev1.PodReady, registrationTimeout) &&
		!podIsPending(pod) {

		log.Info(
			"Unregistration started before runner obtains ID. Waiting for the registration timeout to elapse, or the runner to obtain ID, or the runner pod to stop",
			"registrationTimeout", registrationTimeout,
		)
		return &ctrl.Result{RequeueAfter: retryDelay}, nil
	} else if runnerID == nil && podIsPending(pod) {
		// Note: This logic is here to prevent a dead-lock between ARC and the PV provider.
		//
		// The author of this logic thinks that some (or all?) of CSI plugins and PV providers
		// do not support provisioning dynamic PVs for a pod that is already marked for deletion.
		// If we didn't handle this case here, ARC would end up with waiting forever until the
		// PV provider(s) provision PVs for the pod, which seems to never happen.
		//
		// For reference, the below is an eaxmple of pod.status that you might see when it happened:
		// status:
		//  conditions:
		//  - lastProbeTime: null
		//    lastTransitionTime: "2022-11-04T00:04:05Z"
		//    message: 'binding rejected: running Bind plugin "DefaultBinder": Operation cannot
		//      be fulfilled on pods/binding "org-runnerdeploy-xv2lg-pm6t2": pod org-runnerdeploy-xv2lg-pm6t2
		//      is being deleted, cannot be assigned to a host'
		//    reason: SchedulerError
		//    status: "False"
		//    type: PodScheduled
		//  phase: Pending
		//  qosClass: BestEffort
		log.Info(
			"Unregistration started before runner pod gets scheduled onto a node. "+
				"Perhaps the runner is taking a long time due to e.g. slow CSI slugin not giving us a PV in a timely manner, or your Kubernetes cluster is overloaded? "+
				"Marking unregistration as completed anyway because there's nothing ARC can do.",
			"registrationTimeout", registrationTimeout,
		)
	} else if runnerID == nil && runnerPodOrContainerIsStopped(pod) {
		log.Info(
			"Unregistration started before runner ID is assigned and the runner stopped before obtaining ID within registration timeout. "+
				"Perhaps the runner successfully ran the job and stopped normally before the runner ID becomes visible via GitHub API? "+
				"Perhaps the runner pod was terminated by anyone other than ARC? Was it OOM killed? "+
				"Marking unregistration as completed anyway because there's nothing ARC can do.",
			"registrationTimeout", registrationTimeout,
		)
	} else if runnerID == nil && podConditionTransitionTimeAfter(pod, corev1.PodReady, registrationTimeout) {
		log.Info(
			"Unregistration started before runner ID is assigned and the runner was unable to obtain ID within registration timeout. "+
				"Perhaps the runner has communication issue, or a firewall egress rule is dropping traffic to GitHub API, or GitHub API is unavailable? "+
				"Marking unregistration as completed anyway because there's nothing ARC can do. "+
				"This may result in in cancelling the job depending on your terminationGracePeriodSeconds and RUNNER_GRACEFUL_STOP_TIMEOUT settings.",
			"registrationTimeout", registrationTimeout,
		)
	} else if pod != nil && runnerPodOrContainerIsStopped(pod) {
		// If it's an ephemeral runner with the actions/runner container exited with 0,
		// we can safely assume that it has unregistered itself from GitHub Actions
		// so it's natural that RemoveRunner fails due to 404.

		// If pod has ended up succeeded we need to restart it
		// Happens e.g. when dind is in runner and run completes
		log.Info("Runner pod has been stopped with a successful status.")
	} else if pod != nil && pod.Annotations[AnnotationKeyRunnerCompletionWaitStartTimestamp] != "" {
		ct := ephemeralRunnerContainerStatus(pod)
		if ct == nil {
			log.Info("Runner pod is annotated to wait for completion, and the runner container is not ephemeral")

			return &ctrl.Result{RequeueAfter: retryDelay}, nil
		}

		lts := ct.LastTerminationState.Terminated
		if lts == nil {
			log.Info("Runner pod is annotated to wait for completion, and the runner container is not restarting")

			return &ctrl.Result{RequeueAfter: retryDelay}, nil
		}

		// Prevent runner pod from stucking in Terminating.
		// See https://github.com/actions/actions-runner-controller/issues/1369
		log.Info("Deleting runner pod anyway because it has stopped prematurely. This may leave a dangling runner resource in GitHub Actions",
			"lastState.exitCode", lts.ExitCode,
			"lastState.message", lts.Message,
			"pod.phase", pod.Status.Phase,
		)
	} else if ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, *runnerID); err != nil {
		if errors.Is(err, &gogithub.RateLimitError{}) {
			// We log the underlying error when we failed calling GitHub API to list or unregisters,
			// or the runner is still busy.
			log.Error(
				err,
				fmt.Sprintf(
					"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
					retryDelayOnGitHubAPIRateLimitError,
				),
			)

			return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
		}

		log.V(1).Info("Failed to unregister runner before deleting the pod.", "error", err)

		var (
			runnerBusy                         bool
			runnerUnregistrationFailureMessage string
		)

		errRes := &gogithub.ErrorResponse{}
		if errors.As(err, &errRes) {
			if errRes.Response.StatusCode == 403 {
				log.Error(err, "Unable to unregister due to permission error. "+
					"Perhaps you've changed the permissions of PAT or GitHub App, or you updated authentication method of ARC in a wrong way? "+
					"ARC considers it as already unregistered and continue removing the pod. "+
					"You may need to remove the runner on GitHub UI.")

				return nil, nil
			}

			runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner)

			var runnerID int64

			if runner != nil && runner.ID != nil {
				runnerID = *runner.ID
			}

			runnerBusy = errRes.Response.StatusCode == 422
			runnerUnregistrationFailureMessage = errRes.Message

			if runnerBusy && code != nil {
				log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+
					"This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+
					"ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API",
					"runnerExitCode", *code,
					"runnerID", runnerID,
				)

				return nil, nil
			}
		}

		if runnerBusy {
			_, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyUnregistrationFailureMessage, runnerUnregistrationFailureMessage)
			if err != nil {
				return &ctrl.Result{}, err
			}

			// We want to prevent spamming the deletion attemps but returning ctrl.Result with RequeueAfter doesn't
			// work as the reconcilation can happen earlier due to pod status update.
			// For ephemeral runners, we can expect it to stop and unregister itself on completion.
			// So we can just wait for the completion without actively retrying unregistration.
			ephemeral := getRunnerEnv(pod, EnvVarEphemeral)
			if ephemeral == "true" {
				_, err = annotatePodOnce(ctx, c, log, pod, AnnotationKeyRunnerCompletionWaitStartTimestamp, time.Now().Format(time.RFC3339))
				if err != nil {
					return &ctrl.Result{}, err
				}

				return &ctrl.Result{}, nil
			}

			log.V(2).Info("Retrying runner unregistration because the static runner is still busy")
			// Otherwise we may end up spamming 422 errors,
			// each call consuming GitHub API rate limit
			// https://github.com/actions/actions-runner-controller/pull/1167#issuecomment-1064213271
			return &ctrl.Result{RequeueAfter: retryDelay}, nil
		}

		return &ctrl.Result{}, err
	} else if ok {
		log.Info("Runner has just been unregistered.")
	} else if pod == nil {
		// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
		// However, that doesn't always mean the pod can be safely removed.
		//
		// If the pod does not exist for the runner,
		// it may be due to that the runner pod has never been created.
		// In that case we can safely assume that the runner will never be registered.

		log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
	} else if ts := pod.Annotations[AnnotationKeyUnregistrationStartTimestamp]; ts != "" {
		log.Info("Runner unregistration is in-progress. It can take forever to complete if if it's a static runner constantly running jobs."+
			" It can also take very long time if it's an ephemeral runner that is running a log-running job.", "error", err)

		return &ctrl.Result{RequeueAfter: retryDelay}, nil
	} else {
		// A runner and a runner pod that is created by this version of ARC should match
		// any of the above branches.
		//
		// But we leave this match all branch for potential backward-compatibility.
		// The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process,
		// and retry later.
		log.V(1).Info("Runner unregistration is being retried later.")

		return &ctrl.Result{RequeueAfter: retryDelay}, nil
	}

	return nil, nil
}

func ensureRunnerPodRegistered(ctx context.Context, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
	_, hasRunnerID := getAnnotation(pod, AnnotationKeyRunnerID)
	if runnerPodOrContainerIsStopped(pod) || hasRunnerID {
		return pod, nil, nil
	}

	r, err := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
	if err != nil {
		return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err
	}

	if r == nil || r.ID == nil {
		return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err
	}

	id := *r.ID

	updated, err := annotatePodOnce(ctx, c, log, pod, AnnotationKeyRunnerID, fmt.Sprintf("%d", id))
	if err != nil {
		return nil, &ctrl.Result{RequeueAfter: 10 * time.Second}, err
	}

	return updated, nil, nil
}

func getAnnotation(obj client.Object, key string) (string, bool) {
	if obj.GetAnnotations() == nil {
		return "", false
	}

	v, ok := obj.GetAnnotations()[key]

	return v, ok
}

func setAnnotation(meta *metav1.ObjectMeta, key, value string) {
	if meta.Annotations == nil {
		meta.Annotations = map[string]string{}
	}

	meta.Annotations[key] = value
}

func podConditionTransitionTime(pod *corev1.Pod, tpe corev1.PodConditionType, v corev1.ConditionStatus) *metav1.Time {
	for _, c := range pod.Status.Conditions {
		if c.Type == tpe && c.Status == v {
			return &c.LastTransitionTime
		}
	}

	return nil
}

func podConditionTransitionTimeAfter(pod *corev1.Pod, tpe corev1.PodConditionType, d time.Duration) bool {
	c := podConditionTransitionTime(pod, tpe, corev1.ConditionTrue)
	if c == nil {
		return false
	}

	return c.Add(d).Before(time.Now())
}

func podIsPending(pod *corev1.Pod) bool {
	return pod.Status.Phase == corev1.PodPending
}

func podRunnerID(pod *corev1.Pod) string {
	id, _ := getAnnotation(pod, AnnotationKeyRunnerID)
	return id
}

func getRunnerEnv(pod *corev1.Pod, key string) string {
	for _, c := range pod.Spec.Containers {
		if c.Name == containerName {
			for _, e := range c.Env {
				if e.Name == key {
					return e.Value
				}
			}
		}
	}
	return ""
}

func setRunnerEnv(pod *corev1.Pod, key, value string) {
	for i := range pod.Spec.Containers {
		c := pod.Spec.Containers[i]
		if c.Name == containerName {
			for j, env := range c.Env {
				if env.Name == key {
					pod.Spec.Containers[i].Env[j].Value = value
					return
				}
			}
			pod.Spec.Containers[i].Env = append(c.Env, corev1.EnvVar{Name: key, Value: value})
		}
	}
}

// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
//
// Case 1. (true, nil) when it has successfully unregistered the runner.
// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet)
// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
//
//	an error returned by GitHub API.
//
// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately.
// In other words, all those three sub-cases cannot be distinguished by this function alone.
//
//   - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run).
//     You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere,
//     so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil).
//
//   - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed.
//     Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever.
//     There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side.
//
//   - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up.
//     If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner.
//
// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back
// as long as you recreate the runner pod.
//
// If it was "2-3.", you need a workaround to avoid the race condition.
//
// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever
// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up.
//
// Beware though, you need extra care to set an appropriate grace period depending on your environment.
// There isn't a single right grace period that works for everyone.
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
// while the shorter the grace period is, the more likely you may encounter the race issue.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo string, id int64) (bool, error) {
	// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
	// The reason ARC did so was to let a runner running a job to not stop prematurely.
	//
	// However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner,
	// so ARC doesn't need to do anything special for a graceful runner stop.
	// It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and
	// the runner pod is safe for deletion.
	//
	// Trying to remove a busy runner can result in errors like the following:
	//    failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
	//
	// # NOTES
	//
	// - It can be "status=offline" at the same time but that's another story.
	// - After https://github.com/actions/actions-runner-controller/pull/1127, ListRunners responses that are used to
	//   determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds.
	// - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't
	//   change from 60 seconds.
	//
	// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
	if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
		return false, err
	}

	return true, nil
}

func getRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (*gogithub.Runner, error) {
	runners, err := client.ListRunners(ctx, enterprise, org, repo)
	if err != nil {
		return nil, err
	}

	for _, runner := range runners {
		if runner.GetName() == name {
			return runner, nil
		}
	}

	return nil, nil
}