Introduce consistent timeouts for runner unregistration and runner pod deletion
Enhances runner controller and runner pod controller to have consistent timeouts for runner unregistration and runner pod deletion, so that we are very much unlikely to terminate pods that are running any jobs.
This commit is contained in:
parent
9e356b419e
commit
3c16188371
|
|
@ -110,8 +110,23 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
var p *corev1.Pod
|
||||||
|
|
||||||
|
{
|
||||||
|
var pod corev1.Pod
|
||||||
|
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
|
||||||
|
if !kerrors.IsNotFound(err) {
|
||||||
|
log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err))
|
||||||
|
|
||||||
|
return ctrl.Result{Requeue: true}, nil
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
p = &pod
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner
|
// Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner
|
||||||
return r.processRunnerDeletion(runner, ctx, log)
|
return r.processRunnerDeletion(runner, ctx, log, p)
|
||||||
}
|
}
|
||||||
|
|
||||||
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
|
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
|
||||||
|
|
@ -159,21 +174,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||||
|
|
||||||
// If pod has ended up succeeded we need to restart it
|
// If pod has ended up succeeded we need to restart it
|
||||||
// Happens e.g. when dind is in runner and run completes
|
// Happens e.g. when dind is in runner and run completes
|
||||||
stopped := pod.Status.Phase == corev1.PodSucceeded
|
stopped := runnerPodOrContainerIsStopped(&pod)
|
||||||
|
|
||||||
if !stopped {
|
|
||||||
if pod.Status.Phase == corev1.PodRunning {
|
|
||||||
for _, status := range pod.Status.ContainerStatuses {
|
|
||||||
if status.Name != containerName {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
|
|
||||||
stopped = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
restart := stopped
|
restart := stopped
|
||||||
|
|
||||||
|
|
@ -404,64 +405,53 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to delete current pod if recreation is needed
|
updatedPod, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod)
|
||||||
safeToDeletePod := false
|
if res != nil {
|
||||||
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
|
return *res, err
|
||||||
if err != nil {
|
|
||||||
log.Error(err, "Failed to unregister runner before deleting the pod.", "runner", runner.Name)
|
|
||||||
} else {
|
|
||||||
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
|
|
||||||
if !ok {
|
|
||||||
log.Info("Runner no longer exists on GitHub", "runner", runner.Name)
|
|
||||||
}
|
|
||||||
|
|
||||||
safeToDeletePod = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if safeToDeletePod {
|
// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
|
||||||
// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
|
// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
|
||||||
// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
|
if err := r.Delete(ctx, updatedPod); err != nil {
|
||||||
if err := r.Delete(ctx, &pod); err != nil {
|
log.Error(err, "Failed to delete pod resource")
|
||||||
log.Error(err, "Failed to delete pod resource")
|
return ctrl.Result{}, err
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
|
|
||||||
log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
|
||||||
|
log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
|
||||||
|
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger) (reconcile.Result, error) {
|
func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
|
||||||
|
// If pod has ended up succeeded we need to restart it
|
||||||
|
// Happens e.g. when dind is in runner and run completes
|
||||||
|
stopped := pod.Status.Phase == corev1.PodSucceeded
|
||||||
|
|
||||||
|
if !stopped {
|
||||||
|
if pod.Status.Phase == corev1.PodRunning {
|
||||||
|
for _, status := range pod.Status.ContainerStatuses {
|
||||||
|
if status.Name != containerName {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
|
||||||
|
stopped = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return stopped
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) {
|
||||||
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
||||||
|
|
||||||
if removed {
|
if removed {
|
||||||
if len(runner.Status.Registration.Token) > 0 {
|
_, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod)
|
||||||
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
|
if res != nil {
|
||||||
if err != nil {
|
return *res, err
|
||||||
if errors.Is(err, &gogithub.RateLimitError{}) {
|
|
||||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
|
||||||
// or the runner is still busy.
|
|
||||||
log.Error(
|
|
||||||
err,
|
|
||||||
fmt.Sprintf(
|
|
||||||
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
|
||||||
retryDelayOnGitHubAPIRateLimitError,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if !ok {
|
|
||||||
log.V(1).Info("Runner no longer exists on GitHub")
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
log.V(1).Info("Runner was never registered on GitHub")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
newRunner := runner.DeepCopy()
|
newRunner := runner.DeepCopy()
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,239 @@
|
||||||
|
package controllers
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/actions-runner-controller/actions-runner-controller/github"
|
||||||
|
"github.com/go-logr/logr"
|
||||||
|
gogithub "github.com/google/go-github/v39/github"
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
ctrl "sigs.k8s.io/controller-runtime"
|
||||||
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
unregistrationCompleteTimestamp = "unregistration-complete-timestamp"
|
||||||
|
unregistrationStartTimestamp = "unregistration-start-timestamp"
|
||||||
|
)
|
||||||
|
|
||||||
|
// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that
|
||||||
|
// we can delete the runner pod without disrupting a workflow job.
|
||||||
|
//
|
||||||
|
// This function returns a non-nil pointer to corev1.Pod as the first return value
|
||||||
|
// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion.
|
||||||
|
//
|
||||||
|
// It's a "tick" operation so a graceful stop can take multiple calls to complete.
|
||||||
|
// This function is designed to complete a length graceful stop process in a unblocking way.
|
||||||
|
// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
|
||||||
|
// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
|
||||||
|
func tickRunnerGracefulStop(ctx context.Context, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
|
||||||
|
if pod != nil {
|
||||||
|
if _, ok := getAnnotation(pod, unregistrationStartTimestamp); !ok {
|
||||||
|
updated := pod.DeepCopy()
|
||||||
|
setAnnotation(updated, unregistrationStartTimestamp, time.Now().Format(time.RFC3339))
|
||||||
|
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
|
||||||
|
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationStartTimestamp))
|
||||||
|
return nil, &ctrl.Result{}, err
|
||||||
|
}
|
||||||
|
pod = updated
|
||||||
|
|
||||||
|
log.Info("Runner has started unregistration")
|
||||||
|
} else {
|
||||||
|
log.Info("Runner has already started unregistration")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if res, err := ensureRunnerUnregistration(ctx, 10*time.Minute, 30*time.Second, log, ghClient, enterprise, organization, repository, runner, pod); res != nil {
|
||||||
|
return nil, res, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if pod != nil {
|
||||||
|
if _, ok := getAnnotation(pod, unregistrationCompleteTimestamp); !ok {
|
||||||
|
updated := pod.DeepCopy()
|
||||||
|
setAnnotation(updated, unregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
|
||||||
|
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
|
||||||
|
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationCompleteTimestamp))
|
||||||
|
return nil, &ctrl.Result{}, err
|
||||||
|
}
|
||||||
|
pod = updated
|
||||||
|
|
||||||
|
log.Info("Runner has completed unregistration")
|
||||||
|
} else {
|
||||||
|
log.Info("Runner has already completed unregistration")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pod, nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the first return value is nil, it's safe to delete the runner pod.
|
||||||
|
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
|
||||||
|
ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, &gogithub.RateLimitError{}) {
|
||||||
|
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||||
|
// or the runner is still busy.
|
||||||
|
log.Error(
|
||||||
|
err,
|
||||||
|
fmt.Sprintf(
|
||||||
|
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
||||||
|
retryDelayOnGitHubAPIRateLimitError,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Error(err, "Failed to unregister runner before deleting the pod.")
|
||||||
|
|
||||||
|
return &ctrl.Result{}, err
|
||||||
|
} else if ok {
|
||||||
|
log.Info("Runner has just been unregistered. Removing the runner pod.")
|
||||||
|
} else if pod == nil {
|
||||||
|
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
|
||||||
|
// However, that doesn't always mean the pod can be safely removed.
|
||||||
|
//
|
||||||
|
// If the pod does not exist for the runner,
|
||||||
|
// it may be due to that the runner pod has never been created.
|
||||||
|
// In that case we can safely assume that the runner will never be registered.
|
||||||
|
|
||||||
|
log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
|
||||||
|
} else if pod.Annotations[unregistrationCompleteTimestamp] != "" {
|
||||||
|
// If it's already unregistered in the previous reconcilation loop,
|
||||||
|
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
|
||||||
|
log.Info("Runner pod is marked as already unregistered.")
|
||||||
|
} else if runnerPodOrContainerIsStopped(pod) {
|
||||||
|
// If it's an ephemeral runner with the actions/runner container exited with 0,
|
||||||
|
// we can safely assume that it has unregistered itself from GitHub Actions
|
||||||
|
// so it's natural that RemoveRunner fails due to 404.
|
||||||
|
|
||||||
|
// If pod has ended up succeeded we need to restart it
|
||||||
|
// Happens e.g. when dind is in runner and run completes
|
||||||
|
log.Info("Runner pod has been stopped with a successful status.")
|
||||||
|
} else if ts := pod.Annotations[unregistrationStartTimestamp]; ts != "" {
|
||||||
|
t, err := time.Parse(time.RFC3339, ts)
|
||||||
|
if err != nil {
|
||||||
|
return &ctrl.Result{RequeueAfter: retryDelay}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if r := time.Until(t.Add(unregistrationTimeout)); r > 0 {
|
||||||
|
log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r)
|
||||||
|
return &ctrl.Result{RequeueAfter: retryDelay}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout)
|
||||||
|
} else {
|
||||||
|
// A runner and a runner pod that is created by this version of ARC should match
|
||||||
|
// any of the above branches.
|
||||||
|
//
|
||||||
|
// But we leave this match all branch for potential backward-compatibility.
|
||||||
|
// The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process,
|
||||||
|
// and retry later.
|
||||||
|
log.V(1).Info("Runner unregistration is being retried later.")
|
||||||
|
|
||||||
|
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getAnnotation(pod *corev1.Pod, key string) (string, bool) {
|
||||||
|
if pod.Annotations == nil {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
v, ok := pod.Annotations[key]
|
||||||
|
|
||||||
|
return v, ok
|
||||||
|
}
|
||||||
|
|
||||||
|
func setAnnotation(pod *corev1.Pod, key, value string) {
|
||||||
|
if pod.Annotations == nil {
|
||||||
|
pod.Annotations = map[string]string{}
|
||||||
|
}
|
||||||
|
|
||||||
|
pod.Annotations[key] = value
|
||||||
|
}
|
||||||
|
|
||||||
|
// unregisterRunner unregisters the runner from GitHub Actions by name.
|
||||||
|
//
|
||||||
|
// This function returns:
|
||||||
|
//
|
||||||
|
// Case 1. (true, nil) when it has successfully unregistered the runner.
|
||||||
|
// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet)
|
||||||
|
// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
|
||||||
|
// an error returned by GitHub API.
|
||||||
|
//
|
||||||
|
// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately.
|
||||||
|
// In other words, all those three sub-cases cannot be distinguished by this function alone.
|
||||||
|
//
|
||||||
|
// - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run).
|
||||||
|
// You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere,
|
||||||
|
// so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil).
|
||||||
|
//
|
||||||
|
// - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed.
|
||||||
|
// Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever.
|
||||||
|
// There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side.
|
||||||
|
//
|
||||||
|
// - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up.
|
||||||
|
// If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner.
|
||||||
|
//
|
||||||
|
// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back
|
||||||
|
// as long as you recreate the runner pod.
|
||||||
|
//
|
||||||
|
// If it was "2-3.", you need a workaround to avoid the race condition.
|
||||||
|
//
|
||||||
|
// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever
|
||||||
|
// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up.
|
||||||
|
//
|
||||||
|
// Beware though, you need extra care to set an appropriate grace period depending on your environment.
|
||||||
|
// There isn't a single right grace period that works for everyone.
|
||||||
|
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
|
||||||
|
// while the shorter the grace period is, the more likely you may encounter the race issue.
|
||||||
|
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
|
||||||
|
runners, err := client.ListRunners(ctx, enterprise, org, repo)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
id := int64(0)
|
||||||
|
for _, runner := range runners {
|
||||||
|
if runner.GetName() == name {
|
||||||
|
id = runner.GetID()
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if id == int64(0) {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
|
||||||
|
// The reason ARC did so was to let a runner running a job to not stop prematurely.
|
||||||
|
//
|
||||||
|
// However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner,
|
||||||
|
// so ARC doesn't need to do anything special for a graceful runner stop.
|
||||||
|
// It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and
|
||||||
|
// the runner pod is safe for deletion.
|
||||||
|
//
|
||||||
|
// Trying to remove a busy runner can result in errors like the following:
|
||||||
|
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
|
||||||
|
//
|
||||||
|
// # NOTES
|
||||||
|
//
|
||||||
|
// - It can be "status=offline" at the same time but that's another story.
|
||||||
|
// - After https://github.com/actions-runner-controller/actions-runner-controller/pull/1127, ListRunners responses that are used to
|
||||||
|
// determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds.
|
||||||
|
// - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't
|
||||||
|
// change from 60 seconds.
|
||||||
|
//
|
||||||
|
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
|
||||||
|
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
@ -105,33 +105,16 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||||
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
|
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
|
||||||
|
|
||||||
if removed {
|
if removed {
|
||||||
ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name)
|
updatedPod, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
|
||||||
if err != nil {
|
if res != nil {
|
||||||
if errors.Is(err, &gogithub.RateLimitError{}) {
|
return *res, err
|
||||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
|
||||||
// or the runner is still busy.
|
|
||||||
log.Error(
|
|
||||||
err,
|
|
||||||
fmt.Sprintf(
|
|
||||||
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
|
||||||
retryDelayOnGitHubAPIRateLimitError,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if !ok {
|
patchedPod := updatedPod.DeepCopy()
|
||||||
log.V(1).Info("Runner no longer exists on GitHub")
|
patchedPod.ObjectMeta.Finalizers = finalizers
|
||||||
}
|
|
||||||
|
|
||||||
newRunner := runnerPod.DeepCopy()
|
// We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster.
|
||||||
newRunner.ObjectMeta.Finalizers = finalizers
|
if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil {
|
||||||
|
|
||||||
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
|
|
||||||
log.Error(err, "Failed to update runner for finalizer removal")
|
log.Error(err, "Failed to update runner for finalizer removal")
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
}
|
}
|
||||||
|
|
@ -365,8 +348,13 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
updated, res, err := tickRunnerGracefulStop(ctx, log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
|
||||||
|
if res != nil {
|
||||||
|
return *res, err
|
||||||
|
}
|
||||||
|
|
||||||
// Delete current pod if recreation is needed
|
// Delete current pod if recreation is needed
|
||||||
if err := r.Delete(ctx, &runnerPod); err != nil {
|
if err := r.Delete(ctx, updated); err != nil {
|
||||||
log.Error(err, "Failed to delete pod resource")
|
log.Error(err, "Failed to delete pod resource")
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
}
|
}
|
||||||
|
|
@ -377,10 +365,6 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
|
|
||||||
return unregisterRunner(ctx, r.GitHubClient, enterprise, org, repo, name)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||||
name := "runnerpod-controller"
|
name := "runnerpod-controller"
|
||||||
if r.Name != "" {
|
if r.Name != "" {
|
||||||
|
|
|
||||||
|
|
@ -1,49 +0,0 @@
|
||||||
package controllers
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"github.com/actions-runner-controller/actions-runner-controller/github"
|
|
||||||
)
|
|
||||||
|
|
||||||
// unregisterRunner unregisters the runner from GitHub Actions by name.
|
|
||||||
//
|
|
||||||
// This function returns:
|
|
||||||
// - (true, nil) when it has successfully unregistered the runner.
|
|
||||||
// - (false, nil) when the runner has been already unregistered.
|
|
||||||
// - (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
|
|
||||||
// an error returned by GitHub API.
|
|
||||||
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
|
|
||||||
runners, err := client.ListRunners(ctx, enterprise, org, repo)
|
|
||||||
if err != nil {
|
|
||||||
return false, err
|
|
||||||
}
|
|
||||||
|
|
||||||
id := int64(0)
|
|
||||||
for _, runner := range runners {
|
|
||||||
if runner.GetName() == name {
|
|
||||||
// Note that sometimes a runner can stuck "busy" even though it is already "offline".
|
|
||||||
// But we assume that it's not actually offline and still running a job.
|
|
||||||
if runner.GetBusy() {
|
|
||||||
return false, fmt.Errorf("runner is busy")
|
|
||||||
}
|
|
||||||
id = runner.GetID()
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if id == int64(0) {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Trying to remove a busy runner can result in errors like the following:
|
|
||||||
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
|
|
||||||
//
|
|
||||||
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
|
|
||||||
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
|
|
||||||
return false, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return true, nil
|
|
||||||
}
|
|
||||||
Loading…
Reference in New Issue