Fix RunerSet managed runner pods to terminate more gracefully (#1126)

Make RunnerSet-managed runners as reliable as RunnerDeployment-managed runners.

Ref https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1042404460
This commit is contained in:
Yusuke Kuoka 2022-02-19 21:19:37 +09:00 committed by GitHub
parent 921f547200
commit a5ed6bd263
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 58 additions and 61 deletions

View File

@ -531,32 +531,15 @@ func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1a
return ctrl.Result{}, nil
}
// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
// - (true, nil) when it has successfully unregistered the runner.
// - (false, nil) when the runner has been already unregistered.
// - (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
// an error returned by GitHub API.
func (r *RunnerReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
if runner.GetBusy() {
return false, fmt.Errorf("runner is busy")
}
id = runner.GetID()
break
}
}
if id == int64(0) {
return false, nil
}
if err := r.GitHubClient.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err
}
return true, nil
return unregisterRunner(ctx, r.GitHubClient, enterprise, org, repo, name)
}
func (r *RunnerReconciler) updateRegistrationToken(ctx context.Context, runner v1alpha1.Runner) (bool, error) {

View File

@ -378,42 +378,7 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
}
func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}
var busy bool
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
// Sometimes a runner can stuck "busy" even though it is already "offline".
// Thus removing the condition on status can block the runner pod from being terminated forever.
busy = runner.GetBusy()
if runner.GetStatus() != "offline" && busy {
r.Log.Info("This runner will delay the runner pod deletion and the runner deregistration until it becomes either offline or non-busy", "name", runner.GetName(), "status", runner.GetStatus(), "busy", runner.GetBusy())
return false, fmt.Errorf("runner is busy")
}
id = runner.GetID()
break
}
}
if id == int64(0) {
return false, nil
}
// Sometimes a runner can stuck "busy" even though it is already "offline".
// Trying to remove the offline but busy runner can result in errors like the following:
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
if !busy {
if err := r.GitHubClient.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err
}
}
return true, nil
return unregisterRunner(ctx, r.GitHubClient, enterprise, org, repo, name)
}
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {

49
controllers/unregister.go Normal file
View File

@ -0,0 +1,49 @@
package controllers
import (
"context"
"fmt"
"github.com/actions-runner-controller/actions-runner-controller/github"
)
// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
// - (true, nil) when it has successfully unregistered the runner.
// - (false, nil) when the runner has been already unregistered.
// - (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
// an error returned by GitHub API.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
runners, err := client.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
// Note that sometimes a runner can stuck "busy" even though it is already "offline".
// But we assume that it's not actually offline and still running a job.
if runner.GetBusy() {
return false, fmt.Errorf("runner is busy")
}
id = runner.GetID()
break
}
}
if id == int64(0) {
return false, nil
}
// Trying to remove a busy runner can result in errors like the following:
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
//
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err
}
return true, nil
}