From 11be6c1fb6c2bdb771b33b2f5b683efbde5d03d0 Mon Sep 17 00:00:00 2001 From: Yusuke Kuoka Date: Sun, 27 Feb 2022 11:55:06 +0000 Subject: [PATCH] Prevent runner pod deletion delay when pod disappeared before unregistration --- controllers/runner_controller.go | 14 ++++++++ controllers/runner_graceful_stop.go | 53 +++++++++++++++++++++++------ 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 03c22b27..3b607f47 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -447,6 +447,20 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } +func runnerContainerExitCode(pod *corev1.Pod) *int32 { + for _, status := range pod.Status.ContainerStatuses { + if status.Name != containerName { + continue + } + + if status.State.Terminated != nil { + return &status.State.Terminated.ExitCode + } + } + + return nil +} + func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool { // If pod has ended up succeeded we need to restart it // Happens e.g. when dind is in runner and run completes diff --git a/controllers/runner_graceful_stop.go b/controllers/runner_graceful_stop.go index 83e4b41e..8a6dda2c 100644 --- a/controllers/runner_graceful_stop.go +++ b/controllers/runner_graceful_stop.go @@ -99,6 +99,30 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. log.Error(err, "Failed to unregister runner before deleting the pod.") + errRes := &gogithub.ErrorResponse{} + if errors.As(err, &errRes) { + code := runnerContainerExitCode(pod) + + runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner) + + var runnerID int64 + + if runner != nil && runner.ID != nil { + runnerID = *runner.ID + } + + if errRes.Response.StatusCode == 422 && code != nil { + log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+ + "This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+ + "ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API", + "runnerExitCode", *code, + "runnerID", runnerID, + ) + + return nil, nil + } + } + return &ctrl.Result{}, err } else if ok { log.Info("Runner has just been unregistered. Removing the runner pod.") @@ -204,23 +228,17 @@ func setAnnotation(pod *corev1.Pod, key, value string) { // The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions, // while the shorter the grace period is, the more likely you may encounter the race issue. func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) { - runners, err := client.ListRunners(ctx, enterprise, org, repo) + runner, err := getRunner(ctx, client, enterprise, org, repo, name) if err != nil { return false, err } - id := int64(0) - for _, runner := range runners { - if runner.GetName() == name { - id = runner.GetID() - break - } - } - - if id == int64(0) { + if runner == nil || runner.ID == nil { return false, nil } + id := *runner.ID + // For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true. // The reason ARC did so was to let a runner running a job to not stop prematurely. // @@ -247,3 +265,18 @@ func unregisterRunner(ctx context.Context, client *github.Client, enterprise, or return true, nil } + +func getRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (*gogithub.Runner, error) { + runners, err := client.ListRunners(ctx, enterprise, org, repo) + if err != nil { + return nil, err + } + + for _, runner := range runners { + if runner.GetName() == name { + return runner, nil + } + } + + return nil, nil +}