Prevent runner pod deletion delay when pod disappeared before unregistration

2022-02-27 11:55:06 +00:00 · 2022-02-27 11:55:06 +00:00 · 11be6c1fb6
parent 59c3288e87
commit 11be6c1fb6
2 changed files with 57 additions and 10 deletions
--- a/controllers/runner_controller.go
+++ b/controllers/runner_controller.go
@ -447,6 +447,20 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 	return ctrl.Result{}, nil
 }

+func runnerContainerExitCode(pod *corev1.Pod) *int32 {
+	for _, status := range pod.Status.ContainerStatuses {
+		if status.Name != containerName {
+			continue
+		}
+
+		if status.State.Terminated != nil {
+			return &status.State.Terminated.ExitCode
+		}
+	}
+
+	return nil
+}
+
 func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
 	// If pod has ended up succeeded we need to restart it
 	// Happens e.g. when dind is in runner and run completes
--- a/controllers/runner_graceful_stop.go
+++ b/controllers/runner_graceful_stop.go
@ -99,6 +99,30 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.

 		log.Error(err, "Failed to unregister runner before deleting the pod.")

+		errRes := &gogithub.ErrorResponse{}
+		if errors.As(err, &errRes) {
+			code := runnerContainerExitCode(pod)
+
+			runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
+
+			var runnerID int64
+
+			if runner != nil && runner.ID != nil {
+				runnerID = *runner.ID
+			}
+
+			if errRes.Response.StatusCode == 422 && code != nil {
+				log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+
+					"This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+
+					"ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API",
+					"runnerExitCode", *code,
+					"runnerID", runnerID,
+				)
+
+				return nil, nil
+			}
+		}
+
 		return &ctrl.Result{}, err
 	} else if ok {
 		log.Info("Runner has just been unregistered. Removing the runner pod.")
@ -204,23 +228,17 @@ func setAnnotation(pod *corev1.Pod, key, value string) {
 // The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
 // while the shorter the grace period is, the more likely you may encounter the race issue.
 func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
-	runners, err := client.ListRunners(ctx, enterprise, org, repo)
+	runner, err := getRunner(ctx, client, enterprise, org, repo, name)
 	if err != nil {
 		return false, err
 	}

-	id := int64(0)
-	for _, runner := range runners {
-		if runner.GetName() == name {
-			id = runner.GetID()
-			break
-		}
-	}
-
-	if id == int64(0) {
+	if runner == nil || runner.ID == nil {
 		return false, nil
 	}

+	id := *runner.ID
+
 	// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
 	// The reason ARC did so was to let a runner running a job to not stop prematurely.
 	//
@ -247,3 +265,18 @@ func unregisterRunner(ctx context.Context, client *github.Client, enterprise, or

 	return true, nil
 }
+
+func getRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (*gogithub.Runner, error) {
+	runners, err := client.ListRunners(ctx, enterprise, org, repo)
+	if err != nil {
+		return nil, err
+	}
+
+	for _, runner := range runners {
+		if runner.GetName() == name {
+			return runner, nil
+		}
+	}
+
+	return nil, nil
+}