diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index 17976231..1bfa8bcc 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -404,14 +404,31 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } - // Delete current pod if recreation is needed - if err := r.Delete(ctx, &pod); err != nil { - log.Error(err, "Failed to delete pod resource") - return ctrl.Result{}, err + // Try to delete current pod if recreation is needed + safeToDeletePod := false + ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) + if err != nil { + log.Error(err, "Failed to unregister runner before deleting the pod.", "runner", runner.Name) + } else { + // `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub. + if !ok { + log.Info("Runner no longer exists on GitHub", "runner", runner.Name) + } + + safeToDeletePod = true } - r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name)) - log.Info("Deleted runner pod", "repository", runner.Spec.Repository) + if safeToDeletePod { + // Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service. + // This should help us avoid race condition between runner pickup job after we think the runner is not busy. + if err := r.Delete(ctx, &pod); err != nil { + log.Error(err, "Failed to delete pod resource") + return ctrl.Result{}, err + } + + r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name)) + log.Info("Deleted runner pod", "repository", runner.Spec.Repository) + } return ctrl.Result{}, nil }