Prevent runner pod deletion delay when pod disappeared before unregistration

This commit is contained in:
Yusuke Kuoka 2022-02-27 11:55:06 +00:00
parent 59c3288e87
commit 11be6c1fb6
2 changed files with 57 additions and 10 deletions

View File

@ -447,6 +447,20 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return ctrl.Result{}, nil
}
func runnerContainerExitCode(pod *corev1.Pod) *int32 {
for _, status := range pod.Status.ContainerStatuses {
if status.Name != containerName {
continue
}
if status.State.Terminated != nil {
return &status.State.Terminated.ExitCode
}
}
return nil
}
func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes

View File

@ -99,6 +99,30 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
log.Error(err, "Failed to unregister runner before deleting the pod.")
errRes := &gogithub.ErrorResponse{}
if errors.As(err, &errRes) {
code := runnerContainerExitCode(pod)
runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
var runnerID int64
if runner != nil && runner.ID != nil {
runnerID = *runner.ID
}
if errRes.Response.StatusCode == 422 && code != nil {
log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+
"This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+
"ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API",
"runnerExitCode", *code,
"runnerID", runnerID,
)
return nil, nil
}
}
return &ctrl.Result{}, err
} else if ok {
log.Info("Runner has just been unregistered. Removing the runner pod.")
@ -204,23 +228,17 @@ func setAnnotation(pod *corev1.Pod, key, value string) {
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
// while the shorter the grace period is, the more likely you may encounter the race issue.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
runners, err := client.ListRunners(ctx, enterprise, org, repo)
runner, err := getRunner(ctx, client, enterprise, org, repo, name)
if err != nil {
return false, err
}
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
id = runner.GetID()
break
}
}
if id == int64(0) {
if runner == nil || runner.ID == nil {
return false, nil
}
id := *runner.ID
// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
// The reason ARC did so was to let a runner running a job to not stop prematurely.
//
@ -247,3 +265,18 @@ func unregisterRunner(ctx context.Context, client *github.Client, enterprise, or
return true, nil
}
func getRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (*gogithub.Runner, error) {
runners, err := client.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return nil, err
}
for _, runner := range runners {
if runner.GetName() == name {
return runner, nil
}
}
return nil, nil
}