Prevent runner pod deletion delay when pod disappeared before unregistration
This commit is contained in:
parent
59c3288e87
commit
11be6c1fb6
|
|
@ -447,6 +447,20 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runnerContainerExitCode(pod *corev1.Pod) *int32 {
|
||||||
|
for _, status := range pod.Status.ContainerStatuses {
|
||||||
|
if status.Name != containerName {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if status.State.Terminated != nil {
|
||||||
|
return &status.State.Terminated.ExitCode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
|
func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
|
||||||
// If pod has ended up succeeded we need to restart it
|
// If pod has ended up succeeded we need to restart it
|
||||||
// Happens e.g. when dind is in runner and run completes
|
// Happens e.g. when dind is in runner and run completes
|
||||||
|
|
|
||||||
|
|
@ -99,6 +99,30 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.
|
||||||
|
|
||||||
log.Error(err, "Failed to unregister runner before deleting the pod.")
|
log.Error(err, "Failed to unregister runner before deleting the pod.")
|
||||||
|
|
||||||
|
errRes := &gogithub.ErrorResponse{}
|
||||||
|
if errors.As(err, &errRes) {
|
||||||
|
code := runnerContainerExitCode(pod)
|
||||||
|
|
||||||
|
runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner)
|
||||||
|
|
||||||
|
var runnerID int64
|
||||||
|
|
||||||
|
if runner != nil && runner.ID != nil {
|
||||||
|
runnerID = *runner.ID
|
||||||
|
}
|
||||||
|
|
||||||
|
if errRes.Response.StatusCode == 422 && code != nil {
|
||||||
|
log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+
|
||||||
|
"This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+
|
||||||
|
"ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API",
|
||||||
|
"runnerExitCode", *code,
|
||||||
|
"runnerID", runnerID,
|
||||||
|
)
|
||||||
|
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return &ctrl.Result{}, err
|
return &ctrl.Result{}, err
|
||||||
} else if ok {
|
} else if ok {
|
||||||
log.Info("Runner has just been unregistered. Removing the runner pod.")
|
log.Info("Runner has just been unregistered. Removing the runner pod.")
|
||||||
|
|
@ -204,23 +228,17 @@ func setAnnotation(pod *corev1.Pod, key, value string) {
|
||||||
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
|
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
|
||||||
// while the shorter the grace period is, the more likely you may encounter the race issue.
|
// while the shorter the grace period is, the more likely you may encounter the race issue.
|
||||||
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
|
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
|
||||||
runners, err := client.ListRunners(ctx, enterprise, org, repo)
|
runner, err := getRunner(ctx, client, enterprise, org, repo, name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
|
|
||||||
id := int64(0)
|
if runner == nil || runner.ID == nil {
|
||||||
for _, runner := range runners {
|
|
||||||
if runner.GetName() == name {
|
|
||||||
id = runner.GetID()
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if id == int64(0) {
|
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
id := *runner.ID
|
||||||
|
|
||||||
// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
|
// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
|
||||||
// The reason ARC did so was to let a runner running a job to not stop prematurely.
|
// The reason ARC did so was to let a runner running a job to not stop prematurely.
|
||||||
//
|
//
|
||||||
|
|
@ -247,3 +265,18 @@ func unregisterRunner(ctx context.Context, client *github.Client, enterprise, or
|
||||||
|
|
||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (*gogithub.Runner, error) {
|
||||||
|
runners, err := client.ListRunners(ctx, enterprise, org, repo)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, runner := range runners {
|
||||||
|
if runner.GetName() == name {
|
||||||
|
return runner, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue