Prevent runner pod deletion delay when pod disappeared before unregistration
This commit is contained in:
		
							parent
							
								
									59c3288e87
								
							
						
					
					
						commit
						11be6c1fb6
					
				|  | @ -447,6 +447,20 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr | ||||||
| 	return ctrl.Result{}, nil | 	return ctrl.Result{}, nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | func runnerContainerExitCode(pod *corev1.Pod) *int32 { | ||||||
|  | 	for _, status := range pod.Status.ContainerStatuses { | ||||||
|  | 		if status.Name != containerName { | ||||||
|  | 			continue | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		if status.State.Terminated != nil { | ||||||
|  | 			return &status.State.Terminated.ExitCode | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return nil | ||||||
|  | } | ||||||
|  | 
 | ||||||
| func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool { | func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool { | ||||||
| 	// If pod has ended up succeeded we need to restart it
 | 	// If pod has ended up succeeded we need to restart it
 | ||||||
| 	// Happens e.g. when dind is in runner and run completes
 | 	// Happens e.g. when dind is in runner and run completes
 | ||||||
|  |  | ||||||
|  | @ -99,6 +99,30 @@ func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time. | ||||||
| 
 | 
 | ||||||
| 		log.Error(err, "Failed to unregister runner before deleting the pod.") | 		log.Error(err, "Failed to unregister runner before deleting the pod.") | ||||||
| 
 | 
 | ||||||
|  | 		errRes := &gogithub.ErrorResponse{} | ||||||
|  | 		if errors.As(err, &errRes) { | ||||||
|  | 			code := runnerContainerExitCode(pod) | ||||||
|  | 
 | ||||||
|  | 			runner, _ := getRunner(ctx, ghClient, enterprise, organization, repository, runner) | ||||||
|  | 
 | ||||||
|  | 			var runnerID int64 | ||||||
|  | 
 | ||||||
|  | 			if runner != nil && runner.ID != nil { | ||||||
|  | 				runnerID = *runner.ID | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			if errRes.Response.StatusCode == 422 && code != nil { | ||||||
|  | 				log.V(2).Info("Runner container has already stopped but the unregistration attempt failed. "+ | ||||||
|  | 					"This can happen when the runner container crashed due to an unhandled error, OOM, etc. "+ | ||||||
|  | 					"ARC terminates the pod anyway. You'd probably need to manually delete the runner later by calling the GitHub API", | ||||||
|  | 					"runnerExitCode", *code, | ||||||
|  | 					"runnerID", runnerID, | ||||||
|  | 				) | ||||||
|  | 
 | ||||||
|  | 				return nil, nil | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
| 		return &ctrl.Result{}, err | 		return &ctrl.Result{}, err | ||||||
| 	} else if ok { | 	} else if ok { | ||||||
| 		log.Info("Runner has just been unregistered. Removing the runner pod.") | 		log.Info("Runner has just been unregistered. Removing the runner pod.") | ||||||
|  | @ -204,23 +228,17 @@ func setAnnotation(pod *corev1.Pod, key, value string) { | ||||||
| // The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
 | // The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
 | ||||||
| // while the shorter the grace period is, the more likely you may encounter the race issue.
 | // while the shorter the grace period is, the more likely you may encounter the race issue.
 | ||||||
| func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) { | func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) { | ||||||
| 	runners, err := client.ListRunners(ctx, enterprise, org, repo) | 	runner, err := getRunner(ctx, client, enterprise, org, repo, name) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return false, err | 		return false, err | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	id := int64(0) | 	if runner == nil || runner.ID == nil { | ||||||
| 	for _, runner := range runners { |  | ||||||
| 		if runner.GetName() == name { |  | ||||||
| 			id = runner.GetID() |  | ||||||
| 			break |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	if id == int64(0) { |  | ||||||
| 		return false, nil | 		return false, nil | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	id := *runner.ID | ||||||
|  | 
 | ||||||
| 	// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
 | 	// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
 | ||||||
| 	// The reason ARC did so was to let a runner running a job to not stop prematurely.
 | 	// The reason ARC did so was to let a runner running a job to not stop prematurely.
 | ||||||
| 	//
 | 	//
 | ||||||
|  | @ -247,3 +265,18 @@ func unregisterRunner(ctx context.Context, client *github.Client, enterprise, or | ||||||
| 
 | 
 | ||||||
| 	return true, nil | 	return true, nil | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | func getRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (*gogithub.Runner, error) { | ||||||
|  | 	runners, err := client.ListRunners(ctx, enterprise, org, repo) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return nil, err | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	for _, runner := range runners { | ||||||
|  | 		if runner.GetName() == name { | ||||||
|  | 			return runner, nil | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return nil, nil | ||||||
|  | } | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue