Try to unconfig runner before deleting the pod to recreate (#1125)
There is a race condition between ARC and GitHub service about deleting runner pod. - The ARC use REST API to find a particular runner in a pod that is not running any jobs, so it decides to delete the pod. - A job is queued on the GitHub service side, and it sends the job to this idle runner right before ARC deletes the pod. - The ARC delete the runner pod which cause the in-progress job to end up canceled. To avoid this race condition, I am calling `r.unregisterRunner()` before deleting the pod. - `r.unregisterRunner()` will return 204 to indicate the runner is deleted from the GitHub service, we should be safe to delete the pod. - `r.unregisterRunner` will return 400 to indicate the runner is still running a job, so we will leave this runner pod as it is. TODO: I need to do some E2E tests to force the race condition to happen. Ref #911
This commit is contained in:
		
							parent
							
								
									a5ed6bd263
								
							
						
					
					
						commit
						0b9bef2c08
					
				| 
						 | 
				
			
			@ -404,7 +404,23 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 | 
			
		|||
		return ctrl.Result{}, nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Delete current pod if recreation is needed
 | 
			
		||||
	// Try to delete current pod if recreation is needed
 | 
			
		||||
	safeToDeletePod := false
 | 
			
		||||
	ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.Error(err, "Failed to unregister runner before deleting the pod.", "runner", runner.Name)
 | 
			
		||||
	} else {
 | 
			
		||||
		// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
 | 
			
		||||
		if !ok {
 | 
			
		||||
			log.Info("Runner no longer exists on GitHub", "runner", runner.Name)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		safeToDeletePod = true
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if safeToDeletePod {
 | 
			
		||||
		// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
 | 
			
		||||
		// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
 | 
			
		||||
		if err := r.Delete(ctx, &pod); err != nil {
 | 
			
		||||
			log.Error(err, "Failed to delete pod resource")
 | 
			
		||||
			return ctrl.Result{}, err
 | 
			
		||||
| 
						 | 
				
			
			@ -412,6 +428,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 | 
			
		|||
 | 
			
		||||
		r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
 | 
			
		||||
		log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return ctrl.Result{}, nil
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue