Introduce pod deletion timeout and forcefully delete stuck pods (#307)

* if a k8s node becomes unresponsive, the kube controller will soft delete all pods after the eviction time (default 5 mins) * as long as the node stays unresponsive, the pod will never leave the last status and hence the runner controller will assume that everything is fine with the pod and will not try to create new pods * this can result in a situation where a horizontal autoscaler thinks that none of its runners are currently busy and will not schedule any further runners / pods, resulting in a broken runner deployment until the runnerreplicaset is deleted or the node comes back online * introducing a pod deletion timeout (1 minute) after which the runner controller will try to reboot the runner and create a pod on a working node * use forceful deletion and requeue for pods that have been stuck for more than one minute in terminating state * gracefully handling race conditions if pod gets finally forcefully deleted within
2021-02-15 01:32:28 +01:00 · 2021-02-15 01:32:28 +01:00 · 9c8d7305f1
parent addcbfa7ee
commit 9c8d7305f1
1 changed files with 33 additions and 1 deletions
--- a/controllers/runner_controller.go
+++ b/controllers/runner_controller.go
@ -185,8 +185,40 @@ func (r *RunnerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
 		}

 		if !pod.ObjectMeta.DeletionTimestamp.IsZero() {
+			deletionTimeout := 1 * time.Minute
+			currentTime := time.Now()
+			deletionDidTimeout := currentTime.Sub(pod.DeletionTimestamp.Add(deletionTimeout)) > 0
+
+			if deletionDidTimeout {
+				log.Info(
+					"Pod failed to delete itself in a timely manner. "+
+						"This is typically the case when a Kubernetes node became unreachable "+
+						"and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.",
+					"podDeletionTimestamp", pod.DeletionTimestamp,
+					"currentTime", currentTime,
+					"configuredDeletionTimeout", deletionTimeout,
+				)
+
+				var force int64 = 0
+				// forcefully delete runner as we would otherwise get stuck if the node stays unreachable
+				if err := r.Delete(ctx, &pod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil {
+					// probably
+					if !kerrors.IsNotFound(err) {
+						log.Error(err, "Failed to forcefully delete pod resource ...")
 						return ctrl.Result{}, err
 					}
+					// forceful deletion finally succeeded
+					return ctrl.Result{Requeue: true}, nil
+				}
+
+				r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", pod.Name))
+				log.Info("Forcefully deleted runner pod", "repository", runner.Spec.Repository)
+				// give kube manager a little time to forcefully delete the stuck pod
+				return ctrl.Result{RequeueAfter: 3 * time.Second}, err
+			} else {
+				return ctrl.Result{}, err
+			}
+		}

 		if pod.Status.Phase == corev1.PodRunning {
 			for _, status := range pod.Status.ContainerStatuses {