/* Copyright 2020 The actions-runner-controller authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package actionssummerwindnet import ( "context" "errors" "fmt" "sync" "time" "github.com/go-logr/logr" kerrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" arcv1alpha1 "github.com/actions/actions-runner-controller/apis/actions.summerwind.net/v1alpha1" corev1 "k8s.io/api/core/v1" ) // RunnerPodReconciler reconciles a Runner object type RunnerPodReconciler struct { client.Client Log logr.Logger Recorder record.EventRecorder Scheme *runtime.Scheme GitHubClient *MultiGitHubClient Name string RegistrationRecheckInterval time.Duration RegistrationRecheckJitter time.Duration UnregistrationRetryDelay time.Duration } // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch // +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := r.Log.WithValues("runnerpod", req.NamespacedName) var runnerPod corev1.Pod if err := r.Get(ctx, req.NamespacedName, &runnerPod); err != nil { return ctrl.Result{}, client.IgnoreNotFound(err) } _, isRunnerPod := runnerPod.Labels[LabelKeyRunner] _, isRunnerSetPod := runnerPod.Labels[LabelKeyRunnerSetName] _, isRunnerDeploymentPod := runnerPod.Labels[LabelKeyRunnerDeploymentName] if !isRunnerPod && !isRunnerSetPod && !isRunnerDeploymentPod { return ctrl.Result{}, nil } var envvars []corev1.EnvVar for _, container := range runnerPod.Spec.Containers { if container.Name == "runner" { envvars = container.Env } } if len(envvars) == 0 { return ctrl.Result{}, errors.New("could not determine env vars for runner Pod") } var enterprise, org, repo string var isContainerMode bool for _, e := range envvars { switch e.Name { case EnvVarEnterprise: enterprise = e.Value case EnvVarOrg: org = e.Value case EnvVarRepo: repo = e.Value case "ACTIONS_RUNNER_CONTAINER_HOOKS": isContainerMode = true } } ghc, err := r.GitHubClient.InitForRunnerPod(ctx, &runnerPod) if err != nil { return ctrl.Result{}, err } if runnerPod.DeletionTimestamp.IsZero() { finalizers, added := addFinalizer(runnerPod.Finalizers, runnerPodFinalizerName) var cleanupFinalizersAdded bool if isContainerMode { finalizers, cleanupFinalizersAdded = addFinalizer(finalizers, runnerLinkedResourcesFinalizerName) } if added || cleanupFinalizersAdded { newRunner := runnerPod.DeepCopy() newRunner.Finalizers = finalizers if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil { log.Error(err, "Failed to update runner") return ctrl.Result{}, err } log.V(2).Info("Added finalizer") return ctrl.Result{}, nil } } else { log.V(2).Info("Seen deletion-timestamp is already set") // Mark the parent Runner resource for deletion before deleting this runner pod from the cluster. // Otherwise the runner controller can recreate the runner pod thinking it has not created any runner pod yet. var ( key = types.NamespacedName{Namespace: runnerPod.Namespace, Name: runnerPod.Name} runner arcv1alpha1.Runner ) if err := r.Get(ctx, key, &runner); err == nil { if runner.Name != "" && runner.DeletionTimestamp == nil { log.Info("This runner pod seems to have been deleted directly, bypassing the parent Runner resource. Marking the runner for deletion to not let it recreate this pod.") if err := r.Delete(ctx, &runner); err != nil { return ctrl.Result{}, err } } } if finalizers, removed := removeFinalizer(runnerPod.Finalizers, runnerLinkedResourcesFinalizerName); removed { if err := r.cleanupRunnerLinkedPods(ctx, &runnerPod, log); err != nil { log.Info("Runner-linked pods clean up that has failed due to an error. If this persists, please manually remove the runner-linked pods to unblock ARC", "err", err.Error()) return ctrl.Result{Requeue: true, RequeueAfter: 30 * time.Second}, nil } if err := r.cleanupRunnerLinkedSecrets(ctx, &runnerPod, log); err != nil { log.Info("Runner-linked secrets clean up that has failed due to an error. If this persists, please manually remove the runner-linked secrets to unblock ARC", "err", err.Error()) return ctrl.Result{Requeue: true, RequeueAfter: 30 * time.Second}, nil } patchedPod := runnerPod.DeepCopy() patchedPod.Finalizers = finalizers if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil { log.Error(err, "Failed to update runner for finalizer linked resources removal") return ctrl.Result{}, err } // Otherwise the subsequent patch request can revive the removed finalizer and it will trigger a unnecessary reconcilation runnerPod = *patchedPod } finalizers, removed := removeFinalizer(runnerPod.Finalizers, runnerPodFinalizerName) if removed { // In a standard scenario, the upstream controller, like runnerset-controller, ensures this runner to be gracefully stopped before the deletion timestamp is set. // But for the case that the user manually deleted it for whatever reason, // we have to ensure it to gracefully stop now. updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, ghc, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } patchedPod := updatedPod.DeepCopy() patchedPod.Finalizers = finalizers // We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster. if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil { log.Error(err, "Failed to update runner for finalizer removal") return ctrl.Result{}, err } log.V(2).Info("Removed finalizer") r.GitHubClient.DeinitForRunnerPod(updatedPod) return ctrl.Result{}, nil } deletionTimeout := 1 * time.Minute currentTime := time.Now() deletionDidTimeout := currentTime.Sub(runnerPod.DeletionTimestamp.Add(deletionTimeout)) > 0 if deletionDidTimeout { log.Info( fmt.Sprintf("Failed to delete pod within %s. ", deletionTimeout)+ "This is typically the case when a Kubernetes node became unreachable "+ "and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.", "podDeletionTimestamp", runnerPod.DeletionTimestamp, "currentTime", currentTime, "configuredDeletionTimeout", deletionTimeout, ) var force int64 = 0 // forcefully delete runner as we would otherwise get stuck if the node stays unreachable if err := r.Delete(ctx, &runnerPod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil { // probably if !kerrors.IsNotFound(err) { log.Error(err, "Failed to forcefully delete pod resource ...") return ctrl.Result{}, err } // forceful deletion finally succeeded return ctrl.Result{Requeue: true}, nil } r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", runnerPod.Name)) log.Info("Forcefully deleted runner pod", "repository", repo) // give kube manager a little time to forcefully delete the stuck pod return ctrl.Result{RequeueAfter: 3 * time.Second}, nil } return ctrl.Result{}, nil } po, res, err := ensureRunnerPodRegistered(ctx, log, ghc, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } runnerPod = *po if _, unregistrationRequested := getAnnotation(&runnerPod, AnnotationKeyUnregistrationRequestTimestamp); unregistrationRequested { log.V(2).Info("Progressing unregistration because unregistration-request timestamp is set") // At this point we're sure that DeletionTimestamp is not set yet, but the unregistration process is triggered by an upstream controller like runnerset-controller. // // In a standard scenario, ARC starts the unregistration process before marking the pod for deletion at all, // so that it isn't subject to terminationGracePeriod and can safely take hours to finish it's work. _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationRetryDelay(), log, ghc, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod) if res != nil { return *res, err } // At this point we are sure that the runner has successfully unregistered, hence is safe to be deleted. // But we don't delete the pod here. Instead, let the upstream controller/parent object to delete this pod as // a part of a cascade deletion. // This is to avoid a parent object, like statefulset, to recreate the deleted pod. // If the pod was recreated, it will start a registration process and that may race with the statefulset deleting the pod. log.V(2).Info("Unregistration seems complete") return ctrl.Result{}, nil } return ctrl.Result{}, nil } func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration { retryDelay := DefaultUnregistrationRetryDelay if r.UnregistrationRetryDelay > 0 { retryDelay = r.UnregistrationRetryDelay } return retryDelay } func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error { name := "runnerpod-controller" if r.Name != "" { name = r.Name } r.Recorder = mgr.GetEventRecorderFor(name) return ctrl.NewControllerManagedBy(mgr). For(&corev1.Pod{}). Named(name). Complete(r) } func (r *RunnerPodReconciler) cleanupRunnerLinkedPods(ctx context.Context, pod *corev1.Pod, log logr.Logger) error { var runnerLinkedPodList corev1.PodList if err := r.List(ctx, &runnerLinkedPodList, client.InNamespace(pod.Namespace), client.MatchingLabels( map[string]string{ "runner-pod": pod.Name, }, )); err != nil { return fmt.Errorf("failed to list runner-linked pods: %w", err) } var ( wg sync.WaitGroup errs []error ) for _, p := range runnerLinkedPodList.Items { if !p.DeletionTimestamp.IsZero() { continue } p := p wg.Add(1) go func() { defer wg.Done() if err := r.Delete(ctx, &p); err != nil { if kerrors.IsNotFound(err) || kerrors.IsGone(err) { return } errs = append(errs, fmt.Errorf("delete pod %q error: %v", p.Name, err)) } }() } wg.Wait() if len(errs) > 0 { for _, err := range errs { log.Error(err, "failed to remove runner-linked pod") } return errors.New("failed to remove some runner linked pods") } return nil } func (r *RunnerPodReconciler) cleanupRunnerLinkedSecrets(ctx context.Context, pod *corev1.Pod, log logr.Logger) error { log.V(2).Info("Listing runner-linked secrets to be deleted", "ns", pod.Namespace) var runnerLinkedSecretList corev1.SecretList if err := r.List(ctx, &runnerLinkedSecretList, client.InNamespace(pod.Namespace), client.MatchingLabels( map[string]string{ "runner-pod": pod.Name, }, )); err != nil { return fmt.Errorf("failed to list runner-linked secrets: %w", err) } var ( wg sync.WaitGroup errs []error ) for _, s := range runnerLinkedSecretList.Items { if !s.DeletionTimestamp.IsZero() { continue } s := s wg.Add(1) go func() { defer wg.Done() if err := r.Delete(ctx, &s); err != nil { if kerrors.IsNotFound(err) || kerrors.IsGone(err) { return } errs = append(errs, fmt.Errorf("delete secret %q error: %v", s.Name, err)) } }() } wg.Wait() if len(errs) > 0 { for _, err := range errs { log.Error(err, "failed to remove runner-linked secret") } return errors.New("failed to remove some runner linked secrets") } return nil }