diff --git a/acceptance/deploy.sh b/acceptance/deploy.sh index 62b253df..a7288588 100755 --- a/acceptance/deploy.sh +++ b/acceptance/deploy.sh @@ -84,8 +84,7 @@ if [ -n "${TEST_REPO}" ]; then cat acceptance/testdata/runnerset.envsubst.yaml | TEST_ENTERPRISE= TEST_ORG= RUNNER_MIN_REPLICAS=${REPO_RUNNER_MIN_REPLICAS} NAME=repo-runnerset envsubst | kubectl apply -f - else echo 'Deploying runnerdeployment and hra. Set USE_RUNNERSET if you want to deploy runnerset instead.' - cat acceptance/testdata/repo.runnerdeploy.yaml | envsubst | kubectl apply -f - - cat acceptance/testdata/repo.hra.yaml | envsubst | kubectl apply -f - + cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_ORG= RUNNER_MIN_REPLICAS=${REPO_RUNNER_MIN_REPLICAS} NAME=repo-runnerdeploy envsubst | kubectl apply -f - fi else echo 'Skipped deploying runnerdeployment and hra. Set TEST_REPO to "yourorg/yourrepo" to deploy.' diff --git a/acceptance/testdata/org.hra.yaml b/acceptance/testdata/org.hra.yaml deleted file mode 100644 index dae10f72..00000000 --- a/acceptance/testdata/org.hra.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: actions.summerwind.dev/v1alpha1 -kind: HorizontalRunnerAutoscaler -metadata: - name: org -spec: - scaleTargetRef: - name: org-runnerdeploy - scaleUpTriggers: - - githubEvent: - checkRun: - types: ["created"] - status: "queued" - amount: 1 - duration: "1m" - scheduledOverrides: - - startTime: "2021-05-11T16:05:00+09:00" - endTime: "2021-05-11T16:40:00+09:00" - minReplicas: 2 - - startTime: "2021-05-01T00:00:00+09:00" - endTime: "2021-05-03T00:00:00+09:00" - recurrenceRule: - frequency: Weekly - untilTime: "2022-05-01T00:00:00+09:00" - minReplicas: 0 - minReplicas: 0 - maxReplicas: 5 - # Used to test that HRA is working for org runners - metrics: - - type: PercentageRunnersBusy - scaleUpThreshold: '0.75' - scaleDownThreshold: '0.3' - scaleUpFactor: '2' - scaleDownFactor: '0.5' - - type: TotalNumberOfQueuedAndInProgressWorkflowRuns - repositoryNames: - - ${TEST_ORG_REPO} diff --git a/acceptance/testdata/org.runnerdeploy.yaml b/acceptance/testdata/org.runnerdeploy.yaml deleted file mode 100644 index 1640bb31..00000000 --- a/acceptance/testdata/org.runnerdeploy.yaml +++ /dev/null @@ -1,44 +0,0 @@ -apiVersion: actions.summerwind.dev/v1alpha1 -kind: RunnerDeployment -metadata: - name: org-runnerdeploy -spec: -# replicas: 1 - template: - spec: - organization: ${TEST_ORG} - - # - # Custom runner image - # - image: ${RUNNER_NAME}:${RUNNER_TAG} - imagePullPolicy: IfNotPresent - - # Whether to pass --ephemeral (true) or --once (false, deprecated) - env: - - name: RUNNER_FEATURE_FLAG_EPHEMERAL - value: "${RUNNER_FEATURE_FLAG_EPHEMERAL}" - - # - # dockerd within runner container - # - ## Replace `mumoshu/actions-runner-dind:dev` with your dind image - #dockerdWithinRunnerContainer: true - #image: mumoshu/actions-runner-dind:dev - - # - # Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu) - # - #dockerMTU: 1450 - - #Runner group - # labels: - # - "mylabel 1" - # - "mylabel 2" - labels: - - "${RUNNER_LABEL}" - - # - # Non-standard working directory - # - # workDir: "/" diff --git a/acceptance/testdata/repo.hra.yaml b/acceptance/testdata/repo.hra.yaml deleted file mode 100644 index 514d8b74..00000000 --- a/acceptance/testdata/repo.hra.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: actions.summerwind.dev/v1alpha1 -kind: HorizontalRunnerAutoscaler -metadata: - name: actions-runner-aos-autoscaler -spec: - scaleTargetRef: - name: example-runnerdeploy - scaleUpTriggers: - - githubEvent: - checkRun: - types: ["created"] - status: "queued" - amount: 1 - duration: "1m" - minReplicas: 0 - maxReplicas: 5 - metrics: - - type: PercentageRunnersBusy - scaleUpThreshold: '0.75' - scaleDownThreshold: '0.3' - scaleUpFactor: '2' - scaleDownFactor: '0.5' - - type: TotalNumberOfQueuedAndInProgressWorkflowRuns - repositoryNames: - - ${TEST_REPO} diff --git a/acceptance/testdata/repo.runnerdeploy.yaml b/acceptance/testdata/repo.runnerdeploy.yaml deleted file mode 100644 index eeba2086..00000000 --- a/acceptance/testdata/repo.runnerdeploy.yaml +++ /dev/null @@ -1,44 +0,0 @@ -apiVersion: actions.summerwind.dev/v1alpha1 -kind: RunnerDeployment -metadata: - name: example-runnerdeploy -spec: -# replicas: 1 - template: - spec: - repository: ${TEST_REPO} - - # - # Custom runner image - # - image: ${RUNNER_NAME}:${RUNNER_TAG} - imagePullPolicy: IfNotPresent - - # Whether to pass --ephemeral (true) or --once (false, deprecated) - env: - - name: RUNNER_FEATURE_FLAG_EPHEMERAL - value: "${RUNNER_FEATURE_FLAG_EPHEMERAL}" - - # - # dockerd within runner container - # - ## Replace `mumoshu/actions-runner-dind:dev` with your dind image - #dockerdWithinRunnerContainer: true - #image: mumoshu/actions-runner-dind:dev - - # - # Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu) - # - #dockerMTU: 1450 - - #Runner group - # labels: - # - "mylabel 1" - # - "mylabel 2" - labels: - - "${RUNNER_LABEL}" - - # - # Non-standard working directory - # - # workDir: "/" diff --git a/acceptance/testdata/repo.runnerset.hra.yaml b/acceptance/testdata/repo.runnerset.hra.yaml deleted file mode 100644 index d7b76e90..00000000 --- a/acceptance/testdata/repo.runnerset.hra.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: actions.summerwind.dev/v1alpha1 -kind: HorizontalRunnerAutoscaler -metadata: - name: example-runnerset -spec: - scaleTargetRef: - kind: RunnerSet - name: example-runnerset - scaleUpTriggers: - - githubEvent: - checkRun: - types: ["created"] - status: "queued" - amount: 1 - duration: "1m" - # RunnerSet doesn't support scale from/to zero yet - minReplicas: 1 - maxReplicas: 5 - # This should be less than 600(seconds, the default) for faster testing - scaleDownDelaySecondsAfterScaleOut: 60 - metrics: - - type: PercentageRunnersBusy - scaleUpThreshold: '0.75' - scaleDownThreshold: '0.3' - scaleUpFactor: '2' - scaleDownFactor: '0.5' - - type: TotalNumberOfQueuedAndInProgressWorkflowRuns - repositoryNames: - - ${TEST_REPO} diff --git a/controllers/runner_controller.go b/controllers/runner_controller.go index c4f141d5..7b300777 100644 --- a/controllers/runner_controller.go +++ b/controllers/runner_controller.go @@ -18,15 +18,12 @@ package controllers import ( "context" - "errors" "fmt" "strings" "time" "github.com/actions-runner-controller/actions-runner-controller/hash" "github.com/go-logr/logr" - gogithub "github.com/google/go-github/v39/github" - "k8s.io/apimachinery/pkg/util/wait" kerrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" @@ -92,12 +89,6 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, client.IgnoreNotFound(err) } - err := runner.Validate() - if err != nil { - log.Info("Failed to validate runner spec", "error", err.Error()) - return ctrl.Result{}, nil - } - if runner.ObjectMeta.DeletionTimestamp.IsZero() { finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers, finalizerName) @@ -125,34 +116,6 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return r.processRunnerDeletion(runner, ctx, log, &pod) } - registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly) - if registrationOnly && runner.Status.Phase != "" { - // At this point we are sure that the registration-only runner has successfully configured and - // is of `offline` status, because we set runner.Status.Phase to that of the runner pod only after - // successful registration. - - var pod corev1.Pod - if err := r.Get(ctx, req.NamespacedName, &pod); err != nil { - if !kerrors.IsNotFound(err) { - log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err)) - - return ctrl.Result{Requeue: true}, nil - } - } else if err := r.Delete(ctx, &pod); err != nil { - if !kerrors.IsNotFound(err) { - log.Info(fmt.Sprintf("Retrying soon as we failed to delete registration-only runner pod: %v", err)) - - return ctrl.Result{Requeue: true}, nil - } - } - - log.Info("Successfully deleted registration-only runner pod to free node and cluster resource") - - // Return here to not recreate the deleted pod, because recreating it is the waste of cluster and node resource, - // and also defeats the original purpose of scale-from/to-zero we're trying to implement by using the registration-only runner. - return ctrl.Result{}, nil - } - var pod corev1.Pod if err := r.Get(ctx, req.NamespacedName, &pod); err != nil { if !kerrors.IsNotFound(err) { @@ -162,281 +125,31 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return r.processRunnerCreation(ctx, runner, log) } - // Pod already exists - - if !pod.ObjectMeta.DeletionTimestamp.IsZero() { - return r.processRunnerPodDeletion(ctx, runner, log, pod) + phase := string(pod.Status.Phase) + if phase == "" { + phase = "Created" } - // If pod has ended up succeeded we need to restart it - // Happens e.g. when dind is in runner and run completes - stopped := runnerPodOrContainerIsStopped(&pod) - - ephemeral := runner.Spec.Ephemeral == nil || *runner.Spec.Ephemeral - - if stopped && ephemeral { - log.V(1).Info("Ephemeral runner has been stopped successfully. Marking this runner for deletion.") - - // This is the key to make ephemeral runners to work reliably with webhook-based autoscale. - // See https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1046161384 for more context. - // - // In the next reconcilation loop, this triggers a runner unregistration. - // (Note that the unregistration can fail safely because an ephemeral runner usually unregisters itself from GitHub but we do it just for confirmation) - // - // See the code path above that is executed when `runner.ObjectMeta.DeletionTimestamp.IsZero()` isn't true, - // which handles the unregistrationa the removal of the completed pod, and so on. - if err := r.Delete(ctx, &runner); err != nil { - log.V(1).Error(err, "Retrying to mark this runner for deletion in 10 seconds.") - return ctrl.Result{RequeueAfter: 10 * time.Second}, nil + if runner.Status.Phase != phase { + if pod.Status.Phase == corev1.PodRunning { + // Seeing this message, you can expect the runner to become `Running` soon. + log.V(1).Info( + "Runner appears to have been registered and running.", + "podCreationTimestamp", pod.CreationTimestamp, + ) } - return ctrl.Result{Requeue: true}, nil - } + updated := runner.DeepCopy() + updated.Status.Phase = phase + updated.Status.Reason = pod.Status.Reason + updated.Status.Message = pod.Status.Message - restart := stopped - - if registrationOnly && stopped { - restart = false - - log.Info( - "Observed that registration-only runner for scaling-from-zero has successfully stopped. " + - "Unlike other pods, this one will be recreated only when runner spec changes.", - ) - } - - if updated, err := r.updateRegistrationToken(ctx, runner); err != nil { - return ctrl.Result{}, err - } else if updated { - return ctrl.Result{Requeue: true}, nil - } - - newPod, err := r.newPod(runner) - if err != nil { - log.Error(err, "Could not create pod") - return ctrl.Result{}, err - } - - if registrationOnly { - newPod.Spec.Containers[0].Env = append( - newPod.Spec.Containers[0].Env, - corev1.EnvVar{ - Name: "RUNNER_REGISTRATION_ONLY", - Value: "true", - }, - ) - } - - var registrationRecheckDelay time.Duration - - // all checks done below only decide whether a restart is needed - // if a restart was already decided before, there is no need for the checks - // saving API calls and scary log messages - if !restart { - registrationCheckInterval := time.Minute - if r.RegistrationRecheckInterval > 0 { - registrationCheckInterval = r.RegistrationRecheckInterval - } - - // We want to call ListRunners GitHub Actions API only once per runner per minute. - // This if block, in conjunction with: - // return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil - // achieves that. - if lastCheckTime := runner.Status.LastRegistrationCheckTime; lastCheckTime != nil { - nextCheckTime := lastCheckTime.Add(registrationCheckInterval) - now := time.Now() - - // Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds) - // so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is - // shorter than 1s - requeueAfter := nextCheckTime.Sub(now) - time.Second - if requeueAfter > 0 { - log.Info( - fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter), - "lastRegistrationCheckTime", lastCheckTime, - "registrationCheckInterval", registrationCheckInterval, - ) - - // Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the - // next sync period passes, which can be too much later than nextCheckTime. - // - // We need to requeue on this reconcilation even though we have already scheduled the initial - // requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`. - // Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on - // other requeues- so the initial scheduled requeue may have been reset due to requeue on - // spec/status change. - return ctrl.Result{RequeueAfter: requeueAfter}, nil - } - } - - notFound := false - offline := false - - runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) - - currentTime := time.Now() - - if err != nil { - var notFoundException *github.RunnerNotFound - var offlineException *github.RunnerOffline - if errors.As(err, ¬FoundException) { - notFound = true - } else if errors.As(err, &offlineException) { - offline = true - } else { - var e *gogithub.RateLimitError - if errors.As(err, &e) { - // We log the underlying error when we failed calling GitHub API to list or unregisters, - // or the runner is still busy. - log.Error( - err, - fmt.Sprintf( - "Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls", - retryDelayOnGitHubAPIRateLimitError, - ), - ) - - return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err - } - - return ctrl.Result{}, err - } - } - - // See the `newPod` function called above for more information - // about when this hash changes. - curHash := pod.Labels[LabelKeyPodTemplateHash] - newHash := newPod.Labels[LabelKeyPodTemplateHash] - - if !runnerBusy && curHash != newHash { - restart = true - } - - registrationTimeout := 10 * time.Minute - durationAfterRegistrationTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout)) - registrationDidTimeout := durationAfterRegistrationTimeout > 0 - - if notFound { - if registrationDidTimeout { - log.Info( - "Runner failed to register itself to GitHub in timely manner. "+ - "Recreating the pod to see if it resolves the issue. "+ - "CAUTION: If you see this a lot, you should investigate the root cause. "+ - "See https://github.com/actions-runner-controller/actions-runner-controller/issues/288", - "podCreationTimestamp", pod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) - - restart = true - } else { - log.V(1).Info( - "Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.", - "runnerName", runner.Name, - ) - } - } else if offline { - if registrationOnly { - log.Info( - "Observed that registration-only runner for scaling-from-zero has successfully been registered.", - "podCreationTimestamp", pod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) - } else if registrationDidTimeout { - if runnerBusy { - log.Info( - "Timeout out while waiting for the runner to be online, but observed that it's busy at the same time."+ - "This is a known (unintuitive) behaviour of a runner that is already running a job. Please see https://github.com/actions-runner-controller/actions-runner-controller/issues/911", - "podCreationTimestamp", pod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) - } else { - log.Info( - "Already existing GitHub runner still appears offline . "+ - "Recreating the pod to see if it resolves the issue. "+ - "CAUTION: If you see this a lot, you should investigate the root cause. ", - "podCreationTimestamp", pod.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) - - restart = true - } - } else { - log.V(1).Info( - "Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...", - "runnerName", runner.Name, - ) - } - } - - if (notFound || (offline && !registrationOnly)) && !registrationDidTimeout { - registrationRecheckJitter := 10 * time.Second - if r.RegistrationRecheckJitter > 0 { - registrationRecheckJitter = r.RegistrationRecheckJitter - } - - registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1) + if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil { + log.Error(err, "Failed to update runner status for Phase/Reason/Message") + return ctrl.Result{}, err } } - // Don't do anything if there's no need to restart the runner - if !restart { - // This guard enables us to update runner.Status.Phase to `Running` only after - // the runner is registered to GitHub. - if registrationRecheckDelay > 0 { - log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay)) - - updated := runner.DeepCopy() - updated.Status.LastRegistrationCheckTime = &metav1.Time{Time: time.Now()} - - if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil { - log.Error(err, "Failed to update runner status for LastRegistrationCheckTime") - return ctrl.Result{}, err - } - - return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil - } - - if runner.Status.Phase != string(pod.Status.Phase) { - if pod.Status.Phase == corev1.PodRunning { - // Seeing this message, you can expect the runner to become `Running` soon. - log.Info( - "Runner appears to have registered and running.", - "podCreationTimestamp", pod.CreationTimestamp, - ) - } - - updated := runner.DeepCopy() - updated.Status.Phase = string(pod.Status.Phase) - updated.Status.Reason = pod.Status.Reason - updated.Status.Message = pod.Status.Message - - if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil { - log.Error(err, "Failed to update runner status for Phase/Reason/Message") - return ctrl.Result{}, err - } - } - - return ctrl.Result{}, nil - } - - updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod) - if res != nil { - return *res, err - } - - // Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service. - // This should help us avoid race condition between runner pickup job after we think the runner is not busy. - if err := r.Delete(ctx, updatedPod); err != nil { - log.Error(err, "Failed to delete pod resource") - return ctrl.Result{}, err - } - - r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name)) - log.Info("Deleted runner pod", "repository", runner.Spec.Repository) - return ctrl.Result{}, nil } @@ -480,11 +193,6 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName) if removed { - _, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod) - if res != nil { - return *res, err - } - newRunner := runner.DeepCopy() newRunner.ObjectMeta.Finalizers = finalizers @@ -499,60 +207,6 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con return ctrl.Result{}, nil } -func (r *RunnerReconciler) unregistrationTimeout() time.Duration { - unregistrationTimeout := DefaultUnregistrationTimeout - - if r.UnregistrationTimeout > 0 { - unregistrationTimeout = r.UnregistrationTimeout - } - return unregistrationTimeout -} - -func (r *RunnerReconciler) unregistrationRetryDelay() time.Duration { - retryDelay := DefaultUnregistrationRetryDelay - - if r.UnregistrationRetryDelay > 0 { - retryDelay = r.UnregistrationRetryDelay - } - return retryDelay -} - -func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) { - deletionTimeout := 1 * time.Minute - currentTime := time.Now() - deletionDidTimeout := currentTime.Sub(pod.DeletionTimestamp.Add(deletionTimeout)) > 0 - - if deletionDidTimeout { - log.Info( - fmt.Sprintf("Failed to delete pod within %s. ", deletionTimeout)+ - "This is typically the case when a Kubernetes node became unreachable "+ - "and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.", - "podDeletionTimestamp", pod.DeletionTimestamp, - "currentTime", currentTime, - "configuredDeletionTimeout", deletionTimeout, - ) - - var force int64 = 0 - // forcefully delete runner as we would otherwise get stuck if the node stays unreachable - if err := r.Delete(ctx, &pod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil { - // probably - if !kerrors.IsNotFound(err) { - log.Error(err, "Failed to forcefully delete pod resource ...") - return ctrl.Result{}, err - } - // forceful deletion finally succeeded - return ctrl.Result{Requeue: true}, nil - } - - r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", pod.Name)) - log.Info("Forcefully deleted runner pod", "repository", runner.Spec.Repository) - // give kube manager a little time to forcefully delete the stuck pod - return ctrl.Result{RequeueAfter: 3 * time.Second}, nil - } else { - return ctrl.Result{}, nil - } -} - func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) { if updated, err := r.updateRegistrationToken(ctx, runner); err != nil { return ctrl.Result{}, err @@ -584,6 +238,7 @@ func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1a r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodCreated", fmt.Sprintf("Created pod '%s'", newPod.Name)) log.Info("Created runner pod", "repository", runner.Spec.Repository) + return ctrl.Result{}, nil } @@ -696,7 +351,7 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) { registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly) - pod, err := newRunnerPod(template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL, registrationOnly) + pod, err := newRunnerPod(runner.Name, template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL, registrationOnly) if err != nil { return pod, err } @@ -813,7 +468,7 @@ func mutatePod(pod *corev1.Pod, token string) *corev1.Pod { return updated } -func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string, registrationOnly bool) (corev1.Pod, error) { +func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string, registrationOnly bool) (corev1.Pod, error) { var ( privileged bool = true dockerdInRunner bool = runnerSpec.DockerdWithinRunnerContainer != nil && *runnerSpec.DockerdWithinRunnerContainer @@ -822,6 +477,12 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default dockerdInRunnerPrivileged bool = dockerdInRunner ) + template = *template.DeepCopy() + + // This label selector is used by default when rd.Spec.Selector is empty. + template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunnerSetName, runnerName) + template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyPodMutation, LabelValuePodMutation) + workDir := runnerSpec.WorkDir if workDir == "" { workDir = "/runner/_work" diff --git a/controllers/runner_pod_owner.go b/controllers/runner_pod_owner.go index 7e26c11d..160fa0d8 100644 --- a/controllers/runner_pod_owner.go +++ b/controllers/runner_pod_owner.go @@ -10,6 +10,7 @@ import ( "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" @@ -26,6 +27,7 @@ type podsForOwner struct { runner *v1alpha1.Runner statefulSet *appsv1.StatefulSet owner owner + object client.Object synced bool pods []corev1.Pod } @@ -52,6 +54,9 @@ func (r *ownerRunner) pods(ctx context.Context, c client.Client) ([]corev1.Pod, var pod corev1.Pod if err := c.Get(ctx, types.NamespacedName{Namespace: r.Runner.Namespace, Name: r.Runner.Name}, &pod); err != nil { + if errors.IsNotFound(err) { + return nil, nil + } r.Log.Error(err, "Failed to get pod managed by runner") return nil, err } @@ -70,7 +75,7 @@ func (r *ownerRunner) withAnnotation(k, v string) client.Object { } func (r *ownerRunner) synced() bool { - return true + return r.Runner.Status.Phase != "" } type ownerStatefulSet struct { @@ -104,7 +109,7 @@ func (s *ownerStatefulSet) pods(ctx context.Context, c client.Client) ([]corev1. } func (s *ownerStatefulSet) templateHash() (string, bool) { - return getStatefulSetTemplateHash(s.StatefulSet) + return getRunnerTemplateHash(s.StatefulSet) } func (s *ownerStatefulSet) withAnnotation(k, v string) client.Object { @@ -132,23 +137,26 @@ func getPodsForOwner(ctx context.Context, c client.Client, log logr.Logger, o cl owner owner runner *v1alpha1.Runner statefulSet *appsv1.StatefulSet + object client.Object ) switch v := o.(type) { case *v1alpha1.Runner: owner = &ownerRunner{ - Object: v, Log: log, Runner: v, + Object: v, } runner = v + object = v case *appsv1.StatefulSet: owner = &ownerStatefulSet{ - Object: v, Log: log, StatefulSet: v, + Object: v, } statefulSet = v + object = v default: return nil, fmt.Errorf("BUG: Unsupported runner pods owner %v(%T)", v, v) } @@ -209,19 +217,14 @@ func getPodsForOwner(ctx context.Context, c client.Client, log logr.Logger, o cl runner: runner, statefulSet: statefulSet, owner: owner, + object: object, synced: synced, pods: pods, }, nil } -func getRunnerTemplateHash(r *v1alpha1.Runner) (string, bool) { - hash, ok := r.Labels[LabelKeyRunnerTemplateHash] - - return hash, ok -} - -func getStatefulSetTemplateHash(rs *appsv1.StatefulSet) (string, bool) { - hash, ok := rs.Labels[LabelKeyRunnerTemplateHash] +func getRunnerTemplateHash(r client.Object) (string, bool) { + hash, ok := r.GetLabels()[LabelKeyRunnerTemplateHash] return hash, ok } @@ -235,7 +238,18 @@ type result struct { currentObjects []*podsForOwner } -func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, effectiveTime *metav1.Time, newDesiredReplicas int, desiredTemplateHash string, create client.Object, ephemeral bool, owners []client.Object) (*result, error) { +// Why `create` must be a function rather than a client.Object? That's becase we use it to create one or more objects on scale up. +// +// We use client.Create to create a necessary number of client.Object. client.Create mutates the passed object on a successful creation. +// It seems to set .Revision at least, and the existence of .Revision let client.Create fail due to K8s restriction that an object being just created +// can't have .Revision. +// Now, imagine that you are to add 2 runner replicas on scale up. +// We create one resource object per a replica that ends up calling 2 client.Create calls. +// If we were reusing client.Object to be passed to client.Create calls, only the first call suceeeds. +// The second call fails due to the first call mutated the client.Object to have .Revision. +// Passing a factory function of client.Object and creating a brand-new client.Object per a client.Create call resolves this issue, +// allowing us to create two or more replicas in one reconcilation loop without being rejected by K8s. +func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, effectiveTime *metav1.Time, newDesiredReplicas int, create func() client.Object, ephemeral bool, owners []client.Object) (*result, error) { state, err := collectPodsForOwners(ctx, c, log, owners) if err != nil || state == nil { return nil, err @@ -265,6 +279,13 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, // Even though the error message includes "Forbidden", this error's reason is "Invalid". // So we used to match these errors by using errors.IsInvalid. But that's another story... + desiredTemplateHash, ok := getRunnerTemplateHash(create()) + if !ok { + log.Info("Failed to get template hash of desired owner resource. It must be in an invalid state. Please manually delete the owner so that it is recreated") + + return nil, nil + } + currentObjects := podsForOwnersPerTemplateHash[desiredTemplateHash] sort.SliceStable(currentObjects, func(i, j int) bool { @@ -289,13 +310,20 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, regTimeout += ss.regTimeout } + numOwners := len(owners) + + var hashes []string + for h, _ := range state.podsForOwners { + hashes = append(hashes, h) + } + log.V(2).Info( "Found some pods across owner(s)", "pending", pending, "running", running, "regTimeout", regTimeout, "desired", newDesiredReplicas, - "owners", len(owners), + "owners", numOwners, ) maybeRunning := pending + running @@ -307,15 +335,39 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, for i := 0; i < num; i++ { // Add more replicas - if err := c.Create(ctx, create); err != nil { + if err := c.Create(ctx, create()); err != nil { return nil, err } } - log.V(2).Info("Created object(s) to add more replicas", "num", num) + log.V(1).Info("Created replica(s)", + "created", num, + "templateHashDesired", desiredTemplateHash, + "replicasDesired", newDesiredReplicas, + "replicasMaybeRunning", maybeRunning, + "templateHashObserved", hashes, + ) return nil, nil } else if newDesiredReplicas <= running { + // If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally, + // you're unlikely to fall into this branch. + // + // That's because all the stakeholders work like this: + // + // 1. A runner pod completes with the runner container exiting with code 0 + // 2. ARC runner controller detects the pod completion, marks the owner(runner or statefulset) resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero) + // 3. GitHub triggers a corresponding workflow_job "complete" webhook event + // 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation + // 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations + // 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas + // 7. (We're here) ARC runnerset or runnerreplicaset controller starts reconciling the owner resource (statefulset or runner) + // + // In a normally working ARC installation, the runner that was used to run the workflow job should already have been + // marked for deletion by the runner controller. + // This runnerreplicaset controller doesn't count marked runners into the `running` value, hence you're unlikely to + // fall into this branch when you're using ephemeral runners with webhook-based-autoscaler. + var retained int var delete []*podsForOwner @@ -354,7 +406,7 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); !ok { updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339)) - if err := c.Patch(ctx, updated, client.MergeFrom(ss.owner)); err != nil { + if err := c.Patch(ctx, updated, client.MergeFrom(ss.object)); err != nil { log.Error(err, fmt.Sprintf("Failed to patch object to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp)) return nil, err } @@ -379,7 +431,7 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, for _, ss := range sss { if ss.templateHash != desiredTemplateHash { if ss.owner.GetDeletionTimestamp().IsZero() { - if err := c.Delete(ctx, ss.owner); err != nil { + if err := c.Delete(ctx, ss.object); err != nil { log.Error(err, "Unable to delete object") return nil, err } @@ -417,6 +469,12 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, return nil, err } + if res.templateHash == "" { + log.Info("validation error: runner pod owner must have template hash", "object", res.object) + + return nil, nil + } + // Statefulset termination process 4/4: Let Kubernetes cascade-delete the statefulset and the pods. // // If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion) @@ -429,7 +487,7 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, // Statefulset termination process 3/4: Set the deletionTimestamp to let Kubernetes start a cascade deletion of the statefulset and the pods. if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); ok { - if err := c.Delete(ctx, res.owner); err != nil { + if err := c.Delete(ctx, res.object); err != nil { log.Error(err, "Failed to delete owner") return nil, err } @@ -454,7 +512,7 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); !ok { updated := res.owner.withAnnotation(AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339)) - if err := c.Patch(ctx, updated, client.MergeFrom(res.owner)); err != nil { + if err := c.Patch(ctx, updated, client.MergeFrom(res.object)); err != nil { log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationCompleteTimestamp)) return nil, err } @@ -494,6 +552,8 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger, } if !res.synced { + log.V(1).Info("Skipped reconcilation because owner is not synced yet", "pods", res.pods) + return nil, nil } diff --git a/controllers/runnerreplicaset_controller.go b/controllers/runnerreplicaset_controller.go index ecaa70ba..3433fc58 100644 --- a/controllers/runnerreplicaset_controller.go +++ b/controllers/runnerreplicaset_controller.go @@ -18,13 +18,10 @@ package controllers import ( "context" - "errors" - "fmt" "reflect" "time" "github.com/go-logr/logr" - gogithub "github.com/google/go-github/v39/github" kerrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" @@ -32,7 +29,6 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1" @@ -72,15 +68,35 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req return ctrl.Result{}, nil } + if rs.ObjectMeta.Labels == nil { + rs.ObjectMeta.Labels = map[string]string{} + } + + // Template hash is usually set by the upstream controller(RunnerDeplloyment controller) on authoring + // RunerReplicaset resource, but it may be missing when the user directly created RunnerReplicaSet. + // As a template hash is required by by the runner replica management, we dynamically add it here without ever persisting it. + if rs.ObjectMeta.Labels[LabelKeyRunnerTemplateHash] == "" { + template := rs.Spec.DeepCopy() + template.Replicas = nil + template.EffectiveTime = nil + templateHash := ComputeHash(template) + + log.Info("Using auto-generated template hash", "value", templateHash) + + rs.ObjectMeta.Labels = CloneAndAddLabel(rs.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash) + rs.Spec.Template.ObjectMeta.Labels = CloneAndAddLabel(rs.Spec.Template.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash) + } + selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector) if err != nil { return ctrl.Result{}, err } + // Get the Runners managed by the target RunnerReplicaSet - var allRunners v1alpha1.RunnerList + var runnerList v1alpha1.RunnerList if err := r.List( ctx, - &allRunners, + &runnerList, client.InNamespace(req.Namespace), client.MatchingLabelsSelector{Selector: selector}, ); err != nil { @@ -89,218 +105,43 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req } } - var ( - current int - ready int - available int - - lastSyncTime *time.Time - ) - - for _, r := range allRunners.Items { - // This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before - // to not treat all the runners in the namespace as its children. - if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) { - // If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion) - // or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop), - // we don't need to bother calling GitHub API to re-mark the runner for deletion. - // Just hold on, and runners will disappear as long as the runner controller is up and running. - if !r.DeletionTimestamp.IsZero() { - continue - } - - if r.Annotations != nil { - if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok { - t, err := time.Parse(time.RFC3339, a) - if err == nil { - if lastSyncTime == nil || lastSyncTime.Before(t) { - lastSyncTime = &t - } - } - } - } - - current += 1 - - if r.Status.Phase == string(corev1.PodRunning) { - ready += 1 - // available is currently the same as ready, as we don't yet have minReadySeconds for runners - available += 1 - } - } - } - - var desired int - + replicas := 1 if rs.Spec.Replicas != nil { - desired = *rs.Spec.Replicas - } else { - desired = 1 - } - - // TODO: remove this registration runner cleanup later (v0.23.0 or v0.24.0) - // - // We had to have a registration-only runner to support scale-from-zero before. - // But since Sep 2021 Actions update on GitHub Cloud and GHES 3.3, it is unneceesary. - // See the below issues for more contexts: - // https://github.com/actions-runner-controller/actions-runner-controller/issues/516 - // https://github.com/actions-runner-controller/actions-runner-controller/issues/859 - // - // In the below block, we have a logic to remove existing registration-only runners as unnecessary. - // This logic is introduced since actions-runner-controller 0.21.0 and probably last one or two minor releases - // so that actions-runner-controller instance in everyone's cluster won't leave dangling registration-only runners. - registrationOnlyRunnerNsName := req.NamespacedName - registrationOnlyRunnerNsName.Name = registrationOnlyRunnerNameFor(rs.Name) - registrationOnlyRunner := v1alpha1.Runner{} - registrationOnlyRunnerExists := false - if err := r.Get( - ctx, - registrationOnlyRunnerNsName, - ®istrationOnlyRunner, - ); err != nil { - if !kerrors.IsNotFound(err) { - return ctrl.Result{}, err - } - } else { - registrationOnlyRunnerExists = true - } - - if registrationOnlyRunnerExists { - if err := r.Client.Delete(ctx, ®istrationOnlyRunner); err != nil { - log.Error(err, "Retrying soon because we failed to delete registration-only runner") - - return ctrl.Result{Requeue: true}, nil - } + replicas = *rs.Spec.Replicas } effectiveTime := rs.Spec.EffectiveTime ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral - if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) { - log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready) - } else if current > desired { - // If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally, - // you're unlikely to fall into this branch. - // - // That's becaseu all the stakeholders work like this: - // - // 1. A runner pod completes with the runner container exiting with code 0 - // 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero) - // 3. GitHub triggers a corresponding workflow_job "complete" webhook event - // 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation - // 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations - // 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas - // 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet - // - // In a normally working ARC installation, the runner that was used to run the workflow job should already have been - // marked for deletion by the runner controller. - // This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to - // fall into this branch when you're using ephemeral runners with webhook-based-autoscaler. + desired, err := r.newRunner(rs) + if err != nil { + log.Error(err, "Could not create runner") - n := current - desired - - log.V(0).Info(fmt.Sprintf("Deleting %d runners from RunnerReplicaSet %s", n, req.NamespacedName), "desired", desired, "current", current, "ready", ready) - - // get runners that are currently offline/not busy/timed-out to register - var deletionCandidates []v1alpha1.Runner - - for _, runner := range allRunners.Items { - busy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name) - if err != nil { - notRegistered := false - offline := false - - var notFoundException *github.RunnerNotFound - var offlineException *github.RunnerOffline - if errors.As(err, ¬FoundException) { - log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name) - notRegistered = true - } else if errors.As(err, &offlineException) { - offline = true - } else { - var e *gogithub.RateLimitError - if errors.As(err, &e) { - // We log the underlying error when we failed calling GitHub API to list or unregisters, - // or the runner is still busy. - log.Error( - err, - fmt.Sprintf( - "Failed to check if runner is busy due to GitHub API rate limit. Retrying in %s to avoid excessive GitHub API calls", - retryDelayOnGitHubAPIRateLimitError, - ), - ) - - return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err - } - - return ctrl.Result{}, err - } - - registrationTimeout := 15 * time.Minute - currentTime := time.Now() - registrationDidTimeout := currentTime.Sub(runner.CreationTimestamp.Add(registrationTimeout)) > 0 - - if notRegistered && registrationDidTimeout { - log.Info( - "Runner failed to register itself to GitHub in timely manner. "+ - "Marking the runner for scale down. "+ - "CAUTION: If you see this a lot, you should investigate the root cause. "+ - "See https://github.com/actions-runner-controller/actions-runner-controller/issues/288", - "runnerCreationTimestamp", runner.CreationTimestamp, - "currentTime", currentTime, - "configuredRegistrationTimeout", registrationTimeout, - ) - - deletionCandidates = append(deletionCandidates, runner) - } - - // offline runners should always be a great target for scale down - if offline { - deletionCandidates = append(deletionCandidates, runner) - } - } else if !busy { - deletionCandidates = append(deletionCandidates, runner) - } - } - - if len(deletionCandidates) < n { - n = len(deletionCandidates) - } - - log.V(0).Info(fmt.Sprintf("Deleting %d runner(s)", n), "desired", desired, "current", current, "ready", ready) - - for i := 0; i < n; i++ { - if err := r.Client.Delete(ctx, &deletionCandidates[i]); client.IgnoreNotFound(err) != nil { - log.Error(err, "Failed to delete runner resource") - - return ctrl.Result{}, err - } - - r.Recorder.Event(&rs, corev1.EventTypeNormal, "RunnerDeleted", fmt.Sprintf("Deleted runner '%s'", deletionCandidates[i].Name)) - log.Info(fmt.Sprintf("Deleted runner %s", deletionCandidates[i].Name)) - } - } else if desired > current { - n := desired - current - - log.V(0).Info(fmt.Sprintf("Creating %d runner(s)", n), "desired", desired, "available", current, "ready", ready) - - for i := 0; i < n; i++ { - newRunner, err := r.newRunner(rs) - if err != nil { - log.Error(err, "Could not create runner") - - return ctrl.Result{}, err - } - - if err := r.Client.Create(ctx, &newRunner); err != nil { - log.Error(err, "Failed to create runner resource") - - return ctrl.Result{}, err - } - } + return ctrl.Result{}, err } - var status v1alpha1.RunnerReplicaSetStatus + var live []client.Object + for _, r := range runnerList.Items { + r := r + live = append(live, &r) + } + + res, err := syncRunnerPodsOwners(ctx, r.Client, log, effectiveTime, replicas, func() client.Object { return desired.DeepCopy() }, ephemeral, live) + if err != nil || res == nil { + return ctrl.Result{}, err + } + + var ( + status v1alpha1.RunnerReplicaSetStatus + + current, available, ready int + ) + + for _, o := range res.currentObjects { + current += o.total + available += o.running + ready += o.running + } status.Replicas = ¤t status.AvailableReplicas = &available @@ -322,6 +163,8 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req } func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1alpha1.Runner, error) { + // Note that the upstream controller (runnerdeployment) is expected to add + // the "runner template hash" label to the template.meta which is necessary to make this controller work correctly objectMeta := rs.Spec.Template.ObjectMeta.DeepCopy() objectMeta.GenerateName = rs.ObjectMeta.Name + "-" diff --git a/controllers/runnerreplicaset_controller_test.go b/controllers/runnerreplicaset_controller_test.go index 7405650c..dc100d2b 100644 --- a/controllers/runnerreplicaset_controller_test.go +++ b/controllers/runnerreplicaset_controller_test.go @@ -7,7 +7,6 @@ import ( "time" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -102,12 +101,40 @@ func intPtr(v int) *int { var _ = Context("Inside of a new namespace", func() { ctx := context.TODO() ns := SetupTest(ctx) + name := "example-runnerreplicaset" - Describe("when no existing resources exist", func() { + getRunnerCount := func() int { + runners := actionsv1alpha1.RunnerList{Items: []actionsv1alpha1.Runner{}} - It("should create a new Runner resource from the specified template, add a another Runner on replicas increased, and removes all the replicas when set to 0", func() { - name := "example-runnerreplicaset" + selector, err := metav1.LabelSelectorAsSelector( + &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "foo": "bar", + }, + }, + ) + if err != nil { + logf.Log.Error(err, "failed to create labelselector") + return -1 + } + err = k8sClient.List( + ctx, + &runners, + client.InNamespace(ns.Name), + client.MatchingLabelsSelector{Selector: selector}, + ) + if err != nil { + logf.Log.Error(err, "list runners") + } + + runnersList.Sync(runners.Items) + + return len(runners.Items) + } + + Describe("RunnerReplicaSet", func() { + It("should create a new Runner resource from the specified template", func() { { rs := &actionsv1alpha1.RunnerReplicaSet{ ObjectMeta: metav1.ObjectMeta{ @@ -146,126 +173,99 @@ var _ = Context("Inside of a new namespace", func() { Expect(err).NotTo(HaveOccurred(), "failed to create test RunnerReplicaSet resource") - runners := actionsv1alpha1.RunnerList{Items: []actionsv1alpha1.Runner{}} - Eventually( - func() int { - selector, err := metav1.LabelSelectorAsSelector( - &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "foo": "bar", - }, - }, - ) - if err != nil { - logf.Log.Error(err, "failed to create labelselector") - return -1 - } - err = k8sClient.List( - ctx, - &runners, - client.InNamespace(ns.Name), - client.MatchingLabelsSelector{Selector: selector}, - ) - if err != nil { - logf.Log.Error(err, "list runners") - return -1 - } - - runnersList.Sync(runners.Items) - - return len(runners.Items) - }, - time.Second*5, time.Millisecond*500).Should(BeEquivalentTo(1)) + getRunnerCount, + time.Second*5, time.Second).Should(BeEquivalentTo(1)) } + }) + It("should create 2 runners when specified 2 replicas", func() { { - // We wrap the update in the Eventually block to avoid the below error that occurs due to concurrent modification - // made by the controller to update .Status.AvailableReplicas and .Status.ReadyReplicas - // Operation cannot be fulfilled on runnerreplicasets.actions.summerwind.dev "example-runnerreplicaset": the object has been modified; please apply your changes to the latest version and try again - Eventually(func() error { - var rs actionsv1alpha1.RunnerReplicaSet - - err := k8sClient.Get(ctx, types.NamespacedName{Namespace: ns.Name, Name: name}, &rs) - - Expect(err).NotTo(HaveOccurred(), "failed to get test RunnerReplicaSet resource") - - rs.Spec.Replicas = intPtr(2) - - return k8sClient.Update(ctx, &rs) - }, - time.Second*1, time.Millisecond*500).Should(BeNil()) - - runners := actionsv1alpha1.RunnerList{Items: []actionsv1alpha1.Runner{}} - - Eventually( - func() int { - selector, err := metav1.LabelSelectorAsSelector( - &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "foo": "bar", - }, - }, - ) - if err != nil { - logf.Log.Error(err, "failed to create labelselector") - return -1 - } - err = k8sClient.List( - ctx, - &runners, - client.InNamespace(ns.Name), - client.MatchingLabelsSelector{Selector: selector}, - ) - if err != nil { - logf.Log.Error(err, "list runners") - } - - runnersList.Sync(runners.Items) - - return len(runners.Items) + rs := &actionsv1alpha1.RunnerReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns.Name, }, - time.Second*5, time.Millisecond*500).Should(BeEquivalentTo(2)) - } - - { - // We wrap the update in the Eventually block to avoid the below error that occurs due to concurrent modification - // made by the controller to update .Status.AvailableReplicas and .Status.ReadyReplicas - // Operation cannot be fulfilled on runnersets.actions.summerwind.dev "example-runnerset": the object has been modified; please apply your changes to the latest version and try again - Eventually(func() error { - var rs actionsv1alpha1.RunnerReplicaSet - - err := k8sClient.Get(ctx, types.NamespacedName{Namespace: ns.Name, Name: name}, &rs) - - Expect(err).NotTo(HaveOccurred(), "failed to get test RunnerReplicaSet resource") - - rs.Spec.Replicas = intPtr(0) - - return k8sClient.Update(ctx, &rs) - }, - time.Second*1, time.Millisecond*500).Should(BeNil()) - - runners := actionsv1alpha1.RunnerList{Items: []actionsv1alpha1.Runner{}} - - Eventually( - func() int { - selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ + Spec: actionsv1alpha1.RunnerReplicaSetSpec{ + Replicas: intPtr(2), + Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{ "foo": "bar", }, - }) - Expect(err).ToNot(HaveOccurred()) - - if err := k8sClient.List(ctx, &runners, client.InNamespace(ns.Name), client.MatchingLabelsSelector{Selector: selector}); err != nil { - logf.Log.Error(err, "list runners") - return -1 - } - - runnersList.Sync(runners.Items) - - return len(runners.Items) + }, + Template: actionsv1alpha1.RunnerTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "foo": "bar", + }, + }, + Spec: actionsv1alpha1.RunnerSpec{ + RunnerConfig: actionsv1alpha1.RunnerConfig{ + Repository: "test/valid", + Image: "bar", + }, + RunnerPodSpec: actionsv1alpha1.RunnerPodSpec{ + Env: []corev1.EnvVar{ + {Name: "FOO", Value: "FOOVALUE"}, + }, + }, + }, + }, }, - time.Second*5, time.Millisecond*500).Should(BeEquivalentTo(0)) + } + + err := k8sClient.Create(ctx, rs) + + Expect(err).NotTo(HaveOccurred(), "failed to create test RunnerReplicaSet resource") + + Eventually( + getRunnerCount, + time.Second*5, time.Second).Should(BeEquivalentTo(2)) + } + }) + + It("should not create any runners when specified 0 replicas", func() { + { + rs := &actionsv1alpha1.RunnerReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns.Name, + }, + Spec: actionsv1alpha1.RunnerReplicaSetSpec{ + Replicas: intPtr(0), + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "foo": "bar", + }, + }, + Template: actionsv1alpha1.RunnerTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "foo": "bar", + }, + }, + Spec: actionsv1alpha1.RunnerSpec{ + RunnerConfig: actionsv1alpha1.RunnerConfig{ + Repository: "test/valid", + Image: "bar", + }, + RunnerPodSpec: actionsv1alpha1.RunnerPodSpec{ + Env: []corev1.EnvVar{ + {Name: "FOO", Value: "FOOVALUE"}, + }, + }, + }, + }, + }, + } + + err := k8sClient.Create(ctx, rs) + + Expect(err).NotTo(HaveOccurred(), "failed to create test RunnerReplicaSet resource") + + Consistently( + getRunnerCount, + time.Second*5, time.Second).Should(BeEquivalentTo(0)) } }) }) diff --git a/controllers/runnerset_controller.go b/controllers/runnerset_controller.go index 77d2920a..c5e2a9d6 100644 --- a/controllers/runnerset_controller.go +++ b/controllers/runnerset_controller.go @@ -105,13 +105,6 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, err } - desiredTemplateHash, ok := getStatefulSetTemplateHash(desiredStatefulSet) - if !ok { - log.Info("Failed to get template hash of desired statefulset. It must be in an invalid state. Please manually delete the statefulset so that it is recreated") - - return ctrl.Result{}, nil - } - addedReplicas := int32(1) create := desiredStatefulSet.DeepCopy() create.Spec.Replicas = &addedReplicas @@ -136,7 +129,7 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( owners = append(owners, &ss) } - res, err := syncRunnerPodsOwners(ctx, r.Client, log, effectiveTime, newDesiredReplicas, desiredTemplateHash, create, ephemeral, owners) + res, err := syncRunnerPodsOwners(ctx, r.Client, log, effectiveTime, newDesiredReplicas, func() client.Object { return create.DeepCopy() }, ephemeral, owners) if err != nil || res == nil { return ctrl.Result{}, err } @@ -192,17 +185,12 @@ func (r *RunnerSetReconciler) newStatefulSet(runnerSet *v1alpha1.RunnerSet) (*ap runnerSetWithOverrides.Labels = append(runnerSetWithOverrides.Labels, l) } - // This label selector is used by default when rd.Spec.Selector is empty. - runnerSetWithOverrides.Template.ObjectMeta.Labels = CloneAndAddLabel(runnerSetWithOverrides.Template.ObjectMeta.Labels, LabelKeyRunnerSetName, runnerSet.Name) - - runnerSetWithOverrides.Template.ObjectMeta.Labels = CloneAndAddLabel(runnerSetWithOverrides.Template.ObjectMeta.Labels, LabelKeyPodMutation, LabelValuePodMutation) - template := corev1.Pod{ ObjectMeta: runnerSetWithOverrides.StatefulSetSpec.Template.ObjectMeta, Spec: runnerSetWithOverrides.StatefulSetSpec.Template.Spec, } - pod, err := newRunnerPod(template, runnerSet.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubBaseURL, false) + pod, err := newRunnerPod(runnerSet.Name, template, runnerSet.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubBaseURL, false) if err != nil { return nil, err } diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index db3753b4..743c01ce 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -265,6 +265,8 @@ func (e *env) installActionsRunnerController(t *testing.T) { if e.useRunnerSet { scriptEnv = append(scriptEnv, "USE_RUNNERSET=1") + } else { + scriptEnv = append(scriptEnv, "USE_RUNNERSET=false") } varEnv := []string{