refactor: Make RunnerReplicaSet and Runner backed by the same logic that backs RunnerSet
This commit is contained in:
parent
c95e84a528
commit
14a878bfae
|
|
@ -84,8 +84,7 @@ if [ -n "${TEST_REPO}" ]; then
|
||||||
cat acceptance/testdata/runnerset.envsubst.yaml | TEST_ENTERPRISE= TEST_ORG= RUNNER_MIN_REPLICAS=${REPO_RUNNER_MIN_REPLICAS} NAME=repo-runnerset envsubst | kubectl apply -f -
|
cat acceptance/testdata/runnerset.envsubst.yaml | TEST_ENTERPRISE= TEST_ORG= RUNNER_MIN_REPLICAS=${REPO_RUNNER_MIN_REPLICAS} NAME=repo-runnerset envsubst | kubectl apply -f -
|
||||||
else
|
else
|
||||||
echo 'Deploying runnerdeployment and hra. Set USE_RUNNERSET if you want to deploy runnerset instead.'
|
echo 'Deploying runnerdeployment and hra. Set USE_RUNNERSET if you want to deploy runnerset instead.'
|
||||||
cat acceptance/testdata/repo.runnerdeploy.yaml | envsubst | kubectl apply -f -
|
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_ORG= RUNNER_MIN_REPLICAS=${REPO_RUNNER_MIN_REPLICAS} NAME=repo-runnerdeploy envsubst | kubectl apply -f -
|
||||||
cat acceptance/testdata/repo.hra.yaml | envsubst | kubectl apply -f -
|
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo 'Skipped deploying runnerdeployment and hra. Set TEST_REPO to "yourorg/yourrepo" to deploy.'
|
echo 'Skipped deploying runnerdeployment and hra. Set TEST_REPO to "yourorg/yourrepo" to deploy.'
|
||||||
|
|
|
||||||
|
|
@ -1,36 +0,0 @@
|
||||||
apiVersion: actions.summerwind.dev/v1alpha1
|
|
||||||
kind: HorizontalRunnerAutoscaler
|
|
||||||
metadata:
|
|
||||||
name: org
|
|
||||||
spec:
|
|
||||||
scaleTargetRef:
|
|
||||||
name: org-runnerdeploy
|
|
||||||
scaleUpTriggers:
|
|
||||||
- githubEvent:
|
|
||||||
checkRun:
|
|
||||||
types: ["created"]
|
|
||||||
status: "queued"
|
|
||||||
amount: 1
|
|
||||||
duration: "1m"
|
|
||||||
scheduledOverrides:
|
|
||||||
- startTime: "2021-05-11T16:05:00+09:00"
|
|
||||||
endTime: "2021-05-11T16:40:00+09:00"
|
|
||||||
minReplicas: 2
|
|
||||||
- startTime: "2021-05-01T00:00:00+09:00"
|
|
||||||
endTime: "2021-05-03T00:00:00+09:00"
|
|
||||||
recurrenceRule:
|
|
||||||
frequency: Weekly
|
|
||||||
untilTime: "2022-05-01T00:00:00+09:00"
|
|
||||||
minReplicas: 0
|
|
||||||
minReplicas: 0
|
|
||||||
maxReplicas: 5
|
|
||||||
# Used to test that HRA is working for org runners
|
|
||||||
metrics:
|
|
||||||
- type: PercentageRunnersBusy
|
|
||||||
scaleUpThreshold: '0.75'
|
|
||||||
scaleDownThreshold: '0.3'
|
|
||||||
scaleUpFactor: '2'
|
|
||||||
scaleDownFactor: '0.5'
|
|
||||||
- type: TotalNumberOfQueuedAndInProgressWorkflowRuns
|
|
||||||
repositoryNames:
|
|
||||||
- ${TEST_ORG_REPO}
|
|
||||||
|
|
@ -1,44 +0,0 @@
|
||||||
apiVersion: actions.summerwind.dev/v1alpha1
|
|
||||||
kind: RunnerDeployment
|
|
||||||
metadata:
|
|
||||||
name: org-runnerdeploy
|
|
||||||
spec:
|
|
||||||
# replicas: 1
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
organization: ${TEST_ORG}
|
|
||||||
|
|
||||||
#
|
|
||||||
# Custom runner image
|
|
||||||
#
|
|
||||||
image: ${RUNNER_NAME}:${RUNNER_TAG}
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
|
|
||||||
# Whether to pass --ephemeral (true) or --once (false, deprecated)
|
|
||||||
env:
|
|
||||||
- name: RUNNER_FEATURE_FLAG_EPHEMERAL
|
|
||||||
value: "${RUNNER_FEATURE_FLAG_EPHEMERAL}"
|
|
||||||
|
|
||||||
#
|
|
||||||
# dockerd within runner container
|
|
||||||
#
|
|
||||||
## Replace `mumoshu/actions-runner-dind:dev` with your dind image
|
|
||||||
#dockerdWithinRunnerContainer: true
|
|
||||||
#image: mumoshu/actions-runner-dind:dev
|
|
||||||
|
|
||||||
#
|
|
||||||
# Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu)
|
|
||||||
#
|
|
||||||
#dockerMTU: 1450
|
|
||||||
|
|
||||||
#Runner group
|
|
||||||
# labels:
|
|
||||||
# - "mylabel 1"
|
|
||||||
# - "mylabel 2"
|
|
||||||
labels:
|
|
||||||
- "${RUNNER_LABEL}"
|
|
||||||
|
|
||||||
#
|
|
||||||
# Non-standard working directory
|
|
||||||
#
|
|
||||||
# workDir: "/"
|
|
||||||
|
|
@ -1,25 +0,0 @@
|
||||||
apiVersion: actions.summerwind.dev/v1alpha1
|
|
||||||
kind: HorizontalRunnerAutoscaler
|
|
||||||
metadata:
|
|
||||||
name: actions-runner-aos-autoscaler
|
|
||||||
spec:
|
|
||||||
scaleTargetRef:
|
|
||||||
name: example-runnerdeploy
|
|
||||||
scaleUpTriggers:
|
|
||||||
- githubEvent:
|
|
||||||
checkRun:
|
|
||||||
types: ["created"]
|
|
||||||
status: "queued"
|
|
||||||
amount: 1
|
|
||||||
duration: "1m"
|
|
||||||
minReplicas: 0
|
|
||||||
maxReplicas: 5
|
|
||||||
metrics:
|
|
||||||
- type: PercentageRunnersBusy
|
|
||||||
scaleUpThreshold: '0.75'
|
|
||||||
scaleDownThreshold: '0.3'
|
|
||||||
scaleUpFactor: '2'
|
|
||||||
scaleDownFactor: '0.5'
|
|
||||||
- type: TotalNumberOfQueuedAndInProgressWorkflowRuns
|
|
||||||
repositoryNames:
|
|
||||||
- ${TEST_REPO}
|
|
||||||
|
|
@ -1,44 +0,0 @@
|
||||||
apiVersion: actions.summerwind.dev/v1alpha1
|
|
||||||
kind: RunnerDeployment
|
|
||||||
metadata:
|
|
||||||
name: example-runnerdeploy
|
|
||||||
spec:
|
|
||||||
# replicas: 1
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
repository: ${TEST_REPO}
|
|
||||||
|
|
||||||
#
|
|
||||||
# Custom runner image
|
|
||||||
#
|
|
||||||
image: ${RUNNER_NAME}:${RUNNER_TAG}
|
|
||||||
imagePullPolicy: IfNotPresent
|
|
||||||
|
|
||||||
# Whether to pass --ephemeral (true) or --once (false, deprecated)
|
|
||||||
env:
|
|
||||||
- name: RUNNER_FEATURE_FLAG_EPHEMERAL
|
|
||||||
value: "${RUNNER_FEATURE_FLAG_EPHEMERAL}"
|
|
||||||
|
|
||||||
#
|
|
||||||
# dockerd within runner container
|
|
||||||
#
|
|
||||||
## Replace `mumoshu/actions-runner-dind:dev` with your dind image
|
|
||||||
#dockerdWithinRunnerContainer: true
|
|
||||||
#image: mumoshu/actions-runner-dind:dev
|
|
||||||
|
|
||||||
#
|
|
||||||
# Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu)
|
|
||||||
#
|
|
||||||
#dockerMTU: 1450
|
|
||||||
|
|
||||||
#Runner group
|
|
||||||
# labels:
|
|
||||||
# - "mylabel 1"
|
|
||||||
# - "mylabel 2"
|
|
||||||
labels:
|
|
||||||
- "${RUNNER_LABEL}"
|
|
||||||
|
|
||||||
#
|
|
||||||
# Non-standard working directory
|
|
||||||
#
|
|
||||||
# workDir: "/"
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
||||||
apiVersion: actions.summerwind.dev/v1alpha1
|
|
||||||
kind: HorizontalRunnerAutoscaler
|
|
||||||
metadata:
|
|
||||||
name: example-runnerset
|
|
||||||
spec:
|
|
||||||
scaleTargetRef:
|
|
||||||
kind: RunnerSet
|
|
||||||
name: example-runnerset
|
|
||||||
scaleUpTriggers:
|
|
||||||
- githubEvent:
|
|
||||||
checkRun:
|
|
||||||
types: ["created"]
|
|
||||||
status: "queued"
|
|
||||||
amount: 1
|
|
||||||
duration: "1m"
|
|
||||||
# RunnerSet doesn't support scale from/to zero yet
|
|
||||||
minReplicas: 1
|
|
||||||
maxReplicas: 5
|
|
||||||
# This should be less than 600(seconds, the default) for faster testing
|
|
||||||
scaleDownDelaySecondsAfterScaleOut: 60
|
|
||||||
metrics:
|
|
||||||
- type: PercentageRunnersBusy
|
|
||||||
scaleUpThreshold: '0.75'
|
|
||||||
scaleDownThreshold: '0.3'
|
|
||||||
scaleUpFactor: '2'
|
|
||||||
scaleDownFactor: '0.5'
|
|
||||||
- type: TotalNumberOfQueuedAndInProgressWorkflowRuns
|
|
||||||
repositoryNames:
|
|
||||||
- ${TEST_REPO}
|
|
||||||
|
|
@ -18,15 +18,12 @@ package controllers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/actions-runner-controller/actions-runner-controller/hash"
|
"github.com/actions-runner-controller/actions-runner-controller/hash"
|
||||||
"github.com/go-logr/logr"
|
"github.com/go-logr/logr"
|
||||||
gogithub "github.com/google/go-github/v39/github"
|
|
||||||
"k8s.io/apimachinery/pkg/util/wait"
|
|
||||||
|
|
||||||
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
||||||
"k8s.io/apimachinery/pkg/runtime"
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
|
|
@ -92,12 +89,6 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||||
return ctrl.Result{}, client.IgnoreNotFound(err)
|
return ctrl.Result{}, client.IgnoreNotFound(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
err := runner.Validate()
|
|
||||||
if err != nil {
|
|
||||||
log.Info("Failed to validate runner spec", "error", err.Error())
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if runner.ObjectMeta.DeletionTimestamp.IsZero() {
|
if runner.ObjectMeta.DeletionTimestamp.IsZero() {
|
||||||
finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
||||||
|
|
||||||
|
|
@ -125,34 +116,6 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||||
return r.processRunnerDeletion(runner, ctx, log, &pod)
|
return r.processRunnerDeletion(runner, ctx, log, &pod)
|
||||||
}
|
}
|
||||||
|
|
||||||
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
|
|
||||||
if registrationOnly && runner.Status.Phase != "" {
|
|
||||||
// At this point we are sure that the registration-only runner has successfully configured and
|
|
||||||
// is of `offline` status, because we set runner.Status.Phase to that of the runner pod only after
|
|
||||||
// successful registration.
|
|
||||||
|
|
||||||
var pod corev1.Pod
|
|
||||||
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
|
|
||||||
if !kerrors.IsNotFound(err) {
|
|
||||||
log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err))
|
|
||||||
|
|
||||||
return ctrl.Result{Requeue: true}, nil
|
|
||||||
}
|
|
||||||
} else if err := r.Delete(ctx, &pod); err != nil {
|
|
||||||
if !kerrors.IsNotFound(err) {
|
|
||||||
log.Info(fmt.Sprintf("Retrying soon as we failed to delete registration-only runner pod: %v", err))
|
|
||||||
|
|
||||||
return ctrl.Result{Requeue: true}, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Info("Successfully deleted registration-only runner pod to free node and cluster resource")
|
|
||||||
|
|
||||||
// Return here to not recreate the deleted pod, because recreating it is the waste of cluster and node resource,
|
|
||||||
// and also defeats the original purpose of scale-from/to-zero we're trying to implement by using the registration-only runner.
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var pod corev1.Pod
|
var pod corev1.Pod
|
||||||
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
|
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
|
||||||
if !kerrors.IsNotFound(err) {
|
if !kerrors.IsNotFound(err) {
|
||||||
|
|
@ -162,254 +125,22 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||||
return r.processRunnerCreation(ctx, runner, log)
|
return r.processRunnerCreation(ctx, runner, log)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pod already exists
|
phase := string(pod.Status.Phase)
|
||||||
|
if phase == "" {
|
||||||
if !pod.ObjectMeta.DeletionTimestamp.IsZero() {
|
phase = "Created"
|
||||||
return r.processRunnerPodDeletion(ctx, runner, log, pod)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// If pod has ended up succeeded we need to restart it
|
if runner.Status.Phase != phase {
|
||||||
// Happens e.g. when dind is in runner and run completes
|
|
||||||
stopped := runnerPodOrContainerIsStopped(&pod)
|
|
||||||
|
|
||||||
ephemeral := runner.Spec.Ephemeral == nil || *runner.Spec.Ephemeral
|
|
||||||
|
|
||||||
if stopped && ephemeral {
|
|
||||||
log.V(1).Info("Ephemeral runner has been stopped successfully. Marking this runner for deletion.")
|
|
||||||
|
|
||||||
// This is the key to make ephemeral runners to work reliably with webhook-based autoscale.
|
|
||||||
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1046161384 for more context.
|
|
||||||
//
|
|
||||||
// In the next reconcilation loop, this triggers a runner unregistration.
|
|
||||||
// (Note that the unregistration can fail safely because an ephemeral runner usually unregisters itself from GitHub but we do it just for confirmation)
|
|
||||||
//
|
|
||||||
// See the code path above that is executed when `runner.ObjectMeta.DeletionTimestamp.IsZero()` isn't true,
|
|
||||||
// which handles the unregistrationa the removal of the completed pod, and so on.
|
|
||||||
if err := r.Delete(ctx, &runner); err != nil {
|
|
||||||
log.V(1).Error(err, "Retrying to mark this runner for deletion in 10 seconds.")
|
|
||||||
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return ctrl.Result{Requeue: true}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
restart := stopped
|
|
||||||
|
|
||||||
if registrationOnly && stopped {
|
|
||||||
restart = false
|
|
||||||
|
|
||||||
log.Info(
|
|
||||||
"Observed that registration-only runner for scaling-from-zero has successfully stopped. " +
|
|
||||||
"Unlike other pods, this one will be recreated only when runner spec changes.",
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
if updated, err := r.updateRegistrationToken(ctx, runner); err != nil {
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
} else if updated {
|
|
||||||
return ctrl.Result{Requeue: true}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
newPod, err := r.newPod(runner)
|
|
||||||
if err != nil {
|
|
||||||
log.Error(err, "Could not create pod")
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if registrationOnly {
|
|
||||||
newPod.Spec.Containers[0].Env = append(
|
|
||||||
newPod.Spec.Containers[0].Env,
|
|
||||||
corev1.EnvVar{
|
|
||||||
Name: "RUNNER_REGISTRATION_ONLY",
|
|
||||||
Value: "true",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
var registrationRecheckDelay time.Duration
|
|
||||||
|
|
||||||
// all checks done below only decide whether a restart is needed
|
|
||||||
// if a restart was already decided before, there is no need for the checks
|
|
||||||
// saving API calls and scary log messages
|
|
||||||
if !restart {
|
|
||||||
registrationCheckInterval := time.Minute
|
|
||||||
if r.RegistrationRecheckInterval > 0 {
|
|
||||||
registrationCheckInterval = r.RegistrationRecheckInterval
|
|
||||||
}
|
|
||||||
|
|
||||||
// We want to call ListRunners GitHub Actions API only once per runner per minute.
|
|
||||||
// This if block, in conjunction with:
|
|
||||||
// return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
|
||||||
// achieves that.
|
|
||||||
if lastCheckTime := runner.Status.LastRegistrationCheckTime; lastCheckTime != nil {
|
|
||||||
nextCheckTime := lastCheckTime.Add(registrationCheckInterval)
|
|
||||||
now := time.Now()
|
|
||||||
|
|
||||||
// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
|
|
||||||
// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
|
|
||||||
// shorter than 1s
|
|
||||||
requeueAfter := nextCheckTime.Sub(now) - time.Second
|
|
||||||
if requeueAfter > 0 {
|
|
||||||
log.Info(
|
|
||||||
fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter),
|
|
||||||
"lastRegistrationCheckTime", lastCheckTime,
|
|
||||||
"registrationCheckInterval", registrationCheckInterval,
|
|
||||||
)
|
|
||||||
|
|
||||||
// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
|
|
||||||
// next sync period passes, which can be too much later than nextCheckTime.
|
|
||||||
//
|
|
||||||
// We need to requeue on this reconcilation even though we have already scheduled the initial
|
|
||||||
// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
|
|
||||||
// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
|
|
||||||
// other requeues- so the initial scheduled requeue may have been reset due to requeue on
|
|
||||||
// spec/status change.
|
|
||||||
return ctrl.Result{RequeueAfter: requeueAfter}, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
notFound := false
|
|
||||||
offline := false
|
|
||||||
|
|
||||||
runnerBusy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
|
|
||||||
|
|
||||||
currentTime := time.Now()
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
var notFoundException *github.RunnerNotFound
|
|
||||||
var offlineException *github.RunnerOffline
|
|
||||||
if errors.As(err, ¬FoundException) {
|
|
||||||
notFound = true
|
|
||||||
} else if errors.As(err, &offlineException) {
|
|
||||||
offline = true
|
|
||||||
} else {
|
|
||||||
var e *gogithub.RateLimitError
|
|
||||||
if errors.As(err, &e) {
|
|
||||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
|
||||||
// or the runner is still busy.
|
|
||||||
log.Error(
|
|
||||||
err,
|
|
||||||
fmt.Sprintf(
|
|
||||||
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
|
|
||||||
retryDelayOnGitHubAPIRateLimitError,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// See the `newPod` function called above for more information
|
|
||||||
// about when this hash changes.
|
|
||||||
curHash := pod.Labels[LabelKeyPodTemplateHash]
|
|
||||||
newHash := newPod.Labels[LabelKeyPodTemplateHash]
|
|
||||||
|
|
||||||
if !runnerBusy && curHash != newHash {
|
|
||||||
restart = true
|
|
||||||
}
|
|
||||||
|
|
||||||
registrationTimeout := 10 * time.Minute
|
|
||||||
durationAfterRegistrationTimeout := currentTime.Sub(pod.CreationTimestamp.Add(registrationTimeout))
|
|
||||||
registrationDidTimeout := durationAfterRegistrationTimeout > 0
|
|
||||||
|
|
||||||
if notFound {
|
|
||||||
if registrationDidTimeout {
|
|
||||||
log.Info(
|
|
||||||
"Runner failed to register itself to GitHub in timely manner. "+
|
|
||||||
"Recreating the pod to see if it resolves the issue. "+
|
|
||||||
"CAUTION: If you see this a lot, you should investigate the root cause. "+
|
|
||||||
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
|
|
||||||
"podCreationTimestamp", pod.CreationTimestamp,
|
|
||||||
"currentTime", currentTime,
|
|
||||||
"configuredRegistrationTimeout", registrationTimeout,
|
|
||||||
)
|
|
||||||
|
|
||||||
restart = true
|
|
||||||
} else {
|
|
||||||
log.V(1).Info(
|
|
||||||
"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.",
|
|
||||||
"runnerName", runner.Name,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
} else if offline {
|
|
||||||
if registrationOnly {
|
|
||||||
log.Info(
|
|
||||||
"Observed that registration-only runner for scaling-from-zero has successfully been registered.",
|
|
||||||
"podCreationTimestamp", pod.CreationTimestamp,
|
|
||||||
"currentTime", currentTime,
|
|
||||||
"configuredRegistrationTimeout", registrationTimeout,
|
|
||||||
)
|
|
||||||
} else if registrationDidTimeout {
|
|
||||||
if runnerBusy {
|
|
||||||
log.Info(
|
|
||||||
"Timeout out while waiting for the runner to be online, but observed that it's busy at the same time."+
|
|
||||||
"This is a known (unintuitive) behaviour of a runner that is already running a job. Please see https://github.com/actions-runner-controller/actions-runner-controller/issues/911",
|
|
||||||
"podCreationTimestamp", pod.CreationTimestamp,
|
|
||||||
"currentTime", currentTime,
|
|
||||||
"configuredRegistrationTimeout", registrationTimeout,
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
log.Info(
|
|
||||||
"Already existing GitHub runner still appears offline . "+
|
|
||||||
"Recreating the pod to see if it resolves the issue. "+
|
|
||||||
"CAUTION: If you see this a lot, you should investigate the root cause. ",
|
|
||||||
"podCreationTimestamp", pod.CreationTimestamp,
|
|
||||||
"currentTime", currentTime,
|
|
||||||
"configuredRegistrationTimeout", registrationTimeout,
|
|
||||||
)
|
|
||||||
|
|
||||||
restart = true
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
log.V(1).Info(
|
|
||||||
"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...",
|
|
||||||
"runnerName", runner.Name,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (notFound || (offline && !registrationOnly)) && !registrationDidTimeout {
|
|
||||||
registrationRecheckJitter := 10 * time.Second
|
|
||||||
if r.RegistrationRecheckJitter > 0 {
|
|
||||||
registrationRecheckJitter = r.RegistrationRecheckJitter
|
|
||||||
}
|
|
||||||
|
|
||||||
registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Don't do anything if there's no need to restart the runner
|
|
||||||
if !restart {
|
|
||||||
// This guard enables us to update runner.Status.Phase to `Running` only after
|
|
||||||
// the runner is registered to GitHub.
|
|
||||||
if registrationRecheckDelay > 0 {
|
|
||||||
log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay))
|
|
||||||
|
|
||||||
updated := runner.DeepCopy()
|
|
||||||
updated.Status.LastRegistrationCheckTime = &metav1.Time{Time: time.Now()}
|
|
||||||
|
|
||||||
if err := r.Status().Patch(ctx, updated, client.MergeFrom(&runner)); err != nil {
|
|
||||||
log.Error(err, "Failed to update runner status for LastRegistrationCheckTime")
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if runner.Status.Phase != string(pod.Status.Phase) {
|
|
||||||
if pod.Status.Phase == corev1.PodRunning {
|
if pod.Status.Phase == corev1.PodRunning {
|
||||||
// Seeing this message, you can expect the runner to become `Running` soon.
|
// Seeing this message, you can expect the runner to become `Running` soon.
|
||||||
log.Info(
|
log.V(1).Info(
|
||||||
"Runner appears to have registered and running.",
|
"Runner appears to have been registered and running.",
|
||||||
"podCreationTimestamp", pod.CreationTimestamp,
|
"podCreationTimestamp", pod.CreationTimestamp,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
updated := runner.DeepCopy()
|
updated := runner.DeepCopy()
|
||||||
updated.Status.Phase = string(pod.Status.Phase)
|
updated.Status.Phase = phase
|
||||||
updated.Status.Reason = pod.Status.Reason
|
updated.Status.Reason = pod.Status.Reason
|
||||||
updated.Status.Message = pod.Status.Message
|
updated.Status.Message = pod.Status.Message
|
||||||
|
|
||||||
|
|
@ -422,24 +153,6 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod)
|
|
||||||
if res != nil {
|
|
||||||
return *res, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
|
|
||||||
// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
|
|
||||||
if err := r.Delete(ctx, updatedPod); err != nil {
|
|
||||||
log.Error(err, "Failed to delete pod resource")
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
|
|
||||||
log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
|
|
||||||
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func runnerContainerExitCode(pod *corev1.Pod) *int32 {
|
func runnerContainerExitCode(pod *corev1.Pod) *int32 {
|
||||||
for _, status := range pod.Status.ContainerStatuses {
|
for _, status := range pod.Status.ContainerStatuses {
|
||||||
if status.Name != containerName {
|
if status.Name != containerName {
|
||||||
|
|
@ -480,11 +193,6 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con
|
||||||
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
||||||
|
|
||||||
if removed {
|
if removed {
|
||||||
_, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod)
|
|
||||||
if res != nil {
|
|
||||||
return *res, err
|
|
||||||
}
|
|
||||||
|
|
||||||
newRunner := runner.DeepCopy()
|
newRunner := runner.DeepCopy()
|
||||||
newRunner.ObjectMeta.Finalizers = finalizers
|
newRunner.ObjectMeta.Finalizers = finalizers
|
||||||
|
|
||||||
|
|
@ -499,60 +207,6 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *RunnerReconciler) unregistrationTimeout() time.Duration {
|
|
||||||
unregistrationTimeout := DefaultUnregistrationTimeout
|
|
||||||
|
|
||||||
if r.UnregistrationTimeout > 0 {
|
|
||||||
unregistrationTimeout = r.UnregistrationTimeout
|
|
||||||
}
|
|
||||||
return unregistrationTimeout
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *RunnerReconciler) unregistrationRetryDelay() time.Duration {
|
|
||||||
retryDelay := DefaultUnregistrationRetryDelay
|
|
||||||
|
|
||||||
if r.UnregistrationRetryDelay > 0 {
|
|
||||||
retryDelay = r.UnregistrationRetryDelay
|
|
||||||
}
|
|
||||||
return retryDelay
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) {
|
|
||||||
deletionTimeout := 1 * time.Minute
|
|
||||||
currentTime := time.Now()
|
|
||||||
deletionDidTimeout := currentTime.Sub(pod.DeletionTimestamp.Add(deletionTimeout)) > 0
|
|
||||||
|
|
||||||
if deletionDidTimeout {
|
|
||||||
log.Info(
|
|
||||||
fmt.Sprintf("Failed to delete pod within %s. ", deletionTimeout)+
|
|
||||||
"This is typically the case when a Kubernetes node became unreachable "+
|
|
||||||
"and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.",
|
|
||||||
"podDeletionTimestamp", pod.DeletionTimestamp,
|
|
||||||
"currentTime", currentTime,
|
|
||||||
"configuredDeletionTimeout", deletionTimeout,
|
|
||||||
)
|
|
||||||
|
|
||||||
var force int64 = 0
|
|
||||||
// forcefully delete runner as we would otherwise get stuck if the node stays unreachable
|
|
||||||
if err := r.Delete(ctx, &pod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil {
|
|
||||||
// probably
|
|
||||||
if !kerrors.IsNotFound(err) {
|
|
||||||
log.Error(err, "Failed to forcefully delete pod resource ...")
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
// forceful deletion finally succeeded
|
|
||||||
return ctrl.Result{Requeue: true}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", pod.Name))
|
|
||||||
log.Info("Forcefully deleted runner pod", "repository", runner.Spec.Repository)
|
|
||||||
// give kube manager a little time to forcefully delete the stuck pod
|
|
||||||
return ctrl.Result{RequeueAfter: 3 * time.Second}, nil
|
|
||||||
} else {
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) {
|
func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1alpha1.Runner, log logr.Logger) (reconcile.Result, error) {
|
||||||
if updated, err := r.updateRegistrationToken(ctx, runner); err != nil {
|
if updated, err := r.updateRegistrationToken(ctx, runner); err != nil {
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
|
|
@ -584,6 +238,7 @@ func (r *RunnerReconciler) processRunnerCreation(ctx context.Context, runner v1a
|
||||||
|
|
||||||
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodCreated", fmt.Sprintf("Created pod '%s'", newPod.Name))
|
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodCreated", fmt.Sprintf("Created pod '%s'", newPod.Name))
|
||||||
log.Info("Created runner pod", "repository", runner.Spec.Repository)
|
log.Info("Created runner pod", "repository", runner.Spec.Repository)
|
||||||
|
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -696,7 +351,7 @@ func (r *RunnerReconciler) newPod(runner v1alpha1.Runner) (corev1.Pod, error) {
|
||||||
|
|
||||||
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
|
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
|
||||||
|
|
||||||
pod, err := newRunnerPod(template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL, registrationOnly)
|
pod, err := newRunnerPod(runner.Name, template, runner.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubClient.GithubBaseURL, registrationOnly)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return pod, err
|
return pod, err
|
||||||
}
|
}
|
||||||
|
|
@ -813,7 +468,7 @@ func mutatePod(pod *corev1.Pod, token string) *corev1.Pod {
|
||||||
return updated
|
return updated
|
||||||
}
|
}
|
||||||
|
|
||||||
func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string, registrationOnly bool) (corev1.Pod, error) {
|
func newRunnerPod(runnerName string, template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, defaultRunnerImage string, defaultRunnerImagePullSecrets []string, defaultDockerImage, defaultDockerRegistryMirror string, githubBaseURL string, registrationOnly bool) (corev1.Pod, error) {
|
||||||
var (
|
var (
|
||||||
privileged bool = true
|
privileged bool = true
|
||||||
dockerdInRunner bool = runnerSpec.DockerdWithinRunnerContainer != nil && *runnerSpec.DockerdWithinRunnerContainer
|
dockerdInRunner bool = runnerSpec.DockerdWithinRunnerContainer != nil && *runnerSpec.DockerdWithinRunnerContainer
|
||||||
|
|
@ -822,6 +477,12 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
|
||||||
dockerdInRunnerPrivileged bool = dockerdInRunner
|
dockerdInRunnerPrivileged bool = dockerdInRunner
|
||||||
)
|
)
|
||||||
|
|
||||||
|
template = *template.DeepCopy()
|
||||||
|
|
||||||
|
// This label selector is used by default when rd.Spec.Selector is empty.
|
||||||
|
template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyRunnerSetName, runnerName)
|
||||||
|
template.ObjectMeta.Labels = CloneAndAddLabel(template.ObjectMeta.Labels, LabelKeyPodMutation, LabelValuePodMutation)
|
||||||
|
|
||||||
workDir := runnerSpec.WorkDir
|
workDir := runnerSpec.WorkDir
|
||||||
if workDir == "" {
|
if workDir == "" {
|
||||||
workDir = "/runner/_work"
|
workDir = "/runner/_work"
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ import (
|
||||||
"github.com/go-logr/logr"
|
"github.com/go-logr/logr"
|
||||||
appsv1 "k8s.io/api/apps/v1"
|
appsv1 "k8s.io/api/apps/v1"
|
||||||
corev1 "k8s.io/api/core/v1"
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/api/errors"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
"k8s.io/apimachinery/pkg/types"
|
"k8s.io/apimachinery/pkg/types"
|
||||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||||
|
|
@ -26,6 +27,7 @@ type podsForOwner struct {
|
||||||
runner *v1alpha1.Runner
|
runner *v1alpha1.Runner
|
||||||
statefulSet *appsv1.StatefulSet
|
statefulSet *appsv1.StatefulSet
|
||||||
owner owner
|
owner owner
|
||||||
|
object client.Object
|
||||||
synced bool
|
synced bool
|
||||||
pods []corev1.Pod
|
pods []corev1.Pod
|
||||||
}
|
}
|
||||||
|
|
@ -52,6 +54,9 @@ func (r *ownerRunner) pods(ctx context.Context, c client.Client) ([]corev1.Pod,
|
||||||
var pod corev1.Pod
|
var pod corev1.Pod
|
||||||
|
|
||||||
if err := c.Get(ctx, types.NamespacedName{Namespace: r.Runner.Namespace, Name: r.Runner.Name}, &pod); err != nil {
|
if err := c.Get(ctx, types.NamespacedName{Namespace: r.Runner.Namespace, Name: r.Runner.Name}, &pod); err != nil {
|
||||||
|
if errors.IsNotFound(err) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
r.Log.Error(err, "Failed to get pod managed by runner")
|
r.Log.Error(err, "Failed to get pod managed by runner")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
@ -70,7 +75,7 @@ func (r *ownerRunner) withAnnotation(k, v string) client.Object {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *ownerRunner) synced() bool {
|
func (r *ownerRunner) synced() bool {
|
||||||
return true
|
return r.Runner.Status.Phase != ""
|
||||||
}
|
}
|
||||||
|
|
||||||
type ownerStatefulSet struct {
|
type ownerStatefulSet struct {
|
||||||
|
|
@ -104,7 +109,7 @@ func (s *ownerStatefulSet) pods(ctx context.Context, c client.Client) ([]corev1.
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *ownerStatefulSet) templateHash() (string, bool) {
|
func (s *ownerStatefulSet) templateHash() (string, bool) {
|
||||||
return getStatefulSetTemplateHash(s.StatefulSet)
|
return getRunnerTemplateHash(s.StatefulSet)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *ownerStatefulSet) withAnnotation(k, v string) client.Object {
|
func (s *ownerStatefulSet) withAnnotation(k, v string) client.Object {
|
||||||
|
|
@ -132,23 +137,26 @@ func getPodsForOwner(ctx context.Context, c client.Client, log logr.Logger, o cl
|
||||||
owner owner
|
owner owner
|
||||||
runner *v1alpha1.Runner
|
runner *v1alpha1.Runner
|
||||||
statefulSet *appsv1.StatefulSet
|
statefulSet *appsv1.StatefulSet
|
||||||
|
object client.Object
|
||||||
)
|
)
|
||||||
|
|
||||||
switch v := o.(type) {
|
switch v := o.(type) {
|
||||||
case *v1alpha1.Runner:
|
case *v1alpha1.Runner:
|
||||||
owner = &ownerRunner{
|
owner = &ownerRunner{
|
||||||
Object: v,
|
|
||||||
Log: log,
|
Log: log,
|
||||||
Runner: v,
|
Runner: v,
|
||||||
|
Object: v,
|
||||||
}
|
}
|
||||||
runner = v
|
runner = v
|
||||||
|
object = v
|
||||||
case *appsv1.StatefulSet:
|
case *appsv1.StatefulSet:
|
||||||
owner = &ownerStatefulSet{
|
owner = &ownerStatefulSet{
|
||||||
Object: v,
|
|
||||||
Log: log,
|
Log: log,
|
||||||
StatefulSet: v,
|
StatefulSet: v,
|
||||||
|
Object: v,
|
||||||
}
|
}
|
||||||
statefulSet = v
|
statefulSet = v
|
||||||
|
object = v
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("BUG: Unsupported runner pods owner %v(%T)", v, v)
|
return nil, fmt.Errorf("BUG: Unsupported runner pods owner %v(%T)", v, v)
|
||||||
}
|
}
|
||||||
|
|
@ -209,19 +217,14 @@ func getPodsForOwner(ctx context.Context, c client.Client, log logr.Logger, o cl
|
||||||
runner: runner,
|
runner: runner,
|
||||||
statefulSet: statefulSet,
|
statefulSet: statefulSet,
|
||||||
owner: owner,
|
owner: owner,
|
||||||
|
object: object,
|
||||||
synced: synced,
|
synced: synced,
|
||||||
pods: pods,
|
pods: pods,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getRunnerTemplateHash(r *v1alpha1.Runner) (string, bool) {
|
func getRunnerTemplateHash(r client.Object) (string, bool) {
|
||||||
hash, ok := r.Labels[LabelKeyRunnerTemplateHash]
|
hash, ok := r.GetLabels()[LabelKeyRunnerTemplateHash]
|
||||||
|
|
||||||
return hash, ok
|
|
||||||
}
|
|
||||||
|
|
||||||
func getStatefulSetTemplateHash(rs *appsv1.StatefulSet) (string, bool) {
|
|
||||||
hash, ok := rs.Labels[LabelKeyRunnerTemplateHash]
|
|
||||||
|
|
||||||
return hash, ok
|
return hash, ok
|
||||||
}
|
}
|
||||||
|
|
@ -235,7 +238,18 @@ type result struct {
|
||||||
currentObjects []*podsForOwner
|
currentObjects []*podsForOwner
|
||||||
}
|
}
|
||||||
|
|
||||||
func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, effectiveTime *metav1.Time, newDesiredReplicas int, desiredTemplateHash string, create client.Object, ephemeral bool, owners []client.Object) (*result, error) {
|
// Why `create` must be a function rather than a client.Object? That's becase we use it to create one or more objects on scale up.
|
||||||
|
//
|
||||||
|
// We use client.Create to create a necessary number of client.Object. client.Create mutates the passed object on a successful creation.
|
||||||
|
// It seems to set .Revision at least, and the existence of .Revision let client.Create fail due to K8s restriction that an object being just created
|
||||||
|
// can't have .Revision.
|
||||||
|
// Now, imagine that you are to add 2 runner replicas on scale up.
|
||||||
|
// We create one resource object per a replica that ends up calling 2 client.Create calls.
|
||||||
|
// If we were reusing client.Object to be passed to client.Create calls, only the first call suceeeds.
|
||||||
|
// The second call fails due to the first call mutated the client.Object to have .Revision.
|
||||||
|
// Passing a factory function of client.Object and creating a brand-new client.Object per a client.Create call resolves this issue,
|
||||||
|
// allowing us to create two or more replicas in one reconcilation loop without being rejected by K8s.
|
||||||
|
func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger, effectiveTime *metav1.Time, newDesiredReplicas int, create func() client.Object, ephemeral bool, owners []client.Object) (*result, error) {
|
||||||
state, err := collectPodsForOwners(ctx, c, log, owners)
|
state, err := collectPodsForOwners(ctx, c, log, owners)
|
||||||
if err != nil || state == nil {
|
if err != nil || state == nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
@ -265,6 +279,13 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger,
|
||||||
// Even though the error message includes "Forbidden", this error's reason is "Invalid".
|
// Even though the error message includes "Forbidden", this error's reason is "Invalid".
|
||||||
// So we used to match these errors by using errors.IsInvalid. But that's another story...
|
// So we used to match these errors by using errors.IsInvalid. But that's another story...
|
||||||
|
|
||||||
|
desiredTemplateHash, ok := getRunnerTemplateHash(create())
|
||||||
|
if !ok {
|
||||||
|
log.Info("Failed to get template hash of desired owner resource. It must be in an invalid state. Please manually delete the owner so that it is recreated")
|
||||||
|
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
currentObjects := podsForOwnersPerTemplateHash[desiredTemplateHash]
|
currentObjects := podsForOwnersPerTemplateHash[desiredTemplateHash]
|
||||||
|
|
||||||
sort.SliceStable(currentObjects, func(i, j int) bool {
|
sort.SliceStable(currentObjects, func(i, j int) bool {
|
||||||
|
|
@ -289,13 +310,20 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger,
|
||||||
regTimeout += ss.regTimeout
|
regTimeout += ss.regTimeout
|
||||||
}
|
}
|
||||||
|
|
||||||
|
numOwners := len(owners)
|
||||||
|
|
||||||
|
var hashes []string
|
||||||
|
for h, _ := range state.podsForOwners {
|
||||||
|
hashes = append(hashes, h)
|
||||||
|
}
|
||||||
|
|
||||||
log.V(2).Info(
|
log.V(2).Info(
|
||||||
"Found some pods across owner(s)",
|
"Found some pods across owner(s)",
|
||||||
"pending", pending,
|
"pending", pending,
|
||||||
"running", running,
|
"running", running,
|
||||||
"regTimeout", regTimeout,
|
"regTimeout", regTimeout,
|
||||||
"desired", newDesiredReplicas,
|
"desired", newDesiredReplicas,
|
||||||
"owners", len(owners),
|
"owners", numOwners,
|
||||||
)
|
)
|
||||||
|
|
||||||
maybeRunning := pending + running
|
maybeRunning := pending + running
|
||||||
|
|
@ -307,15 +335,39 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger,
|
||||||
|
|
||||||
for i := 0; i < num; i++ {
|
for i := 0; i < num; i++ {
|
||||||
// Add more replicas
|
// Add more replicas
|
||||||
if err := c.Create(ctx, create); err != nil {
|
if err := c.Create(ctx, create()); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.V(2).Info("Created object(s) to add more replicas", "num", num)
|
log.V(1).Info("Created replica(s)",
|
||||||
|
"created", num,
|
||||||
|
"templateHashDesired", desiredTemplateHash,
|
||||||
|
"replicasDesired", newDesiredReplicas,
|
||||||
|
"replicasMaybeRunning", maybeRunning,
|
||||||
|
"templateHashObserved", hashes,
|
||||||
|
)
|
||||||
|
|
||||||
return nil, nil
|
return nil, nil
|
||||||
} else if newDesiredReplicas <= running {
|
} else if newDesiredReplicas <= running {
|
||||||
|
// If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally,
|
||||||
|
// you're unlikely to fall into this branch.
|
||||||
|
//
|
||||||
|
// That's because all the stakeholders work like this:
|
||||||
|
//
|
||||||
|
// 1. A runner pod completes with the runner container exiting with code 0
|
||||||
|
// 2. ARC runner controller detects the pod completion, marks the owner(runner or statefulset) resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero)
|
||||||
|
// 3. GitHub triggers a corresponding workflow_job "complete" webhook event
|
||||||
|
// 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation
|
||||||
|
// 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations
|
||||||
|
// 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas
|
||||||
|
// 7. (We're here) ARC runnerset or runnerreplicaset controller starts reconciling the owner resource (statefulset or runner)
|
||||||
|
//
|
||||||
|
// In a normally working ARC installation, the runner that was used to run the workflow job should already have been
|
||||||
|
// marked for deletion by the runner controller.
|
||||||
|
// This runnerreplicaset controller doesn't count marked runners into the `running` value, hence you're unlikely to
|
||||||
|
// fall into this branch when you're using ephemeral runners with webhook-based-autoscaler.
|
||||||
|
|
||||||
var retained int
|
var retained int
|
||||||
|
|
||||||
var delete []*podsForOwner
|
var delete []*podsForOwner
|
||||||
|
|
@ -354,7 +406,7 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger,
|
||||||
if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); !ok {
|
if _, ok := getAnnotation(ss.owner, AnnotationKeyUnregistrationRequestTimestamp); !ok {
|
||||||
updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339))
|
updated := ss.owner.withAnnotation(AnnotationKeyUnregistrationRequestTimestamp, time.Now().Format(time.RFC3339))
|
||||||
|
|
||||||
if err := c.Patch(ctx, updated, client.MergeFrom(ss.owner)); err != nil {
|
if err := c.Patch(ctx, updated, client.MergeFrom(ss.object)); err != nil {
|
||||||
log.Error(err, fmt.Sprintf("Failed to patch object to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp))
|
log.Error(err, fmt.Sprintf("Failed to patch object to have %s annotation", AnnotationKeyUnregistrationRequestTimestamp))
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
@ -379,7 +431,7 @@ func syncRunnerPodsOwners(ctx context.Context, c client.Client, log logr.Logger,
|
||||||
for _, ss := range sss {
|
for _, ss := range sss {
|
||||||
if ss.templateHash != desiredTemplateHash {
|
if ss.templateHash != desiredTemplateHash {
|
||||||
if ss.owner.GetDeletionTimestamp().IsZero() {
|
if ss.owner.GetDeletionTimestamp().IsZero() {
|
||||||
if err := c.Delete(ctx, ss.owner); err != nil {
|
if err := c.Delete(ctx, ss.object); err != nil {
|
||||||
log.Error(err, "Unable to delete object")
|
log.Error(err, "Unable to delete object")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
@ -417,6 +469,12 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger,
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if res.templateHash == "" {
|
||||||
|
log.Info("validation error: runner pod owner must have template hash", "object", res.object)
|
||||||
|
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Statefulset termination process 4/4: Let Kubernetes cascade-delete the statefulset and the pods.
|
// Statefulset termination process 4/4: Let Kubernetes cascade-delete the statefulset and the pods.
|
||||||
//
|
//
|
||||||
// If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
|
// If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
|
||||||
|
|
@ -429,7 +487,7 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger,
|
||||||
|
|
||||||
// Statefulset termination process 3/4: Set the deletionTimestamp to let Kubernetes start a cascade deletion of the statefulset and the pods.
|
// Statefulset termination process 3/4: Set the deletionTimestamp to let Kubernetes start a cascade deletion of the statefulset and the pods.
|
||||||
if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); ok {
|
if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); ok {
|
||||||
if err := c.Delete(ctx, res.owner); err != nil {
|
if err := c.Delete(ctx, res.object); err != nil {
|
||||||
log.Error(err, "Failed to delete owner")
|
log.Error(err, "Failed to delete owner")
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
@ -454,7 +512,7 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger,
|
||||||
if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); !ok {
|
if _, ok := getAnnotation(res.owner, AnnotationKeyUnregistrationCompleteTimestamp); !ok {
|
||||||
updated := res.owner.withAnnotation(AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
|
updated := res.owner.withAnnotation(AnnotationKeyUnregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
|
||||||
|
|
||||||
if err := c.Patch(ctx, updated, client.MergeFrom(res.owner)); err != nil {
|
if err := c.Patch(ctx, updated, client.MergeFrom(res.object)); err != nil {
|
||||||
log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationCompleteTimestamp))
|
log.Error(err, fmt.Sprintf("Failed to patch owner to have %s annotation", AnnotationKeyUnregistrationCompleteTimestamp))
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
@ -494,6 +552,8 @@ func collectPodsForOwners(ctx context.Context, c client.Client, log logr.Logger,
|
||||||
}
|
}
|
||||||
|
|
||||||
if !res.synced {
|
if !res.synced {
|
||||||
|
log.V(1).Info("Skipped reconcilation because owner is not synced yet", "pods", res.pods)
|
||||||
|
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,13 +18,10 @@ package controllers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"reflect"
|
"reflect"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/go-logr/logr"
|
"github.com/go-logr/logr"
|
||||||
gogithub "github.com/google/go-github/v39/github"
|
|
||||||
|
|
||||||
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
||||||
"k8s.io/apimachinery/pkg/runtime"
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
|
|
@ -32,7 +29,6 @@ import (
|
||||||
ctrl "sigs.k8s.io/controller-runtime"
|
ctrl "sigs.k8s.io/controller-runtime"
|
||||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||||
|
|
||||||
corev1 "k8s.io/api/core/v1"
|
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
|
||||||
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
"github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||||
|
|
@ -72,15 +68,35 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if rs.ObjectMeta.Labels == nil {
|
||||||
|
rs.ObjectMeta.Labels = map[string]string{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Template hash is usually set by the upstream controller(RunnerDeplloyment controller) on authoring
|
||||||
|
// RunerReplicaset resource, but it may be missing when the user directly created RunnerReplicaSet.
|
||||||
|
// As a template hash is required by by the runner replica management, we dynamically add it here without ever persisting it.
|
||||||
|
if rs.ObjectMeta.Labels[LabelKeyRunnerTemplateHash] == "" {
|
||||||
|
template := rs.Spec.DeepCopy()
|
||||||
|
template.Replicas = nil
|
||||||
|
template.EffectiveTime = nil
|
||||||
|
templateHash := ComputeHash(template)
|
||||||
|
|
||||||
|
log.Info("Using auto-generated template hash", "value", templateHash)
|
||||||
|
|
||||||
|
rs.ObjectMeta.Labels = CloneAndAddLabel(rs.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash)
|
||||||
|
rs.Spec.Template.ObjectMeta.Labels = CloneAndAddLabel(rs.Spec.Template.ObjectMeta.Labels, LabelKeyRunnerTemplateHash, templateHash)
|
||||||
|
}
|
||||||
|
|
||||||
selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector)
|
selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the Runners managed by the target RunnerReplicaSet
|
// Get the Runners managed by the target RunnerReplicaSet
|
||||||
var allRunners v1alpha1.RunnerList
|
var runnerList v1alpha1.RunnerList
|
||||||
if err := r.List(
|
if err := r.List(
|
||||||
ctx,
|
ctx,
|
||||||
&allRunners,
|
&runnerList,
|
||||||
client.InNamespace(req.Namespace),
|
client.InNamespace(req.Namespace),
|
||||||
client.MatchingLabelsSelector{Selector: selector},
|
client.MatchingLabelsSelector{Selector: selector},
|
||||||
); err != nil {
|
); err != nil {
|
||||||
|
|
@ -89,218 +105,43 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
replicas := 1
|
||||||
current int
|
|
||||||
ready int
|
|
||||||
available int
|
|
||||||
|
|
||||||
lastSyncTime *time.Time
|
|
||||||
)
|
|
||||||
|
|
||||||
for _, r := range allRunners.Items {
|
|
||||||
// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
|
|
||||||
// to not treat all the runners in the namespace as its children.
|
|
||||||
if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) {
|
|
||||||
// If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
|
|
||||||
// or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop),
|
|
||||||
// we don't need to bother calling GitHub API to re-mark the runner for deletion.
|
|
||||||
// Just hold on, and runners will disappear as long as the runner controller is up and running.
|
|
||||||
if !r.DeletionTimestamp.IsZero() {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if r.Annotations != nil {
|
|
||||||
if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok {
|
|
||||||
t, err := time.Parse(time.RFC3339, a)
|
|
||||||
if err == nil {
|
|
||||||
if lastSyncTime == nil || lastSyncTime.Before(t) {
|
|
||||||
lastSyncTime = &t
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
current += 1
|
|
||||||
|
|
||||||
if r.Status.Phase == string(corev1.PodRunning) {
|
|
||||||
ready += 1
|
|
||||||
// available is currently the same as ready, as we don't yet have minReadySeconds for runners
|
|
||||||
available += 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var desired int
|
|
||||||
|
|
||||||
if rs.Spec.Replicas != nil {
|
if rs.Spec.Replicas != nil {
|
||||||
desired = *rs.Spec.Replicas
|
replicas = *rs.Spec.Replicas
|
||||||
} else {
|
|
||||||
desired = 1
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: remove this registration runner cleanup later (v0.23.0 or v0.24.0)
|
|
||||||
//
|
|
||||||
// We had to have a registration-only runner to support scale-from-zero before.
|
|
||||||
// But since Sep 2021 Actions update on GitHub Cloud and GHES 3.3, it is unneceesary.
|
|
||||||
// See the below issues for more contexts:
|
|
||||||
// https://github.com/actions-runner-controller/actions-runner-controller/issues/516
|
|
||||||
// https://github.com/actions-runner-controller/actions-runner-controller/issues/859
|
|
||||||
//
|
|
||||||
// In the below block, we have a logic to remove existing registration-only runners as unnecessary.
|
|
||||||
// This logic is introduced since actions-runner-controller 0.21.0 and probably last one or two minor releases
|
|
||||||
// so that actions-runner-controller instance in everyone's cluster won't leave dangling registration-only runners.
|
|
||||||
registrationOnlyRunnerNsName := req.NamespacedName
|
|
||||||
registrationOnlyRunnerNsName.Name = registrationOnlyRunnerNameFor(rs.Name)
|
|
||||||
registrationOnlyRunner := v1alpha1.Runner{}
|
|
||||||
registrationOnlyRunnerExists := false
|
|
||||||
if err := r.Get(
|
|
||||||
ctx,
|
|
||||||
registrationOnlyRunnerNsName,
|
|
||||||
®istrationOnlyRunner,
|
|
||||||
); err != nil {
|
|
||||||
if !kerrors.IsNotFound(err) {
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
registrationOnlyRunnerExists = true
|
|
||||||
}
|
|
||||||
|
|
||||||
if registrationOnlyRunnerExists {
|
|
||||||
if err := r.Client.Delete(ctx, ®istrationOnlyRunner); err != nil {
|
|
||||||
log.Error(err, "Retrying soon because we failed to delete registration-only runner")
|
|
||||||
|
|
||||||
return ctrl.Result{Requeue: true}, nil
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
effectiveTime := rs.Spec.EffectiveTime
|
effectiveTime := rs.Spec.EffectiveTime
|
||||||
ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral
|
ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral
|
||||||
|
|
||||||
if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
|
desired, err := r.newRunner(rs)
|
||||||
log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready)
|
|
||||||
} else if current > desired {
|
|
||||||
// If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally,
|
|
||||||
// you're unlikely to fall into this branch.
|
|
||||||
//
|
|
||||||
// That's becaseu all the stakeholders work like this:
|
|
||||||
//
|
|
||||||
// 1. A runner pod completes with the runner container exiting with code 0
|
|
||||||
// 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero)
|
|
||||||
// 3. GitHub triggers a corresponding workflow_job "complete" webhook event
|
|
||||||
// 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation
|
|
||||||
// 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations
|
|
||||||
// 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas
|
|
||||||
// 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet
|
|
||||||
//
|
|
||||||
// In a normally working ARC installation, the runner that was used to run the workflow job should already have been
|
|
||||||
// marked for deletion by the runner controller.
|
|
||||||
// This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to
|
|
||||||
// fall into this branch when you're using ephemeral runners with webhook-based-autoscaler.
|
|
||||||
|
|
||||||
n := current - desired
|
|
||||||
|
|
||||||
log.V(0).Info(fmt.Sprintf("Deleting %d runners from RunnerReplicaSet %s", n, req.NamespacedName), "desired", desired, "current", current, "ready", ready)
|
|
||||||
|
|
||||||
// get runners that are currently offline/not busy/timed-out to register
|
|
||||||
var deletionCandidates []v1alpha1.Runner
|
|
||||||
|
|
||||||
for _, runner := range allRunners.Items {
|
|
||||||
busy, err := r.GitHubClient.IsRunnerBusy(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
|
|
||||||
if err != nil {
|
|
||||||
notRegistered := false
|
|
||||||
offline := false
|
|
||||||
|
|
||||||
var notFoundException *github.RunnerNotFound
|
|
||||||
var offlineException *github.RunnerOffline
|
|
||||||
if errors.As(err, ¬FoundException) {
|
|
||||||
log.V(1).Info("Failed to check if runner is busy. Either this runner has never been successfully registered to GitHub or it still needs more time.", "runnerName", runner.Name)
|
|
||||||
notRegistered = true
|
|
||||||
} else if errors.As(err, &offlineException) {
|
|
||||||
offline = true
|
|
||||||
} else {
|
|
||||||
var e *gogithub.RateLimitError
|
|
||||||
if errors.As(err, &e) {
|
|
||||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
|
||||||
// or the runner is still busy.
|
|
||||||
log.Error(
|
|
||||||
err,
|
|
||||||
fmt.Sprintf(
|
|
||||||
"Failed to check if runner is busy due to GitHub API rate limit. Retrying in %s to avoid excessive GitHub API calls",
|
|
||||||
retryDelayOnGitHubAPIRateLimitError,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
registrationTimeout := 15 * time.Minute
|
|
||||||
currentTime := time.Now()
|
|
||||||
registrationDidTimeout := currentTime.Sub(runner.CreationTimestamp.Add(registrationTimeout)) > 0
|
|
||||||
|
|
||||||
if notRegistered && registrationDidTimeout {
|
|
||||||
log.Info(
|
|
||||||
"Runner failed to register itself to GitHub in timely manner. "+
|
|
||||||
"Marking the runner for scale down. "+
|
|
||||||
"CAUTION: If you see this a lot, you should investigate the root cause. "+
|
|
||||||
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
|
|
||||||
"runnerCreationTimestamp", runner.CreationTimestamp,
|
|
||||||
"currentTime", currentTime,
|
|
||||||
"configuredRegistrationTimeout", registrationTimeout,
|
|
||||||
)
|
|
||||||
|
|
||||||
deletionCandidates = append(deletionCandidates, runner)
|
|
||||||
}
|
|
||||||
|
|
||||||
// offline runners should always be a great target for scale down
|
|
||||||
if offline {
|
|
||||||
deletionCandidates = append(deletionCandidates, runner)
|
|
||||||
}
|
|
||||||
} else if !busy {
|
|
||||||
deletionCandidates = append(deletionCandidates, runner)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(deletionCandidates) < n {
|
|
||||||
n = len(deletionCandidates)
|
|
||||||
}
|
|
||||||
|
|
||||||
log.V(0).Info(fmt.Sprintf("Deleting %d runner(s)", n), "desired", desired, "current", current, "ready", ready)
|
|
||||||
|
|
||||||
for i := 0; i < n; i++ {
|
|
||||||
if err := r.Client.Delete(ctx, &deletionCandidates[i]); client.IgnoreNotFound(err) != nil {
|
|
||||||
log.Error(err, "Failed to delete runner resource")
|
|
||||||
|
|
||||||
return ctrl.Result{}, err
|
|
||||||
}
|
|
||||||
|
|
||||||
r.Recorder.Event(&rs, corev1.EventTypeNormal, "RunnerDeleted", fmt.Sprintf("Deleted runner '%s'", deletionCandidates[i].Name))
|
|
||||||
log.Info(fmt.Sprintf("Deleted runner %s", deletionCandidates[i].Name))
|
|
||||||
}
|
|
||||||
} else if desired > current {
|
|
||||||
n := desired - current
|
|
||||||
|
|
||||||
log.V(0).Info(fmt.Sprintf("Creating %d runner(s)", n), "desired", desired, "available", current, "ready", ready)
|
|
||||||
|
|
||||||
for i := 0; i < n; i++ {
|
|
||||||
newRunner, err := r.newRunner(rs)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error(err, "Could not create runner")
|
log.Error(err, "Could not create runner")
|
||||||
|
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := r.Client.Create(ctx, &newRunner); err != nil {
|
var live []client.Object
|
||||||
log.Error(err, "Failed to create runner resource")
|
for _, r := range runnerList.Items {
|
||||||
|
r := r
|
||||||
|
live = append(live, &r)
|
||||||
|
}
|
||||||
|
|
||||||
|
res, err := syncRunnerPodsOwners(ctx, r.Client, log, effectiveTime, replicas, func() client.Object { return desired.DeepCopy() }, ephemeral, live)
|
||||||
|
if err != nil || res == nil {
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var status v1alpha1.RunnerReplicaSetStatus
|
var (
|
||||||
|
status v1alpha1.RunnerReplicaSetStatus
|
||||||
|
|
||||||
|
current, available, ready int
|
||||||
|
)
|
||||||
|
|
||||||
|
for _, o := range res.currentObjects {
|
||||||
|
current += o.total
|
||||||
|
available += o.running
|
||||||
|
ready += o.running
|
||||||
|
}
|
||||||
|
|
||||||
status.Replicas = ¤t
|
status.Replicas = ¤t
|
||||||
status.AvailableReplicas = &available
|
status.AvailableReplicas = &available
|
||||||
|
|
@ -322,6 +163,8 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1alpha1.Runner, error) {
|
func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1alpha1.Runner, error) {
|
||||||
|
// Note that the upstream controller (runnerdeployment) is expected to add
|
||||||
|
// the "runner template hash" label to the template.meta which is necessary to make this controller work correctly
|
||||||
objectMeta := rs.Spec.Template.ObjectMeta.DeepCopy()
|
objectMeta := rs.Spec.Template.ObjectMeta.DeepCopy()
|
||||||
|
|
||||||
objectMeta.GenerateName = rs.ObjectMeta.Name + "-"
|
objectMeta.GenerateName = rs.ObjectMeta.Name + "-"
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,6 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
corev1 "k8s.io/api/core/v1"
|
corev1 "k8s.io/api/core/v1"
|
||||||
"k8s.io/apimachinery/pkg/types"
|
|
||||||
"k8s.io/client-go/kubernetes/scheme"
|
"k8s.io/client-go/kubernetes/scheme"
|
||||||
ctrl "sigs.k8s.io/controller-runtime"
|
ctrl "sigs.k8s.io/controller-runtime"
|
||||||
logf "sigs.k8s.io/controller-runtime/pkg/log"
|
logf "sigs.k8s.io/controller-runtime/pkg/log"
|
||||||
|
|
@ -102,12 +101,40 @@ func intPtr(v int) *int {
|
||||||
var _ = Context("Inside of a new namespace", func() {
|
var _ = Context("Inside of a new namespace", func() {
|
||||||
ctx := context.TODO()
|
ctx := context.TODO()
|
||||||
ns := SetupTest(ctx)
|
ns := SetupTest(ctx)
|
||||||
|
|
||||||
Describe("when no existing resources exist", func() {
|
|
||||||
|
|
||||||
It("should create a new Runner resource from the specified template, add a another Runner on replicas increased, and removes all the replicas when set to 0", func() {
|
|
||||||
name := "example-runnerreplicaset"
|
name := "example-runnerreplicaset"
|
||||||
|
|
||||||
|
getRunnerCount := func() int {
|
||||||
|
runners := actionsv1alpha1.RunnerList{Items: []actionsv1alpha1.Runner{}}
|
||||||
|
|
||||||
|
selector, err := metav1.LabelSelectorAsSelector(
|
||||||
|
&metav1.LabelSelector{
|
||||||
|
MatchLabels: map[string]string{
|
||||||
|
"foo": "bar",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
logf.Log.Error(err, "failed to create labelselector")
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
err = k8sClient.List(
|
||||||
|
ctx,
|
||||||
|
&runners,
|
||||||
|
client.InNamespace(ns.Name),
|
||||||
|
client.MatchingLabelsSelector{Selector: selector},
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
logf.Log.Error(err, "list runners")
|
||||||
|
}
|
||||||
|
|
||||||
|
runnersList.Sync(runners.Items)
|
||||||
|
|
||||||
|
return len(runners.Items)
|
||||||
|
}
|
||||||
|
|
||||||
|
Describe("RunnerReplicaSet", func() {
|
||||||
|
It("should create a new Runner resource from the specified template", func() {
|
||||||
{
|
{
|
||||||
rs := &actionsv1alpha1.RunnerReplicaSet{
|
rs := &actionsv1alpha1.RunnerReplicaSet{
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
|
@ -146,126 +173,99 @@ var _ = Context("Inside of a new namespace", func() {
|
||||||
|
|
||||||
Expect(err).NotTo(HaveOccurred(), "failed to create test RunnerReplicaSet resource")
|
Expect(err).NotTo(HaveOccurred(), "failed to create test RunnerReplicaSet resource")
|
||||||
|
|
||||||
runners := actionsv1alpha1.RunnerList{Items: []actionsv1alpha1.Runner{}}
|
|
||||||
|
|
||||||
Eventually(
|
Eventually(
|
||||||
func() int {
|
getRunnerCount,
|
||||||
selector, err := metav1.LabelSelectorAsSelector(
|
time.Second*5, time.Second).Should(BeEquivalentTo(1))
|
||||||
&metav1.LabelSelector{
|
|
||||||
MatchLabels: map[string]string{
|
|
||||||
"foo": "bar",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
logf.Log.Error(err, "failed to create labelselector")
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
err = k8sClient.List(
|
|
||||||
ctx,
|
|
||||||
&runners,
|
|
||||||
client.InNamespace(ns.Name),
|
|
||||||
client.MatchingLabelsSelector{Selector: selector},
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
logf.Log.Error(err, "list runners")
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
|
|
||||||
runnersList.Sync(runners.Items)
|
|
||||||
|
|
||||||
return len(runners.Items)
|
|
||||||
},
|
|
||||||
time.Second*5, time.Millisecond*500).Should(BeEquivalentTo(1))
|
|
||||||
}
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
It("should create 2 runners when specified 2 replicas", func() {
|
||||||
{
|
{
|
||||||
// We wrap the update in the Eventually block to avoid the below error that occurs due to concurrent modification
|
rs := &actionsv1alpha1.RunnerReplicaSet{
|
||||||
// made by the controller to update .Status.AvailableReplicas and .Status.ReadyReplicas
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
// Operation cannot be fulfilled on runnerreplicasets.actions.summerwind.dev "example-runnerreplicaset": the object has been modified; please apply your changes to the latest version and try again
|
Name: name,
|
||||||
Eventually(func() error {
|
Namespace: ns.Name,
|
||||||
var rs actionsv1alpha1.RunnerReplicaSet
|
|
||||||
|
|
||||||
err := k8sClient.Get(ctx, types.NamespacedName{Namespace: ns.Name, Name: name}, &rs)
|
|
||||||
|
|
||||||
Expect(err).NotTo(HaveOccurred(), "failed to get test RunnerReplicaSet resource")
|
|
||||||
|
|
||||||
rs.Spec.Replicas = intPtr(2)
|
|
||||||
|
|
||||||
return k8sClient.Update(ctx, &rs)
|
|
||||||
},
|
},
|
||||||
time.Second*1, time.Millisecond*500).Should(BeNil())
|
Spec: actionsv1alpha1.RunnerReplicaSetSpec{
|
||||||
|
Replicas: intPtr(2),
|
||||||
runners := actionsv1alpha1.RunnerList{Items: []actionsv1alpha1.Runner{}}
|
Selector: &metav1.LabelSelector{
|
||||||
|
|
||||||
Eventually(
|
|
||||||
func() int {
|
|
||||||
selector, err := metav1.LabelSelectorAsSelector(
|
|
||||||
&metav1.LabelSelector{
|
|
||||||
MatchLabels: map[string]string{
|
MatchLabels: map[string]string{
|
||||||
"foo": "bar",
|
"foo": "bar",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
)
|
Template: actionsv1alpha1.RunnerTemplate{
|
||||||
if err != nil {
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
logf.Log.Error(err, "failed to create labelselector")
|
Labels: map[string]string{
|
||||||
return -1
|
"foo": "bar",
|
||||||
}
|
},
|
||||||
err = k8sClient.List(
|
},
|
||||||
ctx,
|
Spec: actionsv1alpha1.RunnerSpec{
|
||||||
&runners,
|
RunnerConfig: actionsv1alpha1.RunnerConfig{
|
||||||
client.InNamespace(ns.Name),
|
Repository: "test/valid",
|
||||||
client.MatchingLabelsSelector{Selector: selector},
|
Image: "bar",
|
||||||
)
|
},
|
||||||
if err != nil {
|
RunnerPodSpec: actionsv1alpha1.RunnerPodSpec{
|
||||||
logf.Log.Error(err, "list runners")
|
Env: []corev1.EnvVar{
|
||||||
}
|
{Name: "FOO", Value: "FOOVALUE"},
|
||||||
|
},
|
||||||
runnersList.Sync(runners.Items)
|
},
|
||||||
|
},
|
||||||
return len(runners.Items)
|
},
|
||||||
},
|
},
|
||||||
time.Second*5, time.Millisecond*500).Should(BeEquivalentTo(2))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err := k8sClient.Create(ctx, rs)
|
||||||
|
|
||||||
|
Expect(err).NotTo(HaveOccurred(), "failed to create test RunnerReplicaSet resource")
|
||||||
|
|
||||||
|
Eventually(
|
||||||
|
getRunnerCount,
|
||||||
|
time.Second*5, time.Second).Should(BeEquivalentTo(2))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
It("should not create any runners when specified 0 replicas", func() {
|
||||||
{
|
{
|
||||||
// We wrap the update in the Eventually block to avoid the below error that occurs due to concurrent modification
|
rs := &actionsv1alpha1.RunnerReplicaSet{
|
||||||
// made by the controller to update .Status.AvailableReplicas and .Status.ReadyReplicas
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
// Operation cannot be fulfilled on runnersets.actions.summerwind.dev "example-runnerset": the object has been modified; please apply your changes to the latest version and try again
|
Name: name,
|
||||||
Eventually(func() error {
|
Namespace: ns.Name,
|
||||||
var rs actionsv1alpha1.RunnerReplicaSet
|
|
||||||
|
|
||||||
err := k8sClient.Get(ctx, types.NamespacedName{Namespace: ns.Name, Name: name}, &rs)
|
|
||||||
|
|
||||||
Expect(err).NotTo(HaveOccurred(), "failed to get test RunnerReplicaSet resource")
|
|
||||||
|
|
||||||
rs.Spec.Replicas = intPtr(0)
|
|
||||||
|
|
||||||
return k8sClient.Update(ctx, &rs)
|
|
||||||
},
|
},
|
||||||
time.Second*1, time.Millisecond*500).Should(BeNil())
|
Spec: actionsv1alpha1.RunnerReplicaSetSpec{
|
||||||
|
Replicas: intPtr(0),
|
||||||
runners := actionsv1alpha1.RunnerList{Items: []actionsv1alpha1.Runner{}}
|
Selector: &metav1.LabelSelector{
|
||||||
|
|
||||||
Eventually(
|
|
||||||
func() int {
|
|
||||||
selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
|
|
||||||
MatchLabels: map[string]string{
|
MatchLabels: map[string]string{
|
||||||
"foo": "bar",
|
"foo": "bar",
|
||||||
},
|
},
|
||||||
})
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
|
|
||||||
if err := k8sClient.List(ctx, &runners, client.InNamespace(ns.Name), client.MatchingLabelsSelector{Selector: selector}); err != nil {
|
|
||||||
logf.Log.Error(err, "list runners")
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
|
|
||||||
runnersList.Sync(runners.Items)
|
|
||||||
|
|
||||||
return len(runners.Items)
|
|
||||||
},
|
},
|
||||||
time.Second*5, time.Millisecond*500).Should(BeEquivalentTo(0))
|
Template: actionsv1alpha1.RunnerTemplate{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Labels: map[string]string{
|
||||||
|
"foo": "bar",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Spec: actionsv1alpha1.RunnerSpec{
|
||||||
|
RunnerConfig: actionsv1alpha1.RunnerConfig{
|
||||||
|
Repository: "test/valid",
|
||||||
|
Image: "bar",
|
||||||
|
},
|
||||||
|
RunnerPodSpec: actionsv1alpha1.RunnerPodSpec{
|
||||||
|
Env: []corev1.EnvVar{
|
||||||
|
{Name: "FOO", Value: "FOOVALUE"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
err := k8sClient.Create(ctx, rs)
|
||||||
|
|
||||||
|
Expect(err).NotTo(HaveOccurred(), "failed to create test RunnerReplicaSet resource")
|
||||||
|
|
||||||
|
Consistently(
|
||||||
|
getRunnerCount,
|
||||||
|
time.Second*5, time.Second).Should(BeEquivalentTo(0))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
|
||||||
|
|
@ -105,13 +105,6 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
desiredTemplateHash, ok := getStatefulSetTemplateHash(desiredStatefulSet)
|
|
||||||
if !ok {
|
|
||||||
log.Info("Failed to get template hash of desired statefulset. It must be in an invalid state. Please manually delete the statefulset so that it is recreated")
|
|
||||||
|
|
||||||
return ctrl.Result{}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
addedReplicas := int32(1)
|
addedReplicas := int32(1)
|
||||||
create := desiredStatefulSet.DeepCopy()
|
create := desiredStatefulSet.DeepCopy()
|
||||||
create.Spec.Replicas = &addedReplicas
|
create.Spec.Replicas = &addedReplicas
|
||||||
|
|
@ -136,7 +129,7 @@ func (r *RunnerSetReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
||||||
owners = append(owners, &ss)
|
owners = append(owners, &ss)
|
||||||
}
|
}
|
||||||
|
|
||||||
res, err := syncRunnerPodsOwners(ctx, r.Client, log, effectiveTime, newDesiredReplicas, desiredTemplateHash, create, ephemeral, owners)
|
res, err := syncRunnerPodsOwners(ctx, r.Client, log, effectiveTime, newDesiredReplicas, func() client.Object { return create.DeepCopy() }, ephemeral, owners)
|
||||||
if err != nil || res == nil {
|
if err != nil || res == nil {
|
||||||
return ctrl.Result{}, err
|
return ctrl.Result{}, err
|
||||||
}
|
}
|
||||||
|
|
@ -192,17 +185,12 @@ func (r *RunnerSetReconciler) newStatefulSet(runnerSet *v1alpha1.RunnerSet) (*ap
|
||||||
runnerSetWithOverrides.Labels = append(runnerSetWithOverrides.Labels, l)
|
runnerSetWithOverrides.Labels = append(runnerSetWithOverrides.Labels, l)
|
||||||
}
|
}
|
||||||
|
|
||||||
// This label selector is used by default when rd.Spec.Selector is empty.
|
|
||||||
runnerSetWithOverrides.Template.ObjectMeta.Labels = CloneAndAddLabel(runnerSetWithOverrides.Template.ObjectMeta.Labels, LabelKeyRunnerSetName, runnerSet.Name)
|
|
||||||
|
|
||||||
runnerSetWithOverrides.Template.ObjectMeta.Labels = CloneAndAddLabel(runnerSetWithOverrides.Template.ObjectMeta.Labels, LabelKeyPodMutation, LabelValuePodMutation)
|
|
||||||
|
|
||||||
template := corev1.Pod{
|
template := corev1.Pod{
|
||||||
ObjectMeta: runnerSetWithOverrides.StatefulSetSpec.Template.ObjectMeta,
|
ObjectMeta: runnerSetWithOverrides.StatefulSetSpec.Template.ObjectMeta,
|
||||||
Spec: runnerSetWithOverrides.StatefulSetSpec.Template.Spec,
|
Spec: runnerSetWithOverrides.StatefulSetSpec.Template.Spec,
|
||||||
}
|
}
|
||||||
|
|
||||||
pod, err := newRunnerPod(template, runnerSet.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubBaseURL, false)
|
pod, err := newRunnerPod(runnerSet.Name, template, runnerSet.Spec.RunnerConfig, r.RunnerImage, r.RunnerImagePullSecrets, r.DockerImage, r.DockerRegistryMirror, r.GitHubBaseURL, false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -265,6 +265,8 @@ func (e *env) installActionsRunnerController(t *testing.T) {
|
||||||
|
|
||||||
if e.useRunnerSet {
|
if e.useRunnerSet {
|
||||||
scriptEnv = append(scriptEnv, "USE_RUNNERSET=1")
|
scriptEnv = append(scriptEnv, "USE_RUNNERSET=1")
|
||||||
|
} else {
|
||||||
|
scriptEnv = append(scriptEnv, "USE_RUNNERSET=false")
|
||||||
}
|
}
|
||||||
|
|
||||||
varEnv := []string{
|
varEnv := []string{
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue