RunnerSet: Automatic-recovery from registration timeout and deregistration on pod termination (#652)
Ref #629 Ref #613 Ref #612
This commit is contained in:
parent
98da4c2adb
commit
acb906164b
|
|
@ -16,6 +16,8 @@ spec:
|
||||||
# RunnerSet doesn't support scale from/to zero yet
|
# RunnerSet doesn't support scale from/to zero yet
|
||||||
minReplicas: 1
|
minReplicas: 1
|
||||||
maxReplicas: 5
|
maxReplicas: 5
|
||||||
|
# This should be less than 600(seconds, the default) for faster testing
|
||||||
|
scaleDownDelaySecondsAfterScaleOut: 60
|
||||||
metrics:
|
metrics:
|
||||||
- type: PercentageRunnersBusy
|
- type: PercentageRunnersBusy
|
||||||
scaleUpThreshold: '0.75'
|
scaleUpThreshold: '0.75'
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,10 @@ spec:
|
||||||
serviceName: example-runnerset
|
serviceName: example-runnerset
|
||||||
|
|
||||||
#replicas: 1
|
#replicas: 1
|
||||||
|
|
||||||
|
# From my limited testing, `ephemeral: true` is more reliable.
|
||||||
|
# Seomtimes, updating already deployed runners from `ephemeral: false` to `ephemeral: true` seems to
|
||||||
|
# result in queued jobs hanging forever.
|
||||||
ephemeral: false
|
ephemeral: false
|
||||||
|
|
||||||
repository: ${TEST_REPO}
|
repository: ${TEST_REPO}
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,10 @@ import (
|
||||||
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
|
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
AnnotationKeyTokenExpirationDate = "actions-runner-controller/token-expires-at"
|
||||||
|
)
|
||||||
|
|
||||||
// +kubebuilder:webhook:path=/mutate-runner-set-pod,mutating=true,failurePolicy=ignore,groups="",resources=pods,verbs=create,versions=v1,name=mutate-runner-pod.webhook.actions.summerwind.dev,sideEffects=None,admissionReviewVersions=v1beta1
|
// +kubebuilder:webhook:path=/mutate-runner-set-pod,mutating=true,failurePolicy=ignore,groups="",resources=pods,verbs=create,versions=v1,name=mutate-runner-pod.webhook.actions.summerwind.dev,sideEffects=None,admissionReviewVersions=v1beta1
|
||||||
|
|
||||||
type PodRunnerTokenInjector struct {
|
type PodRunnerTokenInjector struct {
|
||||||
|
|
@ -72,7 +76,7 @@ func (t *PodRunnerTokenInjector) Handle(ctx context.Context, req admission.Reque
|
||||||
|
|
||||||
updated := mutatePod(&pod, *rt.Token)
|
updated := mutatePod(&pod, *rt.Token)
|
||||||
|
|
||||||
updated.Annotations["actions-runner-controller/token-expires-at"] = ts
|
updated.Annotations[AnnotationKeyTokenExpirationDate] = ts
|
||||||
|
|
||||||
if pod.Spec.RestartPolicy != corev1.RestartPolicyOnFailure {
|
if pod.Spec.RestartPolicy != corev1.RestartPolicyOnFailure {
|
||||||
updated.Spec.RestartPolicy = corev1.RestartPolicyOnFailure
|
updated.Spec.RestartPolicy = corev1.RestartPolicyOnFailure
|
||||||
|
|
|
||||||
|
|
@ -51,6 +51,10 @@ const (
|
||||||
|
|
||||||
// This is an annotation internal to actions-runner-controller and can change in backward-incompatible ways
|
// This is an annotation internal to actions-runner-controller and can change in backward-incompatible ways
|
||||||
annotationKeyRegistrationOnly = "actions-runner-controller/registration-only"
|
annotationKeyRegistrationOnly = "actions-runner-controller/registration-only"
|
||||||
|
|
||||||
|
EnvVarOrg = "RUNNER_ORG"
|
||||||
|
EnvVarRepo = "RUNNER_REPO"
|
||||||
|
EnvVarEnterprise = "RUNNER_ENTERPRISE"
|
||||||
)
|
)
|
||||||
|
|
||||||
// RunnerReconciler reconciles a Runner object
|
// RunnerReconciler reconciles a Runner object
|
||||||
|
|
@ -89,7 +93,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||||
}
|
}
|
||||||
|
|
||||||
if runner.ObjectMeta.DeletionTimestamp.IsZero() {
|
if runner.ObjectMeta.DeletionTimestamp.IsZero() {
|
||||||
finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers)
|
finalizers, added := addFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
||||||
|
|
||||||
if added {
|
if added {
|
||||||
newRunner := runner.DeepCopy()
|
newRunner := runner.DeepCopy()
|
||||||
|
|
@ -103,7 +107,7 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
||||||
return ctrl.Result{}, nil
|
return ctrl.Result{}, nil
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers)
|
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
||||||
|
|
||||||
if removed {
|
if removed {
|
||||||
if len(runner.Status.Registration.Token) > 0 {
|
if len(runner.Status.Registration.Token) > 0 {
|
||||||
|
|
@ -741,15 +745,15 @@ func newRunnerPod(template corev1.Pod, runnerSpec v1alpha1.RunnerConfig, default
|
||||||
|
|
||||||
env := []corev1.EnvVar{
|
env := []corev1.EnvVar{
|
||||||
{
|
{
|
||||||
Name: "RUNNER_ORG",
|
Name: EnvVarOrg,
|
||||||
Value: runnerSpec.Organization,
|
Value: runnerSpec.Organization,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Name: "RUNNER_REPO",
|
Name: EnvVarRepo,
|
||||||
Value: runnerSpec.Repository,
|
Value: runnerSpec.Repository,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Name: "RUNNER_ENTERPRISE",
|
Name: EnvVarEnterprise,
|
||||||
Value: runnerSpec.Enterprise,
|
Value: runnerSpec.Enterprise,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -1029,7 +1033,7 @@ func (r *RunnerReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||||
Complete(r)
|
Complete(r)
|
||||||
}
|
}
|
||||||
|
|
||||||
func addFinalizer(finalizers []string) ([]string, bool) {
|
func addFinalizer(finalizers []string, finalizerName string) ([]string, bool) {
|
||||||
exists := false
|
exists := false
|
||||||
for _, name := range finalizers {
|
for _, name := range finalizers {
|
||||||
if name == finalizerName {
|
if name == finalizerName {
|
||||||
|
|
@ -1044,7 +1048,7 @@ func addFinalizer(finalizers []string) ([]string, bool) {
|
||||||
return append(finalizers, finalizerName), true
|
return append(finalizers, finalizerName), true
|
||||||
}
|
}
|
||||||
|
|
||||||
func removeFinalizer(finalizers []string) ([]string, bool) {
|
func removeFinalizer(finalizers []string, finalizerName string) ([]string, bool) {
|
||||||
removed := false
|
removed := false
|
||||||
result := []string{}
|
result := []string{}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,431 @@
|
||||||
|
/*
|
||||||
|
Copyright 2020 The actions-runner-controller authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package controllers
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
gogithub "github.com/google/go-github/v33/github"
|
||||||
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
|
|
||||||
|
"github.com/go-logr/logr"
|
||||||
|
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
||||||
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
|
"k8s.io/client-go/tools/record"
|
||||||
|
ctrl "sigs.k8s.io/controller-runtime"
|
||||||
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||||
|
|
||||||
|
corev1 "k8s.io/api/core/v1"
|
||||||
|
|
||||||
|
"github.com/actions-runner-controller/actions-runner-controller/github"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RunnerPodReconciler reconciles a Runner object
|
||||||
|
type RunnerPodReconciler struct {
|
||||||
|
client.Client
|
||||||
|
Log logr.Logger
|
||||||
|
Recorder record.EventRecorder
|
||||||
|
Scheme *runtime.Scheme
|
||||||
|
GitHubClient *github.Client
|
||||||
|
Name string
|
||||||
|
RegistrationRecheckInterval time.Duration
|
||||||
|
RegistrationRecheckJitter time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
// This names requires at leaset one slash to work.
|
||||||
|
// See https://github.com/google/knative-gcp/issues/378
|
||||||
|
runnerPodFinalizerName = "actions.summerwind.dev/runner-pod"
|
||||||
|
|
||||||
|
AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete
|
||||||
|
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
|
||||||
|
|
||||||
|
func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
||||||
|
log := r.Log.WithValues("runnerpod", req.NamespacedName)
|
||||||
|
|
||||||
|
var runnerPod corev1.Pod
|
||||||
|
if err := r.Get(ctx, req.NamespacedName, &runnerPod); err != nil {
|
||||||
|
return ctrl.Result{}, client.IgnoreNotFound(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, isRunnerPod := runnerPod.Labels[LabelKeyRunnerSetName]
|
||||||
|
if !isRunnerPod {
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var enterprise, org, repo string
|
||||||
|
|
||||||
|
envvars := runnerPod.Spec.Containers[0].Env
|
||||||
|
for _, e := range envvars {
|
||||||
|
switch e.Name {
|
||||||
|
case EnvVarEnterprise:
|
||||||
|
enterprise = e.Value
|
||||||
|
case EnvVarOrg:
|
||||||
|
org = e.Value
|
||||||
|
case EnvVarRepo:
|
||||||
|
repo = e.Value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if runnerPod.ObjectMeta.DeletionTimestamp.IsZero() {
|
||||||
|
finalizers, added := addFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
|
||||||
|
|
||||||
|
if added {
|
||||||
|
newRunner := runnerPod.DeepCopy()
|
||||||
|
newRunner.ObjectMeta.Finalizers = finalizers
|
||||||
|
|
||||||
|
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
|
||||||
|
log.Error(err, "Failed to update runner")
|
||||||
|
return ctrl.Result{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
|
||||||
|
|
||||||
|
if removed {
|
||||||
|
ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, &gogithub.RateLimitError{}) {
|
||||||
|
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||||
|
// or the runner is still busy.
|
||||||
|
log.Error(
|
||||||
|
err,
|
||||||
|
fmt.Sprintf(
|
||||||
|
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
||||||
|
retryDelayOnGitHubAPIRateLimitError,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctrl.Result{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if !ok {
|
||||||
|
log.V(1).Info("Runner no longer exists on GitHub")
|
||||||
|
}
|
||||||
|
|
||||||
|
newRunner := runnerPod.DeepCopy()
|
||||||
|
newRunner.ObjectMeta.Finalizers = finalizers
|
||||||
|
|
||||||
|
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
|
||||||
|
log.Error(err, "Failed to update runner for finalizer removal")
|
||||||
|
return ctrl.Result{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info("Removed runner from GitHub", "repository", repo, "organization", org)
|
||||||
|
}
|
||||||
|
|
||||||
|
deletionTimeout := 1 * time.Minute
|
||||||
|
currentTime := time.Now()
|
||||||
|
deletionDidTimeout := currentTime.Sub(runnerPod.DeletionTimestamp.Add(deletionTimeout)) > 0
|
||||||
|
|
||||||
|
if deletionDidTimeout {
|
||||||
|
log.Info(
|
||||||
|
fmt.Sprintf("Failed to delete pod within %s. ", deletionTimeout)+
|
||||||
|
"This is typically the case when a Kubernetes node became unreachable "+
|
||||||
|
"and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.",
|
||||||
|
"podDeletionTimestamp", runnerPod.DeletionTimestamp,
|
||||||
|
"currentTime", currentTime,
|
||||||
|
"configuredDeletionTimeout", deletionTimeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
var force int64 = 0
|
||||||
|
// forcefully delete runner as we would otherwise get stuck if the node stays unreachable
|
||||||
|
if err := r.Delete(ctx, &runnerPod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil {
|
||||||
|
// probably
|
||||||
|
if !kerrors.IsNotFound(err) {
|
||||||
|
log.Error(err, "Failed to forcefully delete pod resource ...")
|
||||||
|
return ctrl.Result{}, err
|
||||||
|
}
|
||||||
|
// forceful deletion finally succeeded
|
||||||
|
return ctrl.Result{Requeue: true}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", runnerPod.Name))
|
||||||
|
log.Info("Forcefully deleted runner pod", "repository", repo)
|
||||||
|
// give kube manager a little time to forcefully delete the stuck pod
|
||||||
|
return ctrl.Result{RequeueAfter: 3 * time.Second}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// If pod has ended up succeeded we need to restart it
|
||||||
|
// Happens e.g. when dind is in runner and run completes
|
||||||
|
stopped := runnerPod.Status.Phase == corev1.PodSucceeded
|
||||||
|
|
||||||
|
if !stopped {
|
||||||
|
if runnerPod.Status.Phase == corev1.PodRunning {
|
||||||
|
for _, status := range runnerPod.Status.ContainerStatuses {
|
||||||
|
if status.Name != containerName {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
|
||||||
|
stopped = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
restart := stopped
|
||||||
|
|
||||||
|
var registrationRecheckDelay time.Duration
|
||||||
|
|
||||||
|
// all checks done below only decide whether a restart is needed
|
||||||
|
// if a restart was already decided before, there is no need for the checks
|
||||||
|
// saving API calls and scary log messages
|
||||||
|
if !restart {
|
||||||
|
registrationCheckInterval := time.Minute
|
||||||
|
if r.RegistrationRecheckInterval > 0 {
|
||||||
|
registrationCheckInterval = r.RegistrationRecheckInterval
|
||||||
|
}
|
||||||
|
|
||||||
|
lastCheckTimeStr := runnerPod.Annotations[AnnotationKeyLastRegistrationCheckTime]
|
||||||
|
|
||||||
|
var lastCheckTime *time.Time
|
||||||
|
|
||||||
|
if lastCheckTimeStr != "" {
|
||||||
|
t, err := time.Parse(time.RFC3339, lastCheckTimeStr)
|
||||||
|
if err != nil {
|
||||||
|
log.Error(err, "failed to parase last check time %q", lastCheckTimeStr)
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
lastCheckTime = &t
|
||||||
|
}
|
||||||
|
|
||||||
|
// We want to call ListRunners GitHub Actions API only once per runner per minute.
|
||||||
|
// This if block, in conjunction with:
|
||||||
|
// return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
||||||
|
// achieves that.
|
||||||
|
if lastCheckTime != nil {
|
||||||
|
nextCheckTime := lastCheckTime.Add(registrationCheckInterval)
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
|
||||||
|
// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
|
||||||
|
// shorter than 1s
|
||||||
|
requeueAfter := nextCheckTime.Sub(now) - time.Second
|
||||||
|
if requeueAfter > 0 {
|
||||||
|
log.Info(
|
||||||
|
fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter),
|
||||||
|
"lastRegistrationCheckTime", lastCheckTime,
|
||||||
|
"registrationCheckInterval", registrationCheckInterval,
|
||||||
|
)
|
||||||
|
|
||||||
|
// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
|
||||||
|
// next sync period passes, which can be too much later than nextCheckTime.
|
||||||
|
//
|
||||||
|
// We need to requeue on this reconcilation even though we have already scheduled the initial
|
||||||
|
// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
|
||||||
|
// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
|
||||||
|
// other requeues- so the initial scheduled requeue may have been reset due to requeue on
|
||||||
|
// spec/status change.
|
||||||
|
return ctrl.Result{RequeueAfter: requeueAfter}, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
notFound := false
|
||||||
|
offline := false
|
||||||
|
|
||||||
|
_, err := r.GitHubClient.IsRunnerBusy(ctx, enterprise, org, repo, runnerPod.Name)
|
||||||
|
|
||||||
|
currentTime := time.Now()
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
var notFoundException *github.RunnerNotFound
|
||||||
|
var offlineException *github.RunnerOffline
|
||||||
|
if errors.As(err, ¬FoundException) {
|
||||||
|
notFound = true
|
||||||
|
} else if errors.As(err, &offlineException) {
|
||||||
|
offline = true
|
||||||
|
} else {
|
||||||
|
var e *gogithub.RateLimitError
|
||||||
|
if errors.As(err, &e) {
|
||||||
|
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||||
|
// or the runner is still busy.
|
||||||
|
log.Error(
|
||||||
|
err,
|
||||||
|
fmt.Sprintf(
|
||||||
|
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
|
||||||
|
retryDelayOnGitHubAPIRateLimitError,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctrl.Result{}, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
registrationTimeout := 10 * time.Minute
|
||||||
|
durationAfterRegistrationTimeout := currentTime.Sub(runnerPod.CreationTimestamp.Add(registrationTimeout))
|
||||||
|
registrationDidTimeout := durationAfterRegistrationTimeout > 0
|
||||||
|
|
||||||
|
if notFound {
|
||||||
|
if registrationDidTimeout {
|
||||||
|
log.Info(
|
||||||
|
"Runner failed to register itself to GitHub in timely manner. "+
|
||||||
|
"Recreating the pod to see if it resolves the issue. "+
|
||||||
|
"CAUTION: If you see this a lot, you should investigate the root cause. "+
|
||||||
|
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
|
||||||
|
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
||||||
|
"currentTime", currentTime,
|
||||||
|
"configuredRegistrationTimeout", registrationTimeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
restart = true
|
||||||
|
} else {
|
||||||
|
log.V(1).Info(
|
||||||
|
"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.",
|
||||||
|
"runnerName", runnerPod.Name,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
} else if offline {
|
||||||
|
if registrationDidTimeout {
|
||||||
|
log.Info(
|
||||||
|
"Already existing GitHub runner still appears offline . "+
|
||||||
|
"Recreating the pod to see if it resolves the issue. "+
|
||||||
|
"CAUTION: If you see this a lot, you should investigate the root cause. ",
|
||||||
|
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
||||||
|
"currentTime", currentTime,
|
||||||
|
"configuredRegistrationTimeout", registrationTimeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
restart = true
|
||||||
|
} else {
|
||||||
|
log.V(1).Info(
|
||||||
|
"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...",
|
||||||
|
"runnerName", runnerPod.Name,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (notFound || offline) && !registrationDidTimeout {
|
||||||
|
registrationRecheckJitter := 10 * time.Second
|
||||||
|
if r.RegistrationRecheckJitter > 0 {
|
||||||
|
registrationRecheckJitter = r.RegistrationRecheckJitter
|
||||||
|
}
|
||||||
|
|
||||||
|
registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Don't do anything if there's no need to restart the runner
|
||||||
|
if !restart {
|
||||||
|
// This guard enables us to update runner.Status.Phase to `Running` only after
|
||||||
|
// the runner is registered to GitHub.
|
||||||
|
if registrationRecheckDelay > 0 {
|
||||||
|
log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay))
|
||||||
|
|
||||||
|
updated := runnerPod.DeepCopy()
|
||||||
|
t := time.Now().Format(time.RFC3339)
|
||||||
|
updated.Annotations[AnnotationKeyLastRegistrationCheckTime] = t
|
||||||
|
|
||||||
|
if err := r.Patch(ctx, updated, client.MergeFrom(&runnerPod)); err != nil {
|
||||||
|
log.Error(err, "Failed to update runner pod annotation for LastRegistrationCheckTime")
|
||||||
|
return ctrl.Result{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Seeing this message, you can expect the runner to become `Running` soon.
|
||||||
|
log.Info(
|
||||||
|
"Runner appears to have registered and running.",
|
||||||
|
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete current pod if recreation is needed
|
||||||
|
if err := r.Delete(ctx, &runnerPod); err != nil {
|
||||||
|
log.Error(err, "Failed to delete pod resource")
|
||||||
|
return ctrl.Result{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", runnerPod.Name))
|
||||||
|
log.Info("Deleted runner pod", "name", runnerPod.Name)
|
||||||
|
|
||||||
|
return ctrl.Result{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
|
||||||
|
runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var busy bool
|
||||||
|
|
||||||
|
id := int64(0)
|
||||||
|
for _, runner := range runners {
|
||||||
|
if runner.GetName() == name {
|
||||||
|
// Sometimes a runner can stuck "busy" even though it is already "offline".
|
||||||
|
// Thus removing the condition on status can block the runner pod from being terminated forever.
|
||||||
|
busy = runner.GetBusy()
|
||||||
|
if runner.GetStatus() != "offline" && busy {
|
||||||
|
r.Log.Info("This runner will delay the runner pod deletion and the runner deregistration until it becomes either offline or non-busy", "name", runner.GetName(), "status", runner.GetStatus(), "busy", runner.GetBusy())
|
||||||
|
return false, fmt.Errorf("runner is busy")
|
||||||
|
}
|
||||||
|
id = runner.GetID()
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if id == int64(0) {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sometimes a runner can stuck "busy" even though it is already "offline".
|
||||||
|
// Trying to remove the offline but busy runner can result in errors like the following:
|
||||||
|
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
|
||||||
|
if !busy {
|
||||||
|
if err := r.GitHubClient.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||||
|
name := "runnerpod-controller"
|
||||||
|
if r.Name != "" {
|
||||||
|
name = r.Name
|
||||||
|
}
|
||||||
|
|
||||||
|
r.Recorder = mgr.GetEventRecorderFor(name)
|
||||||
|
|
||||||
|
return ctrl.NewControllerManagedBy(mgr).
|
||||||
|
For(&corev1.Pod{}).
|
||||||
|
Named(name).
|
||||||
|
Complete(r)
|
||||||
|
}
|
||||||
12
main.go
12
main.go
|
|
@ -215,6 +215,18 @@ func main() {
|
||||||
CacheDuration: gitHubAPICacheDuration,
|
CacheDuration: gitHubAPICacheDuration,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
runnerPodReconciler := &controllers.RunnerPodReconciler{
|
||||||
|
Client: mgr.GetClient(),
|
||||||
|
Log: log.WithName("runnerpod"),
|
||||||
|
Scheme: mgr.GetScheme(),
|
||||||
|
GitHubClient: ghClient,
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = runnerPodReconciler.SetupWithManager(mgr); err != nil {
|
||||||
|
log.Error(err, "unable to create controller", "controller", "RunnerPod")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
if err = horizontalRunnerAutoscaler.SetupWithManager(mgr); err != nil {
|
if err = horizontalRunnerAutoscaler.SetupWithManager(mgr); err != nil {
|
||||||
log.Error(err, "unable to create controller", "controller", "HorizontalRunnerAutoscaler")
|
log.Error(err, "unable to create controller", "controller", "HorizontalRunnerAutoscaler")
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue