432 lines
14 KiB
Go
432 lines
14 KiB
Go
/*
|
|
Copyright 2020 The actions-runner-controller authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package controllers
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"time"
|
|
|
|
gogithub "github.com/google/go-github/v39/github"
|
|
"k8s.io/apimachinery/pkg/util/wait"
|
|
|
|
"github.com/go-logr/logr"
|
|
kerrors "k8s.io/apimachinery/pkg/api/errors"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
"k8s.io/client-go/tools/record"
|
|
ctrl "sigs.k8s.io/controller-runtime"
|
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
|
|
|
corev1 "k8s.io/api/core/v1"
|
|
|
|
"github.com/actions-runner-controller/actions-runner-controller/github"
|
|
)
|
|
|
|
// RunnerPodReconciler reconciles a Runner object
|
|
type RunnerPodReconciler struct {
|
|
client.Client
|
|
Log logr.Logger
|
|
Recorder record.EventRecorder
|
|
Scheme *runtime.Scheme
|
|
GitHubClient *github.Client
|
|
Name string
|
|
RegistrationRecheckInterval time.Duration
|
|
RegistrationRecheckJitter time.Duration
|
|
}
|
|
|
|
const (
|
|
// This names requires at least one slash to work.
|
|
// See https://github.com/google/knative-gcp/issues/378
|
|
runnerPodFinalizerName = "actions.summerwind.dev/runner-pod"
|
|
|
|
AnnotationKeyLastRegistrationCheckTime = "actions-runner-controller/last-registration-check-time"
|
|
)
|
|
|
|
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;update;patch;delete
|
|
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
|
|
|
|
func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
|
log := r.Log.WithValues("runnerpod", req.NamespacedName)
|
|
|
|
var runnerPod corev1.Pod
|
|
if err := r.Get(ctx, req.NamespacedName, &runnerPod); err != nil {
|
|
return ctrl.Result{}, client.IgnoreNotFound(err)
|
|
}
|
|
|
|
_, isRunnerPod := runnerPod.Labels[LabelKeyRunnerSetName]
|
|
if !isRunnerPod {
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
var enterprise, org, repo string
|
|
|
|
envvars := runnerPod.Spec.Containers[0].Env
|
|
for _, e := range envvars {
|
|
switch e.Name {
|
|
case EnvVarEnterprise:
|
|
enterprise = e.Value
|
|
case EnvVarOrg:
|
|
org = e.Value
|
|
case EnvVarRepo:
|
|
repo = e.Value
|
|
}
|
|
}
|
|
|
|
if runnerPod.ObjectMeta.DeletionTimestamp.IsZero() {
|
|
finalizers, added := addFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
|
|
|
|
if added {
|
|
newRunner := runnerPod.DeepCopy()
|
|
newRunner.ObjectMeta.Finalizers = finalizers
|
|
|
|
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
|
|
log.Error(err, "Failed to update runner")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
} else {
|
|
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
|
|
|
|
if removed {
|
|
ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name)
|
|
if err != nil {
|
|
if errors.Is(err, &gogithub.RateLimitError{}) {
|
|
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
|
// or the runner is still busy.
|
|
log.Error(
|
|
err,
|
|
fmt.Sprintf(
|
|
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
|
retryDelayOnGitHubAPIRateLimitError,
|
|
),
|
|
)
|
|
|
|
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
|
}
|
|
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
if !ok {
|
|
log.V(1).Info("Runner no longer exists on GitHub")
|
|
}
|
|
|
|
newRunner := runnerPod.DeepCopy()
|
|
newRunner.ObjectMeta.Finalizers = finalizers
|
|
|
|
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
|
|
log.Error(err, "Failed to update runner for finalizer removal")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
log.Info("Removed runner from GitHub", "repository", repo, "organization", org)
|
|
}
|
|
|
|
deletionTimeout := 1 * time.Minute
|
|
currentTime := time.Now()
|
|
deletionDidTimeout := currentTime.Sub(runnerPod.DeletionTimestamp.Add(deletionTimeout)) > 0
|
|
|
|
if deletionDidTimeout {
|
|
log.Info(
|
|
fmt.Sprintf("Failed to delete pod within %s. ", deletionTimeout)+
|
|
"This is typically the case when a Kubernetes node became unreachable "+
|
|
"and the kube controller started evicting nodes. Forcefully deleting the pod to not get stuck.",
|
|
"podDeletionTimestamp", runnerPod.DeletionTimestamp,
|
|
"currentTime", currentTime,
|
|
"configuredDeletionTimeout", deletionTimeout,
|
|
)
|
|
|
|
var force int64 = 0
|
|
// forcefully delete runner as we would otherwise get stuck if the node stays unreachable
|
|
if err := r.Delete(ctx, &runnerPod, &client.DeleteOptions{GracePeriodSeconds: &force}); err != nil {
|
|
// probably
|
|
if !kerrors.IsNotFound(err) {
|
|
log.Error(err, "Failed to forcefully delete pod resource ...")
|
|
return ctrl.Result{}, err
|
|
}
|
|
// forceful deletion finally succeeded
|
|
return ctrl.Result{Requeue: true}, nil
|
|
}
|
|
|
|
r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Forcefully deleted pod '%s'", runnerPod.Name))
|
|
log.Info("Forcefully deleted runner pod", "repository", repo)
|
|
// give kube manager a little time to forcefully delete the stuck pod
|
|
return ctrl.Result{RequeueAfter: 3 * time.Second}, nil
|
|
}
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
// If pod has ended up succeeded we need to restart it
|
|
// Happens e.g. when dind is in runner and run completes
|
|
stopped := runnerPod.Status.Phase == corev1.PodSucceeded
|
|
|
|
if !stopped {
|
|
if runnerPod.Status.Phase == corev1.PodRunning {
|
|
for _, status := range runnerPod.Status.ContainerStatuses {
|
|
if status.Name != containerName {
|
|
continue
|
|
}
|
|
|
|
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
|
|
stopped = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
restart := stopped
|
|
|
|
var registrationRecheckDelay time.Duration
|
|
|
|
// all checks done below only decide whether a restart is needed
|
|
// if a restart was already decided before, there is no need for the checks
|
|
// saving API calls and scary log messages
|
|
if !restart {
|
|
registrationCheckInterval := time.Minute
|
|
if r.RegistrationRecheckInterval > 0 {
|
|
registrationCheckInterval = r.RegistrationRecheckInterval
|
|
}
|
|
|
|
lastCheckTimeStr := runnerPod.Annotations[AnnotationKeyLastRegistrationCheckTime]
|
|
|
|
var lastCheckTime *time.Time
|
|
|
|
if lastCheckTimeStr != "" {
|
|
t, err := time.Parse(time.RFC3339, lastCheckTimeStr)
|
|
if err != nil {
|
|
log.Error(err, "failed to parase last check time %q", lastCheckTimeStr)
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
lastCheckTime = &t
|
|
}
|
|
|
|
// We want to call ListRunners GitHub Actions API only once per runner per minute.
|
|
// This if block, in conjunction with:
|
|
// return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
|
// achieves that.
|
|
if lastCheckTime != nil {
|
|
nextCheckTime := lastCheckTime.Add(registrationCheckInterval)
|
|
now := time.Now()
|
|
|
|
// Requeue scheduled by RequeueAfter can happen a bit earlier (like dozens of milliseconds)
|
|
// so to avoid excessive, in-effective retry, we heuristically ignore the remaining delay in case it is
|
|
// shorter than 1s
|
|
requeueAfter := nextCheckTime.Sub(now) - time.Second
|
|
if requeueAfter > 0 {
|
|
log.Info(
|
|
fmt.Sprintf("Skipped registration check because it's deferred until %s. Retrying in %s at latest", nextCheckTime, requeueAfter),
|
|
"lastRegistrationCheckTime", lastCheckTime,
|
|
"registrationCheckInterval", registrationCheckInterval,
|
|
)
|
|
|
|
// Without RequeueAfter, the controller may not retry on scheduled. Instead, it must wait until the
|
|
// next sync period passes, which can be too much later than nextCheckTime.
|
|
//
|
|
// We need to requeue on this reconcilation even though we have already scheduled the initial
|
|
// requeue previously with `return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil`.
|
|
// Apparently, the workqueue used by controller-runtime seems to deduplicate and resets the delay on
|
|
// other requeues- so the initial scheduled requeue may have been reset due to requeue on
|
|
// spec/status change.
|
|
return ctrl.Result{RequeueAfter: requeueAfter}, nil
|
|
}
|
|
}
|
|
|
|
notFound := false
|
|
offline := false
|
|
|
|
_, err := r.GitHubClient.IsRunnerBusy(ctx, enterprise, org, repo, runnerPod.Name)
|
|
|
|
currentTime := time.Now()
|
|
|
|
if err != nil {
|
|
var notFoundException *github.RunnerNotFound
|
|
var offlineException *github.RunnerOffline
|
|
if errors.As(err, ¬FoundException) {
|
|
notFound = true
|
|
} else if errors.As(err, &offlineException) {
|
|
offline = true
|
|
} else {
|
|
var e *gogithub.RateLimitError
|
|
if errors.As(err, &e) {
|
|
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
|
// or the runner is still busy.
|
|
log.Error(
|
|
err,
|
|
fmt.Sprintf(
|
|
"Failed to check if runner is busy due to Github API rate limit. Retrying in %s to avoid excessive GitHub API calls",
|
|
retryDelayOnGitHubAPIRateLimitError,
|
|
),
|
|
)
|
|
|
|
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
|
}
|
|
|
|
return ctrl.Result{}, err
|
|
}
|
|
}
|
|
|
|
registrationTimeout := 10 * time.Minute
|
|
durationAfterRegistrationTimeout := currentTime.Sub(runnerPod.CreationTimestamp.Add(registrationTimeout))
|
|
registrationDidTimeout := durationAfterRegistrationTimeout > 0
|
|
|
|
if notFound {
|
|
if registrationDidTimeout {
|
|
log.Info(
|
|
"Runner failed to register itself to GitHub in timely manner. "+
|
|
"Recreating the pod to see if it resolves the issue. "+
|
|
"CAUTION: If you see this a lot, you should investigate the root cause. "+
|
|
"See https://github.com/actions-runner-controller/actions-runner-controller/issues/288",
|
|
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
|
"currentTime", currentTime,
|
|
"configuredRegistrationTimeout", registrationTimeout,
|
|
)
|
|
|
|
restart = true
|
|
} else {
|
|
log.V(1).Info(
|
|
"Runner pod exists but we failed to check if runner is busy. Apparently it still needs more time.",
|
|
"runnerName", runnerPod.Name,
|
|
)
|
|
}
|
|
} else if offline {
|
|
if registrationDidTimeout {
|
|
log.Info(
|
|
"Already existing GitHub runner still appears offline . "+
|
|
"Recreating the pod to see if it resolves the issue. "+
|
|
"CAUTION: If you see this a lot, you should investigate the root cause. ",
|
|
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
|
"currentTime", currentTime,
|
|
"configuredRegistrationTimeout", registrationTimeout,
|
|
)
|
|
|
|
restart = true
|
|
} else {
|
|
log.V(1).Info(
|
|
"Runner pod exists but the GitHub runner appears to be still offline. Waiting for runner to get online ...",
|
|
"runnerName", runnerPod.Name,
|
|
)
|
|
}
|
|
}
|
|
|
|
if (notFound || offline) && !registrationDidTimeout {
|
|
registrationRecheckJitter := 10 * time.Second
|
|
if r.RegistrationRecheckJitter > 0 {
|
|
registrationRecheckJitter = r.RegistrationRecheckJitter
|
|
}
|
|
|
|
registrationRecheckDelay = registrationCheckInterval + wait.Jitter(registrationRecheckJitter, 0.1)
|
|
}
|
|
}
|
|
|
|
// Don't do anything if there's no need to restart the runner
|
|
if !restart {
|
|
// This guard enables us to update runner.Status.Phase to `Running` only after
|
|
// the runner is registered to GitHub.
|
|
if registrationRecheckDelay > 0 {
|
|
log.V(1).Info(fmt.Sprintf("Rechecking the runner registration in %s", registrationRecheckDelay))
|
|
|
|
updated := runnerPod.DeepCopy()
|
|
t := time.Now().Format(time.RFC3339)
|
|
updated.Annotations[AnnotationKeyLastRegistrationCheckTime] = t
|
|
|
|
if err := r.Patch(ctx, updated, client.MergeFrom(&runnerPod)); err != nil {
|
|
log.Error(err, "Failed to update runner pod annotation for LastRegistrationCheckTime")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
return ctrl.Result{RequeueAfter: registrationRecheckDelay}, nil
|
|
}
|
|
|
|
// Seeing this message, you can expect the runner to become `Running` soon.
|
|
log.Info(
|
|
"Runner appears to have registered and running.",
|
|
"podCreationTimestamp", runnerPod.CreationTimestamp,
|
|
)
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
// Delete current pod if recreation is needed
|
|
if err := r.Delete(ctx, &runnerPod); err != nil {
|
|
log.Error(err, "Failed to delete pod resource")
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
r.Recorder.Event(&runnerPod, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", runnerPod.Name))
|
|
log.Info("Deleted runner pod", "name", runnerPod.Name)
|
|
|
|
return ctrl.Result{}, nil
|
|
}
|
|
|
|
func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
|
|
runners, err := r.GitHubClient.ListRunners(ctx, enterprise, org, repo)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
var busy bool
|
|
|
|
id := int64(0)
|
|
for _, runner := range runners {
|
|
if runner.GetName() == name {
|
|
// Sometimes a runner can stuck "busy" even though it is already "offline".
|
|
// Thus removing the condition on status can block the runner pod from being terminated forever.
|
|
busy = runner.GetBusy()
|
|
if runner.GetStatus() != "offline" && busy {
|
|
r.Log.Info("This runner will delay the runner pod deletion and the runner deregistration until it becomes either offline or non-busy", "name", runner.GetName(), "status", runner.GetStatus(), "busy", runner.GetBusy())
|
|
return false, fmt.Errorf("runner is busy")
|
|
}
|
|
id = runner.GetID()
|
|
break
|
|
}
|
|
}
|
|
|
|
if id == int64(0) {
|
|
return false, nil
|
|
}
|
|
|
|
// Sometimes a runner can stuck "busy" even though it is already "offline".
|
|
// Trying to remove the offline but busy runner can result in errors like the following:
|
|
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
|
|
if !busy {
|
|
if err := r.GitHubClient.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
|
|
return false, err
|
|
}
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
|
name := "runnerpod-controller"
|
|
if r.Name != "" {
|
|
name = r.Name
|
|
}
|
|
|
|
r.Recorder = mgr.GetEventRecorderFor(name)
|
|
|
|
return ctrl.NewControllerManagedBy(mgr).
|
|
For(&corev1.Pod{}).
|
|
Named(name).
|
|
Complete(r)
|
|
}
|