Merge pull request #1127 from actions-runner-controller/github-api-cache

Enhances ARC(both the controller-manager and github-webhook-server) to cache any GitHub API responses with HTTP GET and an appropriate Cache-Control header.

Ref #920

## Cache Implementation

`gregjones/httpcache` has been chosen as a library to implement this feature, as it is as recommended in `go-github`'s documentation:

https://github.com/google/go-github#conditional-requests

`gregjones/httpcache` supports a number of cache backends like `diskcache`, `s3cache`, and so on:

https://github.com/gregjones/httpcache#cache-backends

We stick to the built-in in-memory cache as a starter. Probably this will never becomes an issue as long as various HTTP responses for all the GitHub API calls that ARC makes, list-runners, list-workflow-jobs, list-runner-groups, etc., doesn't overflow the in-memory cache.

`httpcache` has an known unfixed issue that it doesn't update cache on chunked responses. But we assume that the APIs that we call doesn't use chunked responses. See #1503 for more information on that.

## Ephemeral runner pods are no longer recreated

The addition of the cache layer resulted in a slow down of a scale-down process and a trade-off between making the runner pod termination process fragile to various race conditions(shorter grace period before runner deletion) or delaying runner pod deletion depending on how long the grace period is(longer grace period). A grace period needs to be at least longer than 60s (which is the same as cache duration of ListRunners API) to not prematurely delete a runner pod that was just created.

But once I disabled automatic recreation of ephemeral runner pod, it turned out to be no more of an issue when it's being scaled via workflow_job webhook.

Ephemeral runner resources are still automatically added on demand by RunnerDeployment via RunnerReplicaSet(I've added `EffectiveTime` fields to our CRDs but that's an implementation detail so let's omit). A good side-effect of disabling ephemeral runner pod recreations is that ARC will no longer create redundant ephemeral runners when used with webhook-based autoscaler.

Basically, autoscaling still works as everyone might expect. It's just better than before overall.
This commit is contained in:
Yusuke Kuoka 2022-02-28 08:37:26 +09:00 committed by GitHub
commit 686d40c20d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
32 changed files with 707 additions and 209 deletions

View File

@ -37,7 +37,10 @@ fi
tool=${ACCEPTANCE_TEST_DEPLOYMENT_TOOL}
TEST_ID=${TEST_ID:-default}
if [ "${tool}" == "helm" ]; then
set -v
helm upgrade --install actions-runner-controller \
charts/actions-runner-controller \
-n actions-runner-system \
@ -46,7 +49,10 @@ if [ "${tool}" == "helm" ]; then
--set authSecret.create=false \
--set image.repository=${NAME} \
--set image.tag=${VERSION} \
--set podAnnotations.test-id=${TEST_ID} \
--set githubWebhookServer.podAnnotations.test-id=${TEST_ID} \
-f ${VALUES_FILE}
set +v
# To prevent `CustomResourceDefinition.apiextensions.k8s.io "runners.actions.summerwind.dev" is invalid: metadata.annotations: Too long: must have at most 262144 bytes`
# errors
kubectl create -f charts/actions-runner-controller/crds || kubectl replace -f charts/actions-runner-controller/crds
@ -77,7 +83,7 @@ else
fi
if [ -n "${TEST_ORG}" ]; then
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= NAME=org-runnerdeploy envsubst | kubectl apply -f -
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= RUNNER_MIN_REPLICAS=${ORG_RUNNER_MIN_REPLICAS} NAME=org-runnerdeploy envsubst | kubectl apply -f -
if [ -n "${TEST_ORG_GROUP}" ]; then
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= TEST_GROUP=${TEST_ORG_GROUP} NAME=orggroup-runnerdeploy envsubst | kubectl apply -f -

View File

@ -28,6 +28,7 @@ spec:
## Replace `mumoshu/actions-runner-dind:dev` with your dind image
#dockerdWithinRunnerContainer: true
#image: mumoshu/actions-runner-dind:dev
dockerdWithinRunnerContainer: ${RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER}
#
# Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu)
@ -56,6 +57,7 @@ spec:
scaleUpTriggers:
- githubEvent: {}
amount: 1
duration: "1m"
minReplicas: 0
duration: "10m"
minReplicas: ${RUNNER_MIN_REPLICAS}
maxReplicas: 10
scaleDownDelaySecondsAfterScaleOut: ${RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT}

View File

@ -1,7 +1,8 @@
# Set actions-runner-controller settings for testing
githubAPICacheDuration: 10s
logLevel: "-3"
githubWebhookServer:
logLevel: debug
logLevel: "-3"
enabled: true
labels: {}
replicaCount: 1

View File

@ -107,6 +107,9 @@ type CapacityReservation struct {
Name string `json:"name,omitempty"`
ExpirationTime metav1.Time `json:"expirationTime,omitempty"`
Replicas int `json:"replicas,omitempty"`
// +optional
EffectiveTime metav1.Time `json:"effectiveTime,omitempty"`
}
type ScaleTargetRef struct {

View File

@ -31,6 +31,14 @@ type RunnerDeploymentSpec struct {
// +nullable
Replicas *int `json:"replicas,omitempty"`
// EffectiveTime is the time the upstream controller requested to sync Replicas.
// It is usually populated by the webhook-based autoscaler via HRA.
// The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
//
// +optional
// +nullable
EffectiveTime *metav1.Time `json:"effectiveTime"`
// +optional
// +nullable
Selector *metav1.LabelSelector `json:"selector"`

View File

@ -26,6 +26,15 @@ type RunnerReplicaSetSpec struct {
// +nullable
Replicas *int `json:"replicas,omitempty"`
// EffectiveTime is the time the upstream controller requested to sync Replicas.
// It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment.
// The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners
// based on potentially outdated Replicas value.
//
// +optional
// +nullable
EffectiveTime *metav1.Time `json:"effectiveTime"`
// +optional
// +nullable
Selector *metav1.LabelSelector `json:"selector"`

View File

@ -47,6 +47,7 @@ func (in *CacheEntry) DeepCopy() *CacheEntry {
func (in *CapacityReservation) DeepCopyInto(out *CapacityReservation) {
*out = *in
in.ExpirationTime.DeepCopyInto(&out.ExpirationTime)
in.EffectiveTime.DeepCopyInto(&out.EffectiveTime)
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityReservation.
@ -498,6 +499,10 @@ func (in *RunnerDeploymentSpec) DeepCopyInto(out *RunnerDeploymentSpec) {
*out = new(int)
**out = **in
}
if in.EffectiveTime != nil {
in, out := &in.EffectiveTime, &out.EffectiveTime
*out = (*in).DeepCopy()
}
if in.Selector != nil {
in, out := &in.Selector, &out.Selector
*out = new(metav1.LabelSelector)
@ -812,6 +817,10 @@ func (in *RunnerReplicaSetSpec) DeepCopyInto(out *RunnerReplicaSetSpec) {
*out = new(int)
**out = **in
}
if in.EffectiveTime != nil {
in, out := &in.EffectiveTime, &out.EffectiveTime
*out = (*in).DeepCopy()
}
if in.Selector != nil {
in, out := &in.Selector, &out.Selector
*out = new(metav1.LabelSelector)

View File

@ -49,6 +49,9 @@ spec:
items:
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
properties:
effectiveTime:
format: date-time
type: string
expirationTime:
format: date-time
type: string

View File

@ -48,6 +48,11 @@ spec:
spec:
description: RunnerDeploymentSpec defines the desired state of RunnerDeployment
properties:
effectiveTime:
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
format: date-time
nullable: true
type: string
replicas:
nullable: true
type: integer

View File

@ -45,6 +45,11 @@ spec:
spec:
description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet
properties:
effectiveTime:
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value.
format: date-time
nullable: true
type: string
replicas:
nullable: true
type: integer

View File

@ -14,6 +14,7 @@ spec:
metadata:
{{- with .Values.podAnnotations }}
annotations:
kubectl.kubernetes.io/default-logs-container: "manager"
{{- toYaml . | nindent 8 }}
{{- end }}
labels:

View File

@ -15,6 +15,7 @@ spec:
metadata:
{{- with .Values.githubWebhookServer.podAnnotations }}
annotations:
kubectl.kubernetes.io/default-logs-container: "github-webhook-server"
{{- toYaml . | nindent 8 }}
{{- end }}
labels:

View File

@ -29,15 +29,14 @@ import (
actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
"github.com/actions-runner-controller/actions-runner-controller/controllers"
"github.com/actions-runner-controller/actions-runner-controller/github"
"github.com/actions-runner-controller/actions-runner-controller/logging"
"github.com/kelseyhightower/envconfig"
zaplib "go.uber.org/zap"
"k8s.io/apimachinery/pkg/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth/exec"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
_ "k8s.io/client-go/plugin/pkg/client/auth/oidc"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
// +kubebuilder:scaffold:imports
)
@ -47,11 +46,6 @@ var (
)
const (
logLevelDebug = "debug"
logLevelInfo = "info"
logLevelWarn = "warn"
logLevelError = "error"
webhookSecretTokenEnvName = "GITHUB_WEBHOOK_SECRET_TOKEN"
)
@ -97,7 +91,7 @@ func main() {
flag.BoolVar(&enableLeaderElection, "enable-leader-election", false,
"Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change")
flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
flag.StringVar(&webhookSecretToken, "github-webhook-secret-token", "", "The personal access token of GitHub.")
flag.StringVar(&c.Token, "github-token", c.Token, "The personal access token of GitHub.")
flag.Int64Var(&c.AppID, "github-app-id", c.AppID, "The application ID of GitHub App.")
@ -126,23 +120,7 @@ func main() {
setupLog.Info("-watch-namespace is %q. Only HorizontalRunnerAutoscalers in %q are watched, cached, and considered as scale targets.")
}
logger := zap.New(func(o *zap.Options) {
switch logLevel {
case logLevelDebug:
o.Development = true
lvl := zaplib.NewAtomicLevelAt(-2) // maps to logr's V(2)
o.Level = &lvl
case logLevelInfo:
lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel)
o.Level = &lvl
case logLevelWarn:
lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel)
o.Level = &lvl
case logLevelError:
lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel)
o.Level = &lvl
}
})
logger := logging.NewLogger(logLevel)
ctrl.SetLogger(logger)
@ -152,6 +130,8 @@ func main() {
// That is, all runner groups managed by ARC are assumed to be visible to any repositories,
// which is wrong when you have one or more non-default runner groups in your organization or enterprise.
if len(c.Token) > 0 || (c.AppID > 0 && c.AppInstallationID > 0 && c.AppPrivateKey != "") || (len(c.BasicauthUsername) > 0 && len(c.BasicauthPassword) > 0) {
c.Log = &logger
ghClient, err = c.NewClient()
if err != nil {
fmt.Fprintln(os.Stderr, "Error: Client creation failed.", err)

View File

@ -49,6 +49,9 @@ spec:
items:
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
properties:
effectiveTime:
format: date-time
type: string
expirationTime:
format: date-time
type: string

View File

@ -48,6 +48,11 @@ spec:
spec:
description: RunnerDeploymentSpec defines the desired state of RunnerDeployment
properties:
effectiveTime:
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
format: date-time
nullable: true
type: string
replicas:
nullable: true
type: integer

View File

@ -45,6 +45,11 @@ spec:
spec:
description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet
properties:
effectiveTime:
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value.
format: date-time
nullable: true
type: string
replicas:
nullable: true
type: integer

View File

@ -770,8 +770,10 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.
capacityReservations := getValidCapacityReservations(copy)
if amount > 0 {
now := time.Now()
copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)},
EffectiveTime: metav1.Time{Time: now},
ExpirationTime: metav1.Time{Time: now.Add(target.ScaleUpTrigger.Duration.Duration)},
Replicas: amount,
})
} else if amount < 0 {
@ -790,10 +792,16 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.
copy.Spec.CapacityReservations = reservations
}
autoscaler.Log.Info(
before := len(target.HorizontalRunnerAutoscaler.Spec.CapacityReservations)
expired := before - len(capacityReservations)
after := len(copy.Spec.CapacityReservations)
autoscaler.Log.V(1).Info(
"Patching hra for capacityReservations update",
"before", target.HorizontalRunnerAutoscaler.Spec.CapacityReservations,
"after", copy.Spec.CapacityReservations,
"before", before,
"expired", expired,
"amount", amount,
"after", after,
)
if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil {

View File

@ -99,11 +99,33 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re
return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error {
currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas)
ephemeral := rd.Spec.Template.Spec.Ephemeral == nil || *rd.Spec.Template.Spec.Ephemeral
var effectiveTime *time.Time
for _, r := range hra.Spec.CapacityReservations {
t := r.EffectiveTime
if effectiveTime == nil || effectiveTime.Before(t.Time) {
effectiveTime = &t.Time
}
}
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
if currentDesiredReplicas != newDesiredReplicas {
copy := rd.DeepCopy()
copy.Spec.Replicas = &newDesiredReplicas
if ephemeral && effectiveTime != nil {
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
}
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
}
} else if ephemeral && effectiveTime != nil {
copy := rd.DeepCopy()
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
}

View File

@ -110,6 +110,8 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment {
Name: controllerName("runner"),
RegistrationRecheckInterval: time.Millisecond,
RegistrationRecheckJitter: time.Millisecond,
UnregistrationTimeout: 1 * time.Second,
UnregistrationRetryDelay: 1 * time.Second,
}
err = runnerController.SetupWithManager(mgr)
Expect(err).NotTo(HaveOccurred(), "failed to setup runner controller")

View File

@ -72,6 +72,9 @@ type RunnerReconciler struct {
Name string
RegistrationRecheckInterval time.Duration
RegistrationRecheckJitter time.Duration
UnregistrationTimeout time.Duration
UnregistrationRetryDelay time.Duration
}
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners,verbs=get;list;watch;create;update;patch;delete
@ -110,8 +113,23 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return ctrl.Result{}, nil
}
} else {
var p *corev1.Pod
{
var pod corev1.Pod
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
if !kerrors.IsNotFound(err) {
log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err))
return ctrl.Result{Requeue: true}, nil
}
} else {
p = &pod
}
}
// Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner
return r.processRunnerDeletion(runner, ctx, log)
return r.processRunnerDeletion(runner, ctx, log, p)
}
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
@ -159,20 +177,27 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
stopped := pod.Status.Phase == corev1.PodSucceeded
stopped := runnerPodOrContainerIsStopped(&pod)
if !stopped {
if pod.Status.Phase == corev1.PodRunning {
for _, status := range pod.Status.ContainerStatuses {
if status.Name != containerName {
continue
}
ephemeral := runner.Spec.Ephemeral == nil || *runner.Spec.Ephemeral
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
stopped = true
}
}
if stopped && ephemeral {
log.V(1).Info("Ephemeral runner has been stopped successfully. Marking this runner for deletion.")
// This is the key to make ephemeral runners to work reliably with webhook-based autoscale.
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1046161384 for more context.
//
// In the next reconcilation loop, this triggers a runner unregistration.
// (Note that the unregistration can fail safely because an ephemeral runner usually unregisters itself from GitHub but we do it just for confirmation)
//
// See the code path above that is executed when `runner.ObjectMeta.DeletionTimestamp.IsZero()` isn't true,
// which handles the unregistrationa the removal of the completed pod, and so on.
if err := r.Delete(ctx, &runner); err != nil {
log.V(1).Error(err, "Retrying to mark this runner for deletion in 10 seconds.")
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}
return ctrl.Result{Requeue: true}, nil
}
restart := stopped
@ -404,64 +429,53 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return ctrl.Result{}, nil
}
// Try to delete current pod if recreation is needed
safeToDeletePod := false
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
if err != nil {
log.Error(err, "Failed to unregister runner before deleting the pod.", "runner", runner.Name)
} else {
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
if !ok {
log.Info("Runner no longer exists on GitHub", "runner", runner.Name)
}
safeToDeletePod = true
updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod)
if res != nil {
return *res, err
}
if safeToDeletePod {
// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
if err := r.Delete(ctx, &pod); err != nil {
log.Error(err, "Failed to delete pod resource")
return ctrl.Result{}, err
}
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
if err := r.Delete(ctx, updatedPod); err != nil {
log.Error(err, "Failed to delete pod resource")
return ctrl.Result{}, err
}
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
return ctrl.Result{}, nil
}
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger) (reconcile.Result, error) {
func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
stopped := pod.Status.Phase == corev1.PodSucceeded
if !stopped {
if pod.Status.Phase == corev1.PodRunning {
for _, status := range pod.Status.ContainerStatuses {
if status.Name != containerName {
continue
}
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
stopped = true
}
}
}
}
return stopped
}
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) {
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
if removed {
if len(runner.Status.Registration.Token) > 0 {
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
if err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
return ctrl.Result{}, err
}
if !ok {
log.V(1).Info("Runner no longer exists on GitHub")
}
} else {
log.V(1).Info("Runner was never registered on GitHub")
_, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod)
if res != nil {
return *res, err
}
newRunner := runner.DeepCopy()
@ -478,6 +492,24 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con
return ctrl.Result{}, nil
}
func (r *RunnerReconciler) unregistrationTimeout() time.Duration {
unregistrationTimeout := DefaultUnregistrationTimeout
if r.UnregistrationTimeout > 0 {
unregistrationTimeout = r.UnregistrationTimeout
}
return unregistrationTimeout
}
func (r *RunnerReconciler) unregistrationRetryDelay() time.Duration {
retryDelay := DefaultUnregistrationRetryDelay
if r.UnregistrationRetryDelay > 0 {
retryDelay = r.UnregistrationRetryDelay
}
return retryDelay
}
func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) {
deletionTimeout := 1 * time.Minute
currentTime := time.Now()

View File

@ -0,0 +1,249 @@
package controllers
import (
"context"
"errors"
"fmt"
"time"
"github.com/actions-runner-controller/actions-runner-controller/github"
"github.com/go-logr/logr"
gogithub "github.com/google/go-github/v39/github"
corev1 "k8s.io/api/core/v1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)
const (
unregistrationCompleteTimestamp = "unregistration-complete-timestamp"
unregistrationStartTimestamp = "unregistration-start-timestamp"
// DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name)
// and RemoveRunner API (to actually unregister the runner) calls.
// This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s
// and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response
// up to 60 seconds (or even more depending on the situation).
DefaultUnregistrationTimeout = 60 * time.Second
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
DefaultUnregistrationRetryDelay = 30 * time.Second
)
// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that
// we can delete the runner pod without disrupting a workflow job.
//
// This function returns a non-nil pointer to corev1.Pod as the first return value
// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion.
//
// It's a "tick" operation so a graceful stop can take multiple calls to complete.
// This function is designed to complete a length graceful stop process in a unblocking way.
// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
if pod != nil {
if _, ok := getAnnotation(pod, unregistrationStartTimestamp); !ok {
updated := pod.DeepCopy()
setAnnotation(updated, unregistrationStartTimestamp, time.Now().Format(time.RFC3339))
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationStartTimestamp))
return nil, &ctrl.Result{}, err
}
pod = updated
log.Info("Runner has started unregistration")
} else {
log.Info("Runner has already started unregistration")
}
}
if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, enterprise, organization, repository, runner, pod); res != nil {
return nil, res, err
}
if pod != nil {
if _, ok := getAnnotation(pod, unregistrationCompleteTimestamp); !ok {
updated := pod.DeepCopy()
setAnnotation(updated, unregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationCompleteTimestamp))
return nil, &ctrl.Result{}, err
}
pod = updated
log.Info("Runner has completed unregistration")
} else {
log.Info("Runner has already completed unregistration")
}
}
return pod, nil, nil
}
// If the first return value is nil, it's safe to delete the runner pod.
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner)
if err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
log.Error(err, "Failed to unregister runner before deleting the pod.")
return &ctrl.Result{}, err
} else if ok {
log.Info("Runner has just been unregistered. Removing the runner pod.")
} else if pod == nil {
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
// However, that doesn't always mean the pod can be safely removed.
//
// If the pod does not exist for the runner,
// it may be due to that the runner pod has never been created.
// In that case we can safely assume that the runner will never be registered.
log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
} else if pod.Annotations[unregistrationCompleteTimestamp] != "" {
// If it's already unregistered in the previous reconcilation loop,
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
log.Info("Runner pod is marked as already unregistered.")
} else if runnerPodOrContainerIsStopped(pod) {
// If it's an ephemeral runner with the actions/runner container exited with 0,
// we can safely assume that it has unregistered itself from GitHub Actions
// so it's natural that RemoveRunner fails due to 404.
// If pod has ended up succeeded we need to restart it
// Happens e.g. when dind is in runner and run completes
log.Info("Runner pod has been stopped with a successful status.")
} else if ts := pod.Annotations[unregistrationStartTimestamp]; ts != "" {
t, err := time.Parse(time.RFC3339, ts)
if err != nil {
return &ctrl.Result{RequeueAfter: retryDelay}, err
}
if r := time.Until(t.Add(unregistrationTimeout)); r > 0 {
log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r)
return &ctrl.Result{RequeueAfter: retryDelay}, err
}
log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout)
} else {
// A runner and a runner pod that is created by this version of ARC should match
// any of the above branches.
//
// But we leave this match all branch for potential backward-compatibility.
// The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process,
// and retry later.
log.V(1).Info("Runner unregistration is being retried later.")
return &ctrl.Result{RequeueAfter: retryDelay}, nil
}
return nil, nil
}
func getAnnotation(pod *corev1.Pod, key string) (string, bool) {
if pod.Annotations == nil {
return "", false
}
v, ok := pod.Annotations[key]
return v, ok
}
func setAnnotation(pod *corev1.Pod, key, value string) {
if pod.Annotations == nil {
pod.Annotations = map[string]string{}
}
pod.Annotations[key] = value
}
// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
//
// Case 1. (true, nil) when it has successfully unregistered the runner.
// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet)
// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
// an error returned by GitHub API.
//
// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately.
// In other words, all those three sub-cases cannot be distinguished by this function alone.
//
// - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run).
// You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere,
// so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil).
//
// - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed.
// Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever.
// There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side.
//
// - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up.
// If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner.
//
// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back
// as long as you recreate the runner pod.
//
// If it was "2-3.", you need a workaround to avoid the race condition.
//
// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever
// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up.
//
// Beware though, you need extra care to set an appropriate grace period depending on your environment.
// There isn't a single right grace period that works for everyone.
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
// while the shorter the grace period is, the more likely you may encounter the race issue.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
runners, err := client.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
id = runner.GetID()
break
}
}
if id == int64(0) {
return false, nil
}
// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
// The reason ARC did so was to let a runner running a job to not stop prematurely.
//
// However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner,
// so ARC doesn't need to do anything special for a graceful runner stop.
// It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and
// the runner pod is safe for deletion.
//
// Trying to remove a busy runner can result in errors like the following:
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
//
// # NOTES
//
// - It can be "status=offline" at the same time but that's another story.
// - After https://github.com/actions-runner-controller/actions-runner-controller/pull/1127, ListRunners responses that are used to
// determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds.
// - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't
// change from 60 seconds.
//
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err
}
return true, nil
}

View File

@ -47,6 +47,9 @@ type RunnerPodReconciler struct {
Name string
RegistrationRecheckInterval time.Duration
RegistrationRecheckJitter time.Duration
UnregistrationTimeout time.Duration
UnregistrationRetryDelay time.Duration
}
const (
@ -105,33 +108,16 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
if removed {
ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name)
if err != nil {
if errors.Is(err, &gogithub.RateLimitError{}) {
// We log the underlying error when we failed calling GitHub API to list or unregisters,
// or the runner is still busy.
log.Error(
err,
fmt.Sprintf(
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
retryDelayOnGitHubAPIRateLimitError,
),
)
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
}
return ctrl.Result{}, err
updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if res != nil {
return *res, err
}
if !ok {
log.V(1).Info("Runner no longer exists on GitHub")
}
patchedPod := updatedPod.DeepCopy()
patchedPod.ObjectMeta.Finalizers = finalizers
newRunner := runnerPod.DeepCopy()
newRunner.ObjectMeta.Finalizers = finalizers
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
// We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster.
if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil {
log.Error(err, "Failed to update runner for finalizer removal")
return ctrl.Result{}, err
}
@ -365,8 +351,13 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, nil
}
updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
if res != nil {
return *res, err
}
// Delete current pod if recreation is needed
if err := r.Delete(ctx, &runnerPod); err != nil {
if err := r.Delete(ctx, updated); err != nil {
log.Error(err, "Failed to delete pod resource")
return ctrl.Result{}, err
}
@ -377,8 +368,22 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
return ctrl.Result{}, nil
}
func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
return unregisterRunner(ctx, r.GitHubClient, enterprise, org, repo, name)
func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration {
unregistrationTimeout := DefaultUnregistrationTimeout
if r.UnregistrationTimeout > 0 {
unregistrationTimeout = r.UnregistrationTimeout
}
return unregistrationTimeout
}
func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration {
retryDelay := DefaultUnregistrationRetryDelay
if r.UnregistrationRetryDelay > 0 {
retryDelay = r.UnregistrationRetryDelay
}
return retryDelay
}
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {

View File

@ -177,6 +177,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
if currentDesiredReplicas != newDesiredReplicas {
newestSet.Spec.Replicas = &newDesiredReplicas
newestSet.Spec.EffectiveTime = rd.Spec.EffectiveTime
if err := r.Client.Update(ctx, newestSet); err != nil {
log.Error(err, "Failed to update runnerreplicaset resource")
@ -417,9 +418,10 @@ func newRunnerReplicaSet(rd *v1alpha1.RunnerDeployment, commonRunnerLabels []str
Labels: newRSTemplate.ObjectMeta.Labels,
},
Spec: v1alpha1.RunnerReplicaSetSpec{
Replicas: rd.Spec.Replicas,
Selector: newRSSelector,
Template: newRSTemplate,
Replicas: rd.Spec.Replicas,
Selector: newRSSelector,
Template: newRSTemplate,
EffectiveTime: rd.Spec.EffectiveTime,
},
}

View File

@ -49,6 +49,10 @@ type RunnerReplicaSetReconciler struct {
Name string
}
const (
SyncTimeAnnotationKey = "sync-time"
)
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch
@ -85,19 +89,36 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
}
var myRunners []v1alpha1.Runner
var (
current int
ready int
available int
lastSyncTime *time.Time
)
for _, r := range allRunners.Items {
// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
// to not treat all the runners in the namespace as its children.
if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) {
myRunners = append(myRunners, r)
// If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
// or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop),
// we don't need to bother calling GitHub API to re-mark the runner for deletion.
// Just hold on, and runners will disappear as long as the runner controller is up and running.
if !r.DeletionTimestamp.IsZero() {
continue
}
if r.Annotations != nil {
if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok {
t, err := time.Parse(time.RFC3339, a)
if err == nil {
if lastSyncTime == nil || lastSyncTime.Before(t) {
lastSyncTime = &t
}
}
}
}
current += 1
@ -152,7 +173,30 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
}
if current > desired {
effectiveTime := rs.Spec.EffectiveTime
ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral
if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready)
} else if current > desired {
// If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally,
// you're unlikely to fall into this branch.
//
// That's becaseu all the stakeholders work like this:
//
// 1. A runner pod completes with the runner container exiting with code 0
// 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero)
// 3. GitHub triggers a corresponding workflow_job "complete" webhook event
// 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation
// 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations
// 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas
// 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet
//
// In a normally working ARC installation, the runner that was used to run the workflow job should already have been
// marked for deletion by the runner controller.
// This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to
// fall into this branch when you're using ephemeral runners with webhook-based-autoscaler.
n := current - desired
log.V(0).Info(fmt.Sprintf("Deleting %d runners", n), "desired", desired, "current", current, "ready", ready)
@ -282,6 +326,10 @@ func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1
objectMeta.GenerateName = rs.ObjectMeta.Name + "-"
objectMeta.Namespace = rs.ObjectMeta.Namespace
if objectMeta.Annotations == nil {
objectMeta.Annotations = map[string]string{}
}
objectMeta.Annotations[SyncTimeAnnotationKey] = time.Now().Format(time.RFC3339)
runner := v1alpha1.Runner{
TypeMeta: metav1.TypeMeta{},

View File

@ -1,49 +0,0 @@
package controllers
import (
"context"
"fmt"
"github.com/actions-runner-controller/actions-runner-controller/github"
)
// unregisterRunner unregisters the runner from GitHub Actions by name.
//
// This function returns:
// - (true, nil) when it has successfully unregistered the runner.
// - (false, nil) when the runner has been already unregistered.
// - (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
// an error returned by GitHub API.
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
runners, err := client.ListRunners(ctx, enterprise, org, repo)
if err != nil {
return false, err
}
id := int64(0)
for _, runner := range runners {
if runner.GetName() == name {
// Note that sometimes a runner can stuck "busy" even though it is already "offline".
// But we assume that it's not actually offline and still running a job.
if runner.GetBusy() {
return false, fmt.Errorf("runner is busy")
}
id = runner.GetID()
break
}
}
if id == int64(0) {
return false, nil
}
// Trying to remove a busy runner can result in errors like the following:
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
//
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
return false, err
}
return true, nil
}

View File

@ -11,8 +11,11 @@ import (
"time"
"github.com/actions-runner-controller/actions-runner-controller/github/metrics"
"github.com/actions-runner-controller/actions-runner-controller/logging"
"github.com/bradleyfalzon/ghinstallation"
"github.com/go-logr/logr"
"github.com/google/go-github/v39/github"
"github.com/gregjones/httpcache"
"golang.org/x/oauth2"
)
@ -28,6 +31,8 @@ type Config struct {
BasicauthUsername string `split_words:"true"`
BasicauthPassword string `split_words:"true"`
RunnerGitHubURL string `split_words:"true"`
Log *logr.Logger
}
// Client wraps GitHub client with some additional
@ -82,8 +87,11 @@ func (c *Config) NewClient() (*Client, error) {
transport = tr
}
transport = metrics.Transport{Transport: transport}
httpClient := &http.Client{Transport: transport}
cached := httpcache.NewTransport(httpcache.NewMemoryCache())
cached.Transport = transport
loggingTransport := logging.Transport{Transport: cached, Log: c.Log}
metricsTransport := metrics.Transport{Transport: loggingTransport}
httpClient := &http.Client{Transport: metricsTransport}
var client *github.Client
var githubBaseURL string

1
go.mod
View File

@ -41,6 +41,7 @@ require (
github.com/google/gofuzz v1.1.0 // indirect
github.com/google/uuid v1.1.2 // indirect
github.com/googleapis/gnostic v0.5.5 // indirect
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect
github.com/imdario/mergo v0.3.12 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect

2
go.sum
View File

@ -259,6 +259,8 @@ github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA=
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y=
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=

54
logging/logger.go Normal file
View File

@ -0,0 +1,54 @@
package logging
import (
"fmt"
"os"
"strconv"
"github.com/go-logr/logr"
zaplib "go.uber.org/zap"
"go.uber.org/zap/zapcore"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
)
const (
LogLevelDebug = "debug"
LogLevelInfo = "info"
LogLevelWarn = "warn"
LogLevelError = "error"
)
func NewLogger(logLevel string) logr.Logger {
log := zap.New(func(o *zap.Options) {
switch logLevel {
case LogLevelDebug:
o.Development = true
lvl := zaplib.NewAtomicLevelAt(zaplib.DebugLevel) // maps to logr's V(1)
o.Level = &lvl
case LogLevelInfo:
lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel)
o.Level = &lvl
case LogLevelWarn:
lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel)
o.Level = &lvl
case LogLevelError:
lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel)
o.Level = &lvl
default:
// We use bitsize of 8 as zapcore.Level is a type alias to int8
levelInt, err := strconv.ParseInt(logLevel, 10, 8)
if err != nil {
fmt.Fprintf(os.Stderr, "Failed to parse --log-level=%s: %v", logLevel, err)
os.Exit(1)
}
// For example, --log-level=debug a.k.a --log-level=-1 maps to zaplib.DebugLevel, which is associated to logr's V(1)
// --log-level=-2 maps the specific custom log level that is associated to logr's V(2).
level := zapcore.Level(levelInt)
atomicLevel := zaplib.NewAtomicLevelAt(level)
o.Level = &atomicLevel
}
})
return log
}

51
logging/transport.go Normal file
View File

@ -0,0 +1,51 @@
// Package logging provides various logging helpers for ARC
package logging
import (
"net/http"
"github.com/go-logr/logr"
"github.com/gregjones/httpcache"
)
const (
// https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting
headerRateLimitRemaining = "X-RateLimit-Remaining"
)
// Transport wraps a transport with metrics monitoring
type Transport struct {
Transport http.RoundTripper
Log *logr.Logger
}
func (t Transport) RoundTrip(req *http.Request) (*http.Response, error) {
resp, err := t.Transport.RoundTrip(req)
if resp != nil {
t.log(req, resp)
}
return resp, err
}
func (t Transport) log(req *http.Request, resp *http.Response) {
if t.Log == nil {
return
}
var args []interface{}
marked := resp.Header.Get(httpcache.XFromCache) == "1"
args = append(args, "from_cache", marked, "method", req.Method, "url", req.URL.String())
if !marked {
// Do not log outdated rate limit remaining value
remaining := resp.Header.Get(headerRateLimitRemaining)
args = append(args, "ratelimit_remaining", remaining)
}
t.Log.V(3).Info("Seen HTTP response", args...)
}

28
main.go
View File

@ -26,24 +26,18 @@ import (
actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
"github.com/actions-runner-controller/actions-runner-controller/controllers"
"github.com/actions-runner-controller/actions-runner-controller/github"
"github.com/actions-runner-controller/actions-runner-controller/logging"
"github.com/kelseyhightower/envconfig"
zaplib "go.uber.org/zap"
"k8s.io/apimachinery/pkg/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
// +kubebuilder:scaffold:imports
)
const (
defaultRunnerImage = "summerwind/actions-runner:latest"
defaultDockerImage = "docker:dind"
logLevelDebug = "debug"
logLevelInfo = "info"
logLevelWarn = "warn"
logLevelError = "error"
)
var (
@ -120,24 +114,12 @@ func main() {
flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change. . If you're tweaking this in order to make autoscaling more responsive, you'll probably want to tweak github-api-cache-duration, too")
flag.Var(&commonRunnerLabels, "common-runner-labels", "Runner labels in the K1=V1,K2=V2,... format that are inherited all the runners created by the controller. See https://github.com/actions-runner-controller/actions-runner-controller/issues/321 for more information")
flag.StringVar(&namespace, "watch-namespace", "", "The namespace to watch for custom resources. Set to empty for letting it watch for all namespaces.")
flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
flag.Parse()
logger := zap.New(func(o *zap.Options) {
switch logLevel {
case logLevelDebug:
o.Development = true
case logLevelInfo:
lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel)
o.Level = &lvl
case logLevelWarn:
lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel)
o.Level = &lvl
case logLevelError:
lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel)
o.Level = &lvl
}
})
logger := logging.NewLogger(logLevel)
c.Log = &logger
ghClient, err = c.NewClient()
if err != nil {

View File

@ -17,12 +17,15 @@ var (
controllerImageTag = "e2e"
controllerImage = testing.Img(controllerImageRepo, controllerImageTag)
runnerImageRepo = "actionsrunnercontrollere2e/actions-runner"
runnerDindImageRepo = "actionsrunnercontrollere2e/actions-runner-dind"
runnerImageTag = "e2e"
runnerImage = testing.Img(runnerImageRepo, runnerImageTag)
runnerDindImage = testing.Img(runnerDindImageRepo, runnerImageTag)
prebuildImages = []testing.ContainerImage{
controllerImage,
runnerImage,
runnerDindImage,
}
builds = []testing.DockerBuild{
@ -36,6 +39,11 @@ var (
Args: []testing.BuildArg{},
Image: runnerImage,
},
{
Dockerfile: "../../runner/Dockerfile.dindrunner",
Args: []testing.BuildArg{},
Image: runnerDindImage,
},
}
certManagerVersion = "v1.1.1"
@ -52,7 +60,6 @@ var (
"SYNC_PERIOD=" + "10s",
"NAME=" + controllerImageRepo,
"VERSION=" + controllerImageTag,
"RUNNER_NAME=" + runnerImageRepo,
"RUNNER_TAG=" + runnerImageTag,
}
@ -167,11 +174,15 @@ type env struct {
useRunnerSet bool
testID string
testName string
repoToCommit string
runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string
githubTokenWebhook string
testEnterprise string
featureFlagEphemeral bool
scaleDownDelaySecondsAfterScaleOut int64
minReplicas int64
dockerdWithinRunnerContainer bool
testJobs []job
}
@ -184,11 +195,12 @@ func initTestEnv(t *testing.T) *env {
id := e.ID()
testID := t.Name() + " " + id
testName := t.Name() + " " + id
t.Logf("Using test id %s", testID)
t.Logf("Initializing test with name %s", testName)
e.testID = testID
e.testID = id
e.testName = testName
e.runnerLabel = "test-" + id
e.githubToken = testing.Getenv(t, "GITHUB_TOKEN")
e.githubTokenWebhook = testing.Getenv(t, "WEBHOOK_GITHUB_TOKEN")
@ -197,9 +209,17 @@ func initTestEnv(t *testing.T) *env {
e.testOrg = testing.Getenv(t, "TEST_ORG", "")
e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "")
e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE")
e.testJobs = createTestJobs(id, testResultCMNamePrefix, 10)
e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100)
ephemeral, _ := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL"))
e.featureFlagEphemeral = ephemeral
e.scaleDownDelaySecondsAfterScaleOut, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT", "10"), 10, 32)
e.minReplicas, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_MIN_REPLICAS", "1"), 10, 32)
var err error
e.dockerdWithinRunnerContainer, err = strconv.ParseBool(testing.Getenv(t, "TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER", "false"))
if err != nil {
panic(fmt.Sprintf("unable to parse bool from TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER: %v", err))
}
return e
}
@ -254,7 +274,22 @@ func (e *env) installActionsRunnerController(t *testing.T) {
"GITHUB_TOKEN=" + e.githubToken,
"WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook,
"RUNNER_LABEL=" + e.runnerLabel,
"TEST_ID=" + e.testID,
fmt.Sprintf("RUNNER_FEATURE_FLAG_EPHEMERAL=%v", e.featureFlagEphemeral),
fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut),
fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas),
}
if e.dockerdWithinRunnerContainer {
varEnv = append(varEnv,
"RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER=true",
"RUNNER_NAME="+runnerDindImageRepo,
)
} else {
varEnv = append(varEnv,
"RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER=false",
"RUNNER_NAME="+runnerImageRepo,
)
}
scriptEnv = append(scriptEnv, varEnv...)
@ -273,7 +308,7 @@ func (e *env) createControllerNamespaceAndServiceAccount(t *testing.T) {
func (e *env) installActionsWorkflow(t *testing.T) {
t.Helper()
installActionsWorkflow(t, e.testID, e.runnerLabel, testResultCMNamePrefix, e.repoToCommit, e.testJobs)
installActionsWorkflow(t, e.testName, e.runnerLabel, testResultCMNamePrefix, e.repoToCommit, e.testJobs)
}
func (e *env) verifyActionsWorkflowRun(t *testing.T) {
@ -302,13 +337,13 @@ func createTestJobs(id, testResultCMNamePrefix string, numJobs int) []job {
const Branch = "main"
func installActionsWorkflow(t *testing.T, testID, runnerLabel, testResultCMNamePrefix, testRepo string, testJobs []job) {
func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNamePrefix, testRepo string, testJobs []job) {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
wfName := "E2E " + testID
wfName := "E2E " + testName
wf := testing.Workflow{
Name: wfName,
On: testing.On{