Merge branch 'master' into improve-logs
This commit is contained in:
commit
eff0c7364f
|
|
@ -15,7 +15,7 @@ on:
|
|||
- '!**.md'
|
||||
|
||||
env:
|
||||
RUNNER_VERSION: 2.287.1
|
||||
RUNNER_VERSION: 2.288.0
|
||||
DOCKER_VERSION: 20.10.12
|
||||
DOCKERHUB_USERNAME: summerwind
|
||||
|
||||
|
|
|
|||
|
|
@ -37,7 +37,10 @@ fi
|
|||
|
||||
tool=${ACCEPTANCE_TEST_DEPLOYMENT_TOOL}
|
||||
|
||||
TEST_ID=${TEST_ID:-default}
|
||||
|
||||
if [ "${tool}" == "helm" ]; then
|
||||
set -v
|
||||
helm upgrade --install actions-runner-controller \
|
||||
charts/actions-runner-controller \
|
||||
-n actions-runner-system \
|
||||
|
|
@ -46,7 +49,10 @@ if [ "${tool}" == "helm" ]; then
|
|||
--set authSecret.create=false \
|
||||
--set image.repository=${NAME} \
|
||||
--set image.tag=${VERSION} \
|
||||
--set podAnnotations.test-id=${TEST_ID} \
|
||||
--set githubWebhookServer.podAnnotations.test-id=${TEST_ID} \
|
||||
-f ${VALUES_FILE}
|
||||
set +v
|
||||
# To prevent `CustomResourceDefinition.apiextensions.k8s.io "runners.actions.summerwind.dev" is invalid: metadata.annotations: Too long: must have at most 262144 bytes`
|
||||
# errors
|
||||
kubectl create -f charts/actions-runner-controller/crds || kubectl replace -f charts/actions-runner-controller/crds
|
||||
|
|
@ -77,7 +83,7 @@ else
|
|||
fi
|
||||
|
||||
if [ -n "${TEST_ORG}" ]; then
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= NAME=org-runnerdeploy envsubst | kubectl apply -f -
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= RUNNER_MIN_REPLICAS=${ORG_RUNNER_MIN_REPLICAS} NAME=org-runnerdeploy envsubst | kubectl apply -f -
|
||||
|
||||
if [ -n "${TEST_ORG_GROUP}" ]; then
|
||||
cat acceptance/testdata/runnerdeploy.envsubst.yaml | TEST_ENTERPRISE= TEST_REPO= TEST_GROUP=${TEST_ORG_GROUP} NAME=orggroup-runnerdeploy envsubst | kubectl apply -f -
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ spec:
|
|||
## Replace `mumoshu/actions-runner-dind:dev` with your dind image
|
||||
#dockerdWithinRunnerContainer: true
|
||||
#image: mumoshu/actions-runner-dind:dev
|
||||
dockerdWithinRunnerContainer: ${RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER}
|
||||
|
||||
#
|
||||
# Set the MTU used by dockerd-managed network interfaces (including docker-build-ubuntu)
|
||||
|
|
@ -56,6 +57,7 @@ spec:
|
|||
scaleUpTriggers:
|
||||
- githubEvent: {}
|
||||
amount: 1
|
||||
duration: "1m"
|
||||
minReplicas: 0
|
||||
duration: "10m"
|
||||
minReplicas: ${RUNNER_MIN_REPLICAS}
|
||||
maxReplicas: 10
|
||||
scaleDownDelaySecondsAfterScaleOut: ${RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
# Set actions-runner-controller settings for testing
|
||||
githubAPICacheDuration: 10s
|
||||
logLevel: "-3"
|
||||
githubWebhookServer:
|
||||
logLevel: debug
|
||||
logLevel: "-3"
|
||||
enabled: true
|
||||
labels: {}
|
||||
replicaCount: 1
|
||||
|
|
|
|||
|
|
@ -107,6 +107,9 @@ type CapacityReservation struct {
|
|||
Name string `json:"name,omitempty"`
|
||||
ExpirationTime metav1.Time `json:"expirationTime,omitempty"`
|
||||
Replicas int `json:"replicas,omitempty"`
|
||||
|
||||
// +optional
|
||||
EffectiveTime metav1.Time `json:"effectiveTime,omitempty"`
|
||||
}
|
||||
|
||||
type ScaleTargetRef struct {
|
||||
|
|
|
|||
|
|
@ -31,6 +31,14 @@ type RunnerDeploymentSpec struct {
|
|||
// +nullable
|
||||
Replicas *int `json:"replicas,omitempty"`
|
||||
|
||||
// EffectiveTime is the time the upstream controller requested to sync Replicas.
|
||||
// It is usually populated by the webhook-based autoscaler via HRA.
|
||||
// The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
|
||||
//
|
||||
// +optional
|
||||
// +nullable
|
||||
EffectiveTime *metav1.Time `json:"effectiveTime"`
|
||||
|
||||
// +optional
|
||||
// +nullable
|
||||
Selector *metav1.LabelSelector `json:"selector"`
|
||||
|
|
|
|||
|
|
@ -26,6 +26,15 @@ type RunnerReplicaSetSpec struct {
|
|||
// +nullable
|
||||
Replicas *int `json:"replicas,omitempty"`
|
||||
|
||||
// EffectiveTime is the time the upstream controller requested to sync Replicas.
|
||||
// It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment.
|
||||
// The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners
|
||||
// based on potentially outdated Replicas value.
|
||||
//
|
||||
// +optional
|
||||
// +nullable
|
||||
EffectiveTime *metav1.Time `json:"effectiveTime"`
|
||||
|
||||
// +optional
|
||||
// +nullable
|
||||
Selector *metav1.LabelSelector `json:"selector"`
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ func (in *CacheEntry) DeepCopy() *CacheEntry {
|
|||
func (in *CapacityReservation) DeepCopyInto(out *CapacityReservation) {
|
||||
*out = *in
|
||||
in.ExpirationTime.DeepCopyInto(&out.ExpirationTime)
|
||||
in.EffectiveTime.DeepCopyInto(&out.EffectiveTime)
|
||||
}
|
||||
|
||||
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CapacityReservation.
|
||||
|
|
@ -498,6 +499,10 @@ func (in *RunnerDeploymentSpec) DeepCopyInto(out *RunnerDeploymentSpec) {
|
|||
*out = new(int)
|
||||
**out = **in
|
||||
}
|
||||
if in.EffectiveTime != nil {
|
||||
in, out := &in.EffectiveTime, &out.EffectiveTime
|
||||
*out = (*in).DeepCopy()
|
||||
}
|
||||
if in.Selector != nil {
|
||||
in, out := &in.Selector, &out.Selector
|
||||
*out = new(metav1.LabelSelector)
|
||||
|
|
@ -812,6 +817,10 @@ func (in *RunnerReplicaSetSpec) DeepCopyInto(out *RunnerReplicaSetSpec) {
|
|||
*out = new(int)
|
||||
**out = **in
|
||||
}
|
||||
if in.EffectiveTime != nil {
|
||||
in, out := &in.EffectiveTime, &out.EffectiveTime
|
||||
*out = (*in).DeepCopy()
|
||||
}
|
||||
if in.Selector != nil {
|
||||
in, out := &in.Selector, &out.Selector
|
||||
*out = new(metav1.LabelSelector)
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ All additional docs are kept in the `docs/` folder, this README is solely for do
|
|||
| `authSecret.github_basicauth_username` | Username for GitHub basic auth to use instead of PAT or GitHub APP in case it's running behind a proxy API | |
|
||||
| `authSecret.github_basicauth_password` | Password for GitHub basic auth to use instead of PAT or GitHub APP in case it's running behind a proxy API | |
|
||||
| `dockerRegistryMirror` | The default Docker Registry Mirror used by runners. | |
|
||||
| `hostNetwork` | The "hostNetwork" of the controller container | false |
|
||||
| `image.repository` | The "repository/image" of the controller container | summerwind/actions-runner-controller |
|
||||
| `image.tag` | The tag of the controller container | |
|
||||
| `image.actionsRunnerRepositoryAndTag` | The "repository/image" of the actions runner container | summerwind/actions-runner:latest |
|
||||
|
|
|
|||
|
|
@ -49,6 +49,9 @@ spec:
|
|||
items:
|
||||
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
|
||||
properties:
|
||||
effectiveTime:
|
||||
format: date-time
|
||||
type: string
|
||||
expirationTime:
|
||||
format: date-time
|
||||
type: string
|
||||
|
|
|
|||
|
|
@ -48,6 +48,11 @@ spec:
|
|||
spec:
|
||||
description: RunnerDeploymentSpec defines the desired state of RunnerDeployment
|
||||
properties:
|
||||
effectiveTime:
|
||||
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
|
||||
format: date-time
|
||||
nullable: true
|
||||
type: string
|
||||
replicas:
|
||||
nullable: true
|
||||
type: integer
|
||||
|
|
|
|||
|
|
@ -45,6 +45,11 @@ spec:
|
|||
spec:
|
||||
description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet
|
||||
properties:
|
||||
effectiveTime:
|
||||
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value.
|
||||
format: date-time
|
||||
nullable: true
|
||||
type: string
|
||||
replicas:
|
||||
nullable: true
|
||||
type: integer
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ spec:
|
|||
metadata:
|
||||
{{- with .Values.podAnnotations }}
|
||||
annotations:
|
||||
kubectl.kubernetes.io/default-logs-container: "manager"
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
|
|
@ -199,3 +200,6 @@ spec:
|
|||
topologySpreadConstraints:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.hostNetwork }}
|
||||
hostNetwork: {{ .Values.hostNetwork }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ spec:
|
|||
metadata:
|
||||
{{- with .Values.githubWebhookServer.podAnnotations }}
|
||||
annotations:
|
||||
kubectl.kubernetes.io/default-logs-container: "github-webhook-server"
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
|
|
|
|||
|
|
@ -165,6 +165,10 @@ admissionWebHooks:
|
|||
{}
|
||||
#caBundle: "Ci0tLS0tQk...<base64-encoded PEM bundle containing the CA that signed the webhook's serving certificate>...tLS0K"
|
||||
|
||||
# There may be alternatives to setting `hostNetwork: true`, see
|
||||
# https://github.com/actions-runner-controller/actions-runner-controller/issues/1005#issuecomment-993097155
|
||||
#hostNetwork: true
|
||||
|
||||
githubWebhookServer:
|
||||
enabled: false
|
||||
replicaCount: 1
|
||||
|
|
|
|||
|
|
@ -29,15 +29,14 @@ import (
|
|||
actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/controllers"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/github"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/logging"
|
||||
"github.com/kelseyhightower/envconfig"
|
||||
zaplib "go.uber.org/zap"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
|
||||
_ "k8s.io/client-go/plugin/pkg/client/auth/exec"
|
||||
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
|
||||
_ "k8s.io/client-go/plugin/pkg/client/auth/oidc"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/log/zap"
|
||||
// +kubebuilder:scaffold:imports
|
||||
)
|
||||
|
||||
|
|
@ -47,11 +46,6 @@ var (
|
|||
)
|
||||
|
||||
const (
|
||||
logLevelDebug = "debug"
|
||||
logLevelInfo = "info"
|
||||
logLevelWarn = "warn"
|
||||
logLevelError = "error"
|
||||
|
||||
webhookSecretTokenEnvName = "GITHUB_WEBHOOK_SECRET_TOKEN"
|
||||
)
|
||||
|
||||
|
|
@ -97,7 +91,7 @@ func main() {
|
|||
flag.BoolVar(&enableLeaderElection, "enable-leader-election", false,
|
||||
"Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
|
||||
flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change")
|
||||
flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
|
||||
flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
|
||||
flag.StringVar(&webhookSecretToken, "github-webhook-secret-token", "", "The personal access token of GitHub.")
|
||||
flag.StringVar(&c.Token, "github-token", c.Token, "The personal access token of GitHub.")
|
||||
flag.Int64Var(&c.AppID, "github-app-id", c.AppID, "The application ID of GitHub App.")
|
||||
|
|
@ -126,23 +120,7 @@ func main() {
|
|||
setupLog.Info("-watch-namespace is %q. Only HorizontalRunnerAutoscalers in %q are watched, cached, and considered as scale targets.")
|
||||
}
|
||||
|
||||
logger := zap.New(func(o *zap.Options) {
|
||||
switch logLevel {
|
||||
case logLevelDebug:
|
||||
o.Development = true
|
||||
lvl := zaplib.NewAtomicLevelAt(-2) // maps to logr's V(2)
|
||||
o.Level = &lvl
|
||||
case logLevelInfo:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel)
|
||||
o.Level = &lvl
|
||||
case logLevelWarn:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel)
|
||||
o.Level = &lvl
|
||||
case logLevelError:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel)
|
||||
o.Level = &lvl
|
||||
}
|
||||
})
|
||||
logger := logging.NewLogger(logLevel)
|
||||
|
||||
ctrl.SetLogger(logger)
|
||||
|
||||
|
|
@ -152,6 +130,8 @@ func main() {
|
|||
// That is, all runner groups managed by ARC are assumed to be visible to any repositories,
|
||||
// which is wrong when you have one or more non-default runner groups in your organization or enterprise.
|
||||
if len(c.Token) > 0 || (c.AppID > 0 && c.AppInstallationID > 0 && c.AppPrivateKey != "") || (len(c.BasicauthUsername) > 0 && len(c.BasicauthPassword) > 0) {
|
||||
c.Log = &logger
|
||||
|
||||
ghClient, err = c.NewClient()
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, "Error: Client creation failed.", err)
|
||||
|
|
|
|||
|
|
@ -49,6 +49,9 @@ spec:
|
|||
items:
|
||||
description: CapacityReservation specifies the number of replicas temporarily added to the scale target until ExpirationTime.
|
||||
properties:
|
||||
effectiveTime:
|
||||
format: date-time
|
||||
type: string
|
||||
expirationTime:
|
||||
format: date-time
|
||||
type: string
|
||||
|
|
|
|||
|
|
@ -48,6 +48,11 @@ spec:
|
|||
spec:
|
||||
description: RunnerDeploymentSpec defines the desired state of RunnerDeployment
|
||||
properties:
|
||||
effectiveTime:
|
||||
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA. The value is inherited to RunnerRepicaSet(s) and used to prevent ephemeral runners from unnecessarily recreated.
|
||||
format: date-time
|
||||
nullable: true
|
||||
type: string
|
||||
replicas:
|
||||
nullable: true
|
||||
type: integer
|
||||
|
|
|
|||
|
|
@ -45,6 +45,11 @@ spec:
|
|||
spec:
|
||||
description: RunnerReplicaSetSpec defines the desired state of RunnerReplicaSet
|
||||
properties:
|
||||
effectiveTime:
|
||||
description: EffectiveTime is the time the upstream controller requested to sync Replicas. It is usually populated by the webhook-based autoscaler via HRA and RunnerDeployment. The value is used to prevent runnerreplicaset controller from unnecessarily recreating ephemeral runners based on potentially outdated Replicas value.
|
||||
format: date-time
|
||||
nullable: true
|
||||
type: string
|
||||
replicas:
|
||||
nullable: true
|
||||
type: integer
|
||||
|
|
|
|||
|
|
@ -768,8 +768,10 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.
|
|||
capacityReservations := getValidCapacityReservations(copy)
|
||||
|
||||
if amount > 0 {
|
||||
now := time.Now()
|
||||
copy.Spec.CapacityReservations = append(capacityReservations, v1alpha1.CapacityReservation{
|
||||
ExpirationTime: metav1.Time{Time: time.Now().Add(target.ScaleUpTrigger.Duration.Duration)},
|
||||
EffectiveTime: metav1.Time{Time: now},
|
||||
ExpirationTime: metav1.Time{Time: now.Add(target.ScaleUpTrigger.Duration.Duration)},
|
||||
Replicas: amount,
|
||||
})
|
||||
} else if amount < 0 {
|
||||
|
|
@ -788,10 +790,16 @@ func (autoscaler *HorizontalRunnerAutoscalerGitHubWebhook) tryScale(ctx context.
|
|||
copy.Spec.CapacityReservations = reservations
|
||||
}
|
||||
|
||||
autoscaler.Log.Info(
|
||||
before := len(target.HorizontalRunnerAutoscaler.Spec.CapacityReservations)
|
||||
expired := before - len(capacityReservations)
|
||||
after := len(copy.Spec.CapacityReservations)
|
||||
|
||||
autoscaler.Log.V(1).Info(
|
||||
fmt.Sprintf("Patching hra %s for capacityReservations update", target.HorizontalRunnerAutoscaler.Name),
|
||||
"before", target.HorizontalRunnerAutoscaler.Spec.CapacityReservations,
|
||||
"after", copy.Spec.CapacityReservations,
|
||||
"before", before,
|
||||
"expired", expired,
|
||||
"amount", amount,
|
||||
"after", after,
|
||||
)
|
||||
|
||||
if err := autoscaler.Client.Patch(ctx, copy, client.MergeFrom(&target.HorizontalRunnerAutoscaler)); err != nil {
|
||||
|
|
|
|||
|
|
@ -99,11 +99,33 @@ func (r *HorizontalRunnerAutoscalerReconciler) Reconcile(ctx context.Context, re
|
|||
return r.reconcile(ctx, req, log, hra, st, func(newDesiredReplicas int) error {
|
||||
currentDesiredReplicas := getIntOrDefault(rd.Spec.Replicas, defaultReplicas)
|
||||
|
||||
ephemeral := rd.Spec.Template.Spec.Ephemeral == nil || *rd.Spec.Template.Spec.Ephemeral
|
||||
|
||||
var effectiveTime *time.Time
|
||||
|
||||
for _, r := range hra.Spec.CapacityReservations {
|
||||
t := r.EffectiveTime
|
||||
if effectiveTime == nil || effectiveTime.Before(t.Time) {
|
||||
effectiveTime = &t.Time
|
||||
}
|
||||
}
|
||||
|
||||
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
|
||||
if currentDesiredReplicas != newDesiredReplicas {
|
||||
copy := rd.DeepCopy()
|
||||
copy.Spec.Replicas = &newDesiredReplicas
|
||||
|
||||
if ephemeral && effectiveTime != nil {
|
||||
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
|
||||
}
|
||||
|
||||
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
|
||||
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
|
||||
}
|
||||
} else if ephemeral && effectiveTime != nil {
|
||||
copy := rd.DeepCopy()
|
||||
copy.Spec.EffectiveTime = &metav1.Time{Time: *effectiveTime}
|
||||
|
||||
if err := r.Client.Patch(ctx, copy, client.MergeFrom(&rd)); err != nil {
|
||||
return fmt.Errorf("patching runnerdeployment to have %d replicas: %w", newDesiredReplicas, err)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -110,6 +110,8 @@ func SetupIntegrationTest(ctx2 context.Context) *testEnvironment {
|
|||
Name: controllerName("runner"),
|
||||
RegistrationRecheckInterval: time.Millisecond,
|
||||
RegistrationRecheckJitter: time.Millisecond,
|
||||
UnregistrationTimeout: 1 * time.Second,
|
||||
UnregistrationRetryDelay: 1 * time.Second,
|
||||
}
|
||||
err = runnerController.SetupWithManager(mgr)
|
||||
Expect(err).NotTo(HaveOccurred(), "failed to setup runner controller")
|
||||
|
|
|
|||
|
|
@ -72,6 +72,9 @@ type RunnerReconciler struct {
|
|||
Name string
|
||||
RegistrationRecheckInterval time.Duration
|
||||
RegistrationRecheckJitter time.Duration
|
||||
|
||||
UnregistrationTimeout time.Duration
|
||||
UnregistrationRetryDelay time.Duration
|
||||
}
|
||||
|
||||
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runners,verbs=get;list;watch;create;update;patch;delete
|
||||
|
|
@ -110,8 +113,23 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
|||
return ctrl.Result{}, nil
|
||||
}
|
||||
} else {
|
||||
var p *corev1.Pod
|
||||
|
||||
{
|
||||
var pod corev1.Pod
|
||||
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
|
||||
if !kerrors.IsNotFound(err) {
|
||||
log.Info(fmt.Sprintf("Retrying soon as we failed to get registration-only runner pod: %v", err))
|
||||
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
}
|
||||
} else {
|
||||
p = &pod
|
||||
}
|
||||
}
|
||||
|
||||
// Request to remove a runner. DeletionTimestamp was set in the runner - we need to unregister runner
|
||||
return r.processRunnerDeletion(runner, ctx, log)
|
||||
return r.processRunnerDeletion(runner, ctx, log, p)
|
||||
}
|
||||
|
||||
registrationOnly := metav1.HasAnnotation(runner.ObjectMeta, annotationKeyRegistrationOnly)
|
||||
|
|
@ -159,20 +177,27 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
|||
|
||||
// If pod has ended up succeeded we need to restart it
|
||||
// Happens e.g. when dind is in runner and run completes
|
||||
stopped := pod.Status.Phase == corev1.PodSucceeded
|
||||
stopped := runnerPodOrContainerIsStopped(&pod)
|
||||
|
||||
if !stopped {
|
||||
if pod.Status.Phase == corev1.PodRunning {
|
||||
for _, status := range pod.Status.ContainerStatuses {
|
||||
if status.Name != containerName {
|
||||
continue
|
||||
}
|
||||
ephemeral := runner.Spec.Ephemeral == nil || *runner.Spec.Ephemeral
|
||||
|
||||
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
|
||||
stopped = true
|
||||
}
|
||||
}
|
||||
if stopped && ephemeral {
|
||||
log.V(1).Info("Ephemeral runner has been stopped successfully. Marking this runner for deletion.")
|
||||
|
||||
// This is the key to make ephemeral runners to work reliably with webhook-based autoscale.
|
||||
// See https://github.com/actions-runner-controller/actions-runner-controller/issues/911#issuecomment-1046161384 for more context.
|
||||
//
|
||||
// In the next reconcilation loop, this triggers a runner unregistration.
|
||||
// (Note that the unregistration can fail safely because an ephemeral runner usually unregisters itself from GitHub but we do it just for confirmation)
|
||||
//
|
||||
// See the code path above that is executed when `runner.ObjectMeta.DeletionTimestamp.IsZero()` isn't true,
|
||||
// which handles the unregistrationa the removal of the completed pod, and so on.
|
||||
if err := r.Delete(ctx, &runner); err != nil {
|
||||
log.V(1).Error(err, "Retrying to mark this runner for deletion in 10 seconds.")
|
||||
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
|
||||
}
|
||||
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
}
|
||||
|
||||
restart := stopped
|
||||
|
|
@ -404,64 +429,53 @@ func (r *RunnerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
|
|||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
// Try to delete current pod if recreation is needed
|
||||
safeToDeletePod := false
|
||||
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
|
||||
if err != nil {
|
||||
log.Error(err, "Failed to unregister runner before deleting the pod.", "runner", runner.Name)
|
||||
} else {
|
||||
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
|
||||
if !ok {
|
||||
log.Info("Runner no longer exists on GitHub", "runner", runner.Name)
|
||||
}
|
||||
|
||||
safeToDeletePod = true
|
||||
updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, &pod)
|
||||
if res != nil {
|
||||
return *res, err
|
||||
}
|
||||
|
||||
if safeToDeletePod {
|
||||
// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
|
||||
// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
|
||||
if err := r.Delete(ctx, &pod); err != nil {
|
||||
log.Error(err, "Failed to delete pod resource")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
|
||||
log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
|
||||
// Only delete the pod if we successfully unregistered the runner or the runner is already deleted from the service.
|
||||
// This should help us avoid race condition between runner pickup job after we think the runner is not busy.
|
||||
if err := r.Delete(ctx, updatedPod); err != nil {
|
||||
log.Error(err, "Failed to delete pod resource")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
r.Recorder.Event(&runner, corev1.EventTypeNormal, "PodDeleted", fmt.Sprintf("Deleted pod '%s'", newPod.Name))
|
||||
log.Info("Deleted runner pod", "repository", runner.Spec.Repository)
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger) (reconcile.Result, error) {
|
||||
func runnerPodOrContainerIsStopped(pod *corev1.Pod) bool {
|
||||
// If pod has ended up succeeded we need to restart it
|
||||
// Happens e.g. when dind is in runner and run completes
|
||||
stopped := pod.Status.Phase == corev1.PodSucceeded
|
||||
|
||||
if !stopped {
|
||||
if pod.Status.Phase == corev1.PodRunning {
|
||||
for _, status := range pod.Status.ContainerStatuses {
|
||||
if status.Name != containerName {
|
||||
continue
|
||||
}
|
||||
|
||||
if status.State.Terminated != nil && status.State.Terminated.ExitCode == 0 {
|
||||
stopped = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return stopped
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx context.Context, log logr.Logger, pod *corev1.Pod) (reconcile.Result, error) {
|
||||
finalizers, removed := removeFinalizer(runner.ObjectMeta.Finalizers, finalizerName)
|
||||
|
||||
if removed {
|
||||
if len(runner.Status.Registration.Token) > 0 {
|
||||
ok, err := r.unregisterRunner(ctx, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name)
|
||||
if err != nil {
|
||||
if errors.Is(err, &gogithub.RateLimitError{}) {
|
||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||
// or the runner is still busy.
|
||||
log.Error(
|
||||
err,
|
||||
fmt.Sprintf(
|
||||
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
||||
retryDelayOnGitHubAPIRateLimitError,
|
||||
),
|
||||
)
|
||||
|
||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||
}
|
||||
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
if !ok {
|
||||
log.V(1).Info("Runner no longer exists on GitHub")
|
||||
}
|
||||
} else {
|
||||
log.V(1).Info("Runner was never registered on GitHub")
|
||||
_, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, runner.Spec.Enterprise, runner.Spec.Organization, runner.Spec.Repository, runner.Name, pod)
|
||||
if res != nil {
|
||||
return *res, err
|
||||
}
|
||||
|
||||
newRunner := runner.DeepCopy()
|
||||
|
|
@ -478,6 +492,24 @@ func (r *RunnerReconciler) processRunnerDeletion(runner v1alpha1.Runner, ctx con
|
|||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) unregistrationTimeout() time.Duration {
|
||||
unregistrationTimeout := DefaultUnregistrationTimeout
|
||||
|
||||
if r.UnregistrationTimeout > 0 {
|
||||
unregistrationTimeout = r.UnregistrationTimeout
|
||||
}
|
||||
return unregistrationTimeout
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) unregistrationRetryDelay() time.Duration {
|
||||
retryDelay := DefaultUnregistrationRetryDelay
|
||||
|
||||
if r.UnregistrationRetryDelay > 0 {
|
||||
retryDelay = r.UnregistrationRetryDelay
|
||||
}
|
||||
return retryDelay
|
||||
}
|
||||
|
||||
func (r *RunnerReconciler) processRunnerPodDeletion(ctx context.Context, runner v1alpha1.Runner, log logr.Logger, pod corev1.Pod) (reconcile.Result, error) {
|
||||
deletionTimeout := 1 * time.Minute
|
||||
currentTime := time.Now()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,249 @@
|
|||
package controllers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/github"
|
||||
"github.com/go-logr/logr"
|
||||
gogithub "github.com/google/go-github/v39/github"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/client"
|
||||
)
|
||||
|
||||
const (
|
||||
unregistrationCompleteTimestamp = "unregistration-complete-timestamp"
|
||||
unregistrationStartTimestamp = "unregistration-start-timestamp"
|
||||
|
||||
// DefaultUnregistrationTimeout is the duration until ARC gives up retrying the combo of ListRunners API (to detect the runner ID by name)
|
||||
// and RemoveRunner API (to actually unregister the runner) calls.
|
||||
// This needs to be longer than 60 seconds because a part of the combo, the ListRunners API, seems to use the Cache-Control header of max-age=60s
|
||||
// and that instructs our cache library httpcache to cache responses for 60 seconds, which results in ARC unable to see the runner in the ListRunners response
|
||||
// up to 60 seconds (or even more depending on the situation).
|
||||
DefaultUnregistrationTimeout = 60 * time.Second
|
||||
|
||||
// This can be any value but a larger value can make an unregistration timeout longer than configured in practice.
|
||||
DefaultUnregistrationRetryDelay = 30 * time.Second
|
||||
)
|
||||
|
||||
// tickRunnerGracefulStop reconciles the runner and the runner pod in a way so that
|
||||
// we can delete the runner pod without disrupting a workflow job.
|
||||
//
|
||||
// This function returns a non-nil pointer to corev1.Pod as the first return value
|
||||
// if the runner is considered to have gracefully stopped, hence it's pod is safe for deletion.
|
||||
//
|
||||
// It's a "tick" operation so a graceful stop can take multiple calls to complete.
|
||||
// This function is designed to complete a length graceful stop process in a unblocking way.
|
||||
// When it wants to be retried later, the function returns a non-nil *ctrl.Result as the second return value, may or may not populating the error in the second return value.
|
||||
// The caller is expected to return the returned ctrl.Result and error to postpone the current reconcilation loop and trigger a scheduled retry.
|
||||
func tickRunnerGracefulStop(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, c client.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*corev1.Pod, *ctrl.Result, error) {
|
||||
if pod != nil {
|
||||
if _, ok := getAnnotation(pod, unregistrationStartTimestamp); !ok {
|
||||
updated := pod.DeepCopy()
|
||||
setAnnotation(updated, unregistrationStartTimestamp, time.Now().Format(time.RFC3339))
|
||||
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
|
||||
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationStartTimestamp))
|
||||
return nil, &ctrl.Result{}, err
|
||||
}
|
||||
pod = updated
|
||||
|
||||
log.Info("Runner has started unregistration")
|
||||
} else {
|
||||
log.Info("Runner has already started unregistration")
|
||||
}
|
||||
}
|
||||
|
||||
if res, err := ensureRunnerUnregistration(ctx, unregistrationTimeout, retryDelay, log, ghClient, enterprise, organization, repository, runner, pod); res != nil {
|
||||
return nil, res, err
|
||||
}
|
||||
|
||||
if pod != nil {
|
||||
if _, ok := getAnnotation(pod, unregistrationCompleteTimestamp); !ok {
|
||||
updated := pod.DeepCopy()
|
||||
setAnnotation(updated, unregistrationCompleteTimestamp, time.Now().Format(time.RFC3339))
|
||||
if err := c.Patch(ctx, updated, client.MergeFrom(pod)); err != nil {
|
||||
log.Error(err, fmt.Sprintf("Failed to patch pod to have %s annotation", unregistrationCompleteTimestamp))
|
||||
return nil, &ctrl.Result{}, err
|
||||
}
|
||||
pod = updated
|
||||
|
||||
log.Info("Runner has completed unregistration")
|
||||
} else {
|
||||
log.Info("Runner has already completed unregistration")
|
||||
}
|
||||
}
|
||||
|
||||
return pod, nil, nil
|
||||
}
|
||||
|
||||
// If the first return value is nil, it's safe to delete the runner pod.
|
||||
func ensureRunnerUnregistration(ctx context.Context, unregistrationTimeout time.Duration, retryDelay time.Duration, log logr.Logger, ghClient *github.Client, enterprise, organization, repository, runner string, pod *corev1.Pod) (*ctrl.Result, error) {
|
||||
ok, err := unregisterRunner(ctx, ghClient, enterprise, organization, repository, runner)
|
||||
if err != nil {
|
||||
if errors.Is(err, &gogithub.RateLimitError{}) {
|
||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||
// or the runner is still busy.
|
||||
log.Error(
|
||||
err,
|
||||
fmt.Sprintf(
|
||||
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
||||
retryDelayOnGitHubAPIRateLimitError,
|
||||
),
|
||||
)
|
||||
|
||||
return &ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||
}
|
||||
|
||||
log.Error(err, "Failed to unregister runner before deleting the pod.")
|
||||
|
||||
return &ctrl.Result{}, err
|
||||
} else if ok {
|
||||
log.Info("Runner has just been unregistered. Removing the runner pod.")
|
||||
} else if pod == nil {
|
||||
// `r.unregisterRunner()` will returns `false, nil` if the runner is not found on GitHub.
|
||||
// However, that doesn't always mean the pod can be safely removed.
|
||||
//
|
||||
// If the pod does not exist for the runner,
|
||||
// it may be due to that the runner pod has never been created.
|
||||
// In that case we can safely assume that the runner will never be registered.
|
||||
|
||||
log.Info("Runner was not found on GitHub and the runner pod was not found on Kuberntes.")
|
||||
} else if pod.Annotations[unregistrationCompleteTimestamp] != "" {
|
||||
// If it's already unregistered in the previous reconcilation loop,
|
||||
// you can safely assume that it won't get registered again so it's safe to delete the runner pod.
|
||||
log.Info("Runner pod is marked as already unregistered.")
|
||||
} else if runnerPodOrContainerIsStopped(pod) {
|
||||
// If it's an ephemeral runner with the actions/runner container exited with 0,
|
||||
// we can safely assume that it has unregistered itself from GitHub Actions
|
||||
// so it's natural that RemoveRunner fails due to 404.
|
||||
|
||||
// If pod has ended up succeeded we need to restart it
|
||||
// Happens e.g. when dind is in runner and run completes
|
||||
log.Info("Runner pod has been stopped with a successful status.")
|
||||
} else if ts := pod.Annotations[unregistrationStartTimestamp]; ts != "" {
|
||||
t, err := time.Parse(time.RFC3339, ts)
|
||||
if err != nil {
|
||||
return &ctrl.Result{RequeueAfter: retryDelay}, err
|
||||
}
|
||||
|
||||
if r := time.Until(t.Add(unregistrationTimeout)); r > 0 {
|
||||
log.Info("Runner unregistration is in-progress.", "timeout", unregistrationTimeout, "remaining", r)
|
||||
return &ctrl.Result{RequeueAfter: retryDelay}, err
|
||||
}
|
||||
|
||||
log.Info("Runner unregistration has been timed out. The runner pod will be deleted soon.", "timeout", unregistrationTimeout)
|
||||
} else {
|
||||
// A runner and a runner pod that is created by this version of ARC should match
|
||||
// any of the above branches.
|
||||
//
|
||||
// But we leave this match all branch for potential backward-compatibility.
|
||||
// The caller is expected to take appropriate actions, like annotating the pod as started the unregistration process,
|
||||
// and retry later.
|
||||
log.V(1).Info("Runner unregistration is being retried later.")
|
||||
|
||||
return &ctrl.Result{RequeueAfter: retryDelay}, nil
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func getAnnotation(pod *corev1.Pod, key string) (string, bool) {
|
||||
if pod.Annotations == nil {
|
||||
return "", false
|
||||
}
|
||||
|
||||
v, ok := pod.Annotations[key]
|
||||
|
||||
return v, ok
|
||||
}
|
||||
|
||||
func setAnnotation(pod *corev1.Pod, key, value string) {
|
||||
if pod.Annotations == nil {
|
||||
pod.Annotations = map[string]string{}
|
||||
}
|
||||
|
||||
pod.Annotations[key] = value
|
||||
}
|
||||
|
||||
// unregisterRunner unregisters the runner from GitHub Actions by name.
|
||||
//
|
||||
// This function returns:
|
||||
//
|
||||
// Case 1. (true, nil) when it has successfully unregistered the runner.
|
||||
// Case 2. (false, nil) when (2-1.) the runner has been already unregistered OR (2-2.) the runner will never be created OR (2-3.) the runner is not created yet and it is about to be registered(hence we couldn't see it's existence from GitHub Actions API yet)
|
||||
// Case 3. (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
|
||||
// an error returned by GitHub API.
|
||||
//
|
||||
// When the returned values is "Case 2. (false, nil)", the caller must handle the three possible sub-cases appropriately.
|
||||
// In other words, all those three sub-cases cannot be distinguished by this function alone.
|
||||
//
|
||||
// - Case "2-1." can happen when e.g. ARC has successfully unregistered in a previous reconcilation loop or it was an ephemeral runner that finished it's job run(an ephemeral runner is designed to stop after a job run).
|
||||
// You'd need to maintain the runner state(i.e. if it's already unregistered or not) somewhere,
|
||||
// so that you can either not call this function at all if the runner state says it's already unregistered, or determine that it's case "2-1." when you got (false, nil).
|
||||
//
|
||||
// - Case "2-2." can happen when e.g. the runner registration token was somehow broken so that `config.sh` within the runner container was never meant to succeed.
|
||||
// Waiting and retrying forever on this case is not a solution, because `config.sh` won't succeed with a wrong token hence the runner gets stuck in this state forever.
|
||||
// There isn't a perfect solution to this, but a practical workaround would be implement a "grace period" in the caller side.
|
||||
//
|
||||
// - Case "2-3." can happen when e.g. ARC recreated an ephemral runner pod in a previous reconcilation loop and then it was requested to delete the runner before the runner comes up.
|
||||
// If handled inappropriately, this can cause a race condition betweeen a deletion of the runner pod and GitHub scheduling a workflow job onto the runner.
|
||||
//
|
||||
// Once successfully detected case "2-1." or "2-2.", you can safely delete the runner pod because you know that the runner won't come back
|
||||
// as long as you recreate the runner pod.
|
||||
//
|
||||
// If it was "2-3.", you need a workaround to avoid the race condition.
|
||||
//
|
||||
// You shall introduce a "grace period" mechanism, similar or equal to that is required for "Case 2-2.", so that you ever
|
||||
// start the runner pod deletion only after it's more and more likely that the runner pod is not coming up.
|
||||
//
|
||||
// Beware though, you need extra care to set an appropriate grace period depending on your environment.
|
||||
// There isn't a single right grace period that works for everyone.
|
||||
// The longer the grace period is, the earlier a cluster resource shortage can occur due to throttoled runner pod deletions,
|
||||
// while the shorter the grace period is, the more likely you may encounter the race issue.
|
||||
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
|
||||
runners, err := client.ListRunners(ctx, enterprise, org, repo)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
id := int64(0)
|
||||
for _, runner := range runners {
|
||||
if runner.GetName() == name {
|
||||
id = runner.GetID()
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if id == int64(0) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// For the record, historically ARC did not try to call RemoveRunner on a busy runner, but it's no longer true.
|
||||
// The reason ARC did so was to let a runner running a job to not stop prematurely.
|
||||
//
|
||||
// However, we learned that RemoveRunner already has an ability to prevent stopping a busy runner,
|
||||
// so ARC doesn't need to do anything special for a graceful runner stop.
|
||||
// It can just call RemoveRunner, and if it returned 200 you're guaranteed that the runner will not automatically come back and
|
||||
// the runner pod is safe for deletion.
|
||||
//
|
||||
// Trying to remove a busy runner can result in errors like the following:
|
||||
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
|
||||
//
|
||||
// # NOTES
|
||||
//
|
||||
// - It can be "status=offline" at the same time but that's another story.
|
||||
// - After https://github.com/actions-runner-controller/actions-runner-controller/pull/1127, ListRunners responses that are used to
|
||||
// determine if the runner is busy can be more outdated than before, as those responeses are now cached for 60 seconds.
|
||||
// - Note that 60 seconds is controlled by the Cache-Control response header provided by GitHub so we don't have a strict control on it but we assume it won't
|
||||
// change from 60 seconds.
|
||||
//
|
||||
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
|
||||
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
|
@ -47,6 +47,9 @@ type RunnerPodReconciler struct {
|
|||
Name string
|
||||
RegistrationRecheckInterval time.Duration
|
||||
RegistrationRecheckJitter time.Duration
|
||||
|
||||
UnregistrationTimeout time.Duration
|
||||
UnregistrationRetryDelay time.Duration
|
||||
}
|
||||
|
||||
const (
|
||||
|
|
@ -105,33 +108,16 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
|||
finalizers, removed := removeFinalizer(runnerPod.ObjectMeta.Finalizers, runnerPodFinalizerName)
|
||||
|
||||
if removed {
|
||||
ok, err := r.unregisterRunner(ctx, enterprise, org, repo, runnerPod.Name)
|
||||
if err != nil {
|
||||
if errors.Is(err, &gogithub.RateLimitError{}) {
|
||||
// We log the underlying error when we failed calling GitHub API to list or unregisters,
|
||||
// or the runner is still busy.
|
||||
log.Error(
|
||||
err,
|
||||
fmt.Sprintf(
|
||||
"Failed to unregister runner due to GitHub API rate limits. Delaying retry for %s to avoid excessive GitHub API calls",
|
||||
retryDelayOnGitHubAPIRateLimitError,
|
||||
),
|
||||
)
|
||||
|
||||
return ctrl.Result{RequeueAfter: retryDelayOnGitHubAPIRateLimitError}, err
|
||||
}
|
||||
|
||||
return ctrl.Result{}, err
|
||||
updatedPod, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
|
||||
if res != nil {
|
||||
return *res, err
|
||||
}
|
||||
|
||||
if !ok {
|
||||
log.V(1).Info("Runner no longer exists on GitHub")
|
||||
}
|
||||
patchedPod := updatedPod.DeepCopy()
|
||||
patchedPod.ObjectMeta.Finalizers = finalizers
|
||||
|
||||
newRunner := runnerPod.DeepCopy()
|
||||
newRunner.ObjectMeta.Finalizers = finalizers
|
||||
|
||||
if err := r.Patch(ctx, newRunner, client.MergeFrom(&runnerPod)); err != nil {
|
||||
// We commit the removal of the finalizer so that Kuberenetes notices it and delete the pod resource from the cluster.
|
||||
if err := r.Patch(ctx, patchedPod, client.MergeFrom(&runnerPod)); err != nil {
|
||||
log.Error(err, "Failed to update runner for finalizer removal")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
|
@ -365,8 +351,13 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
|||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
updated, res, err := tickRunnerGracefulStop(ctx, r.unregistrationTimeout(), r.unregistrationRetryDelay(), log, r.GitHubClient, r.Client, enterprise, org, repo, runnerPod.Name, &runnerPod)
|
||||
if res != nil {
|
||||
return *res, err
|
||||
}
|
||||
|
||||
// Delete current pod if recreation is needed
|
||||
if err := r.Delete(ctx, &runnerPod); err != nil {
|
||||
if err := r.Delete(ctx, updated); err != nil {
|
||||
log.Error(err, "Failed to delete pod resource")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
|
@ -377,8 +368,22 @@ func (r *RunnerPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
|
|||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
func (r *RunnerPodReconciler) unregisterRunner(ctx context.Context, enterprise, org, repo, name string) (bool, error) {
|
||||
return unregisterRunner(ctx, r.GitHubClient, enterprise, org, repo, name)
|
||||
func (r *RunnerPodReconciler) unregistrationTimeout() time.Duration {
|
||||
unregistrationTimeout := DefaultUnregistrationTimeout
|
||||
|
||||
if r.UnregistrationTimeout > 0 {
|
||||
unregistrationTimeout = r.UnregistrationTimeout
|
||||
}
|
||||
return unregistrationTimeout
|
||||
}
|
||||
|
||||
func (r *RunnerPodReconciler) unregistrationRetryDelay() time.Duration {
|
||||
retryDelay := DefaultUnregistrationRetryDelay
|
||||
|
||||
if r.UnregistrationRetryDelay > 0 {
|
||||
retryDelay = r.UnregistrationRetryDelay
|
||||
}
|
||||
return retryDelay
|
||||
}
|
||||
|
||||
func (r *RunnerPodReconciler) SetupWithManager(mgr ctrl.Manager) error {
|
||||
|
|
|
|||
|
|
@ -177,6 +177,7 @@ func (r *RunnerDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req
|
|||
// Please add more conditions that we can in-place update the newest runnerreplicaset without disruption
|
||||
if currentDesiredReplicas != newDesiredReplicas {
|
||||
newestSet.Spec.Replicas = &newDesiredReplicas
|
||||
newestSet.Spec.EffectiveTime = rd.Spec.EffectiveTime
|
||||
|
||||
if err := r.Client.Update(ctx, newestSet); err != nil {
|
||||
log.Error(err, "Failed to update runnerreplicaset resource")
|
||||
|
|
@ -417,9 +418,10 @@ func newRunnerReplicaSet(rd *v1alpha1.RunnerDeployment, commonRunnerLabels []str
|
|||
Labels: newRSTemplate.ObjectMeta.Labels,
|
||||
},
|
||||
Spec: v1alpha1.RunnerReplicaSetSpec{
|
||||
Replicas: rd.Spec.Replicas,
|
||||
Selector: newRSSelector,
|
||||
Template: newRSTemplate,
|
||||
Replicas: rd.Spec.Replicas,
|
||||
Selector: newRSSelector,
|
||||
Template: newRSTemplate,
|
||||
EffectiveTime: rd.Spec.EffectiveTime,
|
||||
},
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -49,6 +49,10 @@ type RunnerReplicaSetReconciler struct {
|
|||
Name string
|
||||
}
|
||||
|
||||
const (
|
||||
SyncTimeAnnotationKey = "sync-time"
|
||||
)
|
||||
|
||||
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets,verbs=get;list;watch;create;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/finalizers,verbs=get;list;watch;create;update;patch;delete
|
||||
// +kubebuilder:rbac:groups=actions.summerwind.dev,resources=runnerreplicasets/status,verbs=get;update;patch
|
||||
|
|
@ -85,19 +89,36 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
|
|||
}
|
||||
}
|
||||
|
||||
var myRunners []v1alpha1.Runner
|
||||
|
||||
var (
|
||||
current int
|
||||
ready int
|
||||
available int
|
||||
|
||||
lastSyncTime *time.Time
|
||||
)
|
||||
|
||||
for _, r := range allRunners.Items {
|
||||
// This guard is required to avoid the RunnerReplicaSet created by the controller v0.17.0 or before
|
||||
// to not treat all the runners in the namespace as its children.
|
||||
if metav1.IsControlledBy(&r, &rs) && !metav1.HasAnnotation(r.ObjectMeta, annotationKeyRegistrationOnly) {
|
||||
myRunners = append(myRunners, r)
|
||||
// If the runner is already marked for deletion(=has a non-zero deletion timestamp) by the runner controller (can be caused by an ephemeral runner completion)
|
||||
// or by runnerreplicaset controller (in case it was deleted in the previous reconcilation loop),
|
||||
// we don't need to bother calling GitHub API to re-mark the runner for deletion.
|
||||
// Just hold on, and runners will disappear as long as the runner controller is up and running.
|
||||
if !r.DeletionTimestamp.IsZero() {
|
||||
continue
|
||||
}
|
||||
|
||||
if r.Annotations != nil {
|
||||
if a, ok := r.Annotations[SyncTimeAnnotationKey]; ok {
|
||||
t, err := time.Parse(time.RFC3339, a)
|
||||
if err == nil {
|
||||
if lastSyncTime == nil || lastSyncTime.Before(t) {
|
||||
lastSyncTime = &t
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
current += 1
|
||||
|
||||
|
|
@ -152,7 +173,30 @@ func (r *RunnerReplicaSetReconciler) Reconcile(ctx context.Context, req ctrl.Req
|
|||
}
|
||||
}
|
||||
|
||||
if current > desired {
|
||||
effectiveTime := rs.Spec.EffectiveTime
|
||||
ephemeral := rs.Spec.Template.Spec.Ephemeral == nil || *rs.Spec.Template.Spec.Ephemeral
|
||||
|
||||
if current < desired && ephemeral && lastSyncTime != nil && effectiveTime != nil && lastSyncTime.After(effectiveTime.Time) {
|
||||
log.V(1).Info("Detected that some ephemeral runners have disappeared. Usually this is due to that ephemeral runner completions so ARC does not create new runners until EffectiveTime is updated.", "lastSyncTime", metav1.Time{Time: *lastSyncTime}, "effectiveTime", *effectiveTime, "desired", desired, "available", current, "ready", ready)
|
||||
} else if current > desired {
|
||||
// If you use ephemeral runners with webhook-based autoscaler and the runner controller is working normally,
|
||||
// you're unlikely to fall into this branch.
|
||||
//
|
||||
// That's becaseu all the stakeholders work like this:
|
||||
//
|
||||
// 1. A runner pod completes with the runner container exiting with code 0
|
||||
// 2. ARC runner controller detects the pod completion, marks the runner resource on k8s for deletion (=Runner.DeletionTimestamp becomes non-zero)
|
||||
// 3. GitHub triggers a corresponding workflow_job "complete" webhook event
|
||||
// 4. ARC github-webhook-server (webhook-based autoscaler) receives the webhook event updates HRA with removing the oldest capacity reservation
|
||||
// 5. ARC horizontalrunnerautoscaler updates RunnerDeployment's desired replicas based on capacity reservations
|
||||
// 6. ARC runnerdeployment controller updates RunnerReplicaSet's desired replicas
|
||||
// 7. (We're here) ARC runnerreplicaset controller (this controller) starts reconciling the RunnerReplicaSet
|
||||
//
|
||||
// In a normally working ARC installation, the runner that was used to run the workflow job should already have been
|
||||
// marked for deletion by the runner controller.
|
||||
// This runnerreplicaset controller doesn't count marked runners into the `current` value, hence you're unlikely to
|
||||
// fall into this branch when you're using ephemeral runners with webhook-based-autoscaler.
|
||||
|
||||
n := current - desired
|
||||
|
||||
log.V(0).Info(fmt.Sprintf("Deleting %d runners from RunnerReplicaSet %s", n, req.NamespacedName), "desired", desired, "current", current, "ready", ready)
|
||||
|
|
@ -282,6 +326,10 @@ func (r *RunnerReplicaSetReconciler) newRunner(rs v1alpha1.RunnerReplicaSet) (v1
|
|||
|
||||
objectMeta.GenerateName = rs.ObjectMeta.Name + "-"
|
||||
objectMeta.Namespace = rs.ObjectMeta.Namespace
|
||||
if objectMeta.Annotations == nil {
|
||||
objectMeta.Annotations = map[string]string{}
|
||||
}
|
||||
objectMeta.Annotations[SyncTimeAnnotationKey] = time.Now().Format(time.RFC3339)
|
||||
|
||||
runner := v1alpha1.Runner{
|
||||
TypeMeta: metav1.TypeMeta{},
|
||||
|
|
|
|||
|
|
@ -1,49 +0,0 @@
|
|||
package controllers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/github"
|
||||
)
|
||||
|
||||
// unregisterRunner unregisters the runner from GitHub Actions by name.
|
||||
//
|
||||
// This function returns:
|
||||
// - (true, nil) when it has successfully unregistered the runner.
|
||||
// - (false, nil) when the runner has been already unregistered.
|
||||
// - (false, err) when it postponed unregistration due to the runner being busy, or it tried to unregister the runner but failed due to
|
||||
// an error returned by GitHub API.
|
||||
func unregisterRunner(ctx context.Context, client *github.Client, enterprise, org, repo, name string) (bool, error) {
|
||||
runners, err := client.ListRunners(ctx, enterprise, org, repo)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
id := int64(0)
|
||||
for _, runner := range runners {
|
||||
if runner.GetName() == name {
|
||||
// Note that sometimes a runner can stuck "busy" even though it is already "offline".
|
||||
// But we assume that it's not actually offline and still running a job.
|
||||
if runner.GetBusy() {
|
||||
return false, fmt.Errorf("runner is busy")
|
||||
}
|
||||
id = runner.GetID()
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if id == int64(0) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Trying to remove a busy runner can result in errors like the following:
|
||||
// failed to remove runner: DELETE https://api.github.com/repos/actions-runner-controller/mumoshu-actions-test/actions/runners/47: 422 Bad request - Runner \"example-runnerset-0\" is still running a job\" []
|
||||
//
|
||||
// TODO: Probably we can just remove the runner by ID without seeing if the runner is busy, by treating it as busy when a remove-runner call failed with 422?
|
||||
if err := client.RemoveRunner(ctx, enterprise, org, repo, id); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}
|
||||
|
|
@ -11,8 +11,11 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/actions-runner-controller/actions-runner-controller/github/metrics"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/logging"
|
||||
"github.com/bradleyfalzon/ghinstallation"
|
||||
"github.com/go-logr/logr"
|
||||
"github.com/google/go-github/v39/github"
|
||||
"github.com/gregjones/httpcache"
|
||||
"golang.org/x/oauth2"
|
||||
)
|
||||
|
||||
|
|
@ -28,6 +31,8 @@ type Config struct {
|
|||
BasicauthUsername string `split_words:"true"`
|
||||
BasicauthPassword string `split_words:"true"`
|
||||
RunnerGitHubURL string `split_words:"true"`
|
||||
|
||||
Log *logr.Logger
|
||||
}
|
||||
|
||||
// Client wraps GitHub client with some additional
|
||||
|
|
@ -46,7 +51,6 @@ type BasicAuthTransport struct {
|
|||
|
||||
func (p BasicAuthTransport) RoundTrip(req *http.Request) (*http.Response, error) {
|
||||
req.SetBasicAuth(p.Username, p.Password)
|
||||
req.Header.Set("User-Agent", "actions-runner-controller")
|
||||
return http.DefaultTransport.RoundTrip(req)
|
||||
}
|
||||
|
||||
|
|
@ -82,8 +86,11 @@ func (c *Config) NewClient() (*Client, error) {
|
|||
transport = tr
|
||||
}
|
||||
|
||||
transport = metrics.Transport{Transport: transport}
|
||||
httpClient := &http.Client{Transport: transport}
|
||||
cached := httpcache.NewTransport(httpcache.NewMemoryCache())
|
||||
cached.Transport = transport
|
||||
loggingTransport := logging.Transport{Transport: cached, Log: c.Log}
|
||||
metricsTransport := metrics.Transport{Transport: loggingTransport}
|
||||
httpClient := &http.Client{Transport: metricsTransport}
|
||||
|
||||
var client *github.Client
|
||||
var githubBaseURL string
|
||||
|
|
@ -128,6 +135,8 @@ func (c *Config) NewClient() (*Client, error) {
|
|||
}
|
||||
}
|
||||
|
||||
client.UserAgent = "actions-runner-controller"
|
||||
|
||||
return &Client{
|
||||
Client: client,
|
||||
regTokens: map[string]*github.RegistrationToken{},
|
||||
|
|
|
|||
|
|
@ -152,3 +152,10 @@ func TestCleanup(t *testing.T) {
|
|||
t.Errorf("expired token still exists")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUserAgent(t *testing.T) {
|
||||
client := newTestClient()
|
||||
if client.UserAgent != "actions-runner-controller" {
|
||||
t.Errorf("UserAgent should be set to actions-runner-controller")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
1
go.mod
1
go.mod
|
|
@ -41,6 +41,7 @@ require (
|
|||
github.com/google/gofuzz v1.1.0 // indirect
|
||||
github.com/google/uuid v1.1.2 // indirect
|
||||
github.com/googleapis/gnostic v0.5.5 // indirect
|
||||
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 // indirect
|
||||
github.com/imdario/mergo v0.3.12 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect
|
||||
|
|
|
|||
2
go.sum
2
go.sum
|
|
@ -259,6 +259,8 @@ github.com/gorilla/mux v1.8.0 h1:i40aqfkR1h2SlN9hojwV5ZA91wcXFOvkdNIeFDP5koI=
|
|||
github.com/gorilla/mux v1.8.0/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
|
||||
github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
|
||||
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA=
|
||||
github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
|
||||
github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs=
|
||||
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y=
|
||||
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
|
||||
|
|
|
|||
|
|
@ -0,0 +1,54 @@
|
|||
package logging
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
zaplib "go.uber.org/zap"
|
||||
"go.uber.org/zap/zapcore"
|
||||
"sigs.k8s.io/controller-runtime/pkg/log/zap"
|
||||
)
|
||||
|
||||
const (
|
||||
LogLevelDebug = "debug"
|
||||
LogLevelInfo = "info"
|
||||
LogLevelWarn = "warn"
|
||||
LogLevelError = "error"
|
||||
)
|
||||
|
||||
func NewLogger(logLevel string) logr.Logger {
|
||||
log := zap.New(func(o *zap.Options) {
|
||||
switch logLevel {
|
||||
case LogLevelDebug:
|
||||
o.Development = true
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.DebugLevel) // maps to logr's V(1)
|
||||
o.Level = &lvl
|
||||
case LogLevelInfo:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel)
|
||||
o.Level = &lvl
|
||||
case LogLevelWarn:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel)
|
||||
o.Level = &lvl
|
||||
case LogLevelError:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel)
|
||||
o.Level = &lvl
|
||||
default:
|
||||
// We use bitsize of 8 as zapcore.Level is a type alias to int8
|
||||
levelInt, err := strconv.ParseInt(logLevel, 10, 8)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Failed to parse --log-level=%s: %v", logLevel, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// For example, --log-level=debug a.k.a --log-level=-1 maps to zaplib.DebugLevel, which is associated to logr's V(1)
|
||||
// --log-level=-2 maps the specific custom log level that is associated to logr's V(2).
|
||||
level := zapcore.Level(levelInt)
|
||||
atomicLevel := zaplib.NewAtomicLevelAt(level)
|
||||
o.Level = &atomicLevel
|
||||
}
|
||||
})
|
||||
|
||||
return log
|
||||
}
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
// Package logging provides various logging helpers for ARC
|
||||
package logging
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
"github.com/gregjones/httpcache"
|
||||
)
|
||||
|
||||
const (
|
||||
// https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting
|
||||
headerRateLimitRemaining = "X-RateLimit-Remaining"
|
||||
)
|
||||
|
||||
// Transport wraps a transport with metrics monitoring
|
||||
type Transport struct {
|
||||
Transport http.RoundTripper
|
||||
|
||||
Log *logr.Logger
|
||||
}
|
||||
|
||||
func (t Transport) RoundTrip(req *http.Request) (*http.Response, error) {
|
||||
resp, err := t.Transport.RoundTrip(req)
|
||||
if resp != nil {
|
||||
t.log(req, resp)
|
||||
}
|
||||
return resp, err
|
||||
}
|
||||
|
||||
func (t Transport) log(req *http.Request, resp *http.Response) {
|
||||
if t.Log == nil {
|
||||
return
|
||||
}
|
||||
|
||||
var args []interface{}
|
||||
|
||||
marked := resp.Header.Get(httpcache.XFromCache) == "1"
|
||||
|
||||
args = append(args, "from_cache", marked, "method", req.Method, "url", req.URL.String())
|
||||
|
||||
if !marked {
|
||||
// Do not log outdated rate limit remaining value
|
||||
|
||||
remaining := resp.Header.Get(headerRateLimitRemaining)
|
||||
|
||||
args = append(args, "ratelimit_remaining", remaining)
|
||||
}
|
||||
|
||||
t.Log.V(3).Info("Seen HTTP response", args...)
|
||||
}
|
||||
28
main.go
28
main.go
|
|
@ -26,24 +26,18 @@ import (
|
|||
actionsv1alpha1 "github.com/actions-runner-controller/actions-runner-controller/api/v1alpha1"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/controllers"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/github"
|
||||
"github.com/actions-runner-controller/actions-runner-controller/logging"
|
||||
"github.com/kelseyhightower/envconfig"
|
||||
zaplib "go.uber.org/zap"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
|
||||
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
|
||||
ctrl "sigs.k8s.io/controller-runtime"
|
||||
"sigs.k8s.io/controller-runtime/pkg/log/zap"
|
||||
// +kubebuilder:scaffold:imports
|
||||
)
|
||||
|
||||
const (
|
||||
defaultRunnerImage = "summerwind/actions-runner:latest"
|
||||
defaultDockerImage = "docker:dind"
|
||||
|
||||
logLevelDebug = "debug"
|
||||
logLevelInfo = "info"
|
||||
logLevelWarn = "warn"
|
||||
logLevelError = "error"
|
||||
)
|
||||
|
||||
var (
|
||||
|
|
@ -120,24 +114,12 @@ func main() {
|
|||
flag.DurationVar(&syncPeriod, "sync-period", 10*time.Minute, "Determines the minimum frequency at which K8s resources managed by this controller are reconciled. When you use autoscaling, set to a lower value like 10 minute, because this corresponds to the minimum time to react on demand change. . If you're tweaking this in order to make autoscaling more responsive, you'll probably want to tweak github-api-cache-duration, too")
|
||||
flag.Var(&commonRunnerLabels, "common-runner-labels", "Runner labels in the K1=V1,K2=V2,... format that are inherited all the runners created by the controller. See https://github.com/actions-runner-controller/actions-runner-controller/issues/321 for more information")
|
||||
flag.StringVar(&namespace, "watch-namespace", "", "The namespace to watch for custom resources. Set to empty for letting it watch for all namespaces.")
|
||||
flag.StringVar(&logLevel, "log-level", logLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
|
||||
flag.StringVar(&logLevel, "log-level", logging.LogLevelDebug, `The verbosity of the logging. Valid values are "debug", "info", "warn", "error". Defaults to "debug".`)
|
||||
flag.Parse()
|
||||
|
||||
logger := zap.New(func(o *zap.Options) {
|
||||
switch logLevel {
|
||||
case logLevelDebug:
|
||||
o.Development = true
|
||||
case logLevelInfo:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.InfoLevel)
|
||||
o.Level = &lvl
|
||||
case logLevelWarn:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.WarnLevel)
|
||||
o.Level = &lvl
|
||||
case logLevelError:
|
||||
lvl := zaplib.NewAtomicLevelAt(zaplib.ErrorLevel)
|
||||
o.Level = &lvl
|
||||
}
|
||||
})
|
||||
logger := logging.NewLogger(logLevel)
|
||||
|
||||
c.Log = &logger
|
||||
|
||||
ghClient, err = c.NewClient()
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -17,12 +17,15 @@ var (
|
|||
controllerImageTag = "e2e"
|
||||
controllerImage = testing.Img(controllerImageRepo, controllerImageTag)
|
||||
runnerImageRepo = "actionsrunnercontrollere2e/actions-runner"
|
||||
runnerDindImageRepo = "actionsrunnercontrollere2e/actions-runner-dind"
|
||||
runnerImageTag = "e2e"
|
||||
runnerImage = testing.Img(runnerImageRepo, runnerImageTag)
|
||||
runnerDindImage = testing.Img(runnerDindImageRepo, runnerImageTag)
|
||||
|
||||
prebuildImages = []testing.ContainerImage{
|
||||
controllerImage,
|
||||
runnerImage,
|
||||
runnerDindImage,
|
||||
}
|
||||
|
||||
builds = []testing.DockerBuild{
|
||||
|
|
@ -36,6 +39,11 @@ var (
|
|||
Args: []testing.BuildArg{},
|
||||
Image: runnerImage,
|
||||
},
|
||||
{
|
||||
Dockerfile: "../../runner/Dockerfile.dindrunner",
|
||||
Args: []testing.BuildArg{},
|
||||
Image: runnerDindImage,
|
||||
},
|
||||
}
|
||||
|
||||
certManagerVersion = "v1.1.1"
|
||||
|
|
@ -52,7 +60,6 @@ var (
|
|||
"SYNC_PERIOD=" + "10s",
|
||||
"NAME=" + controllerImageRepo,
|
||||
"VERSION=" + controllerImageTag,
|
||||
"RUNNER_NAME=" + runnerImageRepo,
|
||||
"RUNNER_TAG=" + runnerImageTag,
|
||||
}
|
||||
|
||||
|
|
@ -167,11 +174,15 @@ type env struct {
|
|||
useRunnerSet bool
|
||||
|
||||
testID string
|
||||
testName string
|
||||
repoToCommit string
|
||||
runnerLabel, githubToken, testRepo, testOrg, testOrgRepo string
|
||||
githubTokenWebhook string
|
||||
testEnterprise string
|
||||
featureFlagEphemeral bool
|
||||
scaleDownDelaySecondsAfterScaleOut int64
|
||||
minReplicas int64
|
||||
dockerdWithinRunnerContainer bool
|
||||
testJobs []job
|
||||
}
|
||||
|
||||
|
|
@ -184,11 +195,12 @@ func initTestEnv(t *testing.T) *env {
|
|||
|
||||
id := e.ID()
|
||||
|
||||
testID := t.Name() + " " + id
|
||||
testName := t.Name() + " " + id
|
||||
|
||||
t.Logf("Using test id %s", testID)
|
||||
t.Logf("Initializing test with name %s", testName)
|
||||
|
||||
e.testID = testID
|
||||
e.testID = id
|
||||
e.testName = testName
|
||||
e.runnerLabel = "test-" + id
|
||||
e.githubToken = testing.Getenv(t, "GITHUB_TOKEN")
|
||||
e.githubTokenWebhook = testing.Getenv(t, "WEBHOOK_GITHUB_TOKEN")
|
||||
|
|
@ -197,9 +209,17 @@ func initTestEnv(t *testing.T) *env {
|
|||
e.testOrg = testing.Getenv(t, "TEST_ORG", "")
|
||||
e.testOrgRepo = testing.Getenv(t, "TEST_ORG_REPO", "")
|
||||
e.testEnterprise = testing.Getenv(t, "TEST_ENTERPRISE")
|
||||
e.testJobs = createTestJobs(id, testResultCMNamePrefix, 10)
|
||||
e.testJobs = createTestJobs(id, testResultCMNamePrefix, 100)
|
||||
ephemeral, _ := strconv.ParseBool(testing.Getenv(t, "TEST_FEATURE_FLAG_EPHEMERAL"))
|
||||
e.featureFlagEphemeral = ephemeral
|
||||
e.scaleDownDelaySecondsAfterScaleOut, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT", "10"), 10, 32)
|
||||
e.minReplicas, _ = strconv.ParseInt(testing.Getenv(t, "TEST_RUNNER_MIN_REPLICAS", "1"), 10, 32)
|
||||
|
||||
var err error
|
||||
e.dockerdWithinRunnerContainer, err = strconv.ParseBool(testing.Getenv(t, "TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER", "false"))
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("unable to parse bool from TEST_RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER: %v", err))
|
||||
}
|
||||
|
||||
return e
|
||||
}
|
||||
|
|
@ -254,7 +274,22 @@ func (e *env) installActionsRunnerController(t *testing.T) {
|
|||
"GITHUB_TOKEN=" + e.githubToken,
|
||||
"WEBHOOK_GITHUB_TOKEN=" + e.githubTokenWebhook,
|
||||
"RUNNER_LABEL=" + e.runnerLabel,
|
||||
"TEST_ID=" + e.testID,
|
||||
fmt.Sprintf("RUNNER_FEATURE_FLAG_EPHEMERAL=%v", e.featureFlagEphemeral),
|
||||
fmt.Sprintf("RUNNER_SCALE_DOWN_DELAY_SECONDS_AFTER_SCALE_OUT=%d", e.scaleDownDelaySecondsAfterScaleOut),
|
||||
fmt.Sprintf("ORG_RUNNER_MIN_REPLICAS=%d", e.minReplicas),
|
||||
}
|
||||
|
||||
if e.dockerdWithinRunnerContainer {
|
||||
varEnv = append(varEnv,
|
||||
"RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER=true",
|
||||
"RUNNER_NAME="+runnerDindImageRepo,
|
||||
)
|
||||
} else {
|
||||
varEnv = append(varEnv,
|
||||
"RUNNER_DOCKERD_WITHIN_RUNNER_CONTAINER=false",
|
||||
"RUNNER_NAME="+runnerImageRepo,
|
||||
)
|
||||
}
|
||||
|
||||
scriptEnv = append(scriptEnv, varEnv...)
|
||||
|
|
@ -273,7 +308,7 @@ func (e *env) createControllerNamespaceAndServiceAccount(t *testing.T) {
|
|||
func (e *env) installActionsWorkflow(t *testing.T) {
|
||||
t.Helper()
|
||||
|
||||
installActionsWorkflow(t, e.testID, e.runnerLabel, testResultCMNamePrefix, e.repoToCommit, e.testJobs)
|
||||
installActionsWorkflow(t, e.testName, e.runnerLabel, testResultCMNamePrefix, e.repoToCommit, e.testJobs)
|
||||
}
|
||||
|
||||
func (e *env) verifyActionsWorkflowRun(t *testing.T) {
|
||||
|
|
@ -302,13 +337,13 @@ func createTestJobs(id, testResultCMNamePrefix string, numJobs int) []job {
|
|||
|
||||
const Branch = "main"
|
||||
|
||||
func installActionsWorkflow(t *testing.T, testID, runnerLabel, testResultCMNamePrefix, testRepo string, testJobs []job) {
|
||||
func installActionsWorkflow(t *testing.T, testName, runnerLabel, testResultCMNamePrefix, testRepo string, testJobs []job) {
|
||||
t.Helper()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
wfName := "E2E " + testID
|
||||
wfName := "E2E " + testName
|
||||
wf := testing.Workflow{
|
||||
Name: wfName,
|
||||
On: testing.On{
|
||||
|
|
|
|||
Loading…
Reference in New Issue