Merge branch 'master' into feature/build-logs-stdout

This commit is contained in:
robert lestak 2026-02-11 09:25:57 -08:00 committed by GitHub
commit a2c410311c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 341 additions and 40 deletions

View File

@ -16,7 +16,7 @@ env:
TARGET_ORG: actions-runner-controller
TARGET_REPO: arc_e2e_test_dummy
IMAGE_NAME: "arc-test-image"
IMAGE_VERSION: "0.13.0"
IMAGE_VERSION: "0.13.1"
concurrency:
# This will make sure we only apply the concurrency limits on pull requests

View File

@ -6,7 +6,7 @@ endif
DOCKER_USER ?= $(shell echo ${DOCKER_IMAGE_NAME} | cut -d / -f1)
VERSION ?= dev
COMMIT_SHA = $(shell git rev-parse HEAD)
RUNNER_VERSION ?= 2.330.0
RUNNER_VERSION ?= 2.331.0
TARGETPLATFORM ?= $(shell arch)
RUNNER_NAME ?= ${DOCKER_USER}/actions-runner
RUNNER_TAG ?= ${VERSION}

View File

@ -15,13 +15,13 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.13.0
version: 0.13.1
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.13.0"
appVersion: "0.13.1"
home: https://github.com/actions/actions-runner-controller

View File

@ -15,13 +15,13 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.13.0
version: 0.13.1
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.13.0"
appVersion: "0.13.1"
home: https://github.com/actions/actions-runner-controller

View File

@ -313,37 +313,38 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
cs := runnerContainerStatus(pod)
switch {
case pod.Status.Phase == corev1.PodFailed: // All containers are stopped
switch {
case pod.Status.Reason == "Evicted":
log.Info("Pod evicted; Deleting ephemeral runner or pod",
"podPhase", pod.Status.Phase,
"podReason", pod.Status.Reason,
"podMessage", pod.Status.Message,
)
log.Info("Pod is in failed phase, inspecting runner container status",
"podReason", pod.Status.Reason,
"podMessage", pod.Status.Message,
"podConditions", pod.Status.Conditions,
)
// If the runner pod did not have chance to start, terminated state may not be set.
// Therefore, we should try to restart it.
if cs == nil || cs.State.Terminated == nil {
log.Info("Runner container does not have state set, deleting pod as failed so it can be restarted")
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
}
case strings.HasPrefix(pod.Status.Reason, "OutOf"): // most likely a transient issue.
log.Info("Pod failed with reason starting with OutOf. Deleting ephemeral runner or pod",
"podPhase", pod.Status.Phase,
"podReason", pod.Status.Reason,
"podMessage", pod.Status.Message,
)
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
default:
log.Info("Pod is in failed phase; updating ephemeral runner status",
"podPhase", pod.Status.Phase,
"podReason", pod.Status.Reason,
"podMessage", pod.Status.Message,
)
if err := r.updateRunStatusFromPod(ctx, ephemeralRunner, pod, log); err != nil {
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
if cs.State.Terminated.ExitCode == 0 {
log.Info("Runner container has succeeded but pod is in failed phase; Assume successful exit")
// If the pod is in a failed state, that means that at least one container exited with non-zero exit code.
// If the runner container exits with 0, we assume that the runner has finished successfully.
// If side-car container exits with non-zero, it shouldn't affect the runner. Runner exit code
// drives the controller's inference of whether the job has succeeded or failed.
if err := r.Delete(ctx, ephemeralRunner); err != nil {
log.Error(err, "Failed to delete ephemeral runner after successful completion")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
log.Error(
errors.New("ephemeral runner container has failed, with runner container exit code non-zero"),
"Ephemeral runner container has failed, and runner container termination exit code is non-zero",
"containerTerminatedState", cs.State.Terminated,
)
return ctrl.Result{}, r.deleteEphemeralRunnerOrPod(ctx, ephemeralRunner, pod, log)
case cs == nil:
// starting, no container state yet
log.Info("Waiting for runner container status to be available")
@ -397,6 +398,7 @@ func (r *EphemeralRunnerReconciler) deleteEphemeralRunnerOrPod(ctx context.Conte
log.Info("Removed the runner from the service")
return nil
}
if err := r.deletePodAsFailed(ctx, ephemeralRunner, pod, log); err != nil {
log.Error(err, "Failed to delete runner pod on failure")
return err
@ -570,6 +572,8 @@ func (r *EphemeralRunnerReconciler) markAsFailed(ctx context.Context, ephemeralR
// deletePodAsFailed is responsible for deleting the pod and updating the .Status.Failures for tracking failure count.
// It should not be responsible for setting the status to Failed.
//
// It should be called by deleteEphemeralRunnerOrPod which is responsible for deciding whether to delete the EphemeralRunner or just the Pod.
func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, pod *corev1.Pod, log logr.Logger) error {
if pod.DeletionTimestamp.IsZero() {
log.Info("Deleting the ephemeral runner pod", "podId", pod.UID)

View File

@ -261,6 +261,238 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
})
It("It should delete ephemeral runner when pod failed before runner state is recorded and job assigned", func() {
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
er.Status.JobID = "1"
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
Eventually(func() (string, error) {
current := new(v1alpha1.EphemeralRunner)
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, current); err != nil {
return "", err
}
return current.Status.JobID, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo("1"))
pod := new(corev1.Pod)
Eventually(func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))
pod.Status.Phase = corev1.PodFailed
pod.Status.ContainerStatuses = nil
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status")
Eventually(func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
return kerrors.IsNotFound(err)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
})
It("It should delete ephemeral runner when pod failed before runner state is recorded and job not assigned", func() {
pod := new(corev1.Pod)
Eventually(func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(BeEquivalentTo(true))
oldPodUID := pod.UID
pod.Status.Phase = corev1.PodFailed
pod.Status.ContainerStatuses = nil
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status")
Eventually(
func() (int, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(
ctx,
client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace},
updated,
)
if err != nil {
return 0, err
}
return len(updated.Status.Failures), nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(1))
Eventually(
func() (bool, error) {
newPod := new(corev1.Pod)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod)
if err != nil {
return false, err
}
return newPod.UID != oldPodUID, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeTrue(), "Pod should be re-created")
})
It("It should treat pod failed with runner container exit 0 as success with job id", func() {
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
}, ephemeralRunnerTimeout, ephemeralRunnerInterval).Should(Succeed(), "failed to get ephemeral runner")
er.Status.JobID = "1"
err := k8sClient.Status().Update(ctx, er)
Expect(err).To(BeNil(), "failed to update ephemeral runner status")
pod := new(corev1.Pod)
Eventually(
func() error {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return err
}
return nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(Succeed(), "failed to get pod")
pod.Status.Phase = corev1.PodFailed
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 0,
},
},
})
err = k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status")
Eventually(
func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
return kerrors.IsNotFound(err)
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
})
It("It should treat pod failed with runner container exit 0 as success with no job id", func() {
pod := new(corev1.Pod)
Eventually(
func() error {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return err
}
return nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(Succeed(), "failed to get pod")
pod.Status.Phase = corev1.PodFailed
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 0,
},
},
})
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status")
Eventually(
func() bool {
check := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, check)
return kerrors.IsNotFound(err)
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeTrue(), "Ephemeral runner should eventually be deleted")
})
It("It should mark as failed when job is not assigned and pod is failed", func() {
er := new(v1alpha1.EphemeralRunner)
Eventually(func() error {
return k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, er)
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(Succeed(), "failed to get ephemeral runner")
pod := new(corev1.Pod)
Eventually(
func() error {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return err
}
return nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(Succeed(), "failed to get pod")
pod.Status.Phase = corev1.PodFailed
oldPodUID := pod.UID
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
ExitCode: 1,
},
},
})
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "Failed to update pod status")
Eventually(
func() (int, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(
ctx,
client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace},
updated,
)
if err != nil {
return 0, err
}
return len(updated.Status.Failures), nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(1))
Eventually(
func() (bool, error) {
newPod := new(corev1.Pod)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, newPod)
if err != nil {
return false, err
}
return newPod.UID != oldPodUID, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeTrue(), "Pod should be re-created")
})
It("It should failed if a pod template is invalid", func() {
invalideEphemeralRunner := newExampleRunner("invalid-ephemeral-runner", autoscalingNS.Name, configSecret.Name)
invalideEphemeralRunner.Spec.Spec.PriorityClassName = "notexist"

View File

@ -2,7 +2,7 @@ package actionssummerwindnet
import (
"context"
"crypto/sha1"
"crypto/sha256"
"encoding/hex"
"fmt"
"sort"
@ -176,7 +176,7 @@ func (c *MultiGitHubClient) initClientForSecret(secret *corev1.Secret, dependent
sort.SliceStable(ks, func(i, j int) bool { return ks[i] < ks[j] })
hash := sha1.New()
hash := sha256.New()
for _, k := range ks {
hash.Write(secret.Data[k])
}

View File

@ -43,6 +43,33 @@ You can follow [this troubleshooting guide](https://docs.github.com/en/actions/h
## Changelog
### 0.13.1
1. Make restart pod more flexible to different failure scenarios [#4340](https://github.com/actions/actions-runner-controller/pull/4340)
1. Bump golangci/golangci-lint-action from 9.1.0 to 9.2.0 in the actions group [#4335](https://github.com/actions/actions-runner-controller/pull/4335)
1. Bump the gomod group across 1 directory with 10 updates [#4338](https://github.com/actions/actions-runner-controller/pull/4338)
1. Re-schedule if the failed reason starts with OutOf [#4336](https://github.com/actions/actions-runner-controller/pull/4336)
1. Restart the listener if pod is evicted [#4332](https://github.com/actions/actions-runner-controller/pull/4332)
1. Typo in test name caused test to not execute [#4330](https://github.com/actions/actions-runner-controller/pull/4330)
1. Bump the actions group with 3 updates [#4328](https://github.com/actions/actions-runner-controller/pull/4328)
1. Remove old e2e tests [#4325](https://github.com/actions/actions-runner-controller/pull/4325)
1. Bump the actions group across 1 directory with 4 updates [#4309](https://github.com/actions/actions-runner-controller/pull/4309)
1. Bump golang.org/x/crypto from 0.43.0 to 0.45.0 [#4318](https://github.com/actions/actions-runner-controller/pull/4318)
1. Add support for giving kubernetes mode scaleset service account additional permissions [#4282](https://github.com/actions/actions-runner-controller/pull/4282)
1. Bump the gomod group across 1 directory with 11 updates [#4317](https://github.com/actions/actions-runner-controller/pull/4317)
1. Code style changes on the controller [#4324](https://github.com/actions/actions-runner-controller/pull/4324)
1. Add ephemeral runner finalizer during creation and check finalizer without requeue [#4320](https://github.com/actions/actions-runner-controller/pull/4320)
1. e2e: move from deprecated openebs charts to new registry [#4321](https://github.com/actions/actions-runner-controller/pull/4321)
1. Create e2e test suite [#3136](https://github.com/actions/actions-runner-controller/pull/3136)
1. Handle resource quota on status forbidden by retrying [#4305](https://github.com/actions/actions-runner-controller/pull/4305)
1. Use combination of namespace, GitHub URL, and runner group when hashing the listener name [#4299](https://github.com/actions/actions-runner-controller/pull/4299)
1. Bump kubebuilder tools in the workflow [#4300](https://github.com/actions/actions-runner-controller/pull/4300)
1. Bump timeout for min runners workflow to 30s [#4306](https://github.com/actions/actions-runner-controller/pull/4306)
1. Fix for code scanning alert no. 5: Workflow does not contain permissions [#4292](https://github.com/actions/actions-runner-controller/pull/4292)
1. Delete listener resources without requeueing on each call [#4289](https://github.com/actions/actions-runner-controller/pull/4289)
1. Fix first interaction action [#4290](https://github.com/actions/actions-runner-controller/pull/4290)
1. Bump github/codeql-action from 3 to 4 in the actions group [#4281](https://github.com/actions/actions-runner-controller/pull/4281)
### 0.13.0
1. Remove workflow actions version comments since upgrades are done via dependabot [#4161](https://github.com/actions/actions-runner-controller/pull/4161)
@ -58,7 +85,6 @@ You can follow [this troubleshooting guide](https://docs.github.com/en/actions/h
1. Bump the gomod group across 1 directory with 4 updates [#4277](https://github.com/actions/actions-runner-controller/pull/4277)
1. Bump all dependencies [#4266](https://github.com/actions/actions-runner-controller/pull/4266)
### 0.12.1
1. Fix indentation of startupProbe attributes in dind sidecar [#4126](https://github.com/actions/actions-runner-controller/pull/4126)

View File

@ -274,6 +274,10 @@ func (c *Client) Identifier() string {
func (c *Client) Do(req *http.Request) (*http.Response, error) {
resp, err := c.Client.Do(req)
if err != nil {
// If we have a response even with an error, include the status code
if resp != nil {
return nil, fmt.Errorf("client request failed with status code %d: %w", resp.StatusCode, err)
}
return nil, fmt.Errorf("client request failed: %w", err)
}
@ -856,7 +860,8 @@ func (c *Client) GenerateJitRunnerConfig(ctx context.Context, jitRunnerSetting *
resp, err := c.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to issue the request: %w", err)
// Include the URL and method in the error for better debugging
return nil, fmt.Errorf("failed to issue the request %s %s: %w", req.Method, req.URL.String(), err)
}
if resp.StatusCode != http.StatusOK {

View File

@ -2,6 +2,7 @@ package actions_test
import (
"context"
"errors"
"net/http"
"testing"
"time"
@ -58,4 +59,37 @@ func TestGenerateJitRunnerConfig(t *testing.T) {
assert.NotNil(t, err)
assert.Equalf(t, actualRetry, expectedRetry, "A retry was expected after the first request but got: %v", actualRetry)
})
t.Run("Error includes HTTP method and URL when request fails", func(t *testing.T) {
runnerSettings := &actions.RunnerScaleSetJitRunnerSetting{}
server := newActionsServer(t, http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
client, err := actions.NewClient(
server.configURLForOrg("my-org"),
auth,
actions.WithRetryMax(0), // No retries to get immediate error
actions.WithRetryWaitMax(1*time.Millisecond),
)
require.NoError(t, err)
_, err = client.GenerateJitRunnerConfig(ctx, runnerSettings, 1)
require.NotNil(t, err)
// Verify error message includes HTTP method and URL for better debugging
errMsg := err.Error()
assert.Contains(t, errMsg, "POST", "Error message should include HTTP method")
assert.Contains(t, errMsg, "generatejitconfig", "Error message should include URL path")
// The error might be an ActionsError (if response was received) or a wrapped error (if Do() failed)
// In either case, the error message should include request details
var actionsErr *actions.ActionsError
if errors.As(err, &actionsErr) {
// If we got an ActionsError, verify the status code is included
assert.Equal(t, http.StatusInternalServerError, actionsErr.StatusCode)
}
// If it's a wrapped error from Do(), the error message already includes the method and URL
// which is what we're testing for
})
}

View File

@ -36,7 +36,7 @@ type ActionsError struct {
}
func (e *ActionsError) Error() string {
return fmt.Sprintf("actions error: StatusCode %d, AcivityId %q: %v", e.StatusCode, e.ActivityID, e.Err)
return fmt.Sprintf("actions error: StatusCode %d, ActivityId %q: %v", e.StatusCode, e.ActivityID, e.Err)
}
func (e *ActionsError) Unwrap() error {
@ -112,7 +112,7 @@ type MessageQueueTokenExpiredError struct {
}
func (e *MessageQueueTokenExpiredError) Error() string {
return fmt.Sprintf("MessageQueueTokenExpiredError: AcivityId %q, StatusCode %d: %s", e.activityID, e.statusCode, e.msg)
return fmt.Sprintf("MessageQueueTokenExpiredError: ActivityId %q, StatusCode %d: %s", e.activityID, e.statusCode, e.msg)
}
type HttpClientSideError struct {

View File

@ -22,7 +22,7 @@ func TestActionsError(t *testing.T) {
s := err.Error()
assert.Contains(t, s, "StatusCode 404")
assert.Contains(t, s, "AcivityId \"activity-id\"")
assert.Contains(t, s, "ActivityId \"activity-id\"")
assert.Contains(t, s, "example error description")
})

View File

@ -6,7 +6,7 @@ DIND_ROOTLESS_RUNNER_NAME ?= ${DOCKER_USER}/actions-runner-dind-rootless
OS_IMAGE ?= ubuntu-22.04
TARGETPLATFORM ?= $(shell arch)
RUNNER_VERSION ?= 2.330.0
RUNNER_VERSION ?= 2.331.0
RUNNER_CONTAINER_HOOKS_VERSION ?= 0.8.0
DOCKER_VERSION ?= 28.0.4

View File

@ -1,2 +1,2 @@
RUNNER_VERSION=2.330.0
RUNNER_VERSION=2.331.0
RUNNER_CONTAINER_HOOKS_VERSION=0.8.0

View File

@ -36,7 +36,7 @@ var (
testResultCMNamePrefix = "test-result-"
RunnerVersion = "2.330.0"
RunnerVersion = "2.331.0"
RunnerContainerHooksVersion = "0.8.0"
)