Skip to content

fix: pod with a restart policy of Never or OnFailure stuck at 'Progressing' (#15317) #709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions pkg/health/health_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ import (
"github.com/argoproj/gitops-engine/pkg/utils/kube"
)

const (
AnnotationIgnoreRestartPolicy = "argocd.argoproj.io/ignore-restart-policy"
)

func getPodHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
gvk := obj.GroupVersionKind()
switch gvk {
Expand Down Expand Up @@ -93,9 +97,9 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
}

return &HealthStatus{Status: HealthStatusDegraded, Message: ""}, nil

case corev1.PodRunning:
switch pod.Spec.RestartPolicy {
case corev1.RestartPolicyAlways:
getHealthStatus := func(pod *corev1.Pod) (*HealthStatus, error) {
// if pod is ready, it is automatically healthy
if podutils.IsPodReady(pod) {
return &HealthStatus{
Expand All @@ -117,10 +121,18 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
Status: HealthStatusProgressing,
Message: pod.Status.Message,
}, nil
case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
// pods set with a restart policy of OnFailure or Never, have a finite life.
}
policy := pod.Spec.RestartPolicy
if _, ok := pod.Annotations[AnnotationIgnoreRestartPolicy]; ok || policy == corev1.RestartPolicyAlways {
return getHealthStatus(pod)
}

if policy == corev1.RestartPolicyOnFailure || policy == corev1.RestartPolicyNever {
// Most pods set with a restart policy of OnFailure or Never, have a finite life.
// These pods are typically resource hooks. Thus, we consider these as Progressing
// instead of healthy.
// instead of healthy. If this is unwanted, e.g., when the pod is managed by an
// operator and therefore has a restart policy of OnFailure or Never, then use the
// the AnnotationIgnoreRestartPolicy annotation.
return &HealthStatus{
Status: HealthStatusProgressing,
Message: pod.Status.Message,
Expand Down
1 change: 1 addition & 0 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/pod-error.yaml", HealthStatusDegraded)
assertAppHealth(t, "./testdata/pod-running-restart-always.yaml", HealthStatusHealthy)
assertAppHealth(t, "./testdata/pod-running-restart-never.yaml", HealthStatusProgressing)
assertAppHealth(t, "./testdata/pod-running-restart-never-with-ignore-annotation.yaml", HealthStatusHealthy)
assertAppHealth(t, "./testdata/pod-running-restart-onfailure.yaml", HealthStatusProgressing)
assertAppHealth(t, "./testdata/pod-failed.yaml", HealthStatusDegraded)
assertAppHealth(t, "./testdata/pod-succeeded.yaml", HealthStatusHealthy)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
apiVersion: v1
kind: Pod
metadata:
creationTimestamp: 2018-12-02T09:15:16Z
name: my-pod
namespace: argocd
resourceVersion: "151053"
selfLink: /api/v1/namespaces/argocd/pods/my-pod
uid: c86e909c-f612-11e8-a057-fe5f49266390
annotations:
argocd.argoproj.io/ignore-restart-policy: "true"
spec:
containers:
- command:
- sh
- -c
- sleep 10
image: alpine:3.21
imagePullPolicy: Always
name: main
resources:
requests:
ephemeral-storage: "100Mi"
memory: "128Mi"
cpu: "250m"
limits:
memory: "256Mi"
cpu: "500m"
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: default-token-f9jvj
readOnly: true
dnsPolicy: ClusterFirst
nodeName: minikube
restartPolicy: Never
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
automountServiceAccountToken: false
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- name: default-token-f9jvj
secret:
defaultMode: 420
secretName: default-token-f9jvj
status:
conditions:
- lastProbeTime: null
lastTransitionTime: 2018-12-02T09:15:16Z
status: "True"
type: Initialized
- lastProbeTime: null
lastTransitionTime: 2018-12-02T09:15:19Z
status: "True"
type: Ready
- lastProbeTime: null
lastTransitionTime: 2018-12-02T09:15:16Z
status: "True"
type: PodScheduled
containerStatuses:
- containerID: containerd://adc73c2c0ae3f1fd9bf294abd834e740042ee375de680c0cfcdd90d863a73b8b
image: alpine:3.21
imageID: docker.io/library/alpine@sha256:a8560b36e8b8210634f77d9f7f9efd7ffa463e380b75e2e74aff4511df3ef88c
lastState: {}
name: main
ready: true
restartCount: 0
state:
running:
startedAt: 2018-12-02T09:15:19Z
hostIP: 192.168.64.41
phase: Running
podIP: 172.17.0.9
qosClass: BestEffort
startTime: 2018-12-02T09:15:16Z
Loading