fix: stuck at 'Progressing' #15317

RoelofKuijpers · RoelofKuijpers · commit c6f17b796996 · 2025-04-09T10:00:15.000+02:00
Signed-off-by: Roelof Kuijpers &lt;roelof.kuijpers@energyessentials.nl&gt;
diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go
@@ -12,6 +12,10 @@ import (
 	"github.com/argoproj/gitops-engine/pkg/utils/kube"
 )
 
+const (
+	AnnotationIgnoreRestartPolicy = "argocd.argoproj.io/ignore-restart-policy"
+)
+
 func getPodHealth(obj *unstructured.Unstructured) (*HealthStatus, error) {
 	gvk := obj.GroupVersionKind()
 	switch gvk {
@@ -93,9 +97,9 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
 		}
 
 		return &HealthStatus{Status: HealthStatusDegraded, Message: ""}, nil
+
 	case corev1.PodRunning:
-		switch pod.Spec.RestartPolicy {
-		case corev1.RestartPolicyAlways:
+		getHealthStatus := func(pod *corev1.Pod) (*HealthStatus, error) {
 			// if pod is ready, it is automatically healthy
 			if podutils.IsPodReady(pod) {
 				return &HealthStatus{
@@ -117,14 +121,24 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
 				Status:  HealthStatusProgressing,
 				Message: pod.Status.Message,
 			}, nil
-		case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
-			// pods set with a restart policy of OnFailure or Never, have a finite life.
-			// These pods are typically resource hooks. Thus, we consider these as Progressing
-			// instead of healthy.
-			return &HealthStatus{
-				Status:  HealthStatusProgressing,
-				Message: pod.Status.Message,
-			}, nil
+		}
+		if _, hook := pod.Annotations[AnnotationIgnoreRestartPolicy]; hook {
+			return getHealthStatus(pod)
+		} else {
+			switch pod.Spec.RestartPolicy {
+			case corev1.RestartPolicyAlways:
+				return getHealthStatus(pod)
+			case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
+				// Most pods set with a restart policy of OnFailure or Never, have a finite life.
+				// These pods are typically resource hooks. Thus, we consider these as Progressing
+				// instead of healthy. If this is unwanted, e.g., when the pod is managed by an
+				// operator and therefore has a restart policy of OnFailure or Never, then use the
+				// the AnnotationIgnoreRestartPolicy annotation.
+				return &HealthStatus{
+					Status:  HealthStatusProgressing,
+					Message: pod.Status.Message,
+				}, nil
+			}
 		}
 	}
 	return &HealthStatus{
diff --git a/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml b/pkg/health/testdata/pod-running-restart-never-with-ignore-annotation.yaml
@@ -0,0 +1,79 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  creationTimestamp: 2018-12-02T09:15:16Z
+  name: my-pod
+  namespace: argocd
+  resourceVersion: "151053"
+  selfLink: /api/v1/namespaces/argocd/pods/my-pod
+  uid: c86e909c-f612-11e8-a057-fe5f49266390
+  annotations:
+    argocd.argoproj.io/ignore-restart-policy: "true"
+spec:
+  containers:
+  - command:
+    - sh
+    - -c
+    - sleep 10
+    image: alpine:latest
+    imagePullPolicy: Always
+    name: main
+    resources: {}
+    terminationMessagePath: /dev/termination-log
+    terminationMessagePolicy: File
+    volumeMounts:
+    - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
+      name: default-token-f9jvj
+      readOnly: true
+  dnsPolicy: ClusterFirst
+  nodeName: minikube
+  restartPolicy: Never
+  schedulerName: default-scheduler
+  securityContext: {}
+  serviceAccount: default
+  serviceAccountName: default
+  terminationGracePeriodSeconds: 30
+  tolerations:
+  - effect: NoExecute
+    key: node.kubernetes.io/not-ready
+    operator: Exists
+    tolerationSeconds: 300
+  - effect: NoExecute
+    key: node.kubernetes.io/unreachable
+    operator: Exists
+    tolerationSeconds: 300
+  volumes:
+  - name: default-token-f9jvj
+    secret:
+      defaultMode: 420
+      secretName: default-token-f9jvj
+status:
+  conditions:
+  - lastProbeTime: null
+    lastTransitionTime: 2018-12-02T09:15:16Z
+    status: "True"
+    type: Initialized
+  - lastProbeTime: null
+    lastTransitionTime: 2018-12-02T09:15:19Z
+    status: "True"
+    type: Ready
+  - lastProbeTime: null
+    lastTransitionTime: 2018-12-02T09:15:16Z
+    status: "True"
+    type: PodScheduled
+  containerStatuses:
+  - containerID: docker://acfb261d6c1fe8c543438a202de62cb06c137fa93a2d59262d764470e96f3195
+    image: alpine:latest
+    imageID: docker-pullable://alpine@sha256:621c2f39f8133acb8e64023a94dbdf0d5ca81896102b9e57c0dc184cadaf5528
+    lastState: {}
+    name: main
+    ready: true
+    restartCount: 0
+    state:
+      running:
+        startedAt: 2018-12-02T09:15:19Z
+  hostIP: 192.168.64.41
+  phase: Running
+  podIP: 172.17.0.9
+  qosClass: BestEffort
+  startTime: 2018-12-02T09:15:16Z