fix: remove unnecessary --self-heal-backoff-cooldown-seconds flag from app controller (#25579)

Signed-off-by: Alexander Matyushentsev <AMatyushentsev@gmail.com>
2026-02-20 01:28:45 +01:00 · 2025-12-10 10:39:00 -08:00
parent e79a2bd6ea
commit 8b2e0e1aec
5 changed files with 9 additions and 69 deletions
--- a/cmd/argocd-application-controller/commands/argocd_application_controller.go
+++ b/cmd/argocd-application-controller/commands/argocd_application_controller.go
@@ -202,7 +202,6 @@ func NewCommand() *cobra.Command {
 				time.Duration(appResyncJitter)*time.Second,
 				time.Duration(selfHealTimeoutSeconds)*time.Second,
 				selfHealBackoff,
-				time.Duration(selfHealBackoffCooldownSeconds)*time.Second,
 				time.Duration(syncTimeout)*time.Second,
 				time.Duration(repoErrorGracePeriod)*time.Second,
 				metricsPort,
@@ -275,6 +274,7 @@ func NewCommand() *cobra.Command {
 	command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts")
 	command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts")
 	command.Flags().IntVar(&selfHealBackoffCooldownSeconds, "self-heal-backoff-cooldown-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS", 330, 0, math.MaxInt32), "Specifies period of time the app needs to stay synced before the self heal backoff can reset")
+	errors.CheckError(command.Flags().MarkDeprecated("self-heal-backoff-cooldown-seconds", "This flag is deprecated and has no effect."))
 	command.Flags().IntVar(&syncTimeout, "sync-timeout", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT", 0, 0, math.MaxInt32), "Specifies the timeout after which a sync would be terminated. 0 means no timeout (default 0).")
 	command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.")
 	command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server")
--- a/controller/appcontroller.go
+++ b/controller/appcontroller.go
@@ -128,7 +128,6 @@ type ApplicationController struct {
 	statusRefreshJitter           time.Duration
 	selfHealTimeout               time.Duration
 	selfHealBackoff               *wait.Backoff
-	selfHealBackoffCooldown       time.Duration
 	syncTimeout                   time.Duration
 	db                            db.ArgoDB
 	settingsMgr                   *settings_util.SettingsManager
@@ -164,7 +163,6 @@ func NewApplicationController(
 	appResyncJitter time.Duration,
 	selfHealTimeout time.Duration,
 	selfHealBackoff *wait.Backoff,
-	selfHealBackoffCooldown time.Duration,
 	syncTimeout time.Duration,
 	repoErrorGracePeriod time.Duration,
 	metricsPort int,
@@ -211,7 +209,6 @@ func NewApplicationController(
 		settingsMgr:                       settingsMgr,
 		selfHealTimeout:                   selfHealTimeout,
 		selfHealBackoff:                   selfHealBackoff,
-		selfHealBackoffCooldown:           selfHealBackoffCooldown,
 		syncTimeout:                       syncTimeout,
 		clusterSharding:                   clusterSharding,
 		projByNameCache:                   sync.Map{},
@@ -2249,12 +2246,8 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus *
 		// Self heal will trigger a new sync operation when the desired state changes and cause the application to
 		// be OutOfSync when it was previously synced Successfully. This means SelfHeal should only ever be attempted
 		// when the revisions have not changed, and where the previous sync to these revision was successful
-
-		// Only carry SelfHealAttemptsCount to be increased when the selfHealBackoffCooldown has not elapsed yet
-		if !ctrl.selfHealBackoffCooldownElapsed(app) {
-			if app.Status.OperationState != nil && app.Status.OperationState.Operation.Sync != nil {
-				op.Sync.SelfHealAttemptsCount = app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount
-			}
+		if app.Status.OperationState != nil && app.Status.OperationState.Operation.Sync != nil {
+			op.Sync.SelfHealAttemptsCount = app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount
 		}

 		if remainingTime := ctrl.selfHealRemainingBackoff(app, int(op.Sync.SelfHealAttemptsCount)); remainingTime > 0 {
@@ -2390,19 +2383,6 @@ func (ctrl *ApplicationController) selfHealRemainingBackoff(app *appv1.Applicati
 	return retryAfter
 }

-// selfHealBackoffCooldownElapsed returns true when the last successful sync has occurred since longer
-// than then self heal cooldown. This means that the application has been in sync for long enough to
-// reset the self healing backoff to its initial state
-func (ctrl *ApplicationController) selfHealBackoffCooldownElapsed(app *appv1.Application) bool {
-	if app.Status.OperationState == nil || app.Status.OperationState.FinishedAt == nil {
-		// Something is in progress, or about to be. In that case, selfHeal attempt should be zero anyway
-		return true
-	}
-
-	timeSinceLastOperation := time.Since(app.Status.OperationState.FinishedAt.Time)
-	return timeSinceLastOperation >= ctrl.selfHealBackoffCooldown && app.Status.OperationState.Phase.Successful()
-}
-
 // isAppNamespaceAllowed returns whether the application is allowed in the
 // namespace it's residing in.
 func (ctrl *ApplicationController) isAppNamespaceAllowed(app *appv1.Application) bool {
--- a/controller/appcontroller_test.go
+++ b/controller/appcontroller_test.go
@@ -179,7 +179,6 @@ func newFakeControllerWithResync(ctx context.Context, data *fakeData, appResyncP
 		time.Second,
 		time.Minute,
 		nil,
-		time.Minute,
 		0,
 		time.Second*10,
 		common.DefaultPortArgoCDMetrics,
@@ -3148,46 +3147,3 @@ func TestSelfHealRemainingBackoff(t *testing.T) {
 		})
 	}
 }
-
-func TestSelfHealBackoffCooldownElapsed(t *testing.T) {
-	cooldown := time.Second * 30
-	ctrl := newFakeController(t.Context(), &fakeData{}, nil)
-	ctrl.selfHealBackoffCooldown = cooldown
-
-	app := &v1alpha1.Application{
-		Status: v1alpha1.ApplicationStatus{
-			OperationState: &v1alpha1.OperationState{
-				Phase: synccommon.OperationSucceeded,
-			},
-		},
-	}
-
-	t.Run("operation not completed", func(t *testing.T) {
-		app := app.DeepCopy()
-		app.Status.OperationState.FinishedAt = nil
-		elapsed := ctrl.selfHealBackoffCooldownElapsed(app)
-		assert.True(t, elapsed)
-	})
-
-	t.Run("successful operation finised after cooldown", func(t *testing.T) {
-		app := app.DeepCopy()
-		app.Status.OperationState.FinishedAt = &metav1.Time{Time: time.Now().Add(-cooldown)}
-		elapsed := ctrl.selfHealBackoffCooldownElapsed(app)
-		assert.True(t, elapsed)
-	})
-
-	t.Run("unsuccessful operation finised after cooldown", func(t *testing.T) {
-		app := app.DeepCopy()
-		app.Status.OperationState.Phase = synccommon.OperationFailed
-		app.Status.OperationState.FinishedAt = &metav1.Time{Time: time.Now().Add(-cooldown)}
-		elapsed := ctrl.selfHealBackoffCooldownElapsed(app)
-		assert.False(t, elapsed)
-	})
-
-	t.Run("successful operation finised before cooldown", func(t *testing.T) {
-		app := app.DeepCopy()
-		app.Status.OperationState.FinishedAt = &metav1.Time{Time: time.Now()}
-		elapsed := ctrl.selfHealBackoffCooldownElapsed(app)
-		assert.False(t, elapsed)
-	})
-}
--- a/docs/operator-manual/server-commands/argocd-application-controller.md
+++ b/docs/operator-manual/server-commands/argocd-application-controller.md
@@ -71,7 +71,6 @@ argocd-application-controller [flags]
      --repo-server-timeout-seconds int                           Repo server RPC call timeout seconds. (default 60)
      --request-timeout string                                    The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0")
      --self-heal-backoff-cap-seconds int                         Specifies max timeout of exponential backoff between application self heal attempts (default 300)
-      --self-heal-backoff-cooldown-seconds int                    Specifies period of time the app needs to stay synced before the self heal backoff can reset (default 330)
      --self-heal-backoff-factor int                              Specifies factor of exponential timeout between application self heal attempts (default 3)
      --self-heal-backoff-timeout-seconds int                     Specifies initial timeout of exponential backoff between self heal attempts (default 2)
      --self-heal-timeout-seconds int                             Specifies timeout between application self heal attempts
--- a/docs/operator-manual/upgrading/3.2-3.3.md
+++ b/docs/operator-manual/upgrading/3.2-3.3.md
@@ -31,4 +31,9 @@ It no longer returns the `resourceOverrides` field which is considered sensitive
 The new environment variable `ARGOCD_K8S_SERVER_SIDE_TIMEOUT` can be used to control the K8s server side timeout of API requests.
 In 3.2 and before this change, the K8s server side timeout was controlled by `ARGOCD_K8S_TCP_TIMEOUT` 
 which is also used to control the TCP timeout when communicating with the K8s API server. 
-From now onwards, the Kubernetes server-side timeout is controlled by a separate environment variable.
+From now onwards, the Kubernetes server-side timeout is controlled by a separate environment variable.
+
+### Deprecated --self-heal-backoff-cooldown-seconds flag 
+
+The `--self-heal-backoff-cooldown-seconds` flag of the `argocd-application-controller` has been deprecated and will be
+removed in a future release.