fix: remove unnecessary --self-heal-backoff-cooldown-seconds flag from app controller (#25579)

Signed-off-by: Alexander Matyushentsev <AMatyushentsev@gmail.com>
This commit is contained in:
Alexander Matyushentsev
2025-12-10 10:39:00 -08:00
committed by GitHub
parent e79a2bd6ea
commit 8b2e0e1aec
5 changed files with 9 additions and 69 deletions

View File

@@ -202,7 +202,6 @@ func NewCommand() *cobra.Command {
time.Duration(appResyncJitter)*time.Second,
time.Duration(selfHealTimeoutSeconds)*time.Second,
selfHealBackoff,
time.Duration(selfHealBackoffCooldownSeconds)*time.Second,
time.Duration(syncTimeout)*time.Second,
time.Duration(repoErrorGracePeriod)*time.Second,
metricsPort,
@@ -275,6 +274,7 @@ func NewCommand() *cobra.Command {
command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts")
command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts")
command.Flags().IntVar(&selfHealBackoffCooldownSeconds, "self-heal-backoff-cooldown-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_COOLDOWN_SECONDS", 330, 0, math.MaxInt32), "Specifies period of time the app needs to stay synced before the self heal backoff can reset")
errors.CheckError(command.Flags().MarkDeprecated("self-heal-backoff-cooldown-seconds", "This flag is deprecated and has no effect."))
command.Flags().IntVar(&syncTimeout, "sync-timeout", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT", 0, 0, math.MaxInt32), "Specifies the timeout after which a sync would be terminated. 0 means no timeout (default 0).")
command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.")
command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server")

View File

@@ -128,7 +128,6 @@ type ApplicationController struct {
statusRefreshJitter time.Duration
selfHealTimeout time.Duration
selfHealBackoff *wait.Backoff
selfHealBackoffCooldown time.Duration
syncTimeout time.Duration
db db.ArgoDB
settingsMgr *settings_util.SettingsManager
@@ -164,7 +163,6 @@ func NewApplicationController(
appResyncJitter time.Duration,
selfHealTimeout time.Duration,
selfHealBackoff *wait.Backoff,
selfHealBackoffCooldown time.Duration,
syncTimeout time.Duration,
repoErrorGracePeriod time.Duration,
metricsPort int,
@@ -211,7 +209,6 @@ func NewApplicationController(
settingsMgr: settingsMgr,
selfHealTimeout: selfHealTimeout,
selfHealBackoff: selfHealBackoff,
selfHealBackoffCooldown: selfHealBackoffCooldown,
syncTimeout: syncTimeout,
clusterSharding: clusterSharding,
projByNameCache: sync.Map{},
@@ -2249,12 +2246,8 @@ func (ctrl *ApplicationController) autoSync(app *appv1.Application, syncStatus *
// Self heal will trigger a new sync operation when the desired state changes and cause the application to
// be OutOfSync when it was previously synced Successfully. This means SelfHeal should only ever be attempted
// when the revisions have not changed, and where the previous sync to these revision was successful
// Only carry SelfHealAttemptsCount to be increased when the selfHealBackoffCooldown has not elapsed yet
if !ctrl.selfHealBackoffCooldownElapsed(app) {
if app.Status.OperationState != nil && app.Status.OperationState.Operation.Sync != nil {
op.Sync.SelfHealAttemptsCount = app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount
}
if app.Status.OperationState != nil && app.Status.OperationState.Operation.Sync != nil {
op.Sync.SelfHealAttemptsCount = app.Status.OperationState.Operation.Sync.SelfHealAttemptsCount
}
if remainingTime := ctrl.selfHealRemainingBackoff(app, int(op.Sync.SelfHealAttemptsCount)); remainingTime > 0 {
@@ -2390,19 +2383,6 @@ func (ctrl *ApplicationController) selfHealRemainingBackoff(app *appv1.Applicati
return retryAfter
}
// selfHealBackoffCooldownElapsed returns true when the last successful sync has occurred since longer
// than then self heal cooldown. This means that the application has been in sync for long enough to
// reset the self healing backoff to its initial state
func (ctrl *ApplicationController) selfHealBackoffCooldownElapsed(app *appv1.Application) bool {
if app.Status.OperationState == nil || app.Status.OperationState.FinishedAt == nil {
// Something is in progress, or about to be. In that case, selfHeal attempt should be zero anyway
return true
}
timeSinceLastOperation := time.Since(app.Status.OperationState.FinishedAt.Time)
return timeSinceLastOperation >= ctrl.selfHealBackoffCooldown && app.Status.OperationState.Phase.Successful()
}
// isAppNamespaceAllowed returns whether the application is allowed in the
// namespace it's residing in.
func (ctrl *ApplicationController) isAppNamespaceAllowed(app *appv1.Application) bool {

View File

@@ -179,7 +179,6 @@ func newFakeControllerWithResync(ctx context.Context, data *fakeData, appResyncP
time.Second,
time.Minute,
nil,
time.Minute,
0,
time.Second*10,
common.DefaultPortArgoCDMetrics,
@@ -3148,46 +3147,3 @@ func TestSelfHealRemainingBackoff(t *testing.T) {
})
}
}
func TestSelfHealBackoffCooldownElapsed(t *testing.T) {
cooldown := time.Second * 30
ctrl := newFakeController(t.Context(), &fakeData{}, nil)
ctrl.selfHealBackoffCooldown = cooldown
app := &v1alpha1.Application{
Status: v1alpha1.ApplicationStatus{
OperationState: &v1alpha1.OperationState{
Phase: synccommon.OperationSucceeded,
},
},
}
t.Run("operation not completed", func(t *testing.T) {
app := app.DeepCopy()
app.Status.OperationState.FinishedAt = nil
elapsed := ctrl.selfHealBackoffCooldownElapsed(app)
assert.True(t, elapsed)
})
t.Run("successful operation finised after cooldown", func(t *testing.T) {
app := app.DeepCopy()
app.Status.OperationState.FinishedAt = &metav1.Time{Time: time.Now().Add(-cooldown)}
elapsed := ctrl.selfHealBackoffCooldownElapsed(app)
assert.True(t, elapsed)
})
t.Run("unsuccessful operation finised after cooldown", func(t *testing.T) {
app := app.DeepCopy()
app.Status.OperationState.Phase = synccommon.OperationFailed
app.Status.OperationState.FinishedAt = &metav1.Time{Time: time.Now().Add(-cooldown)}
elapsed := ctrl.selfHealBackoffCooldownElapsed(app)
assert.False(t, elapsed)
})
t.Run("successful operation finised before cooldown", func(t *testing.T) {
app := app.DeepCopy()
app.Status.OperationState.FinishedAt = &metav1.Time{Time: time.Now()}
elapsed := ctrl.selfHealBackoffCooldownElapsed(app)
assert.False(t, elapsed)
})
}

View File

@@ -71,7 +71,6 @@ argocd-application-controller [flags]
--repo-server-timeout-seconds int Repo server RPC call timeout seconds. (default 60)
--request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0")
--self-heal-backoff-cap-seconds int Specifies max timeout of exponential backoff between application self heal attempts (default 300)
--self-heal-backoff-cooldown-seconds int Specifies period of time the app needs to stay synced before the self heal backoff can reset (default 330)
--self-heal-backoff-factor int Specifies factor of exponential timeout between application self heal attempts (default 3)
--self-heal-backoff-timeout-seconds int Specifies initial timeout of exponential backoff between self heal attempts (default 2)
--self-heal-timeout-seconds int Specifies timeout between application self heal attempts

View File

@@ -31,4 +31,9 @@ It no longer returns the `resourceOverrides` field which is considered sensitive
The new environment variable `ARGOCD_K8S_SERVER_SIDE_TIMEOUT` can be used to control the K8s server side timeout of API requests.
In 3.2 and before this change, the K8s server side timeout was controlled by `ARGOCD_K8S_TCP_TIMEOUT`
which is also used to control the TCP timeout when communicating with the K8s API server.
From now onwards, the Kubernetes server-side timeout is controlled by a separate environment variable.
From now onwards, the Kubernetes server-side timeout is controlled by a separate environment variable.
### Deprecated --self-heal-backoff-cooldown-seconds flag
The `--self-heal-backoff-cooldown-seconds` flag of the `argocd-application-controller` has been deprecated and will be
removed in a future release.