feat(appcontroller): store application health status in redis by default (#10312) (#21532)

Signed-off-by: rumstead <37445536+rumstead@users.noreply.github.com>
This commit is contained in:
rumstead
2025-03-04 06:29:23 -05:00
committed by GitHub
parent f2490fccdd
commit 029927b25e
14 changed files with 140 additions and 50 deletions

View File

@@ -7,31 +7,17 @@ import (
func BuildResourceStatus(statusMap map[string]argov1alpha1.ResourceStatus, apps []argov1alpha1.Application) map[string]argov1alpha1.ResourceStatus {
appMap := map[string]argov1alpha1.Application{}
for _, app := range apps {
appCopy := app
appMap[app.Name] = app
gvk := app.GroupVersionKind()
// Create status if it does not exist
status, ok := statusMap[app.Name]
if !ok {
status = argov1alpha1.ResourceStatus{
Group: gvk.Group,
Version: gvk.Version,
Kind: gvk.Kind,
Name: app.Name,
Namespace: app.Namespace,
Status: app.Status.Sync.Status,
Health: &appCopy.Status.Health,
}
}
var status argov1alpha1.ResourceStatus
status.Group = gvk.Group
status.Version = gvk.Version
status.Kind = gvk.Kind
status.Name = app.Name
status.Namespace = app.Namespace
status.Status = app.Status.Sync.Status
status.Health = &appCopy.Status.Health
status.Health = app.Status.Health.DeepCopy()
statusMap[app.Name] = status
}

View File

@@ -280,7 +280,7 @@ func NewCommand() *cobra.Command {
command.Flags().StringToStringVar(&otlpHeaders, "otlp-headers", env.ParseStringToStringFromEnv("ARGOCD_APPLICATION_CONTROLLER_OTLP_HEADERS", map[string]string{}, ","), "List of OpenTelemetry collector extra headers sent with traces, headers are comma-separated key-value pairs(e.g. key1=value1,key2=value2)")
command.Flags().StringSliceVar(&otlpAttrs, "otlp-attrs", env.StringsFromEnv("ARGOCD_APPLICATION_CONTROLLER_OTLP_ATTRS", []string{}, ","), "List of OpenTelemetry collector extra attrs when send traces, each attribute is separated by a colon(e.g. key:value)")
command.Flags().StringSliceVar(&applicationNamespaces, "application-namespaces", env.StringsFromEnv("ARGOCD_APPLICATION_NAMESPACES", []string{}, ","), "List of additional namespaces that applications are allowed to be reconciled from")
command.Flags().BoolVar(&persistResourceHealth, "persist-resource-health", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_PERSIST_RESOURCE_HEALTH", true), "Enables storing the managed resources health in the Application CRD")
command.Flags().BoolVar(&persistResourceHealth, "persist-resource-health", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_PERSIST_RESOURCE_HEALTH", false), "Enables storing the managed resources health in the Application CRD")
command.Flags().StringVar(&shardingAlgorithm, "sharding-method", env.StringFromEnv(common.EnvControllerShardingAlgorithm, common.DefaultShardingAlgorithm), "Enables choice of sharding method. Supported sharding methods are : [legacy, round-robin, consistent-hashing] ")
// global queue rate limit config
command.Flags().Int64Var(&workqueueRateLimit.BucketSize, "wq-bucket-size", env.ParseInt64FromEnv("WORKQUEUE_BUCKET_SIZE", 500, 1, math.MaxInt64), "Set Workqueue Rate Limiter Bucket Size, default 500")

View File

@@ -69,11 +69,10 @@ data:
# Cache expiration for app state (default 1h0m0s)
controller.app.state.cache.expiration: "1h0m0s"
# Specifies if resource health should be persisted in app CRD (default true)
# Changing this to `false` significantly reduce number of Application CRD updates and improves controller performance.
# However, disabling resource health by default might affect applications that communicate with Applications CRD directly
# so we have to defer switching this to `false` by default till v3.0 release.
controller.resource.health.persist: "true"
# Specifies if resource health should be persisted in the application CR (default false).
# Setting this to true will store the health status of each resource in the application CR,
# increasing the number of updates to the CR and putting more load on the application controller
controller.resource.health.persist: "false"
# Cache expiration default (default 24h0m0s)
controller.default.cache.expiration: "24h0m0s"
# Sharding algorithm used to balance clusters across application controller shards (default "legacy")

View File

@@ -54,7 +54,7 @@ argocd-application-controller [flags]
--otlp-headers stringToString List of OpenTelemetry collector extra headers sent with traces, headers are comma-separated key-value pairs(e.g. key1=value1,key2=value2) (default [])
--otlp-insecure OpenTelemetry collector insecure mode (default true)
--password string Password for basic authentication to the API server
--persist-resource-health Enables storing the managed resources health in the Application CRD (default true)
--persist-resource-health Enables storing the managed resources health in the Application CRD
--proxy-url string If provided, this URL will be used to connect via proxy
--redis string Redis server hostname and port (e.g. argocd-redis:6379).
--redis-ca-certificate string Path to Redis server CA certificate (e.g. /etc/certs/redis/ca.crt). If not specified, system trusted CAs will be used for server certificate validation.

View File

@@ -21,7 +21,8 @@ to allow the `update/*` or `delete/*` actions on the application to give permiss
The v2 behavior can be preserved by setting the config value `server.rbac.disableApplicationFineGrainedRBACInheritance`
to `false` in the Argo CD ConfigMap `argocd-cm`.
Read the [RBAC documentation](../rbac.md#fine-grained-permissions-for-updatedelete-action) for more detailed inforamtion.
Read the [RBAC documentation](../rbac.md#fine-grained-permissions-for-updatedelete-action) for more detailed
information.
### Logs RBAC enforcement as a first-class RBAC citizen
@@ -55,14 +56,17 @@ labels before upgrading.
### Changes to RBAC with Dex SSO Authentication
When using Dex, the `sub` claim returned in the authentication was used as the subject for RBAC. That value depends on
the Dex internal implementation and should not be considered an immutable value that represent the subject.
the Dex internal implementation and should not be considered an immutable value that represents the subject.
The new behavior will request the `federated:id` [scope](https://dexidp.io/docs/configuration/custom-scopes-claims-clients/) from Dex, and the new value used as the RBAC subject will be based
The new behavior will request the
`federated:id` [scope](https://dexidp.io/docs/configuration/custom-scopes-claims-clients/) from Dex, and the new value
used as the RBAC subject will be based
on the `federated_claims.user_id` claim instead of the `sub` claim.
If you were using the Dex sub claim in RBAC policies, you will need to update them to maintain the same access.
You can know the correct `user_id` to use by decoding the current `sub` claims defined in your policies. You can also configure which
You can know the correct `user_id` to use by decoding the current `sub` claims defined in your policies. You can also
configure which
value is used as `user_id` for some [connectors](https://dexidp.io/docs/connectors/).
```sh
@@ -190,6 +194,77 @@ Application, it will not delete the previously managed resources.
It is recommended to perform any cleanup or migration to existing in-cluster Application before upgrading
when in-cluster is disabled. To perform cleanup post-migration, the in-cluster will need to be enabled temporarily.
### Health status in the Application CR
The health status of each object used to be persisted under `/status` in the Application CR by default.
Any health churn in the resources deployed by the Application put load on the application controller.
Now, the health status is stored externally.
You can revert this behavior by setting the `controller.resource.health.persist` to `true` in the Argo CD
`argocd-cmd-params-cm.yaml` Config Map.
Example of a status field in the Application CR persisting health:
```yaml
status:
health:
status: Healthy
resources:
- group: apps
health:
status: Healthy
kind: Deployment
name: my-app
namespace: foo
status: OutOfSync
version: v1
sync:
status: OutOfSync
```
Example of a status field in the Application CR _not_ persisting health:
```yaml
status:
health:
status: Healthy
resourceHealthSource: appTree
resources:
- group: apps
kind: Deployment
name: my-app
namespace: foo
status: OutOfSync
version: v1
sync:
status: OutOfSync
```
#### Detection
1. Check the `argocd-cmd-params-cm.yaml` ConfigMap for `controller.resource.health.persist`.
If the value is empty or true, the health status is persisted in the Application CR.
```sh
kubectl get cm argocd-cmd-params-cm -n argocd -o jsonpath='{.data.controller\.resource\.health\.persist}'
```
2. Check any Application CR for the `resourceHealthSource` field.
If you see a blank value, the health status is persisted in the Application CR.
```sh
kubectl get applications.argoproj.io <my app> -n argocd -o jsonpath='{.status.resourceHealthSource}'
```
#### Migration
Any tools or CLI commands parsing the `.status.resources[].health` need to be updated to use the argocd cli/API to get the health status.
```sh
argocd app get <my app> -o json
```
### Empty Environment Variables in Plugins
In Argo CD 3.0, empty environment variables are now passed to config management plugins.

View File

@@ -12,6 +12,8 @@ import (
"strings"
"time"
"github.com/argoproj/gitops-engine/pkg/health"
cacheutil "github.com/argoproj/argo-cd/v3/util/cache"
kubecache "github.com/argoproj/gitops-engine/pkg/cache"
@@ -46,6 +48,7 @@ import (
"github.com/argoproj/argo-cd/v3/reposerver/apiclient"
servercache "github.com/argoproj/argo-cd/v3/server/cache"
"github.com/argoproj/argo-cd/v3/server/deeplinks"
"github.com/argoproj/argo-cd/v3/util"
"github.com/argoproj/argo-cd/v3/util/argo"
"github.com/argoproj/argo-cd/v3/util/collections"
"github.com/argoproj/argo-cd/v3/util/db"
@@ -252,9 +255,15 @@ func (s *Server) getApplicationEnforceRBACClient(ctx context.Context, action, pr
if !s.isNamespaceEnabled(namespaceOrDefault) {
return nil, security.NamespaceNotPermittedError(namespaceOrDefault)
}
return s.appclientset.ArgoprojV1alpha1().Applications(namespaceOrDefault).Get(ctx, name, metav1.GetOptions{
app, err := s.appclientset.ArgoprojV1alpha1().Applications(namespaceOrDefault).Get(ctx, name, metav1.GetOptions{
ResourceVersion: resourceVersion,
})
if err != nil {
return nil, err
}
// Objects returned by the lister must be treated as read-only.
// To allow us to modify the app later, make a copy
return app.DeepCopy(), nil
})
}
@@ -371,6 +380,9 @@ func (s *Server) Create(ctx context.Context, q *application.ApplicationCreateReq
if err != nil {
return nil, status.Errorf(codes.Internal, "unable to check existing application details (%s): %v", appNs, err)
}
// Objects returned from listers have to be treated as read-only
// Take a deep copy so we can edit it below
existing = existing.DeepCopy()
if _, err := argo.GetDestinationCluster(ctx, existing.Spec.Destination, s.db); err != nil {
return nil, status.Errorf(codes.InvalidArgument, "application destination spec for %s is invalid: %s", existing.Name, err.Error())
@@ -808,7 +820,7 @@ func (s *Server) Get(ctx context.Context, q *application.ApplicationQuery) (*v1a
annotations = make(map[string]string)
}
if _, ok := annotations[v1alpha1.AnnotationKeyRefresh]; !ok {
return &event.Application, nil
return event.Application.DeepCopy(), nil
}
}
}
@@ -1214,6 +1226,7 @@ func (s *Server) Watch(q *application.ApplicationQuery, ws application.Applicati
if err != nil {
return fmt.Errorf("error listing apps with selector: %w", err)
}
apps = util.SliceCopy(apps)
sort.Slice(apps, func(i, j int) bool {
return apps[i].QualifiedName() < apps[j].QualifiedName()
})
@@ -2663,7 +2676,14 @@ func (s *Server) inferResourcesStatusHealth(app *v1alpha1.Application) {
if err := s.cache.GetAppResourcesTree(app.Name, tree); err == nil {
healthByKey := map[kube.ResourceKey]*v1alpha1.HealthStatus{}
for _, node := range tree.Nodes {
healthByKey[kube.NewResourceKey(node.Group, node.Kind, node.Namespace, node.Name)] = node.Health
if node.Health != nil {
healthByKey[kube.NewResourceKey(node.Group, node.Kind, node.Namespace, node.Name)] = node.Health
} else if node.ResourceVersion == "" && node.ResourceRef.UID == "" && node.CreatedAt == nil {
healthByKey[kube.NewResourceKey(node.Group, node.Kind, node.Namespace, node.Name)] = &v1alpha1.HealthStatus{
Status: health.HealthStatusMissing,
Message: "Resource has not been created",
}
}
}
for i, res := range app.Status.Resources {
res.Health = healthByKey[kube.NewResourceKey(res.Group, res.Kind, res.Namespace, res.Name)]

View File

@@ -51,7 +51,7 @@ func TestDeletingAppByLabel(t *testing.T) {
CreateApp("--label=foo=bar").
Sync().
Then().
Expect(SyncStatusIs(SyncStatusCode(SyncStatusCodeSynced))).
Expect(SyncStatusIs(SyncStatusCodeSynced)).
When().
IgnoreErrors().
DeleteBySelector("foo=baz").

View File

@@ -7,10 +7,13 @@ import (
"github.com/argoproj/gitops-engine/pkg/health"
log "github.com/sirupsen/logrus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
applicationpkg "github.com/argoproj/argo-cd/v3/pkg/apiclient/application"
. "github.com/argoproj/argo-cd/v3/pkg/apis/application/v1alpha1"
"github.com/argoproj/argo-cd/v3/test/e2e/fixture"
"github.com/argoproj/argo-cd/v3/util/errors"
util "github.com/argoproj/argo-cd/v3/util/io"
)
// this implements the "then" part of given/when/then
@@ -112,7 +115,16 @@ func (c *Consequences) get() (*Application, error) {
}
func (c *Consequences) resource(kind, name, namespace string) ResourceStatus {
for _, r := range c.app().Status.Resources {
closer, client, err := fixture.ArgoCDClientset.NewApplicationClient()
errors.CheckError(err)
defer util.Close(closer)
app, err := client.Get(context.Background(), &applicationpkg.ApplicationQuery{
Name: ptr.To(c.context.AppName()),
Projects: []string{c.context.project},
AppNamespace: ptr.To(c.context.appNamespace),
})
errors.CheckError(err)
for _, r := range app.Status.Resources {
if r.Kind == kind && r.Name == name && (namespace == "" || namespace == r.Namespace) {
return r
}

View File

@@ -99,9 +99,9 @@ func TestPreSyncHookFailure(t *testing.T) {
IgnoreErrors().
Sync().
Then().
Expect(Error("hook Failed PreSync", "")).
Expect(Error("hook Failed Synced PreSync container \"main\" failed", "")).
// make sure resource are also printed
Expect(Error("pod OutOfSync Missing", "")).
Expect(Error("pod OutOfSync Missing", "")).
Expect(OperationPhaseIs(OperationFailed)).
// if a pre-sync hook fails, we should not start the main sync
Expect(SyncStatusIs(SyncStatusCodeOutOfSync)).

View File

@@ -248,7 +248,7 @@ func RefreshApp(appIf v1alpha1.ApplicationInterface, name string, refreshType ar
app, err := appIf.Patch(context.Background(), name, types.MergePatchType, patch, metav1.PatchOptions{})
if err == nil {
log.Infof("Requested app '%s' refresh", name)
return app, nil
return app.DeepCopy(), nil
}
if !apierrors.IsConflict(err) {
return nil, fmt.Errorf("error patching annotations in application %q: %w", name, err)
@@ -814,6 +814,7 @@ func SetAppOperation(appIf v1alpha1.ApplicationInterface, appName string, op *ar
if err != nil {
return nil, fmt.Errorf("error getting application %q: %w", appName, err)
}
a = a.DeepCopy()
if a.Operation != nil {
return nil, ErrAnotherOperationInProgress
}

View File

@@ -42,7 +42,7 @@ func (db *db) listSecretsByType(types ...string) ([]*corev1.Secret, error) {
// SecretNamespaceLister lists all Secrets in the indexer for a given namespace.
// Objects returned by the lister must be treated as read-only.
// To allow us to modify the secrets, make a copy
secrets = util.SecretCopy(secrets)
secrets = util.SliceCopy(secrets)
return secrets, nil
}

View File

@@ -780,7 +780,7 @@ func (mgr *SettingsManager) getSecrets() ([]*corev1.Secret, error) {
// SecretNamespaceLister lists all Secrets in the indexer for a given namespace.
// Objects returned by the lister must be treated as read-only.
// To allow us to modify the secrets, make a copy
secrets = util.SecretCopy(secrets)
secrets = util.SliceCopy(secrets)
return secrets, nil
}

View File

@@ -7,7 +7,7 @@ import (
"encoding/hex"
"fmt"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
)
// MakeSignature generates a cryptographically-secure pseudo-random token, based on a given number of random bytes, for signing purposes.
@@ -22,16 +22,13 @@ func MakeSignature(size int) ([]byte, error) {
return b, err
}
// SecretCopy generates a deep copy of a slice containing secrets
//
// This function takes a slice of pointers to Secrets and returns a new slice
// containing deep copies of the original secrets.
func SecretCopy(secrets []*corev1.Secret) []*corev1.Secret {
secretsCopy := make([]*corev1.Secret, len(secrets))
for i, secret := range secrets {
secretsCopy[i] = secret.DeepCopy()
// SliceCopy generates a deep copy of a slice containing any type that implements the runtime.Object interface.
func SliceCopy[T runtime.Object](items []T) []T {
itemsCopy := make([]T, len(items))
for i, item := range items {
itemsCopy[i] = item.DeepCopyObject().(T)
}
return secretsCopy
return itemsCopy
}
// GenerateCacheKey generates a cache key based on a format string and arguments

View File

@@ -42,7 +42,7 @@ func TestParseRevision(t *testing.T) {
}
}
func TestSecretCopy(t *testing.T) {
func TestSliceCopy(t *testing.T) {
type args struct {
secrets []*corev1.Secret
}
@@ -72,8 +72,8 @@ func TestSecretCopy(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
secretsCopy := util.SecretCopy(tt.args.secrets)
assert.Equalf(t, tt.want, secretsCopy, "SecretCopy(%v)", tt.args.secrets)
secretsCopy := util.SliceCopy(tt.args.secrets)
assert.Equalf(t, tt.want, secretsCopy, "SliceCopy(%v)", tt.args.secrets)
for i := range tt.args.secrets {
assert.NotSame(t, secretsCopy[i], tt.args.secrets[i])
}