feat: add Prometheus health check (#10508)

* Create Prometheus health check Signed-off-by: Roel van den Berg <roel.vandenberg@infosupport.com> * remove details from our test environment Co-authored-by: Rouke Broersma <mobrockers@gmail.com> Signed-off-by: roelvdberg <email@roelvdberg.nl> Signed-off-by: Roel van den Berg <roel.vandenberg@infosupport.com> Signed-off-by: roelvdberg <email@roelvdberg.nl> Co-authored-by: Roel van den Berg <roel.vandenberg@infosupport.com> Co-authored-by: Rouke Broersma <mobrockers@gmail.com>
2026-02-20 01:28:45 +01:00 · 2022-10-01 01:37:47 +02:00
parent e8dd95c37a
commit 8a38b8b03d
5 changed files with 440 additions and 0 deletions
--- a/resource_customizations/monitoring.coreos.com/Prometheus/health.lua
+++ b/resource_customizations/monitoring.coreos.com/Prometheus/health.lua
@@ -0,0 +1,23 @@
+hs={ status = "Progressing", message = "Waiting for initialization" }
+
+if obj.status ~= nil then
+  if obj.status.conditions ~= nil then
+    for i, condition in ipairs(obj.status.conditions) do
+
+      if condition.type == "Available" and condition.status ~= "True" then
+        if condition.reason == "SomePodsNotReady" then
+          hs.status = "Progressing"
+        else
+          hs.status = "Degraded"
+        end
+        hs.message = condition.message or condition.reason
+      end
+      if condition.type == "Available" and condition.status == "True" then
+        hs.status = "Healthy"
+        hs.message = "All instances are available"
+      end
+    end
+  end
+end
+
+return hs
--- a/resource_customizations/monitoring.coreos.com/Prometheus/health_test.yaml
+++ b/resource_customizations/monitoring.coreos.com/Prometheus/health_test.yaml
@@ -0,0 +1,13 @@
+tests:
+  - healthStatus:
+      status: Healthy
+      message: "All instances are available"
+    inputPath: testdata/healthy.yaml
+  - healthStatus:
+      status: Progressing
+      message: "SomePodsNotReady"
+    inputPath: testdata/progressing.yaml
+  - healthStatus:
+      status: Degraded
+      message: "shard 0: pod prometheus-prometheus-stack-kube-prom-prometheus-0: 0/5 nodes are available: 2 node(s) didn't match Pod's node affinity/selector, 3 node(s) were unschedulable.\nshard 0: pod prometheus-prometheus-stack-kube-prom-prometheus-1: 0/5 nodes are available: 2 node(s) didn't match Pod's node affinity/selector, 3 node(s) were unschedulable."
+    inputPath: testdata/degraded.yaml
--- a/resource_customizations/monitoring.coreos.com/Prometheus/testdata/degraded.yaml
+++ b/resource_customizations/monitoring.coreos.com/Prometheus/testdata/degraded.yaml
@@ -0,0 +1,142 @@
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+  annotations:
+    argocd.argoproj.io/tracking-id: >-
+      prometheus-stack:monitoring.coreos.com/Prometheus:prometheus/prometheus-stack-kube-prom-prometheus
+  creationTimestamp: '2021-12-09T15:51:10Z'
+  generation: 46
+  labels:
+    app: kube-prometheus-stack-prometheus
+    app.kubernetes.io/instance: prometheus-stack
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/part-of: kube-prometheus-stack
+    app.kubernetes.io/version: 39.11.0
+    chart: kube-prometheus-stack-39.11.0
+    heritage: Helm
+    release: prometheus-stack
+  name: prometheus-stack-kube-prom-prometheus
+  namespace: prometheus
+  resourceVersion: '200320271'
+  uid: 6f2e1016-926d-44e7-945b-dec4c975595b
+spec:
+  additionalScrapeConfigs:
+    key: prometheus-additional.yaml
+    name: additional-scrape-configs
+  alerting:
+    alertmanagers:
+      - apiVersion: v2
+        name: prometheus-stack-kube-prom-alertmanager
+        namespace: prometheus
+        pathPrefix: /
+        port: http-web
+  containers:
+    - name: prometheus
+      securityContext:
+        allowPrivilegeEscalation: false
+        capabilities:
+          drop:
+            - ALL
+        readOnlyRootFilesystem: true
+        runAsNonRoot: true
+    - name: config-reloader
+      securityContext:
+        allowPrivilegeEscalation: false
+        capabilities:
+          drop:
+            - ALL
+        readOnlyRootFilesystem: true
+        runAsNonRoot: true
+  enableAdminAPI: false
+  evaluationInterval: 30s
+  externalUrl: 'http://prometheus-stack-kube-prom-prometheus.prometheus:9090'
+  image: 'quay.io/prometheus/prometheus:v2.37.0'
+  imagePullSecrets:
+    - name: mcps-registry-image-pull-secret
+  initContainers:
+    - name: init-config-reloader
+      securityContext:
+        allowPrivilegeEscalation: false
+        capabilities:
+          drop:
+            - ALL
+        readOnlyRootFilesystem: true
+        runAsNonRoot: true
+  listenLocal: false
+  logFormat: logfmt
+  logLevel: info
+  paused: false
+  podMonitorNamespaceSelector: {}
+  podMonitorSelector: {}
+  portName: http-web
+  probeNamespaceSelector: {}
+  probeSelector: {}
+  replicas: 2
+  resources:
+    requests:
+      memory: 700Mi
+  retention: 6h
+  routePrefix: /
+  ruleNamespaceSelector: {}
+  ruleSelector: {}
+  scrapeInterval: 10s
+  securityContext:
+    fsGroup: 2000
+    runAsGroup: 2000
+    runAsNonRoot: true
+    runAsUser: 1000
+  serviceAccountName: prometheus-stack-kube-prom-prometheus
+  serviceMonitorNamespaceSelector: {}
+  serviceMonitorSelector: {}
+  shards: 1
+  storage:
+    volumeClaimTemplate:
+      spec:
+        accessModes:
+          - ReadWriteOnce
+        resources:
+          requests:
+            storage: 100Gi
+        storageClassName: default
+  topologySpreadConstraints:
+    - labelSelector:
+        matchLabels:
+          app.kubernetes.io/name: prometheus
+      maxSkew: 1
+      topologyKey: kubernetes.io/hostname
+      whenUnsatisfiable: ScheduleAnyway
+    - labelSelector:
+        matchLabels:
+          app.kubernetes.io/name: prometheus
+      maxSkew: 1
+      topologyKey: topology.kubernetes.io/zone
+      whenUnsatisfiable: DoNotSchedule
+  version: v2.37.0
+status:
+  availableReplicas: 0
+  conditions:
+    - lastTransitionTime: '2022-09-02T14:55:03Z'
+      message: >-
+        shard 0: pod prometheus-prometheus-stack-kube-prom-prometheus-0: 0/5
+        nodes are available: 2 node(s) didn't match Pod's node
+        affinity/selector, 3 node(s) were unschedulable.
+
+        shard 0: pod prometheus-prometheus-stack-kube-prom-prometheus-1: 0/5
+        nodes are available: 2 node(s) didn't match Pod's node
+        affinity/selector, 3 node(s) were unschedulable.
+      reason: NoPodReady
+      status: 'False'
+      type: Available
+    - lastTransitionTime: '2022-09-02T09:57:03Z'
+      status: 'True'
+      type: Reconciled
+  paused: false
+  replicas: 2
+  shardStatuses:
+    - availableReplicas: 0
+      replicas: 2
+      shardID: '0'
+      unavailableReplicas: 2
+      updatedReplicas: 2
+  unavailableReplicas: 2
+  updatedReplicas: 2
--- a/resource_customizations/monitoring.coreos.com/Prometheus/testdata/healthy.yaml
+++ b/resource_customizations/monitoring.coreos.com/Prometheus/testdata/healthy.yaml
@@ -0,0 +1,130 @@
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+  annotations:
+    argocd.argoproj.io/tracking-id: prometheus-stack:monitoring.coreos.com/Prometheus:prometheus/prometheus-stack-kube-prom-prometheus
+  creationTimestamp: "2021-12-09T15:51:10Z"
+  generation: 46
+  labels:
+    app: kube-prometheus-stack-prometheus
+    app.kubernetes.io/instance: prometheus-stack
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/part-of: kube-prometheus-stack
+    app.kubernetes.io/version: 39.11.0
+    chart: kube-prometheus-stack-39.11.0
+    heritage: Helm
+    release: prometheus-stack
+  name: prometheus-stack-kube-prom-prometheus
+  namespace: prometheus
+  resourceVersion: "200165695"
+  uid: 6f2e1016-926d-44e7-945b-dec4c975595b
+spec:
+  additionalScrapeConfigs:
+    key: prometheus-additional.yaml
+    name: additional-scrape-configs
+  alerting:
+    alertmanagers:
+    - apiVersion: v2
+      name: prometheus-stack-kube-prom-alertmanager
+      namespace: prometheus
+      pathPrefix: /
+      port: http-web
+  containers:
+  - name: prometheus
+    securityContext:
+      allowPrivilegeEscalation: false
+      capabilities:
+        drop:
+        - ALL
+      readOnlyRootFilesystem: true
+      runAsNonRoot: true
+  - name: config-reloader
+    securityContext:
+      allowPrivilegeEscalation: false
+      capabilities:
+        drop:
+        - ALL
+      readOnlyRootFilesystem: true
+      runAsNonRoot: true
+  enableAdminAPI: false
+  evaluationInterval: 30s
+  externalUrl: http://prometheus-stack-kube-prom-prometheus.prometheus:9090
+  image: quay.io/prometheus/prometheus:v2.37.0
+  initContainers:
+  - name: init-config-reloader
+    securityContext:
+      allowPrivilegeEscalation: false
+      capabilities:
+        drop:
+        - ALL
+      readOnlyRootFilesystem: true
+      runAsNonRoot: true
+  listenLocal: false
+  logFormat: logfmt
+  logLevel: info
+  paused: false
+  podMonitorNamespaceSelector: {}
+  podMonitorSelector: {}
+  portName: http-web
+  probeNamespaceSelector: {}
+  probeSelector: {}
+  replicas: 2
+  resources:
+    requests:
+      memory: 700Mi
+  retention: 6h
+  routePrefix: /
+  ruleNamespaceSelector: {}
+  ruleSelector: {}
+  scrapeInterval: 10s
+  securityContext:
+    fsGroup: 2000
+    runAsGroup: 2000
+    runAsNonRoot: true
+    runAsUser: 1000
+  serviceAccountName: prometheus-stack-kube-prom-prometheus
+  serviceMonitorNamespaceSelector: {}
+  serviceMonitorSelector: {}
+  shards: 1
+  storage:
+    volumeClaimTemplate:
+      spec:
+        accessModes:
+        - ReadWriteOnce
+        resources:
+          requests:
+            storage: 100Gi
+        storageClassName: default
+  topologySpreadConstraints:
+  - labelSelector:
+      matchLabels:
+        app.kubernetes.io/name: prometheus
+    maxSkew: 1
+    topologyKey: kubernetes.io/hostname
+    whenUnsatisfiable: ScheduleAnyway
+  - labelSelector:
+      matchLabels:
+        app.kubernetes.io/name: prometheus
+    maxSkew: 1
+    topologyKey: topology.kubernetes.io/zone
+    whenUnsatisfiable: DoNotSchedule
+  version: v2.37.0
+status:
+  availableReplicas: 2
+  conditions:
+  - lastTransitionTime: "2022-09-01T19:54:00Z"
+    status: "True"
+    type: Available
+  - lastTransitionTime: "2022-09-02T09:57:03Z"
+    status: "True"
+    type: Reconciled
+  paused: false
+  replicas: 2
+  shardStatuses:
+  - availableReplicas: 2
+    replicas: 2
+    shardID: "0"
+    unavailableReplicas: 0
+    updatedReplicas: 2
+  unavailableReplicas: 0
+  updatedReplicas: 2
--- a/resource_customizations/monitoring.coreos.com/Prometheus/testdata/progressing.yaml
+++ b/resource_customizations/monitoring.coreos.com/Prometheus/testdata/progressing.yaml
@@ -0,0 +1,132 @@
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+  annotations:
+    argocd.argoproj.io/tracking-id: >-
+      prometheus-stack:monitoring.coreos.com/Prometheus:prometheus/prometheus-stack-kube-prom-prometheus
+  creationTimestamp: '2021-12-09T15:51:10Z'
+  generation: 46
+  labels:
+    app: kube-prometheus-stack-prometheus
+    app.kubernetes.io/instance: prometheus-stack
+    app.kubernetes.io/managed-by: Helm
+    app.kubernetes.io/part-of: kube-prometheus-stack
+    app.kubernetes.io/version: 39.11.0
+    chart: kube-prometheus-stack-39.11.0
+    heritage: Helm
+    release: prometheus-stack
+  name: prometheus-stack-kube-prom-prometheus
+  namespace: prometheus
+  resourceVersion: '200307978'
+  uid: 6f2e1016-926d-44e7-945b-dec4c975595b
+spec:
+  additionalScrapeConfigs:
+    key: prometheus-additional.yaml
+    name: additional-scrape-configs
+  alerting:
+    alertmanagers:
+      - apiVersion: v2
+        name: prometheus-stack-kube-prom-alertmanager
+        namespace: prometheus
+        pathPrefix: /
+        port: http-web
+  containers:
+    - name: prometheus
+      securityContext:
+        allowPrivilegeEscalation: false
+        capabilities:
+          drop:
+            - ALL
+        readOnlyRootFilesystem: true
+        runAsNonRoot: true
+    - name: config-reloader
+      securityContext:
+        allowPrivilegeEscalation: false
+        capabilities:
+          drop:
+            - ALL
+        readOnlyRootFilesystem: true
+        runAsNonRoot: true
+  enableAdminAPI: false
+  evaluationInterval: 30s
+  externalUrl: 'http://prometheus-stack-kube-prom-prometheus.prometheus:9090'
+  image: 'quay.io/prometheus/prometheus:v2.37.0'
+  initContainers:
+    - name: init-config-reloader
+      securityContext:
+        allowPrivilegeEscalation: false
+        capabilities:
+          drop:
+            - ALL
+        readOnlyRootFilesystem: true
+        runAsNonRoot: true
+  listenLocal: false
+  logFormat: logfmt
+  logLevel: info
+  paused: false
+  podMonitorNamespaceSelector: {}
+  podMonitorSelector: {}
+  portName: http-web
+  probeNamespaceSelector: {}
+  probeSelector: {}
+  replicas: 2
+  resources:
+    requests:
+      memory: 700Mi
+  retention: 6h
+  routePrefix: /
+  ruleNamespaceSelector: {}
+  ruleSelector: {}
+  scrapeInterval: 10s
+  securityContext:
+    fsGroup: 2000
+    runAsGroup: 2000
+    runAsNonRoot: true
+    runAsUser: 1000
+  serviceAccountName: prometheus-stack-kube-prom-prometheus
+  serviceMonitorNamespaceSelector: {}
+  serviceMonitorSelector: {}
+  shards: 1
+  storage:
+    volumeClaimTemplate:
+      spec:
+        accessModes:
+          - ReadWriteOnce
+        resources:
+          requests:
+            storage: 100Gi
+        storageClassName: default
+  topologySpreadConstraints:
+    - labelSelector:
+        matchLabels:
+          app.kubernetes.io/name: prometheus
+      maxSkew: 1
+      topologyKey: kubernetes.io/hostname
+      whenUnsatisfiable: ScheduleAnyway
+    - labelSelector:
+        matchLabels:
+          app.kubernetes.io/name: prometheus
+      maxSkew: 1
+      topologyKey: topology.kubernetes.io/zone
+      whenUnsatisfiable: DoNotSchedule
+  version: v2.37.0
+status:
+  availableReplicas: 1
+  conditions:
+    - lastTransitionTime: '2022-09-02T14:34:39Z'
+      reason: SomePodsNotReady
+      status: Degraded
+      type: Available
+    - lastTransitionTime: '2022-09-02T09:57:03Z'
+      status: 'True'
+      type: Reconciled
+  paused: false
+  replicas: 2
+  shardStatuses:
+    - availableReplicas: 1
+      replicas: 2
+      shardID: '0'
+      unavailableReplicas: 1
+      updatedReplicas: 1
+  unavailableReplicas: 1
+  updatedReplicas: 1