feat: update health checks for Numaflow resources (#25698)

Signed-off-by: Dillen Padhiar <dillen_padhiar@intuit.com>
This commit is contained in:
Dillen Padhiar
2025-12-18 06:05:53 -08:00
committed by GitHub
parent b1a93b4756
commit 6f0de8b858
12 changed files with 272 additions and 9 deletions

View File

@@ -6,7 +6,7 @@ actions["force-promote"] = {
-- force-promote
local forcePromote = false
if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress") then
if obj.metadata.labels ~= nil and (obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress" or obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "trial") then
forcePromote = true
end
if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/force-promote"] == "true") then

View File

@@ -11,11 +11,14 @@ if obj.status ~= nil then
end
end
progressiveFailure = (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/progressive-result-state"] == "failed")
if obj.metadata.generation == obj.status.observedGeneration then
if (healthy ~= {} and healthy.status == "False") or obj.status.phase == "Failed" then
if (healthy ~= {} and healthy.status == "False") or obj.status.phase == "Failed" or progressiveFailure then
hs.status = "Degraded"
if obj.status.phase == "Failed" then
hs.message = obj.status.message
elseif progressiveFailure then
hs.message = "Failed progressive upgrade"
else
hs.message = healthy.message
end

View File

@@ -10,4 +10,8 @@ tests:
- healthStatus:
status: Degraded
message: "Waiting for 3 pods to be ready...\n"
inputPath: testdata/degraded.yaml
inputPath: testdata/degraded.yaml
- healthStatus:
status: Degraded
message: "Failed progressive upgrade"
inputPath: testdata/degraded-progressive.yaml

View File

@@ -0,0 +1,80 @@
apiVersion: numaflow.numaproj.io/v1alpha1
kind: InterStepBufferService
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"numaflow.numaproj.io/v1alpha1","kind":"InterStepBufferService","metadata":{"annotations":{},"name":"default","namespace":"numaflow-system"},"spec":{"jetstream":{"persistence":{"volumeSize":"3Gi"},"version":"latest"}}}
creationTimestamp: "2024-10-08T18:21:09Z"
finalizers:
- isbsvc-controller
generation: 1
name: default
namespace: numaflow-system
resourceVersion: "357862"
uid: e175db66-3918-4ef8-993d-12b37eb9a964
labels:
numaplane.numaproj.io/progressive-result-state: "failed"
spec:
jetstream:
persistence:
volumeSize: 3Gi
replicas: 3
version: latest
status:
conditions:
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: |
partitioned roll out complete: 3 new pods have been updated...
reason: Healthy
status: "True"
type: ChildrenResourcesHealthy
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: Successful
reason: Successful
status: "True"
type: Configured
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: Successful
reason: Successful
status: "True"
type: Deployed
config:
jetstream:
auth:
basic:
password:
key: client-auth-password
name: isbsvc-default-js-client-auth
user:
key: client-auth-user
name: isbsvc-default-js-client-auth
streamConfig: |
consumer:
ackwait: 60s
maxackpending: 25000
otbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 3h
procbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 72h
stream:
duplicates: 60s
maxage: 72h
maxbytes: -1
maxmsgs: 100000
replicas: 3
retention: 0
storage: 0
url: nats://isbsvc-default-js-svc.numaflow-system.svc:4222
observedGeneration: 1
phase: Running
type: jetstream

View File

@@ -45,7 +45,7 @@ end
-- force-promote
local forcePromote = false
if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress") then
if obj.metadata.labels ~= nil and (obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress" or obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "trial") then
forcePromote = true
end
if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/force-promote"] == "true") then

View File

@@ -10,11 +10,14 @@ if obj.status ~= nil then
end
end
progressiveFailure = (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/progressive-result-state"] == "failed")
if obj.metadata.generation == obj.status.observedGeneration then
if (healthy ~= {} and healthy.status == "False") or obj.status.phase == "Failed" then
if (healthy ~= {} and healthy.status == "False") or (obj.status.phase == "Failed") or progressiveFailure then
hs.status = "Degraded"
if obj.status.phase == "Failed" then
hs.message = obj.status.message
elseif progressiveFailure then
hs.message = "Failed progressive upgrade"
else
hs.message = "Subresources are unhealthy"
end

View File

@@ -14,4 +14,8 @@ tests:
- healthStatus:
status: Healthy
message: "MonoVertex is paused"
inputPath: testdata/paused.yaml
inputPath: testdata/paused.yaml
- healthStatus:
status: Degraded
message: "Failed progressive upgrade"
inputPath: testdata/degraded-progressive.yaml

View File

@@ -0,0 +1,62 @@
apiVersion: numaflow.numaproj.io/v1alpha1
kind: MonoVertex
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"numaflow.numaproj.io/v1alpha1","kind":"MonoVertex","metadata":{"annotations":{},"name":"simple-mono-vertex","namespace":"numaflow-system"},"spec":{"sink":{"udsink":{"container":{"image":"quay.io/numaio/numaflow-java/simple-sink:stable"}}},"source":{"transformer":{"container":{"image":"quay.io/numaio/numaflow-rs/source-transformer-now:stable"}},"udsource":{"container":{"image":"quay.io/numaio/numaflow-java/source-simple-source:stable"}}}}}
creationTimestamp: "2024-10-08T20:34:32Z"
generation: 1
name: simple-mono-vertex
namespace: numaflow-system
resourceVersion: "367420"
uid: 7bc9291a-9c80-4ec1-8b06-46fac8f7e507
labels:
numaplane.numaproj.io/progressive-result-state: "failed"
spec:
lifecycle:
desiredPhase: Running
replicas: 1
sink:
udsink:
container:
image: quay.io/numaio/numaflow-java/simple-sink:stable
source:
transformer:
container:
image: quay.io/numaio/numaflow-rs/source-transformer-now:stable
udsource:
container:
image: quay.io/numaio/numaflow-java/source-simple-source:stable
updateStrategy:
rollingUpdate:
maxUnavailable: 25%
type: RollingUpdate
status:
conditions:
- lastTransitionTime: "2024-10-08T20:34:36Z"
message: Successful
reason: Successful
status: "True"
type: DaemonHealthy
- lastTransitionTime: "2024-10-08T20:34:32Z"
message: Successful
reason: Successful
status: "True"
type: Deployed
- lastTransitionTime: "2024-10-08T21:58:03Z"
message: All pods are healthy
reason: Running
status: "True"
type: PodsHealthy
currentHash: 8ed34d9058faa60997ee13083ccb3d80691df37b45a34eaa347af99f237e8df6
desiredReplicas: 1
lastScaledAt: "2024-10-08T20:34:32Z"
lastUpdated: "2024-10-08T21:58:13Z"
observedGeneration: 1
phase: Running
readyReplicas: 1
replicas: 1
selector: app.kubernetes.io/component=mono-vertex,numaflow.numaproj.io/mono-vertex-name=simple-mono-vertex
updateHash: 8ed34d9058faa60997ee13083ccb3d80691df37b45a34eaa347af99f237e8df6
updatedReadyReplicas: 1
updatedReplicas: 1

View File

@@ -45,7 +45,7 @@ end
-- force-promote
local forcePromote = false
if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress") then
if obj.metadata.labels ~= nil and (obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress" or obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "trial") then
forcePromote = true
end
if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/force-promote"] == "true") then

View File

@@ -10,11 +10,14 @@ if obj.status ~= nil then
end
end
progressiveFailure = (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/progressive-result-state"] == "failed")
if obj.metadata.generation == obj.status.observedGeneration then
if (healthy ~= {} and healthy.status == "False") or obj.status.phase == "Failed" then
if (healthy ~= {} and healthy.status == "False") or (obj.status.phase == "Failed") or progressiveFailure then
hs.status = "Degraded"
if obj.status.phase == "Failed" then
hs.message = obj.status.message
elseif progressiveFailure then
hs.message = "Failed progressive upgrade"
else
hs.message = "Subresources are unhealthy"
end

View File

@@ -14,4 +14,8 @@ tests:
- healthStatus:
status: Healthy
message: "Pipeline is paused"
inputPath: testdata/paused.yaml
inputPath: testdata/paused.yaml
- healthStatus:
status: Degraded
message: "Failed progressive upgrade"
inputPath: testdata/degraded-progressive.yaml

View File

@@ -0,0 +1,100 @@
apiVersion: numaflow.numaproj.io/v1alpha1
kind: Pipeline
metadata:
creationTimestamp: "2024-10-08T18:22:18Z"
finalizers:
- pipeline-controller
generation: 1
name: simple-pipeline
namespace: numaflow-system
resourceVersion: "358080"
uid: bb6cc91c-eb05-4fe7-9380-63b9532a85db
labels:
numaplane.numaproj.io/progressive-result-state: "failed"
spec:
edges:
- from: in
to: cat
- from: cat
to: out
lifecycle:
deleteGracePeriodSeconds: 30
desiredPhase: Running
pauseGracePeriodSeconds: 30
limits:
bufferMaxLength: 30000
bufferUsageLimit: 80
readBatchSize: 500
readTimeout: 1s
vertices:
- name: in
scale:
min: 1
source:
generator:
duration: 1s
jitter: 0s
msgSize: 8
rpu: 5
updateStrategy:
rollingUpdate:
maxUnavailable: 25%
type: RollingUpdate
- name: cat
scale:
min: 1
udf:
builtin:
name: cat
updateStrategy:
rollingUpdate:
maxUnavailable: 25%
type: RollingUpdate
- name: out
scale:
min: 1
sink:
log: {}
updateStrategy:
rollingUpdate:
maxUnavailable: 25%
type: RollingUpdate
watermark:
disabled: false
maxDelay: 0s
status:
conditions:
- lastTransitionTime: "2024-10-08T18:22:49Z"
message: Successful
reason: Successful
status: "True"
type: Configured
- lastTransitionTime: "2024-10-08T18:22:49Z"
message: Successful
reason: Successful
status: "True"
type: DaemonServiceHealthy
- lastTransitionTime: "2024-10-08T18:22:49Z"
message: Successful
reason: Successful
status: "True"
type: Deployed
- lastTransitionTime: "2024-10-08T18:22:49Z"
message: No Side Inputs attached to the pipeline
reason: NoSideInputs
status: "True"
type: SideInputsManagersHealthy
- lastTransitionTime: "2024-10-08T18:22:49Z"
message: All vertices are healthy
reason: Successful
status: "True"
type: VerticesHealthy
lastUpdated: "2024-10-08T18:22:49Z"
mapUDFCount: 1
observedGeneration: 1
phase: Running
reduceUDFCount: 0
sinkCount: 1
sourceCount: 1
udfCount: 1
vertexCount: 3