diff --git a/resource_customizations/numaflow.numaproj.io/InterStepBufferService/actions/discovery.lua b/resource_customizations/numaflow.numaproj.io/InterStepBufferService/actions/discovery.lua index 8f6cea0665..7bfa5fb281 100644 --- a/resource_customizations/numaflow.numaproj.io/InterStepBufferService/actions/discovery.lua +++ b/resource_customizations/numaflow.numaproj.io/InterStepBufferService/actions/discovery.lua @@ -6,7 +6,7 @@ actions["force-promote"] = { -- force-promote local forcePromote = false -if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress") then +if obj.metadata.labels ~= nil and (obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress" or obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "trial") then forcePromote = true end if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/force-promote"] == "true") then diff --git a/resource_customizations/numaflow.numaproj.io/InterStepBufferService/health.lua b/resource_customizations/numaflow.numaproj.io/InterStepBufferService/health.lua index 8010a34924..ba60faa07b 100644 --- a/resource_customizations/numaflow.numaproj.io/InterStepBufferService/health.lua +++ b/resource_customizations/numaflow.numaproj.io/InterStepBufferService/health.lua @@ -11,11 +11,14 @@ if obj.status ~= nil then end end + progressiveFailure = (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/progressive-result-state"] == "failed") if obj.metadata.generation == obj.status.observedGeneration then - if (healthy ~= {} and healthy.status == "False") or obj.status.phase == "Failed" then + if (healthy ~= {} and healthy.status == "False") or obj.status.phase == "Failed" or progressiveFailure then hs.status = "Degraded" if obj.status.phase == "Failed" then hs.message = obj.status.message + elseif progressiveFailure then + hs.message = "Failed progressive upgrade" else hs.message = healthy.message end diff --git a/resource_customizations/numaflow.numaproj.io/InterStepBufferService/health_test.yaml b/resource_customizations/numaflow.numaproj.io/InterStepBufferService/health_test.yaml index 1c4ff922da..4b82e62e3d 100644 --- a/resource_customizations/numaflow.numaproj.io/InterStepBufferService/health_test.yaml +++ b/resource_customizations/numaflow.numaproj.io/InterStepBufferService/health_test.yaml @@ -10,4 +10,8 @@ tests: - healthStatus: status: Degraded message: "Waiting for 3 pods to be ready...\n" - inputPath: testdata/degraded.yaml \ No newline at end of file + inputPath: testdata/degraded.yaml +- healthStatus: + status: Degraded + message: "Failed progressive upgrade" + inputPath: testdata/degraded-progressive.yaml \ No newline at end of file diff --git a/resource_customizations/numaflow.numaproj.io/InterStepBufferService/testdata/degraded-progressive.yaml b/resource_customizations/numaflow.numaproj.io/InterStepBufferService/testdata/degraded-progressive.yaml new file mode 100644 index 0000000000..3d9fc0f562 --- /dev/null +++ b/resource_customizations/numaflow.numaproj.io/InterStepBufferService/testdata/degraded-progressive.yaml @@ -0,0 +1,80 @@ +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: InterStepBufferService +metadata: + annotations: + kubectl.kubernetes.io/last-applied-configuration: | + {"apiVersion":"numaflow.numaproj.io/v1alpha1","kind":"InterStepBufferService","metadata":{"annotations":{},"name":"default","namespace":"numaflow-system"},"spec":{"jetstream":{"persistence":{"volumeSize":"3Gi"},"version":"latest"}}} + creationTimestamp: "2024-10-08T18:21:09Z" + finalizers: + - isbsvc-controller + generation: 1 + name: default + namespace: numaflow-system + resourceVersion: "357862" + uid: e175db66-3918-4ef8-993d-12b37eb9a964 + labels: + numaplane.numaproj.io/progressive-result-state: "failed" +spec: + jetstream: + persistence: + volumeSize: 3Gi + replicas: 3 + version: latest +status: + conditions: + - lastTransitionTime: "2024-10-08T18:21:53Z" + message: | + partitioned roll out complete: 3 new pods have been updated... + reason: Healthy + status: "True" + type: ChildrenResourcesHealthy + - lastTransitionTime: "2024-10-08T18:21:53Z" + message: Successful + reason: Successful + status: "True" + type: Configured + - lastTransitionTime: "2024-10-08T18:21:53Z" + message: Successful + reason: Successful + status: "True" + type: Deployed + config: + jetstream: + auth: + basic: + password: + key: client-auth-password + name: isbsvc-default-js-client-auth + user: + key: client-auth-user + name: isbsvc-default-js-client-auth + streamConfig: | + consumer: + ackwait: 60s + maxackpending: 25000 + otbucket: + history: 1 + maxbytes: 0 + maxvaluesize: 0 + replicas: 3 + storage: 0 + ttl: 3h + procbucket: + history: 1 + maxbytes: 0 + maxvaluesize: 0 + replicas: 3 + storage: 0 + ttl: 72h + stream: + duplicates: 60s + maxage: 72h + maxbytes: -1 + maxmsgs: 100000 + replicas: 3 + retention: 0 + storage: 0 + url: nats://isbsvc-default-js-svc.numaflow-system.svc:4222 + observedGeneration: 1 + phase: Running + type: jetstream \ No newline at end of file diff --git a/resource_customizations/numaflow.numaproj.io/MonoVertex/actions/discovery.lua b/resource_customizations/numaflow.numaproj.io/MonoVertex/actions/discovery.lua index 075a05bf58..6bc517eb59 100644 --- a/resource_customizations/numaflow.numaproj.io/MonoVertex/actions/discovery.lua +++ b/resource_customizations/numaflow.numaproj.io/MonoVertex/actions/discovery.lua @@ -45,7 +45,7 @@ end -- force-promote local forcePromote = false -if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress") then +if obj.metadata.labels ~= nil and (obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress" or obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "trial") then forcePromote = true end if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/force-promote"] == "true") then diff --git a/resource_customizations/numaflow.numaproj.io/MonoVertex/health.lua b/resource_customizations/numaflow.numaproj.io/MonoVertex/health.lua index a36b9dd0a2..8f11dec810 100644 --- a/resource_customizations/numaflow.numaproj.io/MonoVertex/health.lua +++ b/resource_customizations/numaflow.numaproj.io/MonoVertex/health.lua @@ -10,11 +10,14 @@ if obj.status ~= nil then end end + progressiveFailure = (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/progressive-result-state"] == "failed") if obj.metadata.generation == obj.status.observedGeneration then - if (healthy ~= {} and healthy.status == "False") or obj.status.phase == "Failed" then + if (healthy ~= {} and healthy.status == "False") or (obj.status.phase == "Failed") or progressiveFailure then hs.status = "Degraded" if obj.status.phase == "Failed" then hs.message = obj.status.message + elseif progressiveFailure then + hs.message = "Failed progressive upgrade" else hs.message = "Subresources are unhealthy" end diff --git a/resource_customizations/numaflow.numaproj.io/MonoVertex/health_test.yaml b/resource_customizations/numaflow.numaproj.io/MonoVertex/health_test.yaml index e9bda38cb8..d15ae4ffe8 100644 --- a/resource_customizations/numaflow.numaproj.io/MonoVertex/health_test.yaml +++ b/resource_customizations/numaflow.numaproj.io/MonoVertex/health_test.yaml @@ -14,4 +14,8 @@ tests: - healthStatus: status: Healthy message: "MonoVertex is paused" - inputPath: testdata/paused.yaml \ No newline at end of file + inputPath: testdata/paused.yaml +- healthStatus: + status: Degraded + message: "Failed progressive upgrade" + inputPath: testdata/degraded-progressive.yaml \ No newline at end of file diff --git a/resource_customizations/numaflow.numaproj.io/MonoVertex/testdata/degraded-progressive.yaml b/resource_customizations/numaflow.numaproj.io/MonoVertex/testdata/degraded-progressive.yaml new file mode 100644 index 0000000000..a07885aa6d --- /dev/null +++ b/resource_customizations/numaflow.numaproj.io/MonoVertex/testdata/degraded-progressive.yaml @@ -0,0 +1,62 @@ +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: MonoVertex +metadata: + annotations: + kubectl.kubernetes.io/last-applied-configuration: | + {"apiVersion":"numaflow.numaproj.io/v1alpha1","kind":"MonoVertex","metadata":{"annotations":{},"name":"simple-mono-vertex","namespace":"numaflow-system"},"spec":{"sink":{"udsink":{"container":{"image":"quay.io/numaio/numaflow-java/simple-sink:stable"}}},"source":{"transformer":{"container":{"image":"quay.io/numaio/numaflow-rs/source-transformer-now:stable"}},"udsource":{"container":{"image":"quay.io/numaio/numaflow-java/source-simple-source:stable"}}}}} + creationTimestamp: "2024-10-08T20:34:32Z" + generation: 1 + name: simple-mono-vertex + namespace: numaflow-system + resourceVersion: "367420" + uid: 7bc9291a-9c80-4ec1-8b06-46fac8f7e507 + labels: + numaplane.numaproj.io/progressive-result-state: "failed" +spec: + lifecycle: + desiredPhase: Running + replicas: 1 + sink: + udsink: + container: + image: quay.io/numaio/numaflow-java/simple-sink:stable + source: + transformer: + container: + image: quay.io/numaio/numaflow-rs/source-transformer-now:stable + udsource: + container: + image: quay.io/numaio/numaflow-java/source-simple-source:stable + updateStrategy: + rollingUpdate: + maxUnavailable: 25% + type: RollingUpdate +status: + conditions: + - lastTransitionTime: "2024-10-08T20:34:36Z" + message: Successful + reason: Successful + status: "True" + type: DaemonHealthy + - lastTransitionTime: "2024-10-08T20:34:32Z" + message: Successful + reason: Successful + status: "True" + type: Deployed + - lastTransitionTime: "2024-10-08T21:58:03Z" + message: All pods are healthy + reason: Running + status: "True" + type: PodsHealthy + currentHash: 8ed34d9058faa60997ee13083ccb3d80691df37b45a34eaa347af99f237e8df6 + desiredReplicas: 1 + lastScaledAt: "2024-10-08T20:34:32Z" + lastUpdated: "2024-10-08T21:58:13Z" + observedGeneration: 1 + phase: Running + readyReplicas: 1 + replicas: 1 + selector: app.kubernetes.io/component=mono-vertex,numaflow.numaproj.io/mono-vertex-name=simple-mono-vertex + updateHash: 8ed34d9058faa60997ee13083ccb3d80691df37b45a34eaa347af99f237e8df6 + updatedReadyReplicas: 1 + updatedReplicas: 1 \ No newline at end of file diff --git a/resource_customizations/numaflow.numaproj.io/Pipeline/actions/discovery.lua b/resource_customizations/numaflow.numaproj.io/Pipeline/actions/discovery.lua index 075a05bf58..6bc517eb59 100644 --- a/resource_customizations/numaflow.numaproj.io/Pipeline/actions/discovery.lua +++ b/resource_customizations/numaflow.numaproj.io/Pipeline/actions/discovery.lua @@ -45,7 +45,7 @@ end -- force-promote local forcePromote = false -if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress") then +if obj.metadata.labels ~= nil and (obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "in-progress" or obj.metadata.labels["numaplane.numaproj.io/upgrade-state"] == "trial") then forcePromote = true end if (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/force-promote"] == "true") then diff --git a/resource_customizations/numaflow.numaproj.io/Pipeline/health.lua b/resource_customizations/numaflow.numaproj.io/Pipeline/health.lua index b634298a38..ed8dc106ef 100644 --- a/resource_customizations/numaflow.numaproj.io/Pipeline/health.lua +++ b/resource_customizations/numaflow.numaproj.io/Pipeline/health.lua @@ -10,11 +10,14 @@ if obj.status ~= nil then end end + progressiveFailure = (obj.metadata.labels ~= nil and obj.metadata.labels["numaplane.numaproj.io/progressive-result-state"] == "failed") if obj.metadata.generation == obj.status.observedGeneration then - if (healthy ~= {} and healthy.status == "False") or obj.status.phase == "Failed" then + if (healthy ~= {} and healthy.status == "False") or (obj.status.phase == "Failed") or progressiveFailure then hs.status = "Degraded" if obj.status.phase == "Failed" then hs.message = obj.status.message + elseif progressiveFailure then + hs.message = "Failed progressive upgrade" else hs.message = "Subresources are unhealthy" end diff --git a/resource_customizations/numaflow.numaproj.io/Pipeline/health_test.yaml b/resource_customizations/numaflow.numaproj.io/Pipeline/health_test.yaml index 3faf80baaf..8bd5e10a2c 100644 --- a/resource_customizations/numaflow.numaproj.io/Pipeline/health_test.yaml +++ b/resource_customizations/numaflow.numaproj.io/Pipeline/health_test.yaml @@ -14,4 +14,8 @@ tests: - healthStatus: status: Healthy message: "Pipeline is paused" - inputPath: testdata/paused.yaml \ No newline at end of file + inputPath: testdata/paused.yaml +- healthStatus: + status: Degraded + message: "Failed progressive upgrade" + inputPath: testdata/degraded-progressive.yaml \ No newline at end of file diff --git a/resource_customizations/numaflow.numaproj.io/Pipeline/testdata/degraded-progressive.yaml b/resource_customizations/numaflow.numaproj.io/Pipeline/testdata/degraded-progressive.yaml new file mode 100644 index 0000000000..eec1933d8b --- /dev/null +++ b/resource_customizations/numaflow.numaproj.io/Pipeline/testdata/degraded-progressive.yaml @@ -0,0 +1,100 @@ +apiVersion: numaflow.numaproj.io/v1alpha1 +kind: Pipeline +metadata: + creationTimestamp: "2024-10-08T18:22:18Z" + finalizers: + - pipeline-controller + generation: 1 + name: simple-pipeline + namespace: numaflow-system + resourceVersion: "358080" + uid: bb6cc91c-eb05-4fe7-9380-63b9532a85db + labels: + numaplane.numaproj.io/progressive-result-state: "failed" +spec: + edges: + - from: in + to: cat + - from: cat + to: out + lifecycle: + deleteGracePeriodSeconds: 30 + desiredPhase: Running + pauseGracePeriodSeconds: 30 + limits: + bufferMaxLength: 30000 + bufferUsageLimit: 80 + readBatchSize: 500 + readTimeout: 1s + vertices: + - name: in + scale: + min: 1 + source: + generator: + duration: 1s + jitter: 0s + msgSize: 8 + rpu: 5 + updateStrategy: + rollingUpdate: + maxUnavailable: 25% + type: RollingUpdate + - name: cat + scale: + min: 1 + udf: + builtin: + name: cat + updateStrategy: + rollingUpdate: + maxUnavailable: 25% + type: RollingUpdate + - name: out + scale: + min: 1 + sink: + log: {} + updateStrategy: + rollingUpdate: + maxUnavailable: 25% + type: RollingUpdate + watermark: + disabled: false + maxDelay: 0s +status: + conditions: + - lastTransitionTime: "2024-10-08T18:22:49Z" + message: Successful + reason: Successful + status: "True" + type: Configured + - lastTransitionTime: "2024-10-08T18:22:49Z" + message: Successful + reason: Successful + status: "True" + type: DaemonServiceHealthy + - lastTransitionTime: "2024-10-08T18:22:49Z" + message: Successful + reason: Successful + status: "True" + type: Deployed + - lastTransitionTime: "2024-10-08T18:22:49Z" + message: No Side Inputs attached to the pipeline + reason: NoSideInputs + status: "True" + type: SideInputsManagersHealthy + - lastTransitionTime: "2024-10-08T18:22:49Z" + message: All vertices are healthy + reason: Successful + status: "True" + type: VerticesHealthy + lastUpdated: "2024-10-08T18:22:49Z" + mapUDFCount: 1 + observedGeneration: 1 + phase: Running + reduceUDFCount: 0 + sinkCount: 1 + sourceCount: 1 + udfCount: 1 + vertexCount: 3 \ No newline at end of file