feat: Add health checks for Coralogix (#23853)

Signed-off-by: Daniel Leinov <daniellei@jfrog.com>
Co-authored-by: Pasha Kostohrys <pasha.kostohrys@gmail.com>
This commit is contained in:
daniel-leinov
2025-07-24 13:03:55 +03:00
committed by GitHub
parent 0a1572b9d9
commit dcf1965c52
10 changed files with 337 additions and 0 deletions

View File

@@ -0,0 +1,21 @@
hs = {}
hs.status = "Progressing"
hs.message = "Waiting for status to be updated"
if obj.status ~= nil and obj.status.conditions ~= nil then
for i, condition in ipairs(obj.status.conditions) do
if condition.type == "RemoteSynced" then
if condition.status == "True" then
hs.status = "Healthy"
hs.message = "Resource is ready"
return hs
elseif condition.status == "False" then
hs.status = "Degraded"
hs.message = condition.message
return hs
end
end
end
end
return hs

View File

@@ -0,0 +1,16 @@
tests:
- healthStatus:
status: Degraded
message: "error on extracting alert properties: failed to expand notification
group: failed to expand webhooks settings: failed to expand webhook
setting: failed to expand integration: failed to convert name to
integration ID: webhook critical-alerts-webhook not found"
inputPath: testdata/degraded_alert.yaml
- healthStatus:
status: Progressing
message: "Waiting for status to be updated"
inputPath: testdata/progressing_alert.yaml
- healthStatus:
status: Healthy
message: "Resource is ready"
inputPath: testdata/healthy_alert.yaml

View File

@@ -0,0 +1,58 @@
apiVersion: coralogix.com/v1beta1
kind: Alert
metadata:
name: bitbucketcontainernotrunning-test
spec:
alertType:
metricThreshold:
metricFilter:
promql: >-
sum({namespace="bitbucket",pod=~"bitbucket-k8s-.*",condition="false"}) by (pod)
missingValues:
replaceWithZero: true
rules:
- condition:
conditionType: moreThan
forOverPct: 100
ofTheLast:
specificValue: 5m
threshold: 0
override:
priority: p1
description: >-
Bitbucket one of the container is not running
entityLabels:
app: bitbucket
name: Bitbucketcontainernotrunning-test
notificationGroup:
groupByKeys:
- pod
webhooks:
- integration:
integrationRef:
backendRef:
name: opsgenie-example
notifyOn: triggeredAndResolved
retriggeringPeriod:
minutes: 60
- integration:
integrationRef:
backendRef:
name: critical-alerts-webhook
notifyOn: triggeredAndResolved
retriggeringPeriod:
minutes: 60
priority: p1
status:
conditions:
- lastTransitionTime: '2025-07-17T07:39:54Z'
message: >-
error on extracting alert properties: failed to expand notification
group: failed to expand webhooks settings: failed to expand webhook
setting: failed to expand integration: failed to convert name to
integration ID: webhook critical-alerts-webhook not found
observedGeneration: 1
reason: RemoteCreationFailed
status: 'False'
type: RemoteSynced

View File

@@ -0,0 +1,53 @@
apiVersion: coralogix.com/v1beta1
kind: Alert
metadata:
name: bitbucketcontainernotrunning-test
spec:
alertType:
metricThreshold:
metricFilter:
promql: >-
sum({namespace="bitbucket",pod=~"bitbucket-k8s-.*",condition="false"}) by (pod)
missingValues:
replaceWithZero: true
rules:
- condition:
conditionType: moreThan
forOverPct: 100
ofTheLast:
specificValue: 5m
threshold: 0
override:
priority: p1
description: >-
Bitbucket one of the container is not running
entityLabels:
app: bitbucket
name: Bitbucketcontainernotrunning-test
notificationGroup:
groupByKeys:
- pod
webhooks:
- integration:
integrationRef:
backendRef:
name: opsgenie-example
notifyOn: triggeredAndResolved
retriggeringPeriod:
minutes: 60
- integration:
integrationRef:
backendRef:
name: critical-alerts-webhook
notifyOn: triggeredAndResolved
retriggeringPeriod:
minutes: 60
priority: p1
status:
conditions:
- lastTransitionTime: '2025-07-17T07:39:55Z'
message: Remote resource synced
observedGeneration: 3
reason: RemoteSyncedSuccessfully
status: 'True'
type: RemoteSynced

View File

@@ -0,0 +1,46 @@
apiVersion: coralogix.com/v1beta1
kind: Alert
metadata:
name: bitbucketcontainernotrunning-test
spec:
alertType:
metricThreshold:
metricFilter:
promql: >-
sum({namespace="bitbucket",pod=~"bitbucket-k8s-.*",condition="false"}) by (pod)
missingValues:
replaceWithZero: true
rules:
- condition:
conditionType: moreThan
forOverPct: 100
ofTheLast:
specificValue: 5m
threshold: 0
override:
priority: p1
description: >-
Bitbucket one of the container is not running
entityLabels:
app: bitbucket
name: Bitbucketcontainernotrunning-test
notificationGroup:
groupByKeys:
- pod
webhooks:
- integration:
integrationRef:
backendRef:
name: opsgenie-example
notifyOn: triggeredAndResolved
retriggeringPeriod:
minutes: 60
- integration:
integrationRef:
backendRef:
name: critical-alerts-webhook
notifyOn: triggeredAndResolved
retriggeringPeriod:
minutes: 60
priority: p1

View File

@@ -0,0 +1,21 @@
hs = {}
hs.status = "Progressing"
hs.message = "Waiting for status to be updated"
if obj.status ~= nil and obj.status.conditions ~= nil then
for i, condition in ipairs(obj.status.conditions) do
if condition.type == "RemoteSynced" then
if condition.status == "True" then
hs.status = "Healthy"
hs.message = "Resource is ready"
return hs
elseif condition.status == "False" then
hs.status = "Degraded"
hs.message = condition.message
return hs
end
end
end
end
return hs

View File

@@ -0,0 +1,41 @@
tests:
- healthStatus:
status: Degraded
message: >-
error on creating remote recordingRuleGroupSet: SDK API error from /com.coralogixapis.metrics_rule_manager.v1.RuleGroupSets/Create for feature group recording-rules: rpc error: code = InvalidArgument desc = {
"groups": {
"0": {
"rules": {
"0": {
"record": [
{
"code": "length",
"message": null,
"params": {
"value": "",
"min": 1
}
},
{
"code": "invalid_promql",
"message": "SingleExpr: unexpected token ; want \"\"(\", \"{\", \"-\", \"+\"\"",
"params": {
"value": ""
}
}
]
}
}
}
}
}
inputPath: testdata/degraded_recording_rule.yaml
- healthStatus:
status: Progressing
message: "Waiting for status to be updated"
inputPath: testdata/progressing_recording_rule.yaml
- healthStatus:
status: Healthy
message: "Resource is ready"
inputPath: testdata/healthy_recording_rule.yaml

View File

@@ -0,0 +1,47 @@
apiVersion: coralogix.com/v1alpha1
kind: RecordingRuleGroupSet
metadata:
name: rules
spec:
groups:
- name: k8s_rules
rules:
- expr: >-
sum(rate(container_cpu_usage_seconds_total{job="kubelet",
metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]))
by (namespace)
status:
conditions:
- lastTransitionTime: "2025-07-17T14:41:18Z"
message: |-
error on creating remote recordingRuleGroupSet: SDK API error from /com.coralogixapis.metrics_rule_manager.v1.RuleGroupSets/Create for feature group recording-rules: rpc error: code = InvalidArgument desc = {
"groups": {
"0": {
"rules": {
"0": {
"record": [
{
"code": "length",
"message": null,
"params": {
"value": "",
"min": 1
}
},
{
"code": "invalid_promql",
"message": "SingleExpr: unexpected token ; want \"\"(\", \"{\", \"-\", \"+\"\"",
"params": {
"value": ""
}
}
]
}
}
}
}
}
observedGeneration: 1
reason: RemoteCreationFailed
status: "False"
type: RemoteSynced

View File

@@ -0,0 +1,21 @@
apiVersion: coralogix.com/v1alpha1
kind: RecordingRuleGroupSet
metadata:
name: rules
spec:
groups:
- name: k8s_rules
rules:
- expr: >-
sum(rate(container_cpu_usage_seconds_total{job="kubelet",
metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]))
by (namespace)
record: 'namespace:container_cpu_usage_seconds_total:sum_rate'
status:
conditions:
- lastTransitionTime: '2025-05-27T08:49:26Z'
message: Remote resource synced
observedGeneration: 3
reason: RemoteSyncedSuccessfully
status: 'True'
type: RemoteSynced

View File

@@ -0,0 +1,13 @@
apiVersion: coralogix.com/v1alpha1
kind: RecordingRuleGroupSet
metadata:
name: rules
spec:
groups:
- name: k8s_rules
rules:
- expr: >-
sum(rate(container_cpu_usage_seconds_total{job="kubelet",
metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]))
by (namespace)
record: 'namespace:container_cpu_usage_seconds_total:sum_rate'