feat: Add health check for Ceph CRD (#24111)

Signed-off-by: Arnaud Farbos <afarbos@nvidia.com>
This commit is contained in:
afarbos
2025-11-18 14:16:40 -08:00
committed by GitHub
parent 27715cd556
commit 9ee5cca38b
17 changed files with 310 additions and 0 deletions

View File

@@ -0,0 +1,66 @@
-- CRD documentation: https://rook.github.io/docs/rook/latest-release/CRDs/Cluster/ceph-cluster-crd/
local hs = {
status = "Progressing",
message = ""
}
function append_to_message(message)
if message ~= "" then
if hs.message ~= "" then
hs.message = hs.message .. " - " .. message
else
hs.message = message
end
end
end
if obj.status == nil then
append_to_message("Waiting for status to be reported")
return hs
end
-- Check the main Ceph health status first - https://github.com/ceph/ceph/blob/v20.3.0/src/include/health.h#L12
if obj.status.ceph ~= nil and obj.status.ceph.health ~= nil then
local ceph_health = obj.status.ceph.health
local details_message = ""
-- Build details message from status.ceph.details if available
if obj.status.ceph.details ~= nil then
local detail_parts = {}
for detail_type, detail_info in pairs(obj.status.ceph.details) do
if detail_info.message ~= nil then
table.insert(detail_parts, detail_info.message)
end
end
if #detail_parts > 0 then
details_message = " (" .. table.concat(detail_parts, "; ") .. ")"
end
end
if ceph_health == "HEALTH_ERR" or ceph_health == "HEALTH_WARN" then
hs.status = "Degraded"
elseif ceph_health == "HEALTH_OK" then
hs.status = "Healthy"
end
append_to_message("Ceph health is " .. ceph_health .. details_message)
end
-- Check state - https://github.com/rook/rook/blob/v1.17.7/pkg/apis/ceph.rook.io/v1/types.go#L621
if obj.status.state ~= nil then
if hs.status == "Healthy" then
append_to_message("Ceph cluster state is " .. obj.status.state)
if obj.status.state == "Created" or obj.status.state == "Connected" then
hs.status = "Healthy"
elseif obj.status.state == "Error" then
hs.status = "Degraded"
else
hs.status = "Progressing"
end
end
end
if obj.status.message ~= nil then
append_to_message(obj.status.message)
end
return hs

View File

@@ -0,0 +1,21 @@
tests:
- healthStatus:
status: Healthy
message: 'Ceph health is HEALTH_OK - Ceph cluster state is Created - Cluster created successfully'
inputPath: testdata/healthy.yaml
- healthStatus:
status: Degraded
message: 'Ceph health is HEALTH_WARN (4 osds down; 2 pools degraded) - Cluster has warnings'
inputPath: testdata/degraded_warn.yaml
- healthStatus:
status: Degraded
message: 'Ceph health is HEALTH_ERR (8 osds down) - Cluster has critical errors'
inputPath: testdata/degraded_error.yaml
- healthStatus:
status: Progressing
message: 'Cluster is being created'
inputPath: testdata/state_creating.yaml
- healthStatus:
status: Progressing
message: 'Waiting for status to be reported'
inputPath: testdata/no_status.yaml

View File

@@ -0,0 +1,17 @@
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: test-ceph-cluster
namespace: rook-ceph
spec:
cephVersion:
image: quay.io/ceph/ceph:v19.2.0
status:
ceph:
health: HEALTH_ERR
details:
OSD_DOWN:
message: 8 osds down
severity: HEALTH_ERR
state: Error
message: Cluster has critical errors

View File

@@ -0,0 +1,20 @@
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: test-ceph-cluster
namespace: rook-ceph
spec:
cephVersion:
image: quay.io/ceph/ceph:v19.2.0
status:
ceph:
health: HEALTH_WARN
details:
OSD_DOWN:
message: 4 osds down
severity: HEALTH_WARN
POOL_DEGRADED:
message: 2 pools degraded
severity: HEALTH_WARN
state: Created
message: Cluster has warnings

View File

@@ -0,0 +1,44 @@
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: test-ceph-cluster
namespace: rook-ceph
spec:
cephVersion:
image: quay.io/ceph/ceph:v19.2.0
mon:
count: 3
mgr:
count: 2
storage:
useAllNodes: true
useAllDevices: false
status:
ceph:
health: HEALTH_OK
lastChecked: "2025-08-11T16:03:08Z"
fsid: c121226d-cac9-492f-8b0b-c05693243380
capacity:
bytesAvailable: 35183103942656
bytesTotal: 35184372088832
bytesUsed: 1268146176
lastUpdated: "2025-08-11T16:03:08Z"
conditions:
- lastHeartbeatTime: "2025-08-11T16:03:08Z"
lastTransitionTime: "2025-08-11T16:03:08Z"
message: Cluster created successfully
reason: ClusterCreated
status: "True"
type: Ready
- lastHeartbeatTime: "2025-08-11T16:03:08Z"
lastTransitionTime: "2025-08-11T16:03:08Z"
message: All OSDs are running
reason: OSDsRunning
status: "True"
type: Progressing
message: Cluster created successfully
phase: Ready
state: Created
version:
image: quay.io/ceph/ceph:v19.2.0
version: 19.2.0-0

View File

@@ -0,0 +1,8 @@
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: test-ceph-cluster
namespace: rook-ceph
spec:
cephVersion:
image: quay.io/ceph/ceph:v19.2.0

View File

@@ -0,0 +1,11 @@
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: test-ceph-cluster
namespace: rook-ceph
spec:
cephVersion:
image: quay.io/ceph/ceph:v19.2.0
status:
state: Creating
message: Cluster is being created

View File

@@ -0,0 +1,26 @@
-- CRD documentation: https://rook.github.io/docs/rook/latest-release/CRDs/Object-Storage/ceph-object-store-crd/
-- Status documentation: https://github.com/rook/rook/blob/v1.17.7/pkg/apis/ceph.rook.io/v1/types.go#L1960
local hs = {
status = "Progressing",
message = "Waiting for status to be reported"
}
if obj.status == nil then
return hs
end
-- phase status check - https://github.com/rook/rook/blob/v1.17.7/pkg/apis/ceph.rook.io/v1/types.go#L596
if obj.status.phase ~= nil then
hs.message = "Ceph object store phase is " .. obj.status.phase
if obj.status.phase == "Ready" then
hs.status = "Healthy"
elseif obj.status.phase == "Failure" then
hs.status = "Degraded"
end
end
if obj.status.info ~= nil and obj.status.info.endpoint ~= nil and obj.status.info.endpoint ~= "" then
hs.message = hs.message .. " - endpoint: " .. obj.status.info.endpoint
end
return hs

View File

@@ -0,0 +1,13 @@
tests:
- healthStatus:
status: Healthy
message: "Ceph object store phase is Ready - endpoint: http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80"
inputPath: testdata/healthy.yaml
- healthStatus:
status: Progressing
message: "Ceph object store phase is Pending"
inputPath: testdata/progressing.yaml
- healthStatus:
status: Degraded
message: "Ceph object store phase is Failure"
inputPath: testdata/degraded.yaml

View File

@@ -0,0 +1,7 @@
apiVersion: ceph.rook.io/v1
kind: CephObjectStore
metadata:
name: ceph-objectstore
spec: {}
status:
phase: Failure

View File

@@ -0,0 +1,9 @@
apiVersion: ceph.rook.io/v1
kind: CephObjectStore
metadata:
name: ceph-objectstore
spec: {}
status:
phase: Ready
info:
endpoint: "http://rook-ceph-rgw-ceph-objectstore.rook-ceph.svc:80"

View File

@@ -0,0 +1,7 @@
apiVersion: ceph.rook.io/v1
kind: CephObjectStore
metadata:
name: ceph-objectstore
spec: {}
status:
phase: Pending

View File

@@ -0,0 +1,21 @@
-- CRD documentation: https://doc.crds.dev/github.com/kube-object-storage/lib-bucket-provisioner/objectbucket.io/ObjectBucketClaim/v1alpha1@kubernetes-v1.14.1
local hs = {
status = "Progressing",
message = "Waiting for status to be reported"
}
-- phase status check - https://github.com/kube-object-storage/lib-bucket-provisioner/blob/ffa47d5/pkg/apis/objectbucket.io/v1alpha1/objectbucketclaim_types.go#L58
if obj.status ~= nil then
if obj.status.phase ~= nil then
hs.message = "Object bucket claim phase is " .. obj.status.phase
if obj.status.phase == "Bound" then
hs.status = "Healthy"
elseif obj.status.phase == "Failed" then
hs.status = "Degraded"
end
else
hs.message = "Waiting for phase to be reported"
end
end
return hs

View File

@@ -0,0 +1,13 @@
tests:
- healthStatus:
status: Healthy
message: "Object bucket claim phase is Bound"
inputPath: testdata/healthy.yaml
- healthStatus:
status: Progressing
message: "Object bucket claim phase is Pending"
inputPath: testdata/progressing.yaml
- healthStatus:
status: Degraded
message: "Object bucket claim phase is Failed"
inputPath: testdata/degraded.yaml

View File

@@ -0,0 +1,9 @@
apiVersion: objectbucket.io/v1alpha1
kind: ObjectBucketClaim
metadata:
name: test-bucket-claim
spec:
bucketName: test-bucket
storageClassName: ceph-bucket
status:
phase: Failed

View File

@@ -0,0 +1,9 @@
apiVersion: objectbucket.io/v1alpha1
kind: ObjectBucketClaim
metadata:
name: test-bucket-claim
spec:
bucketName: test-bucket
storageClassName: ceph-bucket
status:
phase: Bound

View File

@@ -0,0 +1,9 @@
apiVersion: objectbucket.io/v1alpha1
kind: ObjectBucketClaim
metadata:
name: test-bucket-claim
spec:
bucketName: test-bucket
storageClassName: ceph-bucket
status:
phase: Pending