Skip to content

Commit

Permalink
Add DownPGs healthcheck
Browse files Browse the repository at this point in the history
Signed-off-by: Igor Shishkin <[email protected]>
  • Loading branch information
teran committed May 26, 2024
1 parent 641d00c commit a77ebbf
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 0 deletions.
13 changes: 13 additions & 0 deletions models/clusterhealth.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,19 @@ const (
// Dangerous: n/a
ClusterHealthIndicatorTypeQuorum ClusterHealthIndicatorType = "QUORUM"

// ClusterHealthIndicatorTypeDownPGs reflects amount of PGs which are down
//
// Description: Down PGs indicator shows how many PGs are down i.e. are stored
// on OSDs which are down and there's no available copy or a way to reconstruct
// them
//
// Ref: https://docs.ceph.com/en/latest/rados/operations/monitoring-osd-pg/#monitoring-pg-states
//
// Good: 0
// AtRisk: n/a
// Dangerous: >0
ClusterHealthIndicatorTypeDownPGs ClusterHealthIndicatorType = "DOWN_PGS"

// ClusterHealthIndicatorTypeUncleanPGs reflects amount of PGs which are not in clean state
//
// Description: Inactive PGs indicator shows how many PGs are inactive i.e. can not be
Expand Down
15 changes: 15 additions & 0 deletions service/cluster_health/cluster_health.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,21 @@ func DeviceHealth(ctx context.Context, cr models.ClusterReport) (models.ClusterH
}, nil
}

func DownPGs(ctx context.Context, cr models.ClusterReport) (models.ClusterHealthIndicator, error) {
st := models.ClusterHealthIndicatorStatusGood

downPGs, _ := cr.NumPGsByState["down"]
if downPGs > 0 {
st = models.ClusterHealthIndicatorStatusDangerous
}

return models.ClusterHealthIndicator{
Indicator: models.ClusterHealthIndicatorTypeDownPGs,
CurrentValue: fmt.Sprintf("%d of %d", downPGs, cr.NumPGs),
CurrentValueStatus: st,
}, nil
}

func InactivePGs(ctx context.Context, cr models.ClusterReport) (models.ClusterHealthIndicator, error) {
st := models.ClusterHealthIndicatorStatusGood

Expand Down
45 changes: 45 additions & 0 deletions service/cluster_health/cluster_health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,51 @@ func TestDeviceHealth(t *testing.T) {
}
}

func TestDownPGs(t *testing.T) {
tcs := []testCase{
{
name: "no down OSDs",
in: models.ClusterReport{
NumPGs: 10,
NumPGsByState: map[string]uint32{
"active": 10,
"clean": 10,
},
},
expOut: models.ClusterHealthIndicator{
Indicator: models.ClusterHealthIndicatorTypeDownPGs,
CurrentValue: "0 of 10",
CurrentValueStatus: models.ClusterHealthIndicatorStatusGood,
},
},
{
name: "some down OSDs",
in: models.ClusterReport{
NumPGs: 10,
NumPGsByState: map[string]uint32{
"active": 3,
"down": 5,
},
},
expOut: models.ClusterHealthIndicator{
Indicator: models.ClusterHealthIndicatorTypeDownPGs,
CurrentValue: "5 of 10",
CurrentValueStatus: models.ClusterHealthIndicatorStatusDangerous,
},
},
}

for _, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
r := require.New(t)

i, err := DownPGs(context.Background(), tc.in)
r.NoError(err)
r.Equal(tc.expOut, i)
})
}
}

func TestInactivePGs(t *testing.T) {
tcs := []testCase{
{
Expand Down
1 change: 1 addition & 0 deletions service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ var clusterHealthChecksList = []clusterHealth.ClusterHealthCheck{
clusterHealth.OSDsDown,
clusterHealth.OSDsOut,
clusterHealth.MutesAmount,
clusterHealth.DownPGs,
clusterHealth.UncleanPGs,
clusterHealth.InactivePGs,
clusterHealth.AllowCrimson,
Expand Down
5 changes: 5 additions & 0 deletions service/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@ func (s *serviceTestSuite) TestCheckClusterHealth() {
CurrentValue: "0 of 0",
CurrentValueStatus: models.ClusterHealthIndicatorStatusGood,
},
{
Indicator: models.ClusterHealthIndicatorTypeDownPGs,
CurrentValue: "0 of 330",
CurrentValueStatus: models.ClusterHealthIndicatorStatusGood,
},
{
Indicator: models.ClusterHealthIndicatorTypeUncleanPGs,
CurrentValue: "52 of 330",
Expand Down

0 comments on commit a77ebbf

Please sign in to comment.