Skip to content

Commit

Permalink
feat(health): fancy composable status
Browse files Browse the repository at this point in the history
  • Loading branch information
vknabel committed Jul 26, 2024
1 parent 9ba8b42 commit 14b8a91
Show file tree
Hide file tree
Showing 6 changed files with 411 additions and 179 deletions.
111 changes: 111 additions & 0 deletions healthstatus/async-check.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package healthstatus

import (
"context"
"log/slog"
"time"

"golang.org/x/sync/semaphore"
)

type AsyncHealthCheck struct {
healthCheck HealthCheck
log *slog.Logger
healthCheckInterval time.Duration

sem *semaphore.Weighted
current currentState
ticker *time.Ticker
}

func Async(log *slog.Logger, interval time.Duration, hc HealthCheck) *AsyncHealthCheck {
return &AsyncHealthCheck{
healthCheckInterval: interval,
healthCheck: hc,
log: log,
sem: semaphore.NewWeighted(1),
current: currentState{
Status: HealthResult{
Status: HealthStatusHealthy,
Message: "",
},
},
}
}

func (c *AsyncHealthCheck) ServiceName() string {
return c.healthCheck.ServiceName()
}

func (c *AsyncHealthCheck) Check(context.Context) (HealthResult, error) {
c.log.Debug("checked async")
if c.ticker == nil {
c.Start(context.Background())

Check failure on line 43 in healthstatus/async-check.go

View workflow job for this annotation

GitHub Actions / build

Non-inherited new context, use function like `context.WithXXX` instead (contextcheck)
}
return c.current.Status, c.current.Err
}

func (r *AsyncHealthCheck) Start(ctx context.Context) {
r.log.Debug("started async updates")
if r.ticker != nil {
r.ticker.Reset(r.healthCheckInterval)
} else {
r.ticker = time.NewTicker(r.healthCheckInterval)
}
go func() {
err := r.updateStatus(ctx)
if err != nil {
r.log.Error("services are unhealthy", "error", err)
}

for {
select {
case <-ctx.Done():
r.log.Info("stop health checking, context is done")
return
case <-r.ticker.C:
if r.sem.TryAcquire(1) {
err := r.updateStatus(ctx)
if err != nil {
r.log.Error("services are unhealthy", "error", err)
}
r.sem.Release(1)
} else {
r.log.Info("skip updating health status because update is still running")
}
}
}
}()
}

func (r *AsyncHealthCheck) Stop(ctx context.Context) {
r.ticker.Stop()
}

func (r *AsyncHealthCheck) ForceUpdateStatus(ctx context.Context) error {
err := r.sem.Acquire(ctx, 1)
if err != nil {
return err
}
err = r.updateStatus(ctx)
if err != nil {
r.log.Error("services are unhealthy", "error", err)
}
r.sem.Release(1)
return err
}

func (r *AsyncHealthCheck) updateStatus(ctx context.Context) error {

Check failure on line 98 in healthstatus/async-check.go

View workflow job for this annotation

GitHub Actions / build

Non-inherited new context, use function like `context.WithXXX` instead (contextcheck)
r.log.Info("evaluating current service health statuses")
if ctx == nil {
ctx = context.Background()
}

ctx, cancel := context.WithTimeout(ctx, r.healthCheckInterval/2)
defer cancel()

res, err := r.healthCheck.Check(ctx)
r.current = currentState{res, err}
r.log.Debug("evaluated current service health statuses", "current", r.current)
return err
}
44 changes: 44 additions & 0 deletions healthstatus/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package healthstatus

import "context"

// HealthStatus indicates the health of a service.
type HealthStatus string

const (
// HealthStatusHealthy is returned when the service is healthy.
HealthStatusHealthy HealthStatus = "healthy"
// HealthStatusUnhealthy is returned when the service is not healthy.
HealthStatusUnhealthy HealthStatus = "unhealthy"
// HealthStatusDegraded is returned when the service is degraded.
HealthStatusDegraded HealthStatus = "degraded"
// HealthStatusPartiallyUnhealthy is returned when the service is partially not healthy.
HealthStatusPartiallyUnhealthy HealthStatus = "partial-outage"
)

// HealthCheck defines an interface for health checks.
type HealthCheck interface {
// ServiceName returns the name of the service that is health checked.
ServiceName() string
// Check is a function returning a service status and an error.
Check(ctx context.Context) (HealthResult, error)
}

// HealthResult holds the health state of a service.
type HealthResult struct {
// Status indicates the overall health state.
Status HealthStatus
// Message gives additional information on the overall health state.
Message string
// Services contain the individual health results of the services as evaluated by the HealthCheck interface. The overall HealthStatus is then derived automatically from the results of the health checks.
//
// Note that the individual HealthResults evaluated by the HealthCheck interface may again consist of a plurality services. While this is only optional it allows for creating nested health structures. These can be used for more sophisticated scenarios like evaluating platform health describing service availability in different locations or similar.
//
// If using nested HealthResults, the status of the parent service can be derived automatically from the status of its children by leaving the parent's health status field blank.
Services map[string]HealthResult
}

type currentState struct {
Status HealthResult
Err error
}
45 changes: 45 additions & 0 deletions healthstatus/deferred-error-check.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package healthstatus

import (
"context"
)

type DeferredErrorHealthCheck struct {
maxIgnoredErrors int
errorCountSinceSuccess int
lastSuccess currentState
healthCheck HealthCheck
}

func DeferErrors(maxIgnoredErrors int, hc HealthCheck) *DeferredErrorHealthCheck {
return &DeferredErrorHealthCheck{
maxIgnoredErrors: maxIgnoredErrors,
healthCheck: hc,
lastSuccess: currentState{
Status: HealthResult{
Status: HealthStatusHealthy,
Message: "",
},
},
}
}

func (c *DeferredErrorHealthCheck) ServiceName() string {
return c.healthCheck.ServiceName()
}

func (c *DeferredErrorHealthCheck) Check(ctx context.Context) (HealthResult, error) {
status, err := c.healthCheck.Check(ctx)
state := currentState{status, err}

if err == nil {
c.errorCountSinceSuccess = 0
c.lastSuccess = state
return status, err

Check failure on line 38 in healthstatus/deferred-error-check.go

View workflow job for this annotation

GitHub Actions / build

error is nil (line 32) but it returns error (nilerr)
}
c.errorCountSinceSuccess++
if c.errorCountSinceSuccess > c.maxIgnoredErrors {
return status, err
}
return c.lastSuccess.Status, c.lastSuccess.Err
}
141 changes: 141 additions & 0 deletions healthstatus/grouped-check.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
package healthstatus

import (
"context"
"log/slog"
"sync"

"golang.org/x/sync/errgroup"
)

type groupedHealthCheck struct {
serviceName string
hcs []HealthCheck
log *slog.Logger
}

func Grouped(log *slog.Logger, serviceName string, checks ...HealthCheck) *groupedHealthCheck {
return &groupedHealthCheck{
serviceName: serviceName,
hcs: checks,
log: log,
}
}

func (c *groupedHealthCheck) Add(hc HealthCheck) {
c.hcs = append(c.hcs, hc)
}

func (c *groupedHealthCheck) ServiceName() string {
return c.serviceName
}
func (h *groupedHealthCheck) Check(ctx context.Context) (HealthResult, error) {
type chanResult struct {
name string
HealthResult
}
if len(h.hcs) == 0 {
return HealthResult{
Status: HealthStatusHealthy,
Message: "",
Services: nil,
}, nil
}
var (
result = HealthResult{
Status: HealthStatusHealthy,
Message: "",
Services: map[string]HealthResult{},
}

resultChan = make(chan chanResult)
once sync.Once
)
defer once.Do(func() { close(resultChan) })

g, _ := errgroup.WithContext(ctx)

for _, healthCheck := range h.hcs {
name := healthCheck.ServiceName()
healthCheck := healthCheck

g.Go(func() error {
result := chanResult{
name: name,
HealthResult: HealthResult{
Status: HealthStatusHealthy,
Message: "",
},
}
defer func() {
resultChan <- result
}()

var err error
result.HealthResult, err = healthCheck.Check(ctx)
if err != nil {
result.Message = err.Error()
h.log.Error("unhealthy service", "name", name, "status", result.Status, "error", err)
}

return err
})
}

finished := make(chan bool)
go func() {
for r := range resultChan {
result.Services[r.name] = r.HealthResult
}
finished <- true
}()
err := g.Wait()
once.Do(func() { close(resultChan) })

<-finished

if err != nil {
result.Message = err.Error()
result.Status = HealthStatusUnhealthy
}
result.Status = DeriveOverallHealthStatus(result.Services)
return result, err
}

func DeriveOverallHealthStatus(services map[string]HealthResult) HealthStatus {
var (
result = HealthStatusHealthy
degraded int
unhealthy int
)

for k, service := range services {
if len(service.Services) > 0 && service.Status == "" {
service.Status = DeriveOverallHealthStatus(service.Services)
}
services[k] = service
switch service.Status {
case HealthStatusHealthy:
case HealthStatusDegraded:
degraded++
case HealthStatusUnhealthy, HealthStatusPartiallyUnhealthy:
unhealthy++
default:
unhealthy++
}
}

if len(services) > 0 {
if degraded > 0 {
result = HealthStatusDegraded
}
if unhealthy > 0 {
result = HealthStatusPartiallyUnhealthy
}
if unhealthy == len(services) {
result = HealthStatusUnhealthy
}
}

return result
}
Loading

0 comments on commit 14b8a91

Please sign in to comment.